├── .gitignore ├── LICENSE ├── README.md ├── benchmarks └── README.md ├── docs ├── .nojekyll ├── BCQ.md ├── C51.md ├── CONTRIBUTE.md ├── MOQ-Learning.md ├── NoisyDQN.md ├── PER_DQN.md ├── PPO.md ├── QLearning.md ├── README.md ├── Rainbow.md ├── _sidebar.md ├── algo_cfg.md ├── api.md ├── basic_concept.md ├── dev │ └── offline_run.md ├── devlop.md ├── docsify.md ├── figs │ ├── DQN_pseu.png │ ├── branch_merge.png │ ├── collector.png │ ├── data_flow.png │ ├── interaction_mdp.png │ ├── interactor_learner.png │ ├── overall_framework.png │ ├── recorder.png │ └── tasks_dir.png ├── general_cfg.md ├── hyper_cfg.md ├── index.html ├── multiprocessing.md ├── multiprocessing_DRL │ ├── mp_DQN.svg │ └── multiprocessing_DQN.md ├── ray_DRL │ ├── ray_DRL.md │ └── results.png └── usage.md ├── examples ├── custom_env.py ├── quick_start_1.py └── quick_start_2.py ├── joyrl ├── __init__.py ├── algos │ ├── A3C │ │ ├── __init__.py │ │ ├── config.py │ │ ├── data_handler.py │ │ └── policy.py │ ├── CategoricalDQN │ │ ├── __init__.py │ │ ├── config.py │ │ ├── data_handler.py │ │ └── policy.py │ ├── DDPG │ │ ├── __init__.py │ │ ├── config.py │ │ ├── data_handler.py │ │ ├── model.py │ │ └── policy.py │ ├── DQN │ │ ├── __init__.py │ │ ├── config.py │ │ ├── data_handler.py │ │ └── policy.py │ ├── DoubleDQN │ │ ├── __init__.py │ │ ├── config.py │ │ ├── data_handler.py │ │ └── policy.py │ ├── DuelingDQN │ │ ├── __init__.py │ │ ├── config.py │ │ ├── data_handler.py │ │ └── policy.py │ ├── HierarchicalDQN │ │ ├── README.md │ │ ├── agent.py │ │ ├── task0.py │ │ └── train.py │ ├── MonteCarlo │ │ ├── agent.py │ │ └── task0.py │ ├── NoisyDQN │ │ ├── __init__.py │ │ ├── config.py │ │ ├── data_handler.py │ │ └── policy.py │ ├── PPO │ │ ├── __init__.py │ │ ├── config.py │ │ ├── data_handler.py │ │ └── policy.py │ ├── PolicyGradient │ │ ├── main.py │ │ └── pg.py │ ├── QLearning │ │ ├── __init__.py │ │ ├── config.py │ │ ├── data_handler.py │ │ └── policy.py │ ├── RainbowDQN │ │ ├── rainbow_dqn.py │ │ └── task0.py │ ├── SAC-S │ │ └── sac.py │ ├── SAC │ │ └── sacd_cnn.py │ ├── Sarsa │ │ ├── __init__.py │ │ ├── config.py │ │ ├── data_handler.py │ │ └── policy.py │ ├── SoftActorCritic │ │ ├── env_wrapper.py │ │ ├── model.py │ │ ├── sac.py │ │ ├── task0.py │ │ └── task0_train.ipynb │ ├── SoftQ │ │ ├── __init__.py │ │ ├── config.py │ │ ├── data_handler.py │ │ └── policy.py │ ├── TD3 │ │ ├── __init__.py │ │ ├── config.py │ │ ├── data_handler.py │ │ ├── model.py │ │ └── policy.py │ ├── __init__.py │ └── base │ │ ├── __init__.py │ │ ├── action_layer.py │ │ ├── base_layer.py │ │ ├── buffer.py │ │ ├── data_handler.py │ │ ├── experience.py │ │ ├── network.py │ │ ├── noise.py │ │ ├── optm.py │ │ └── policy.py ├── envs │ ├── README.md │ ├── __init__.py │ ├── assets │ │ ├── action_grid.png │ │ ├── gym_info_20211130180023.png │ │ ├── image-20200820174307301.png │ │ ├── image-20200820174814084.png │ │ ├── image-20201007211441036.png │ │ ├── image-20201007211858925.png │ │ ├── image-20210429150622353.png │ │ ├── image-20210429150630806.png │ │ └── track_big.png │ ├── blackjack.py │ ├── cliff_walking.py │ ├── gridworld.py │ ├── gridworld_env.py │ ├── gym │ │ ├── __init__.py │ │ ├── config.py │ │ ├── toy_text │ │ │ └── cliff_walking.py │ │ └── wrappers.py │ ├── gym_info.md │ ├── mujoco_info.md │ ├── racetrack.py │ ├── racetrack_env.md │ ├── register.py │ ├── snake │ │ ├── README.md │ │ ├── agent.py │ │ ├── example_assignment_and_report2.pdf │ │ ├── main.py │ │ ├── snake_env.py │ │ └── utils.py │ ├── stochastic_mdp.py │ ├── track.txt │ └── windy_gridworld.py ├── framework │ ├── __init__.py │ ├── base.py │ ├── collector.py │ ├── config.py │ ├── core_types.py │ ├── interactor.py │ ├── learner.py │ ├── message.py │ ├── policy_mgr.py │ ├── recorder.py │ ├── tester.py │ ├── tracker.py │ ├── trainer.py │ └── utils.py ├── run.py └── scripts │ ├── __init__.py │ └── scripts.py ├── offline_run.py ├── presets ├── Atari │ ├── AirRaid-v5 │ │ └── AirRaid-v5_PPO.yaml │ ├── Breakout-v5 │ │ ├── Breakout-v5_DQN.yaml │ │ ├── Breakout-v5_PPO.yaml │ │ └── Breakout-v5_PPO_test.yaml │ ├── DemonAttack-v5 │ │ ├── DoubleDQN_DemonAttack-v5_Test.yaml │ │ └── DoubleDQN_DemonAttack-v5_Train.yaml │ └── Enduro-v5 │ │ └── Enduro-v5_DQN.yaml ├── BipedalWalker-v3_DDPG_mp_Train.yaml ├── Box2D │ ├── BipedalWalker-v3 │ │ ├── BipedalWalker-v3_DDPG_Test.yaml │ │ ├── BipedalWalker-v3_DDPG_Train.yaml │ │ ├── BipedalWalker-v3_PPO.yaml │ │ ├── BipedalWalker-v3_SAC_Test.yaml │ │ └── BipedalWalker-v3_SAC_Train.yaml │ ├── BipedalWalkerHardcore-v3 │ │ └── TD3_BipedalWalkerHardcore-v3.yaml │ ├── CarRacing-v2 │ │ ├── CarRacing-v2_PPO.yaml │ │ ├── DQN_carRacing-v2_Test.yaml │ │ ├── DQN_carRacing-v2_Train.yaml │ │ └── TD3_carRacing-v2_Train.yaml │ └── LunarLander-v2 │ │ ├── LunarLander-v2_PPO.yaml │ │ ├── LunarLander-v2_PPO_Test.yaml │ │ ├── LunarLanderContinuous-v2_SAC_Test.yaml │ │ └── LunarLanderContinuous-v2_SAC_Train.yaml ├── ClassControl │ ├── Acrobot-v1 │ │ ├── Acrobot-v1_DQN.yaml │ │ ├── Acrobot-v1_DoubleDQN.yaml │ │ ├── Acrobot-v1_DuelingDQN.yaml │ │ ├── Acrobot-v1_NoisyDQN.yaml │ │ └── Acrobot-v1_PPO.yaml │ ├── CartPole-v1 │ │ ├── CartPole-v1_A3C.yaml │ │ ├── CartPole-v1_BC_Test.yaml │ │ ├── CartPole-v1_BC_Train.yaml │ │ ├── CartPole-v1_C51_Test.yaml │ │ ├── CartPole-v1_C51_Train.yaml │ │ ├── CartPole-v1_CQL_Test.yaml │ │ ├── CartPole-v1_CQL_Train.yaml │ │ ├── CartPole-v1_CategoricalDQN.yaml │ │ ├── CartPole-v1_DQN.yaml │ │ ├── CartPole-v1_DQN_1.yaml │ │ ├── CartPole-v1_DRQN_Test.yaml │ │ ├── CartPole-v1_DRQN_Train.yaml │ │ ├── CartPole-v1_DoubleDQN.yaml │ │ ├── CartPole-v1_DuelingDQN.yaml │ │ ├── CartPole-v1_GAIL_Test.yaml │ │ ├── CartPole-v1_GAIL_Train.yaml │ │ ├── CartPole-v1_NoisyDQN.yaml │ │ ├── CartPole-v1_PER_DQN.yaml │ │ ├── CartPole-v1_PPO.yaml │ │ ├── CartPole-v1_PPO_off_policy.yaml │ │ ├── CartPole-v1_REINFORCE_Test.yaml │ │ ├── CartPole-v1_REINFORCE_Train.yaml │ │ ├── CartPole-v1_RainbowDQN_Test.yaml │ │ ├── CartPole-v1_RainbowDQN_Train.yaml │ │ ├── CartPole-v1_RainbowDQN_Train_mp.yaml │ │ ├── CartPole-v1_SAC_D_Test.yaml │ │ ├── CartPole-v1_SAC_D_Train.yaml │ │ └── CartPole-v1_SoftQ.yaml │ ├── MountainCar-v0 │ │ ├── MountainCar-v0_DQN.yaml │ │ └── MountainCar-v0_PPO.yaml │ ├── MountainCarContinuous-v0 │ │ ├── MountainCarContinuous-v0_PPO-test.yaml │ │ └── MountainCarContinuous-v0_PPO.yaml │ └── Pendulum-v1 │ │ ├── Pendulum-v1_DDPG.yaml │ │ ├── Pendulum-v1_DDPG_HER_Test.yaml │ │ ├── Pendulum-v1_DDPG_HER_Train.yaml │ │ ├── Pendulum-v1_PPO.yaml │ │ ├── Pendulum-v1_PPO_off_policy.yaml │ │ ├── Pendulum-v1_SAC_Train.yaml │ │ ├── Pendulum-v1_TD3.yaml │ │ ├── Pendulum-v1_TD3_BC_Test.yaml │ │ └── Pendulum-v1_TD3_BC_Train.yaml ├── External │ └── Mario │ │ ├── Mario_DQN_CNN_Test.yaml │ │ └── Mario_DQN_CNN_Train.yaml ├── Mujoco │ ├── Ant-v4 │ │ └── Ant-v4_PPO.yaml │ ├── HalfCheetah-v4 │ │ ├── HalfCheetah-v2_TD3_Test.yaml │ │ ├── HalfCheetah-v2_TD3_Train.yaml │ │ └── HalfCheetah-v4_PPO.yaml │ ├── Hopper-v4 │ │ └── Hopper-v4_PPO.yaml │ └── Reacher-v4 │ │ └── Reacher-v4_PPO.yaml ├── Others │ ├── Racetrack-v0 │ │ ├── Racetrack-v0_QLearning_Test.yaml │ │ ├── Racetrack-v0_QLearning_Train.yaml │ │ └── Train_Racetrack-v0_FirstVisitMC.yaml │ ├── deep-sea-treasure-v0 │ │ ├── DeepSeaTreasure-v0_MOQLearning_Test.yaml │ │ └── DeepSeaTreasure-v0_MOQLearning_Train.yaml │ └── theAlley │ │ ├── theAlley_VI_Test.yaml │ │ └── theAlley_VI_Train.yaml ├── Pendulum-v1_TD3_mp_Test.yaml ├── Pendulum-v1_TD3_mp_Train.yaml └── ToyText │ ├── CliffWalking-v0 │ ├── CliffWalking-v0_DynaQ_Test.yaml │ ├── CliffWalking-v0_DynaQ_Train.yaml │ ├── CliffWalking-v0_QLearning.yaml │ ├── CliffWalking-v0_Sarsa.yaml │ └── CustomCliffWalking-v0_DQN.yaml │ └── FrozenLake-v1 │ └── FrozenLake-v1_NoSlippery_QLearning.yaml ├── requirements.txt ├── run.bat ├── run.sh ├── setup.cfg ├── setup.py ├── setup.sh ├── stop.bat └── stop.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .ipynb_checkpoints 3 | __pycache__ 4 | .vscode 5 | .pypirc 6 | dist 7 | build 8 | joyrl.egg-info 9 | tasks 10 | test 11 | test.py 12 | *.log 13 | presets/*/*/*.ipynb 14 | presets/*/*/*/*.ckpt -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Tianshou contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # benchmarks 2 | 3 | Save well trained models and test results here. 4 | 5 | Now we have moved to: [Mega](https://mega.nz/folder/dwxEnRoT#qPiDkhL4eyzvcSfgLxIsHQ) and [天翼云盘,访问码:88yf](https://cloud.189.cn/web/share?code=7JrQRzfQf6Nn)。 -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/.nojekyll -------------------------------------------------------------------------------- /docs/BCQ.md: -------------------------------------------------------------------------------- 1 | # BCQ 算法参数说明 2 | 3 | ```python 4 | class AlgoConfig: 5 | def __init__(self): 6 | self.critic_hidden_dims = [400,300] # Critic隐藏单元 7 | self.actor_hidden_dims = [400,300] # Actor隐藏单元设置 8 | 9 | self.vae_hidden_dims = [750,750] # VAE 隐藏单元设置 10 | 11 | self.critic_lr = 1e-3 12 | self.actor_lr = 1e-3 13 | self.vae_lr = 1e-3 14 | self.batch_size = 128 15 | 16 | self.gamma = 0.99 17 | self.tau = 0.005 # Target critic/actor 更新的快慢 18 | self.lmbda = 0.75 # soft double Q learning: target_Q = lmbda * min(q1,q2) + (1-lmbda) * max(q1,q2) 19 | self.phi = 0.05 # BCQ 特有的参数, 表示 action 相比经验池中的action, 最大的波动范围 (Actor中使用) 20 | 21 | # train parameters 22 | self.iters_per_ep = 10 # 在train BCQ Agent时, 每一个 train_one_episode中 迭代的次数, 每一次迭代都是batch_size的经验。 23 | self.max_buffer_size = int(1e5) # BCQ Agent中 memories的经验池大小 24 | self.start_learn_buffer_size = 1e3 # memories的最少经验数量,少于此数量,会报错。 25 | 26 | # parameters for collecting data 27 | self.collect_explore_data = True # 收集数据时 DDPG是否加噪音 28 | self.behavior_agent_name = "DDPG" # 使用的行为智能体算法 29 | self.behavior_agent_parameters_path = "/behave_param.yaml" # 行为智能体的参数, 在BCQ目录下。 具体参数请看 行为智能体算法参数 30 | self.behavior_policy_path = "/behaviour_models" # 行为智能体的模型, 收集数据时需要用到 31 | self.collect_eps = 500 # 收集的数据 episode 数量 32 | ``` 33 | * `z_dim`: 这是 VAE中latent space的维度参数,固定为action dim的两倍,因此无需设置。 34 | 35 | 36 | # BCQ 训练过程 37 | BCQ算法属于offline RL,因此需要 行为智能体来收集数据, 然后根据数据进行学习,而不与环境进行交互。 38 | 算法执行步骤: 39 | 1. **生成行为智能题模型**:使用 DDPG算法 (可以自行更换其他算法) 与环境交互,模型学习好之后将模型保存。 40 | 2. **获取训练数据**: 主目录的config下开启 "collect" mode,采用“BCQ算法”, 将DDPG算法的model复制到 “BCQ/behaviour_models”下, 41 | 然后在 "tasks/你的训练文档/traj"下会生成memories对应的数据。 42 | 3. **训练BCQ agent**: 主目录下的config开启 "train" mode,采用“BCQ”算法,将上一步骤生成的"traj"复制到"BCQ/traj"下, 43 | 然后训练就可以结果。 **注意**:因为训练过程中不与环境交互,因此每次训练完智能体,我们都会test_one_episode,生成reward。 44 | 45 | 46 | # BCQ学习 47 | 48 | ## VAE介绍 49 | 50 | [一文理解变分自编码器(VAE)](https://zhuanlan.zhihu.com/p/64485020) 51 | [VAE手写体识别项目实现(详细注释)从小项目通俗理解变分自编码器(Variational Autoencoder, VAE)tu](https://blog.csdn.net/weixin_40015791/article/details/89735233) 52 | 53 | ## BCQ算法介绍 54 | 55 | 1. [BCQ 张楚珩](https://zhuanlan.zhihu.com/p/136844574) 56 | 2. [(RL)BCQ](https://zhuanlan.zhihu.com/p/206489894) 57 | 3. [BCQ github code](https://github.com/sfujim/BCQ/tree/master/continuous_BCQ) 58 | 4. [Batch RL与BCQ算法](https://zhuanlan.zhihu.com/p/269475418) 59 | -------------------------------------------------------------------------------- /docs/C51.md: -------------------------------------------------------------------------------- 1 | ## 算法参数说明 2 | 3 | C51的算法参数如下: 4 | 5 | ```python 6 | class AlgoConfig(DefaultConfig): 7 | def __init__(self): 8 | self.gamma = 0.99 # discount factor 9 | self.tau = 1.0 # 1.0 means hard update 10 | self.Vmin = 0. # support of C51 11 | self.Vmax = 200. # support of C51 12 | self.num_atoms = 51 # support of C51 13 | self.support = torch.linspace(self.Vmin, self.Vmax, self.num_atoms) # support of C51 14 | self.delta_z = (self.Vmax - self.Vmin) / (self.num_atoms - 1) # support of C51 15 | 16 | self.batch_size = 32 # batch size 17 | self.lr = 0.0001 # learning rate 18 | self.target_update = 200 # target network update frequency 19 | self.memory_capacity = 10000 # size of replay buffer 20 | self.epsilon_start = 0.95 # epsilon start value 21 | self.epsilon_end = 0.01 # epsilon end value 22 | self.epsilon_decay = 500 # epsilon decay rate 23 | ``` 24 | 25 | 其中Vmin表示支撑中的最小值,Vmax表示支撑中的最大值,num_atoms表示支撑的单元数,support表示C51中的支撑表示分布可能取到的值,delta_z表示相邻支撑单元之间的差距。 26 | 27 | 其他的参数内容和DQN基本保持一致。 -------------------------------------------------------------------------------- /docs/CONTRIBUTE.md: -------------------------------------------------------------------------------- 1 | # 如何贡献 2 | 3 | 欢迎广大开发爱好者为 JoyRL 贡献代码,如果你想参与bug修复的话,直接修改对应的代码然后PR即可,PR教程可参考[VS Code快速实现Git PR操作](https://blog.csdn.net/JohnJim0/article/details/128156442)。如果你想贡献新的算法的话,可以按照以下步骤进行,有问题的话随时交流~(微信:johnjim0816) 4 | 5 | ## 新建算法 6 | 7 | 首先在`algos`目录下新建文件夹,明明为你想要新增的算法名称,并且在`config.py`下配置好默认参数 8 | 9 | ## 配置参数 10 | 11 | 在`presets`下配置好`yaml`文件,包括`Train`和`Test`的部分 12 | 13 | ## 运行代码 14 | 15 | 调试好你的算法代码之后,分别训练和测试一次,将对应输出的文件夹放到`benchmarks`目录下 16 | 17 | ## 修改文档 18 | 19 | 在`docs/hyper_tuning.md`文件中写好你贡献的算法的参数说明,最后PR即可 -------------------------------------------------------------------------------- /docs/MOQ-Learning.md: -------------------------------------------------------------------------------- 1 | ## MOQ-learning 2 | 3 | ```python 4 | class AlgoConfig: 5 | def __init__(self) -> None: 6 | # set epsilon_start=epsilon_end can obtain fixed epsilon=epsilon_end 7 | self.epsilon_start = 0.95 # epsilon start value 8 | self.epsilon_end = 0.01 # epsilon end value 9 | self.epsilon_decay = 300 # epsilon decay rate 10 | self.gamma = 0.90 # discount factor 11 | self.lr = 0.1 # learning rate 12 | self.weights = [0.5, 0.5] # weights for scalarization 13 | ``` 14 | 15 | 其中gamma是强化学习中的折扣因子,一般调整在0.9-0.999之间即可,可以默认为0.99。weights为目标之间的权重向量;max_buffer_size、target_update以及epsilon都需要根据实际环境的情况来经验性的调整。 16 | 17 | MOQ-Learning中的epsilon的衰减机制和DQN的保持一致。总体来说,MOQ-Learning的参数和DQN大体一致,这里不再赘述。 18 | -------------------------------------------------------------------------------- /docs/NoisyDQN.md: -------------------------------------------------------------------------------- 1 | ## 算法参数说明 2 | 3 | NoisyDQN的算法参数如下,基本和DQN中的一致: 4 | 5 | ```python 6 | class AlgoConfig(DefaultConfig): 7 | def __init__(self) -> None: 8 | # set epsilon_start=epsilon_end can obtain fixed epsilon=epsilon_end 9 | self.epsilon_start = 0.95 # epsilon start value 10 | self.epsilon_end = 0.01 # epsilon end value 11 | self.epsilon_decay = 500 # epsilon decay rate 12 | self.hidden_dim = 256 # hidden_dim for MLP 13 | self.gamma = 0.95 # discount factor 14 | self.lr = 0.0001 # learning rate 15 | self.max_buffer_size = 100000 # size of replay buffer 16 | self.batch_size = 64 # batch size 17 | self.target_update = 4 # target network update frequency 18 | ``` 19 | 20 | 其中gamma是强化学习中的折扣因子,一般调整在0.9-0.999之间即可,可以默认为0.99。max_buffer_size、target_update以及epsilon都需要根据实际环境的情况来经验性的调整。 21 | 22 | NoisyDQN中的epsilon的衰减机制和DQN的保持一致。总体来说,NoisyDQN的参数和DQN大体一致,这里不再赘述。 -------------------------------------------------------------------------------- /docs/PER_DQN.md: -------------------------------------------------------------------------------- 1 | ## 算法参数说明 2 | 3 | PER_DQN的算法参数如下,基本和DQN中的一致: 4 | 5 | ```python 6 | class AlgoConfig(DefaultConfig): 7 | def __init__(self) -> None: 8 | # set epsilon_start=epsilon_end can obtain fixed epsilon=epsilon_end 9 | self.epsilon_start = 0.95 # epsilon start value 10 | self.epsilon_end = 0.01 # epsilon end value 11 | self.epsilon_decay = 500 # epsilon decay rate 12 | self.hidden_dim = 256 # hidden_dim for MLP 13 | self.gamma = 0.95 # discount factor 14 | self.lr = 0.0001 # learning rate 15 | self.max_buffer_size = 100000 # size of replay buffer 16 | self.batch_size = 64 # batch size 17 | self.target_update = 4 # target network update frequency 18 | self.value_layers = [ 19 | {'layer_type': 'linear', 'layer_dim': ['n_states', 256], 20 | 'activation': 'relu'}, 21 | {'layer_type': 'linear', 'layer_dim': [256, 256], 22 | 'activation': 'relu'}, 23 | {'layer_type': 'linear', 'layer_dim': [256, 'n_actions'], 24 | 'activation': 'none'}] 25 | ``` 26 | 27 | 28 | 其中gamma是强化学习中的折扣因子,一般调整在0.9-0.999之间即可,可以默认为0.99。max_buffer_size、target_update以及epsilon都需要根据实际环境的情况来经验性的调整。 29 | 30 | PER_DQN中的epsilon的衰减机制和DQN的保持一致。 31 | 32 | 因为PER_DQN只改变了replay buffer,这里的参数相比DQN基本变化不大。 -------------------------------------------------------------------------------- /docs/PPO.md: -------------------------------------------------------------------------------- 1 | 2 | ## 算法参数说明 3 | 4 | PPO算法参数如下: 5 | 6 | ```python 7 | class AlgoConfig: 8 | def __init__(self): 9 | ppo_type = 'clip' # clip or kl 10 | self.gamma = 0.99 # discount factor 11 | self.k_epochs = 4 # update policy for K epochs 12 | self.actor_lr = 0.0003 # learning rate for actor 13 | self.critic_lr = 0.001 # learning rate for critic 14 | self.eps_clip = 0.2 # clip parameter for PPO 15 | self.entropy_coef = 0.01 # entropy coefficient 16 | self.update_freq = 100 # update policy every n steps 17 | self.actor_hidden_dim = 256 # hidden dimension for actor 18 | self.critic_hidden_dim = 256 # hidden dimension for critic 19 | # batch size 20 | self.train_batch_size = 100 # ppo train batch size 21 | self.sgd_batch_size = 64 # sgd batch size 22 | # continuous PPO 23 | self.continuous = False # continuous action space 24 | # KL parameter 25 | self.kl_target = 0.1 # target KL divergence 26 | self.kl_lambda = 0.5 # lambda for KL penalty, 0.5 is the default value in the paper 27 | self.kl_beta = 1.5 # beta for KL penalty, 1.5 is the default value in the paper 28 | self.kl_alpha = 2 # alpha for KL penalty, 2 is the default value in the paper 29 | ``` 30 | 31 | * `ppo_type`: PPO有两种Loss函数更新方式:clip方法和KL散度。现在一般都用clip方法更新,一方面因为KL调参比较费劲,另一方面clip方法基本可以满足所有需求 32 | * `eps_clip`:clip参数,一般设置为0.1-0.2之间即可 33 | * `entropy_coef`:策略熵损失系数,增加该系数提高actor的稳定性,保持0.0001-0.02即可,或者直接设置为0在一些问题中影响也不大 34 | * `update_freq`:更新频率,在JoyRL中设置为每隔几步更新,一般跟环境中每回合最大步数线性相关,例如carpole-v1环境中每回合最大步数是200,这里更新频率可以设置为50,100,200等等,这项参数需要根据实际经验调整 35 | * `k_epochs`:调整每次更新的epoch数,不能太大也不能太小,太大了一方面收敛速度会变慢,另一方面容易过拟合,太小了容易欠拟合 36 | * `train_batch_size`: 一般取值比较大(这里取100实际上是为了计算简便),当batch_size比较大时,训练的结果比较准确,但是训练速度比较慢 37 | * `sgd_batch_size`: 小批量样本,一般取值64或128。当batch_size特别小的时候,训练速度很快,但是训练结果准确性不高,这时就需要一个折中的办法,即使用小批量样本计算 38 | * `continuous`: 动作空间是否连续 39 | * `kl_target`: KL散度的目标值 40 | * `kl_lambda`: KL惩罚项的系数,PPO论文中的默认值是0.5 41 | * `kl_beta`: KL散度目标值的系数,默认值为1.5 42 | * `kl_alpha`: KL惩罚项的系数的更新参数,默认值为2 43 | -------------------------------------------------------------------------------- /docs/QLearning.md: -------------------------------------------------------------------------------- 1 | 2 | ## 算法参数说明 3 | 4 | PPO算法参数如下: 5 | 6 | ```python 7 | class AlgoConfig: 8 | def __init__(self) -> None: 9 | ## 设置 epsilon_start=epsilon_end 可以得到固定的 epsilon,即等于epsilon_end 10 | self.epsilon_start = 0.95 # epsilon 初始值 11 | self.epsilon_end = 0.01 # epsilon 终止值 12 | self.epsilon_decay = 300 # epsilon 衰减率 13 | self.gamma = 0.90 # 奖励折扣因子 14 | self.lr = 0.1 # 学习率 15 | ``` 16 | 17 | * 适当调整`epsilon_decay`以保证`epsilon`在训练过程中不会过早衰减。 18 | * 由于传统强化学习算法面对的环境都比较简单,因此`gamma`一般设置为`0.9`,`lr`且设置得比较大,不用太担心过拟合的情况。 19 | -------------------------------------------------------------------------------- /docs/Rainbow.md: -------------------------------------------------------------------------------- 1 | ## 算法参数说明 2 | 3 | Rainbow的算法参数如下: 4 | 5 | ```python 6 | class AlgoConfig(DefaultConfig): 7 | def __init__(self): 8 | self.gamma = 0.99 # discount factor 9 | self.tau = 1.0 # 1.0 means hard update 10 | self.hidden_dim = 256 # hidden_dim for MLP 11 | self.Vmin = 0. # support of C51 12 | self.Vmax = 200. # support of C51 13 | self.num_atoms = 51 # support of C51 14 | self.support = torch.linspace(self.Vmin, self.Vmax, self.num_atoms) # support of C51 15 | self.delta_z = (self.Vmax - self.Vmin) / (self.num_atoms - 1) # support of C51 16 | 17 | self.n_step = 1 #the n_step for N-step DQN 18 | self.batch_size = 32 # batch size 19 | self.lr = 0.0001 # learning rate 20 | self.target_update = 200 # target network update frequency 21 | self.memory_capacity = 10000 # size of replay buffer 22 | self.epsilon_start = 0.95 # epsilon start value 23 | self.epsilon_end = 0.01 # epsilon end value 24 | self.epsilon_decay = 500 # epsilon decay rate 25 | ``` 26 | 27 | 参数配置和C51基本一致,其中增加的n_step表示n_step DQN的步长。同样,Vmin表示支撑中的最小值,Vmax表示支撑中的最大值,num_atoms表示支撑的单元数,support表示C51中的支撑表示分布可能取到的值,delta_z表示相邻支撑单元之间的差距。 28 | 29 | 剩下的参数都和DQN基本保持一致,这里不再赘述。 -------------------------------------------------------------------------------- /docs/_sidebar.md: -------------------------------------------------------------------------------- 1 | - [README](/) 2 | - [Basic Concept](./basic_concept.md) 3 | - [Usage](./usage.md) 4 | - [HyperParameter Config](./hyper_cfg.md) 5 | - [General Config](./general_cfg.md) 6 | - [Algo Config](./algo_cfg.md) 7 | - [API](./api.md) 8 | - [Contribution](./CONTRIBUTE.md) -------------------------------------------------------------------------------- /docs/algo_cfg.md: -------------------------------------------------------------------------------- 1 | The default parameter settings for the environment are stored in `joyrl/framework/envs/gym/config.py`, as follows: 2 | 3 | ### Q-learning 4 | 5 | ```python 6 | class AlgoConfig: 7 | def __init__(self) -> None: 8 | self.epsilon_start = 0.95 # epsilon start value 9 | self.epsilon_end = 0.01 # epsilon end value 10 | self.epsilon_decay = 300 # epsilon decay rate 11 | self.gamma = 0.90 # discount factor 12 | self.lr = 0.1 # learning rate 13 | ``` 14 | 15 | Note: 16 | 17 | * Set `epsilon_start=epsilon_end` can obtain fixed `epsilon=epsilon_end`. 18 | * Adjust `epsilon_decay` appropriately to ensure that `epsilon` will not decay too early during the training process. 19 | * Since the traditional reinforcement learning algorithm faces a relatively simple environment, `gamma` is generally set to `0.9`, and `lr` can be set to a relatively large value such as `0.1`, and there is no need to worry too much about overfitting. 20 | 21 | ### DQN 22 | 23 | ```python 24 | class AlgoConfig(DefaultConfig): 25 | def __init__(self) -> None: 26 | # set epsilon_start=epsilon_end can obtain fixed epsilon=epsilon_end 27 | self.epsilon_start = 0.95 # epsilon start value 28 | self.epsilon_end = 0.01 # epsilon end value 29 | self.epsilon_decay = 500 # epsilon decay rate 30 | self.gamma = 0.95 # discount factor 31 | self.lr = 0.0001 # learning rate 32 | self.max_buffer_size = 100000 # size of replay buffer 33 | self.batch_size = 64 # batch size 34 | self.target_update = 4 # target network update frequency 35 | self.value_layers = [ 36 | {'layer_type': 'linear', 'layer_dim': ['n_states', 256], 37 | 'activation': 'relu'}, 38 | {'layer_type': 'linear', 'layer_dim': [256, 256], 39 | 'activation': 'relu'}, 40 | {'layer_type': 'linear', 'layer_dim': [256, 'n_actions'], 41 | 'activation': 'none'}] 42 | ``` -------------------------------------------------------------------------------- /docs/api.md: -------------------------------------------------------------------------------- 1 | # API 2 | 3 | 敬请期待 -------------------------------------------------------------------------------- /docs/dev/offline_run.md: -------------------------------------------------------------------------------- 1 | 2 | ### Offline Run 3 | 4 | If you want to run from source code for debugging or other purposes, you can clone this repo: 5 | 6 | ```bash 7 | git clone https://github.com/datawhalechina/joyrl.git 8 | ``` 9 | 10 | Then install the dependencies: 11 | 12 | ```bash 13 | pip install -r requirements.txt 14 | # if you have installed joyrl, you'd better uninstall it to avoid conflicts 15 | pip uninstall joyrl 16 | ``` 17 | 18 | Then you can run the following command to train a DQN agent on CartPole-v1 environment. 19 | 20 | ```bash 21 | python offline_run.py --yaml ./presets/ClassControl/CartPole-v1/CartPole-v1_DQN.yaml 22 | ``` -------------------------------------------------------------------------------- /docs/docsify.md: -------------------------------------------------------------------------------- 1 | 安装 2 | 3 | ```bash 4 | # 安装node(Mac) 5 | brew install node 6 | # windows,安装后需要重启 7 | https://nodejs.org/en/ 8 | # 全局安装docsify-cli,没有代理加速可以安装cnpm镜像加速 9 | npm i docsify-cli -g 10 | # 初始化,会在docs文件夹下生成README,index.html文件 11 | docsify init ./docs 12 | ``` 13 | 14 | 本地部署,预览网站就在http://localhost:3000网址打开 15 | 16 | ```bash 17 | cd docs 18 | docsify serve 19 | ## 或者 20 | docsify serve ./docs 21 | ``` 22 | 23 | 配置侧边栏 24 | 25 | index.html文件`window.$docsify`中增加`loadSidebar: true`,然后在docs下新建`_sidebar.md` 26 | 27 | ```html 28 | window.$docsify = { 29 | name: '', 30 | repo: '', 31 | loadSidebar: true, 32 | } 33 | ``` 34 | 35 | latex 公式显示问题 36 | 37 | https://github.com/scruel/docsify-latex 38 | 39 | ## 编写规范 40 | 41 | 1. 不能在代码块里面加入`\`,否则会导致后面的公式不渲染 42 | 2. 英文和文中的数字用`$\text{}$` 43 | 3. 公式编号用`\tag{}`,公式和图表引用要在实际的公式和图表之前,便于阅读 44 | ### 参考 45 | 46 | [docsify中文文档](https://jingping-ye.github.io/docsify-docs-zh/#/%E5%BF%AB%E9%80%9F%E4%B8%8A%E6%89%8B/%E5%BC%80%E5%A7%8B) -------------------------------------------------------------------------------- /docs/figs/DQN_pseu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/DQN_pseu.png -------------------------------------------------------------------------------- /docs/figs/branch_merge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/branch_merge.png -------------------------------------------------------------------------------- /docs/figs/collector.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/collector.png -------------------------------------------------------------------------------- /docs/figs/data_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/data_flow.png -------------------------------------------------------------------------------- /docs/figs/interaction_mdp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/interaction_mdp.png -------------------------------------------------------------------------------- /docs/figs/interactor_learner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/interactor_learner.png -------------------------------------------------------------------------------- /docs/figs/overall_framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/overall_framework.png -------------------------------------------------------------------------------- /docs/figs/recorder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/recorder.png -------------------------------------------------------------------------------- /docs/figs/tasks_dir.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/figs/tasks_dir.png -------------------------------------------------------------------------------- /docs/hyper_cfg.md: -------------------------------------------------------------------------------- 1 | # HyperParameter Config 2 | 3 | This part mainly introduces the parameter configuration and description of `JoyRL`. 4 | 5 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Document 6 | 7 | 8 | 9 | 10 | 11 | 12 | 17 | 18 | 19 |
20 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /docs/multiprocessing.md: -------------------------------------------------------------------------------- 1 | [multiprocessing官方教程-Python](https://docs.python.org/zh-cn/3/library/multiprocessing.html) 2 | 3 | 容易陷入的误区: 4 | 5 | * 电脑的CPU核数不等于支持的进程数,实际上能够支持的进程数更多,一般每个核支持两个进程 6 | * 进程与线程也有区别 7 | 8 | 执行下列代码可查看电脑能够支持的最大进程数: 9 | ```python 10 | import multiprocessing as mp 11 | print(mp.cpu_count()) 12 | ``` 13 | 14 | ## 构建子进程的方式 15 | 16 | 一般有三种,即fork,spawn和forkserver。unix环境中默认为fork,win环境下不支持fork,需要设置为spawn。 17 | 18 | fork模式下,除了必要的启动资源,子进程中的其他变量、包和数据等等都继承父进程,因而启动较快,但是大部分用的都是父进程的数据,不是很安全的模式 19 | 20 | spawn模式下,子进程是从头开始创建的,变量、包和数据等等都是从父进程拷贝而来,因此启动较慢,但是安全系数高。 21 | 22 | ```python 23 | import multiprocessing as mp 24 | print(mp.get_all_start_methods()) # 查看所有启动子进程的方法 25 | print(mp.get_start_method()) # 查看当前系统启动子进程的默认方法 26 | ``` 27 | 28 | -------------------------------------------------------------------------------- /docs/ray_DRL/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/docs/ray_DRL/results.png -------------------------------------------------------------------------------- /examples/quick_start_1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-22 13:42:56 7 | LastEditor: JiangJi 8 | LastEditTime: 2023-12-22 13:49:09 9 | Discription: 10 | ''' 11 | import joyrl 12 | 13 | if __name__ == "__main__": 14 | print(joyrl.__version__) 15 | yaml_path = "./presets/ClassControl/CartPole-v1/CartPole-v1_DQN.yaml" 16 | joyrl.run(yaml_path = yaml_path) -------------------------------------------------------------------------------- /examples/quick_start_2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-23 10:45:01 7 | LastEditor: JiangJi 8 | LastEditTime: 2023-12-24 00:05:32 9 | Discription: 10 | ''' 11 | import joyrl 12 | 13 | class GeneralConfig: 14 | ''' General parameters for running 15 | ''' 16 | def __init__(self) -> None: 17 | # basic settings 18 | self.env_name = "gym" # name of environment 19 | self.algo_name = "DQN" # name of algorithm 20 | self.mode = "train" # train, test 21 | self.interactor_mode = "dummy" # dummy, only works when learner_mode is serial 22 | self.learner_mode = "serial" # serial, parallel, whether workers and learners are in parallel 23 | self.device = "cpu" # device to use 24 | self.seed = 0 # random seed 25 | self.max_episode = -1 # number of episodes for training, set -1 to keep running 26 | self.max_step = 200 # number of episodes for testing, set -1 means unlimited steps 27 | self.collect_traj = False # if collect trajectory or not 28 | # multiprocessing settings 29 | self.n_interactors = 1 # number of workers 30 | # online evaluation settings 31 | self.online_eval = True # online evaluation or not 32 | self.online_eval_episode = 10 # online eval episodes 33 | self.model_save_fre = 500 # model save frequency per update step 34 | # load model settings 35 | self.load_checkpoint = False # if load checkpoint 36 | self.load_path = "Train_single_CartPole-v1_DQN_20230515-211721" # path to load model 37 | self.load_model_step = 'best' # load model at which step 38 | 39 | class EnvConfig(object): 40 | def __init__(self) -> None: 41 | self.id = "CartPole-v1" # environment id 42 | 43 | if __name__ == "__main__": 44 | general_cfg = GeneralConfig() 45 | env_cfg = EnvConfig() 46 | joyrl.run(general_cfg = general_cfg, env_cfg = env_cfg) -------------------------------------------------------------------------------- /joyrl/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-01-01 16:20:49 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-12-19 13:44:27 9 | Discription: 10 | ''' 11 | from joyrl import algos, framework, envs 12 | from joyrl.run import run 13 | 14 | __version__ = "0.6.8" 15 | 16 | __all__ = [ 17 | "algos", 18 | "config", 19 | "envs", 20 | "framework", 21 | "run" 22 | ] 23 | -------------------------------------------------------------------------------- /joyrl/algos/A3C/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/A3C/__init__.py -------------------------------------------------------------------------------- /joyrl/algos/A3C/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2024-06-03 13:37:11 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-06-23 17:49:41 9 | Discription: 10 | ''' 11 | 12 | class AlgoConfig(object): 13 | def __init__(self): 14 | self.independ_actor = True # whether to use independent actor 15 | self.action_type_list = "continuous" # continuous action space 16 | self.gae_lambda = 0.95 # lambda for GAE 17 | self.gamma = 0.99 # discount factor 18 | self.lr = 0.0001 # for shared optimizer 19 | self.actor_lr = 0.0003 # learning rate for actor, must be specified if share_optimizer is False 20 | self.critic_lr = 0.001 # learning rate for critic, must be specified if share_optimizer is False 21 | self.critic_loss_coef = 0.001 # critic loss coefficient 22 | self.entropy_coef = 0.01 # entropy coefficient 23 | self.batch_size = 256 # ppo train batch size 24 | self.min_policy = 0 # min value for policy (for discrete action space) 25 | self.buffer_type = 'REPLAY_QUE' 26 | self.branch_layers = [] 27 | self.merge_layers = [] 28 | self.actor_branch_layers = [] 29 | self.actor_merge_layers = [] 30 | self.critic_branch_layers = [] 31 | self.critic_merge_layers = [] 32 | -------------------------------------------------------------------------------- /joyrl/algos/CategoricalDQN/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/CategoricalDQN/__init__.py -------------------------------------------------------------------------------- /joyrl/algos/CategoricalDQN/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2024-12-18 13:15:04 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-12-19 13:24:11 9 | Discription: 10 | ''' 11 | class AlgoConfig(): 12 | ''' algorithm parameters 13 | ''' 14 | def __init__(self) -> None: 15 | # set epsilon_start=epsilon_end to get fixed epsilon, i.e. epsilon=epsilon_end 16 | self.epsilon_start = 0.95 # epsilon start value 17 | self.epsilon_end = 0.01 # epsilon end value 18 | self.epsilon_decay = 500 # epsilon decay 19 | self.gamma = 0.95 # reward discount factor 20 | self.lr = 0.0001 # learning rate 21 | self.buffer_type = 'REPLAY_QUE' # replay buffer type 22 | self.max_buffer_size = 100000 # replay buffer size 23 | self.batch_size = 256 # batch size 24 | self.target_update = 4 # target network update frequency 25 | self.distributional = True # if use distributional dqn 26 | self.n_atoms = 51 # number of atoms 27 | self.v_min = -10 # min value 28 | self.v_max = 10 # max value 29 | self.enable_soft_update = True # enable soft update 30 | self.tau_soft_update = 0.005 # soft update tau 31 | # value network layers config 32 | # [{'name': 'feature_1', 'layers': [{'layer_type': 'linear', 'layer_size': [256], 'activation': 'relu'}, {'layer_type': 'linear', 'layer_size': [256], 'activation': 'relu'}]}] 33 | self.branch_layers = [ 34 | # { 35 | # 'name': 'feature_1', 36 | # 'layers': 37 | # [ 38 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 39 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 40 | # ] 41 | # }, 42 | # { 43 | # 'name': 'feature_2', 44 | # 'layers': 45 | # [ 46 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 47 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 48 | # ] 49 | # } 50 | ] 51 | self.merge_layers = [ 52 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'}, 53 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'}, 54 | ] 55 | -------------------------------------------------------------------------------- /joyrl/algos/CategoricalDQN/data_handler.py: -------------------------------------------------------------------------------- 1 | from joyrl.algos.base.data_handler import BaseDataHandler 2 | 3 | class DataHandler(BaseDataHandler): 4 | def __init__(self, cfg): 5 | super().__init__(cfg) 6 | -------------------------------------------------------------------------------- /joyrl/algos/DDPG/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/DDPG/__init__.py -------------------------------------------------------------------------------- /joyrl/algos/DDPG/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-15 13:16:24 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-01-25 12:01:27 9 | Discription: 10 | ''' 11 | import numpy as np 12 | class AlgoConfig: 13 | def __init__(self): 14 | self.action_type_list = 'dpg' # action type, dpg: deterministic policy gradient 15 | self.buffer_type = 'REPLAY_QUE' # replay buffer type 16 | self.max_buffer_size = 100000 # replay buffer size 17 | self.batch_size = 128 # batch size 18 | self.gamma = 0.99 # discount factor 19 | self.policy_loss_weight = 0.002 # policy loss weight 20 | self.critic_lr = 1e-3 # learning rate of critic 21 | self.actor_lr = 1e-4 # learning rate of actor 22 | self.tau = 0.001 # soft update parameter 23 | self.value_min = -np.inf # clip min critic value 24 | self.value_max = np.inf # clip max critic value 25 | # self.actor_layers = [ 26 | # {'layer_type': 'Linear', 'layer_size': [256], 'activation': 'ReLU'}, 27 | # {'layer_type': 'Linear', 'layer_size': [256], 'activation': 'ReLU'}, 28 | # ] 29 | # self.critic_layers = [ 30 | # {'layer_type': 'Linear', 'layer_size': [256], 'activation': 'ReLU'}, 31 | # {'layer_type': 'Linear', 'layer_size': [256], 'activation': 'ReLU'}, 32 | # ] 33 | self.branch_layers = [] 34 | self.merge_layers = [] 35 | self.actor_branch_layers = [] 36 | self.actor_merge_layers = [] 37 | self.critic_branch_layers = [] 38 | self.critic_merge_layers = [] -------------------------------------------------------------------------------- /joyrl/algos/DDPG/data_handler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2024-02-25 15:46:04 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-07-21 14:45:35 9 | Discription: 10 | ''' 11 | from joyrl.algos.base.data_handler import BaseDataHandler 12 | import numpy as np 13 | class DataHandler(BaseDataHandler): 14 | def __init__(self, cfg): 15 | super().__init__(cfg) 16 | -------------------------------------------------------------------------------- /joyrl/algos/DDPG/model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2024-07-20 14:15:24 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-07-21 16:38:14 9 | Discription: 10 | ''' 11 | import torch.nn as nn 12 | from joyrl.algos.base.network import * 13 | 14 | class Model(nn.Module): 15 | def __init__(self, cfg ): 16 | super(Model, self).__init__() 17 | state_size_list = cfg.obs_space_info.size 18 | action_size_list = cfg.action_space_info.size 19 | critic_input_size_list = state_size_list+ [[None, len(action_size_list)]] 20 | self.actor = ActorNetwork(cfg, input_size_list = state_size_list) 21 | self.critic = CriticNetwork(cfg, input_size_list = critic_input_size_list) -------------------------------------------------------------------------------- /joyrl/algos/DQN/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/DQN/__init__.py -------------------------------------------------------------------------------- /joyrl/algos/DQN/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-20 23:39:18 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-06-14 22:44:09 9 | Discription: 10 | ''' 11 | class AlgoConfig(): 12 | ''' algorithm parameters 13 | ''' 14 | def __init__(self) -> None: 15 | # set epsilon_start=epsilon_end to get fixed epsilon, i.e. epsilon=epsilon_end 16 | self.epsilon_start = 0.95 # epsilon start value 17 | self.epsilon_end = 0.01 # epsilon end value 18 | self.epsilon_decay = 500 # epsilon decay 19 | self.gamma = 0.95 # reward discount factor 20 | self.lr = 0.0001 # learning rate 21 | self.buffer_type = 'REPLAY_QUE' # replay buffer type 22 | self.max_buffer_size = 100000 # replay buffer size 23 | self.batch_size = 64 # batch size 24 | self.target_update = 4 # target network update frequency 25 | # value network layers config 26 | # [{'name': 'feature_1', 'layers': [{'layer_type': 'linear', 'layer_size': [256], 'activation': 'relu'}, {'layer_type': 'linear', 'layer_size': [256], 'activation': 'relu'}]}] 27 | self.branch_layers = [ 28 | # { 29 | # 'name': 'feature_1', 30 | # 'layers': 31 | # [ 32 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 33 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 34 | # ] 35 | # }, 36 | # { 37 | # 'name': 'feature_2', 38 | # 'layers': 39 | # [ 40 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 41 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 42 | # ] 43 | # } 44 | ] 45 | self.merge_layers = [ 46 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'}, 47 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'}, 48 | ] 49 | -------------------------------------------------------------------------------- /joyrl/algos/DQN/data_handler.py: -------------------------------------------------------------------------------- 1 | from joyrl.algos.base.data_handler import BaseDataHandler 2 | 3 | class DataHandler(BaseDataHandler): 4 | def __init__(self, cfg): 5 | super().__init__(cfg) 6 | -------------------------------------------------------------------------------- /joyrl/algos/DoubleDQN/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/DoubleDQN/__init__.py -------------------------------------------------------------------------------- /joyrl/algos/DoubleDQN/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-03-15 22:04:42 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-06-15 14:03:54 9 | Discription: 10 | ''' 11 | class AlgoConfig(object): 12 | ''' algorithm parameters 13 | ''' 14 | def __init__(self) -> None: 15 | # set epsilon_start=epsilon_end can obtain fixed epsilon=epsilon_end 16 | self.epsilon_start = 0.95 # epsilon start value 17 | self.epsilon_end = 0.01 # epsilon end value 18 | self.epsilon_decay = 500 # epsilon decay rate 19 | self.gamma = 0.99 # discount factor 20 | self.lr = 0.0001 # learning rate 21 | self.max_buffer_size = 100000 # size of replay buffer 22 | self.batch_size = 64 # batch size 23 | self.target_update = 4 # target network update frequency 24 | # value network layers config 25 | self.branch_layers = [ 26 | # { 27 | # 'name': 'feature_1', 28 | # 'layers': 29 | # [ 30 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 31 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 32 | # ] 33 | # }, 34 | # { 35 | # 'name': 'feature_2', 36 | # 'layers': 37 | # [ 38 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 39 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 40 | # ] 41 | # } 42 | ] 43 | self.merge_layers = [ 44 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'}, 45 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'}, 46 | ] 47 | 48 | -------------------------------------------------------------------------------- /joyrl/algos/DoubleDQN/data_handler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | @Author: John 5 | @Email: johnjim0816@gmail.com 6 | @Date: 2020-06-12 00:50:49 7 | @LastEditor: John 8 | LastEditTime: 2023-12-24 19:57:50 9 | @Discription: 10 | @Environment: 11 | ''' 12 | from joyrl.algos.base.data_handler import BaseDataHandler 13 | class DataHandler(BaseDataHandler): 14 | def __init__(self, cfg): 15 | super().__init__(cfg) -------------------------------------------------------------------------------- /joyrl/algos/DoubleDQN/policy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-22 23:02:13 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-06-14 22:49:54 9 | Discription: 10 | ''' 11 | import torch 12 | import torch.nn as nn 13 | from joyrl.algos.DQN.policy import Policy as DQNPolicy 14 | class Policy(DQNPolicy): 15 | def __init__(self,cfg) -> None: 16 | super(Policy, self).__init__(cfg) 17 | 18 | def learn(self, **kwargs): 19 | ''' learn policy 20 | ''' 21 | self.prepare_data_before_learn(**kwargs) 22 | actor_outputs = self.model(self.states)['actor_outputs'] 23 | target_actor_outputs = self.target_model(self.next_states)['actor_outputs'] 24 | tot_loss = 0 25 | self.summary_loss = [] 26 | for i in range(len(self.action_size_list)): 27 | actual_q_value = actor_outputs[i]['q_value'].gather(1, self.actions[i].long()) 28 | next_q_values = target_actor_outputs[i]['q_value'] 29 | next_target_q_values_action = next_q_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)) 30 | expected_q_value = self.rewards + self.gamma * next_target_q_values_action * (1 - self.dones) 31 | loss_i = nn.MSELoss()(actual_q_value, expected_q_value) 32 | tot_loss += loss_i 33 | self.summary_loss.append(loss_i.item()) 34 | self.optimizer.zero_grad() 35 | tot_loss.backward() 36 | # clip to avoid gradient explosion 37 | for param in self.model.parameters(): 38 | param.grad.data.clamp_(-1, 1) 39 | self.optimizer.step() 40 | # update target net every C steps 41 | if self.update_step % self.target_update == 0: 42 | self.target_model.load_state_dict(self.model.state_dict()) 43 | self.update_step += 1 44 | self.update_summary() # update summary 45 | 46 | -------------------------------------------------------------------------------- /joyrl/algos/DuelingDQN/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/DuelingDQN/__init__.py -------------------------------------------------------------------------------- /joyrl/algos/DuelingDQN/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2024-02-25 15:46:04 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-06-14 22:54:08 9 | Discription: 10 | ''' 11 | class AlgoConfig(object): 12 | def __init__(self) -> None: 13 | # set epsilon_start=epsilon_end can obtain fixed epsilon=epsilon_end 14 | self.dueling = True # use dueling network 15 | self.epsilon_start = 0.95 # epsilon start value 16 | self.epsilon_end = 0.01 # epsilon end value 17 | self.epsilon_decay = 500 # epsilon decay rate 18 | self.gamma = 0.99 # discount factor 19 | self.lr = 0.0001 # learning rate 20 | self.buffer_type = 'REPLAY_QUE' # replay buffer type 21 | self.max_buffer_size = 100000 # replay buffer size 22 | self.batch_size = 64 # batch size 23 | self.target_update = 4 # target network update frequency 24 | # value network layers config 25 | self.branch_layers = [ 26 | # { 27 | # 'name': 'feature_1', 28 | # 'layers': 29 | # [ 30 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 31 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 32 | # ] 33 | # }, 34 | # { 35 | # 'name': 'feature_2', 36 | # 'layers': 37 | # [ 38 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 39 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 40 | # ] 41 | # } 42 | ] 43 | self.merge_layers = [ 44 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'}, 45 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'}, 46 | ] 47 | -------------------------------------------------------------------------------- /joyrl/algos/DuelingDQN/data_handler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-02-21 20:32:11 7 | LastEditor: JiangJi 8 | LastEditTime: 2023-05-17 11:39:38 9 | Discription: 10 | ''' 11 | from joyrl.algos.base.data_handler import BaseDataHandler 12 | class DataHandler(BaseDataHandler): 13 | def __init__(self, cfg): 14 | super().__init__(cfg) 15 | -------------------------------------------------------------------------------- /joyrl/algos/DuelingDQN/policy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2022-11-14 23:50:59 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-06-14 22:49:36 9 | Discription: 10 | ''' 11 | from joyrl.algos.DQN.policy import Policy as DQNPolicy 12 | 13 | class Policy(DQNPolicy): 14 | def __init__(self,cfg) -> None: 15 | super(Policy, self).__init__(cfg) 16 | -------------------------------------------------------------------------------- /joyrl/algos/HierarchicalDQN/README.md: -------------------------------------------------------------------------------- 1 | # Hierarchical DQN 2 | 3 | ## 原理简介 4 | 5 | Hierarchical DQN是一种分层强化学习方法,与DQN相比增加了一个meta controller, 6 | 7 | ![image-20210331153115575](assets/image-20210331153115575.png) 8 | 9 | 即学习时,meta controller每次会生成一个goal,然后controller或者说下面的actor就会达到这个goal,直到done为止。这就相当于给agent增加了一个队长,队长擅长制定局部目标,指导agent前行,这样应对一些每回合步数较长或者稀疏奖励的问题会有所帮助。 10 | 11 | ## 伪代码 12 | 13 | ![image-20210331153542314](assets/image-20210331153542314.png) -------------------------------------------------------------------------------- /joyrl/algos/NoisyDQN/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-05-30 23:57:29 7 | LastEditor: JiangJi 8 | LastEditTime: 2023-05-30 23:57:30 9 | Discription: 10 | ''' 11 | -------------------------------------------------------------------------------- /joyrl/algos/NoisyDQN/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-02-21 20:32:11 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-06-14 22:54:02 9 | Discription: 10 | ''' 11 | class AlgoConfig(object): 12 | ''' algorithm parameters 13 | ''' 14 | def __init__(self) -> None: 15 | # set epsilon_start=epsilon_end to get fixed epsilon, i.e. epsilon=epsilon_end 16 | self.epsilon_start = 0.95 # epsilon start value 17 | self.epsilon_end = 0.01 # epsilon end value 18 | self.epsilon_decay = 500 # epsilon decay 19 | self.gamma = 0.95 # reward discount factor 20 | self.lr = 0.0001 # learning rate 21 | self.max_buffer_size = 100000 # replay buffer size 22 | self.batch_size = 64 # batch size 23 | self.target_update = 4 # target network update frequency 24 | self.branch_layers = [] 25 | self.merge_layers = [ 26 | # {'layer_type': 'noisy_linear', 'layer_size': [64], 'activation': 'ReLU','std_init': 0.4}, 27 | # {'layer_type': 'noisy_linear', 'layer_size': [64], 'activation': 'ReLU','std_init': 0.4}, 28 | ] 29 | -------------------------------------------------------------------------------- /joyrl/algos/NoisyDQN/data_handler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-05-18 13:21:15 7 | LastEditor: JiangJi 8 | LastEditTime: 2023-12-25 00:07:41 9 | Discription: 10 | ''' 11 | from joyrl.algos.base.data_handler import BaseDataHandler 12 | class DataHandler(BaseDataHandler): 13 | def __init__(self, cfg): 14 | super().__init__(cfg) 15 | -------------------------------------------------------------------------------- /joyrl/algos/NoisyDQN/policy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-04-17 11:23:49 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-06-14 22:53:19 9 | Discription: 10 | ''' 11 | import torch 12 | import torch.nn as nn 13 | import torch.optim as optim 14 | import torch.nn.functional as F 15 | import numpy as np 16 | import math 17 | import random 18 | 19 | from joyrl.algos.DQN.policy import Policy as DQNPolicy 20 | 21 | 22 | class Policy(DQNPolicy): 23 | def __init__(self,cfg) -> None: 24 | super(Policy, self).__init__(cfg) 25 | 26 | 27 | def learn(self, **kwargs): 28 | ''' train policy 29 | ''' 30 | self.prepare_data_before_learn(**kwargs) 31 | self.summary_loss = [] 32 | tot_loss = 0 33 | actor_outputs = self.model(self.states)['actor_outputs'] 34 | target_actor_outputs = self.target_model(self.next_states)['actor_outputs'] 35 | for i in range(len(self.action_size_list)): 36 | actual_q_value = actor_outputs[i]['q_value'].gather(1, self.actions[i].long()) 37 | # compute next max q value 38 | next_q_value_max = target_actor_outputs[i]['q_value'].max(1)[0].unsqueeze(dim=1) 39 | # compute target Q values 40 | target_q_value = self.rewards + (1 - self.dones) * self.gamma * next_q_value_max 41 | # compute loss 42 | loss_i = nn.MSELoss()(actual_q_value, target_q_value) 43 | tot_loss += loss_i 44 | self.summary_loss.append(loss_i.item()) 45 | self.optimizer.zero_grad() 46 | tot_loss.backward() 47 | # clip to avoid gradient explosion 48 | for param in self.model.parameters(): 49 | param.grad.data.clamp_(-1, 1) 50 | self.optimizer.step() 51 | # update target net every C steps 52 | if self.update_step % self.target_update == 0: 53 | self.target_model.load_state_dict(self.model.state_dict()) 54 | self.update_step += 1 55 | self.model.reset_noise() 56 | self.target_model.reset_noise() 57 | self.update_summary() # update summary 58 | 59 | -------------------------------------------------------------------------------- /joyrl/algos/PPO/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/PPO/__init__.py -------------------------------------------------------------------------------- /joyrl/algos/PPO/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-02-20 21:53:39 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-06-23 17:50:12 9 | Discription: 10 | ''' 11 | class AlgoConfig(object): 12 | def __init__(self): 13 | self.independ_actor = False # whether to use independent actor 14 | # whether actor and critic share the same optimizer 15 | self.ppo_type = 'clip' # clip or kl 16 | self.eps_clip = 0.2 # clip parameter for PPO 17 | self.gae_lambda = 0.95 # lambda for GAE 18 | # for kl penalty version of PPO 19 | self.kl_target = 0.1 # target KL divergence 20 | self.kl_lambda = 0.5 # lambda for KL penalty, 0.5 is the default value in the paper 21 | self.kl_beta = 1.5 # beta for KL penalty, 1.5 is the default value in the paper 22 | self.kl_alpha = 2 # alpha for KL penalty, 2 is the default value in the paper 23 | self.action_type_list = "continuous" # continuous action space 24 | self.return_form = 'mc' # 'mc' or 'td' or 'gae' 25 | self.gamma = 0.99 # discount factor 26 | self.k_epochs = 4 # update policy for K epochs 27 | self.lr = 0.0001 # for shared optimizer 28 | self.actor_lr = 0.0003 # learning rate for actor, must be specified if share_optimizer is False 29 | self.critic_lr = 0.001 # learning rate for critic, must be specified if share_optimizer is False 30 | self.critic_loss_coef = 0.001 # critic loss coefficient 31 | self.entropy_coef = 0.01 # entropy coefficient 32 | self.batch_size = 256 # ppo train batch size 33 | self.sgd_batch_size = 32 # sgd batch size 34 | self.min_policy = 0 # min value for policy (for discrete action space) 35 | self.buffer_type = 'REPLAY_QUE' 36 | self.branch_layers = [] 37 | self.merge_layers = [] 38 | self.actor_branch_layers = [] 39 | self.actor_merge_layers = [] 40 | self.critic_branch_layers = [] 41 | self.critic_merge_layers = [] 42 | -------------------------------------------------------------------------------- /joyrl/algos/QLearning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/QLearning/__init__.py -------------------------------------------------------------------------------- /joyrl/algos/QLearning/config.py: -------------------------------------------------------------------------------- 1 | 2 | class AlgoConfig(): 3 | ''' algorithm parameters 4 | ''' 5 | def __init__(self) -> None: 6 | # set epsilon_start=epsilon_end to get fixed epsilon, i.e. epsilon=epsilon_end 7 | self.epsilon_start = 0.95 # epsilon start value 8 | self.epsilon_end = 0.01 # epsilon end value 9 | self.epsilon_decay = 500 # epsilon decay 10 | self.gamma = 0.95 # reward discount factor 11 | self.lr = 0.0001 # learning rate 12 | -------------------------------------------------------------------------------- /joyrl/algos/QLearning/data_handler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-24 19:13:11 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-01-13 16:03:27 9 | Discription: 10 | ''' 11 | import numpy as np 12 | from joyrl.algos.base.data_handler import BaseDataHandler 13 | 14 | 15 | class DataHandler(BaseDataHandler): 16 | def __init__(self,cfg) -> None: 17 | self.cfg = cfg 18 | self.buffer = [] 19 | self.data_after_train = {} 20 | def add_exps(self, exps): 21 | ''' add transition to buffer 22 | ''' 23 | self.buffer.append(exps) 24 | 25 | def sample_training_data(self): 26 | ''' sample training data from buffer 27 | ''' 28 | if len(self.buffer) == 0: 29 | return None 30 | exp = self.buffer.pop()[0] 31 | return self._handle_exps_before_train(exp) 32 | def _handle_exps_before_train(self, exp, **kwargs): 33 | ''' convert exps to training data 34 | ''' 35 | state = np.array(exp.state) 36 | action = np.array(exp.action) 37 | reward = np.array(exp.reward) 38 | next_state = np.array(exp.next_state) 39 | done = np.array(exp.done) 40 | data = {'state': state, 'action': action, 'reward': reward, 'next_state': next_state, 'done': done} 41 | return data 42 | def handle_exps_after_train(self): 43 | ''' handle exps after train 44 | ''' 45 | pass 46 | -------------------------------------------------------------------------------- /joyrl/algos/QLearning/policy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-24 15:09:47 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-01-13 18:26:43 9 | Discription: 10 | ''' 11 | import math 12 | import numpy as np 13 | from collections import defaultdict 14 | from joyrl.algos.base.policy import ToyPolicy 15 | 16 | class Policy(ToyPolicy): 17 | def __init__(self,cfg) -> None: 18 | super(Policy, self).__init__(cfg) 19 | self.lr = cfg.lr 20 | self.gamma = cfg.gamma 21 | self.epsilon = cfg.epsilon_start 22 | self.epsilon_start = cfg.epsilon_start 23 | self.epsilon_end = cfg.epsilon_end 24 | self.epsilon_decay = cfg.epsilon_decay 25 | self.Q_table = defaultdict(lambda: np.zeros(self.n_actions)) 26 | self.sample_count = 0 27 | self.create_summary() 28 | 29 | def sample_action(self, state, **kwargs): 30 | self.sample_count += 1 31 | self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ 32 | math.exp(-1. * self.sample_count / self.epsilon_decay) 33 | if np.random.uniform(0, 1) > self.epsilon: 34 | action = np.argmax(self.Q_table[str(state)]) # select the action with max Q value 35 | else: 36 | action = np.random.choice(self.n_actions) # random select an action 37 | return action 38 | 39 | def predict_action(self, state, **kwargs): 40 | action = np.argmax(self.Q_table[str(state)]) 41 | return action 42 | 43 | def learn(self, **kwargs): 44 | state, action, reward, next_state, done = kwargs.get('state'), kwargs.get('action'), kwargs.get('reward'), kwargs.get('next_state'), kwargs.get('done') 45 | Q_predict = self.Q_table[str(state)][action] 46 | if done: 47 | Q_target = reward 48 | else: 49 | Q_target = reward + self.gamma * np.max(self.Q_table[str(next_state)]) 50 | self.Q_table[str(state)][action] += self.lr * (Q_target - Q_predict) 51 | self.loss = (Q_target - Q_predict) ** 2 52 | self.update_summary() # update summary 53 | -------------------------------------------------------------------------------- /joyrl/algos/SAC-S/sac.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.optim as optim 3 | import torch.nn as nn 4 | import numpy as np 5 | class SAC: 6 | def __init__(self,n_actions,models,memory,cfg): 7 | self.device = cfg.device 8 | self.value_net = models['ValueNet'].to(self.device) # $\psi$ 9 | self.target_value_net = models['ValueNet'].to(self.device) # $\bar{\psi}$ 10 | self.soft_q_net = models['SoftQNet'].to(self.device) # $\theta$ 11 | self.policy_net = models['PolicyNet'].to(self.device) # $\phi$ 12 | self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=cfg.value_lr) 13 | self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=cfg.soft_q_lr) 14 | self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.policy_lr) 15 | for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): 16 | target_param.data.copy_(param.data) 17 | self.value_criterion = nn.MSELoss() 18 | self.soft_q_criterion = nn.MSELoss() 19 | def update(self): 20 | # sample a batch of transitions from replay buffer 21 | state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( 22 | self.batch_size) 23 | state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states) 24 | action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1) 25 | reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize) 26 | next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states) 27 | done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1) 28 | -------------------------------------------------------------------------------- /joyrl/algos/SAC/sacd_cnn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/SAC/sacd_cnn.py -------------------------------------------------------------------------------- /joyrl/algos/Sarsa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/Sarsa/__init__.py -------------------------------------------------------------------------------- /joyrl/algos/Sarsa/config.py: -------------------------------------------------------------------------------- 1 | 2 | class AlgoConfig(): 3 | ''' algorithm parameters 4 | ''' 5 | def __init__(self) -> None: 6 | # set epsilon_start=epsilon_end to get fixed epsilon, i.e. epsilon=epsilon_end 7 | self.epsilon_start = 0.95 # epsilon start value 8 | self.epsilon_end = 0.01 # epsilon end value 9 | self.epsilon_decay = 500 # epsilon decay 10 | self.gamma = 0.95 # reward discount factor 11 | self.lr = 0.0001 # learning rate 12 | -------------------------------------------------------------------------------- /joyrl/algos/Sarsa/data_handler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-24 19:13:11 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-01-13 16:03:27 9 | Discription: 10 | ''' 11 | import numpy as np 12 | from joyrl.algos.base.data_handler import BaseDataHandler 13 | 14 | 15 | class DataHandler(BaseDataHandler): 16 | def __init__(self,cfg) -> None: 17 | self.cfg = cfg 18 | self.buffer = [] 19 | self.data_after_train = {} 20 | def add_exps(self, exps): 21 | ''' add transition to buffer 22 | ''' 23 | self.buffer.append(exps) 24 | 25 | def sample_training_data(self): 26 | ''' sample training data from buffer 27 | ''' 28 | if len(self.buffer) == 0: 29 | return None 30 | exp = self.buffer.pop()[0] 31 | return self._handle_exps_before_train(exp) 32 | def _handle_exps_before_train(self, exp, **kwargs): 33 | ''' convert exps to training data 34 | ''' 35 | state = np.array(exp.state) 36 | action = np.array(exp.action) 37 | reward = np.array(exp.reward) 38 | next_state = np.array(exp.next_state) 39 | done = np.array(exp.done) 40 | data = {'state': state, 'action': action, 'reward': reward, 'next_state': next_state, 'done': done} 41 | return data 42 | def handle_exps_after_train(self): 43 | ''' handle exps after train 44 | ''' 45 | pass 46 | -------------------------------------------------------------------------------- /joyrl/algos/Sarsa/policy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-24 15:09:47 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-01-13 18:37:13 9 | Discription: 10 | ''' 11 | import math 12 | import numpy as np 13 | from collections import defaultdict 14 | from joyrl.algos.base.policy import ToyPolicy 15 | 16 | class Policy(ToyPolicy): 17 | def __init__(self,cfg) -> None: 18 | super(Policy, self).__init__(cfg) 19 | self.lr = cfg.lr 20 | self.gamma = cfg.gamma 21 | self.epsilon = cfg.epsilon_start 22 | self.epsilon_start = cfg.epsilon_start 23 | self.epsilon_end = cfg.epsilon_end 24 | self.epsilon_decay = cfg.epsilon_decay 25 | self.Q_table = defaultdict(lambda: np.zeros(self.n_actions)) 26 | self.sample_count = 0 27 | self.next_action = None 28 | self.create_summary() 29 | 30 | def sample_action(self, state, **kwargs): 31 | self.sample_count += 1 32 | self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ 33 | math.exp(-1. * self.sample_count / self.epsilon_decay) 34 | if np.random.uniform(0, 1) > self.epsilon: 35 | action = self.predict_action(state) 36 | if self.next_action is not None: 37 | action = self.next_action 38 | self.next_action = None 39 | else: 40 | action = np.random.choice(self.n_actions) # random select an action 41 | return action 42 | 43 | def predict_action(self, state, **kwargs): 44 | action = np.argmax(self.Q_table[str(state)]) 45 | return action 46 | 47 | def learn(self, **kwargs): 48 | state, action, reward, next_state, done = kwargs.get('state'), kwargs.get('action'), kwargs.get('reward'), kwargs.get('next_state'), kwargs.get('done') 49 | Q_predict = self.Q_table[str(state)][action] 50 | self.next_action = self.predict_action(next_state) 51 | if done: 52 | Q_target = reward 53 | else: 54 | Q_target = reward + self.gamma * self.Q_table[str(next_state)][self.next_action] 55 | self.Q_table[str(state)][action] += self.lr * (Q_target - Q_predict) 56 | self.loss = (Q_target - Q_predict) ** 2 57 | self.update_summary() # update summary 58 | -------------------------------------------------------------------------------- /joyrl/algos/SoftActorCritic/env_wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2021-04-29 12:52:11 7 | LastEditor: JiangJi 8 | LastEditTime: 2021-12-22 15:36:36 9 | Discription: 10 | Environment: 11 | ''' 12 | import gym 13 | import numpy as np 14 | 15 | class NormalizedActions(gym.ActionWrapper): 16 | def action(self, action): 17 | low = self.action_space.low 18 | high = self.action_space.high 19 | 20 | action = low + (action + 1.0) * 0.5 * (high - low) 21 | action = np.clip(action, low, high) 22 | 23 | return action 24 | 25 | def reverse_action(self, action): 26 | low = self.action_space.low 27 | high = self.action_space.high 28 | action = 2 * (action - low) / (high - low) - 1 29 | action = np.clip(action, low, high) 30 | return action -------------------------------------------------------------------------------- /joyrl/algos/SoftQ/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2024-07-30 13:40:26 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-07-30 13:40:27 9 | Discription: 10 | ''' 11 | -------------------------------------------------------------------------------- /joyrl/algos/SoftQ/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-20 23:39:18 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-07-30 13:40:49 9 | Discription: 10 | ''' 11 | class AlgoConfig(): 12 | ''' algorithm parameters 13 | ''' 14 | def __init__(self) -> None: 15 | # set epsilon_start=epsilon_end to get fixed epsilon, i.e. epsilon=epsilon_end 16 | self.epsilon_start = 0.95 # epsilon start value 17 | self.epsilon_end = 0.01 # epsilon end value 18 | self.epsilon_decay = 500 # epsilon decay 19 | self.gamma = 0.95 # reward discount factor 20 | self.alpha = 0.4 # temperature parameter of softmax 21 | self.lr = 0.0001 # learning rate 22 | self.buffer_type = 'REPLAY_QUE' # replay buffer type 23 | self.max_buffer_size = 100000 # replay buffer size 24 | self.batch_size = 64 # batch size 25 | self.target_update = 4 # target network update frequency 26 | # value network layers config 27 | # [{'name': 'feature_1', 'layers': [{'layer_type': 'linear', 'layer_size': [256], 'activation': 'relu'}, {'layer_type': 'linear', 'layer_size': [256], 'activation': 'relu'}]}] 28 | self.branch_layers = [ 29 | # { 30 | # 'name': 'feature_1', 31 | # 'layers': 32 | # [ 33 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 34 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 35 | # ] 36 | # }, 37 | # { 38 | # 'name': 'feature_2', 39 | # 'layers': 40 | # [ 41 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 42 | # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, 43 | # ] 44 | # } 45 | ] 46 | self.merge_layers = [ 47 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'}, 48 | # {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'}, 49 | ] 50 | -------------------------------------------------------------------------------- /joyrl/algos/SoftQ/data_handler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2024-07-30 13:40:11 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-07-30 13:40:12 9 | Discription: 10 | ''' 11 | from joyrl.algos.base.data_handler import BaseDataHandler 12 | 13 | class DataHandler(BaseDataHandler): 14 | def __init__(self, cfg): 15 | super().__init__(cfg) 16 | -------------------------------------------------------------------------------- /joyrl/algos/TD3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/TD3/__init__.py -------------------------------------------------------------------------------- /joyrl/algos/TD3/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-25 00:37:19 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-01-27 11:54:20 9 | Discription: 10 | ''' 11 | class AlgoConfig: 12 | def __init__(self) -> None: 13 | self.action_type_list = 'dpg' # action type, dpg: deterministic policy gradient 14 | self.buffer_type = 'REPLAY_QUE' # replay buffer type 15 | self.explore_steps = 100 # exploration steps before training 16 | self.policy_freq = 2 # policy update frequency 17 | self.actor_lr = 1e-4 # actor learning rate 3e-4 18 | self.critic_lr = 1e-3 # critic learning rate 19 | self.gamma = 0.99 # discount factor 20 | self.tau = 0.005 # target smoothing coefficient 21 | self.policy_noise = 0.2 # noise added to target policy during critic update 22 | self.expl_noise = 0.1 # std of Gaussian exploration noise 23 | self.noise_clip = 0.5 # range to clip target policy noise 24 | self.batch_size = 100 # batch size for both actor and critic 25 | self.max_buffer_size = 1000000 # replay buffer size 26 | self.branch_layers = [] 27 | self.merge_layers = [] 28 | self.actor_branch_layers = [] 29 | self.actor_merge_layers = [] 30 | self.critic_branch_layers = [] 31 | self.critic_merge_layers = [] -------------------------------------------------------------------------------- /joyrl/algos/TD3/data_handler.py: -------------------------------------------------------------------------------- 1 | from joyrl.algos.base.data_handler import BaseDataHandler 2 | 3 | class DataHandler(BaseDataHandler): 4 | def __init__(self, cfg): 5 | super().__init__(cfg) -------------------------------------------------------------------------------- /joyrl/algos/TD3/model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2024-07-21 16:37:59 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-07-21 16:38:00 9 | Discription: 10 | ''' 11 | import torch.nn as nn 12 | from joyrl.algos.base.network import * 13 | 14 | class Model(nn.Module): 15 | def __init__(self, cfg ): 16 | super(Model, self).__init__() 17 | state_size_list = cfg.obs_space_info.size 18 | action_size_list = cfg.action_space_info.size 19 | critic_input_size_list = state_size_list+ [[None, len(action_size_list)]] 20 | self.actor = ActorNetwork(cfg, input_size_list = state_size_list) 21 | self.critic_1 = CriticNetwork(cfg, input_size_list = critic_input_size_list) 22 | self.critic_2 = CriticNetwork(cfg, input_size_list = critic_input_size_list) -------------------------------------------------------------------------------- /joyrl/algos/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-01-01 16:20:49 7 | LastEditor: JiangJi 8 | LastEditTime: 2023-12-25 12:52:47 9 | Discription: 10 | ''' 11 | from joyrl.algos import base,DQN,DoubleDQN,DuelingDQN,NoisyDQN,PPO 12 | __all__ = [ 13 | "base", 14 | "QLearning", 15 | "DQN", 16 | "DoubleDQN", 17 | "DuelingDQN", 18 | "NoisyDQN", 19 | "PPO" 20 | ] 21 | -------------------------------------------------------------------------------- /joyrl/algos/base/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/algos/base/__init__.py -------------------------------------------------------------------------------- /joyrl/algos/base/experience.py: -------------------------------------------------------------------------------- 1 | class Exp: 2 | def __init__(self, **kwargs) -> None: 3 | for k,v in kwargs.items(): 4 | setattr(self,k,v) 5 | 6 | -------------------------------------------------------------------------------- /joyrl/algos/base/optm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | class SharedAdam(torch.optim.Adam): 4 | """Implements Adam algorithm with shared states. 5 | """ 6 | 7 | def __init__(self, 8 | params, 9 | lr=1e-3, 10 | betas=(0.9, 0.999), 11 | eps=1e-8, 12 | weight_decay=0): 13 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay) 14 | 15 | for group in self.param_groups: 16 | for p in group['params']: 17 | state = self.state[p] 18 | state['step'] = torch.zeros(1) 19 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_() 20 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_() 21 | 22 | def share_memory(self): 23 | for group in self.param_groups: 24 | for p in group['params']: 25 | state = self.state[p] 26 | state['step'].share_memory_() 27 | state['exp_avg'].share_memory_() 28 | state['exp_avg_sq'].share_memory_() 29 | 30 | def step(self, closure=None): 31 | """Performs a single optimization step. 32 | Arguments: 33 | closure (callable, optional): A closure that reevaluates the model 34 | and returns the loss. 35 | """ 36 | loss = None 37 | if closure is not None: 38 | loss = closure() 39 | 40 | for group in self.param_groups: 41 | for p in group['params']: 42 | if p.grad is None: 43 | continue 44 | grad = p.grad.data 45 | state = self.state[p] 46 | 47 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 48 | beta1, beta2 = group['betas'] 49 | 50 | state['step'] += 1 51 | 52 | if group['weight_decay'] != 0: 53 | grad = grad.add(group['weight_decay'], p.data) 54 | 55 | # Decay the first and second moment running average coefficient 56 | exp_avg.mul_(beta1).add_(grad,alpha = 1 - beta1) 57 | exp_avg_sq.mul_(beta2).addcmul_(grad, grad,value = 1 - beta2) 58 | 59 | denom = exp_avg_sq.sqrt().add_(group['eps']) 60 | 61 | bias_correction1 = 1 - beta1 ** state['step'].item() 62 | bias_correction2 = 1 - beta2 ** state['step'].item() 63 | step_size = group['lr'] * math.sqrt( 64 | bias_correction2) / bias_correction1 65 | 66 | p.data.addcdiv_(exp_avg, denom,value = -step_size) 67 | return loss -------------------------------------------------------------------------------- /joyrl/envs/README.md: -------------------------------------------------------------------------------- 1 | # 环境说明汇总 2 | 3 | ## 算法SAR一览 4 | 5 | 说明:SAR分别指状态(S)、动作(A)以及奖励(R),下表的Reward Range表示每回合能获得的奖励范围,Steps表示环境中每回合的最大步数 6 | 7 | | Environment ID | Observation Space | Action Space | Reward Range | Steps | 8 | | :--------------------------------: | :---------------: | :----------: | :----------: | :------: | 9 | | CartPole-v0 | Box(4,) | Discrete(2) | [0,200] | 200 | 10 | | CartPole-v1 | Box(4,) | Discrete(2) | [0,500] | 500 | 11 | | CliffWalking-v0 | Discrete(48) | Discrete(4) | [-inf,-13] | [13,inf] | 12 | | FrozenLake-v1(*is_slippery*=False) | Discrete(16) | Discrete(4) | 0 or 1 | [6,info] | 13 | 14 | ## 环境描述 15 | 16 | [OpenAI Gym](./gym_info.md) 17 | [MuJoCo](./mujoco_info.md) 18 | 19 | -------------------------------------------------------------------------------- /joyrl/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from joyrl.envs import gym 2 | __all__ = [ 3 | "gym", 4 | ] -------------------------------------------------------------------------------- /joyrl/envs/assets/action_grid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/action_grid.png -------------------------------------------------------------------------------- /joyrl/envs/assets/gym_info_20211130180023.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/gym_info_20211130180023.png -------------------------------------------------------------------------------- /joyrl/envs/assets/image-20200820174307301.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/image-20200820174307301.png -------------------------------------------------------------------------------- /joyrl/envs/assets/image-20200820174814084.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/image-20200820174814084.png -------------------------------------------------------------------------------- /joyrl/envs/assets/image-20201007211441036.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/image-20201007211441036.png -------------------------------------------------------------------------------- /joyrl/envs/assets/image-20201007211858925.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/image-20201007211858925.png -------------------------------------------------------------------------------- /joyrl/envs/assets/image-20210429150622353.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/image-20210429150622353.png -------------------------------------------------------------------------------- /joyrl/envs/assets/image-20210429150630806.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/image-20210429150630806.png -------------------------------------------------------------------------------- /joyrl/envs/assets/track_big.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/assets/track_big.png -------------------------------------------------------------------------------- /joyrl/envs/gym/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-05-28 18:49:43 7 | LastEditor: JiangJi 8 | LastEditTime: 2023-05-28 18:49:46 9 | Discription: 10 | ''' 11 | from joyrl.envs.gym.wrappers import * -------------------------------------------------------------------------------- /joyrl/envs/gym/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-05-27 20:55:27 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-06-17 01:31:52 9 | Discription: 10 | ''' 11 | from joyrl.envs.register import register_env 12 | class EnvConfig(object): 13 | def __init__(self) -> None: 14 | self.id = "CartPole-v1" # environment id 15 | register_env(self.id) 16 | self.render_mode = None # render mode: None, rgb_array, human 17 | self.wrappers = [] 18 | self.ignore_params = ["wrappers", "ignore_params"] -------------------------------------------------------------------------------- /joyrl/envs/gym_info.md: -------------------------------------------------------------------------------- 1 | # OpenAi Gym 环境说明 2 | ## 基础控制 3 | 4 | ### [CartPole v0](https://github.com/openai/gym/wiki/CartPole-v0) 5 | 6 | image-20200820174307301 7 | 8 | 通过向左或向右推车能够实现平衡,所以动作空间由两个动作组成。每进行一个step就会给一个reward,如果无法保持平衡那么done等于true,本次episode失败。理想状态下,每个episode至少能进行200个step,也就是说每个episode的reward总和至少为200,step数目至少为200 9 | 10 | ### CartPole-v1 11 | 12 | ```CartPole v1```环境其实跟```CartPole v0```是一模一样的,区别在于每回合最大步数(max_episode_steps)以及奖励阈值(reward_threshold),如下是相关源码: 13 | 14 | ![](assets/gym_info_20211130180023.png) 15 | 16 | 这里先解释一下奖励阈值(reward_threshold),即Gym设置的一个合格标准,比如对于```CartPole v0```如果算法能够将奖励收敛到195以上,说明该算法合格。但实际上```CartPole v0```的每回合最大步数(max_episode_steps)是200,每步的奖励最大是1,也就是每回合最大奖励是200,比Gym设置的奖励阈值高。笔者猜测这是Gym可能是给算法学习者们设置的一个参考线,而实际中在写算法时并不会用到这个算法阈值,所以可以忽略。 17 | 18 | 再看每回合最大步数,可以看到```CartPole v1```的步数更长,相应的奖励要求更高,可以理解为```v1```是```v0```的难度升级版。 19 | 20 | 21 | ### [Pendulum-v0](https://github.com/openai/gym/wiki/Pendulum-v0) 22 | 23 | 注:gym 0.18.0之后版本中Pendulum-v0已经改为Pendulum-v1 24 | image-20200820174814084 25 | 26 | 钟摆以随机位置开始,目标是将其摆动,使其保持向上直立。动作空间是连续的,值的区间为[-2,2]。每个step给的reward最低为-16.27,最高为0。目前最好的成绩是100个episode的reward之和为-123.11 ± 6.86。 27 | 28 | ### 29 | 30 | 悬崖寻路问题(CliffWalking)是指在一个4 x 12的网格中,智能体以网格的左下角位置为起点,以网格的下角位置为终点,目标是移动智能体到达终点位置,智能体每次可以在上、下、左、右这4个方向中移动一步,每移动一步会得到-1单位的奖励。 31 | 32 | image-20201007211441036 33 | 34 | 如图,红色部分表示悬崖,数字代表智能体能够观测到的位置信息,即observation,总共会有0-47等48个不同的值,智能体再移动中会有以下限制: 35 | 36 | * 智能体不能移出网格,如果智能体想执行某个动作移出网格,那么这一步智能体不会移动,但是这个操作依然会得到-1单位的奖励 37 | 38 | * 如果智能体“掉入悬崖” ,会立即回到起点位置,并得到-100单位的奖励 39 | 40 | * 当智能体移动到终点时,该回合结束,该回合总奖励为各步奖励之和 41 | 42 | 实际的仿真界面如下: 43 | 44 | image-20201007211858925 45 | 46 | 由于从起点到终点最少需要13步,每步得到-1的reward,因此最佳训练算法下,每个episode下reward总和应该为-13。 47 | 48 | ## 参考 49 | 50 | [Gym环境相关源码](https://github.com/openai/gym/tree/master/gym/envs) -------------------------------------------------------------------------------- /joyrl/envs/mujoco_info.md: -------------------------------------------------------------------------------- 1 | # MuJoCo 2 | 3 | MuJoCo(Multi-Joint dynamics with Contact)是一个物理模拟器,可以用于机器人控制优化等研究。安装见[Mac安装MuJoCo以及mujoco_py](https://blog.csdn.net/JohnJim0/article/details/115656392?spm=1001.2014.3001.5501) 4 | 5 | 6 | 7 | ## HalfCheetah-v2 8 | 9 | 10 | 11 | 该环境基于mujoco仿真引擎,该环境的目的是使一只两只脚的“猎豹”跑得越快越好(下面图谷歌HalfCheetah-v2的,https://gym.openai.com/envs/HalfCheetah-v2/)。 12 | 13 | image-20210429150630806 14 | 15 | 动作空间:Box(6,),一只脚需要控制三个关节一共6个关节,每个关节的运动范围为[-1, 1]。 16 | 17 | 状态空间:Box(17, ),包含各种状态,每个值的范围为![img](assets/9cd6ae68c9aad008ede4139da358ec26.svg),主要描述“猎豹”本身的姿态等信息。 18 | 19 | 回报定义:每一步的回报与这一步的中猎豹的速度和猎豹行动的消耗有关,定义回报的代码如下。 20 | 21 | ```python 22 | def step(self, action): 23 | xposbefore = self.sim.data.qpos[0] 24 | self.do_simulation(action, self.frame_skip) 25 | xposafter = self.sim.data.qpos[0] 26 | ob = self._get_obs() 27 | reward_ctrl = - 0.1 * np.square(action).sum() 28 | reward_run = (xposafter - xposbefore)/self.dt 29 | # =========== reward =========== 30 | reward = reward_ctrl + reward_run 31 | # =========== reward =========== 32 | done = False 33 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl) 34 | ``` 35 | 36 | 当猎豹无法控制平衡而倒下时,一个回合(episode)结束。 37 | 38 | 但是这个环境有一些问题,目前经过搜索并不知道一个回合的reward上限,实验中训练好的episode能跑出平台之外: 39 | 40 | image-20210429150622353 41 | 42 | 加上时间有限,所以训练中reward一直处于一个平缓上升的状态,本人猜测这可能是gym的一个bug。 -------------------------------------------------------------------------------- /joyrl/envs/register.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-22 23:02:13 7 | LastEditor: JiangJi 8 | LastEditTime: 2023-12-24 22:52:08 9 | Discription: 10 | ''' 11 | import gymnasium as gym 12 | from gymnasium.envs.registration import register 13 | 14 | def register_env(env_name): 15 | if env_name == 'Racetrack-v0': 16 | register( 17 | id='Racetrack-v0', 18 | entry_point='envs.racetrack:RacetrackEnv', 19 | max_episode_steps=1000, 20 | kwargs={} 21 | ) 22 | elif env_name == 'FrozenLakeNoSlippery-v1': 23 | register( 24 | id='FrozenLakeNoSlippery-v1', 25 | entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv', 26 | kwargs={'map_name':"4x4",'is_slippery':False}, 27 | ) 28 | elif env_name == 'CustomCliffWalking-v0': 29 | register( 30 | id='CustomCliffWalking-v0', 31 | entry_point='joyrl.envs.gym.toy_text.cliff_walking:CustomCliffWalkingEnv', 32 | max_episode_steps=1000, 33 | kwargs={} 34 | ) 35 | else: 36 | pass 37 | 38 | # if __name__ == "__main__": 39 | # import random 40 | # import gym 41 | # env = gym.make('FrozenLakeNoSlippery-v1') 42 | # num_steps = 1000000 43 | # state = env.reset() 44 | # n_actions = env.action_space.n 45 | # print(state) 46 | # for _ in range(num_steps) : 47 | # next_state, reward, done,_ = env.step(random.choice(range(n_actions))) 48 | # print(next_state) 49 | # if (done) : 50 | # _ = env.reset() 51 | -------------------------------------------------------------------------------- /joyrl/envs/snake/README.md: -------------------------------------------------------------------------------- 1 | # 贪吃蛇 2 | 3 | 贪吃蛇是一个起源于1976年的街机游戏 Blockade,玩家控制蛇上下左右吃到食物并将身体增长,吃到食物后移动速度逐渐加快,直到碰到墙体或者蛇的身体算游戏结束。 4 | 5 | ![image-20200901202636603](img/image-20200901202636603.png) 6 | 7 | 如图,本次任务整个游戏版面大小为560X560,绿色部分就是我们的智能体贪吃蛇,红色方块就是食物,墙位于四周,一旦食物被吃掉,会在下一个随机位置刷出新的食物。蛇的每一节以及食物的大小为40X40,除开墙体(厚度也为40),蛇可以活动的范围为480X480,也就是12X12的栅格。环境的状态等信息如下: 8 | 9 | * state:为一个元组,包含(adjoining_wall_x, adjoining_wall_y, food_dir_x, food_dir_y, adjoining_body_top, adjoining_body_bottom, adjoining_body_left, adjoining_body_right). 10 | 11 | * [adjoining_wall_x, adjoining_wall_y]:提供蛇头是否与墙体相邻的信息,具体包含9个状态 12 | 13 | adjoining_wall_x:0表示x轴方向蛇头无墙体相邻,1表示有墙在蛇头左边,2表示有墙在右边adjoining_wall_y:0表示y轴方向蛇头无墙体相邻,1表示有墙在蛇头上边,2表示有墙在下边 14 | 15 | 注意[0,0]也包括蛇跑出480X480范围的情况 16 | 17 | * [food_dir_x, food_dir_y]:表示食物与蛇头的位置关系 18 | 19 | food_dir_x:0表示食物与蛇头同在x轴上,1表示食物在蛇头左侧(不一定相邻),2表示在右边 20 | 21 | food_dir_y:0表示食物与蛇头同在y轴上,1表示食物在蛇头上面,2表示在下面 22 | 23 | * [adjoining_body_top, adjoining_body_bottom, adjoining_body_left, adjoining_body_right]:用以检查蛇的身体是否在蛇头的附近 24 | 25 | adjoining_body_top:1表示蛇头上边有蛇的身体,0表示没有 26 | 27 | adjoining_body_bottom:1表示蛇头下边有蛇的身体,0表示没有 28 | 29 | adjoining_body_left:1表示蛇头左边有蛇的身体,0表示没有 30 | 31 | adjoining_body_right:1表示蛇头右边有蛇的身体,0表示没有 32 | 33 | * action:即上下左右 34 | 35 | * reward:如果吃到食物给一个+1的reward,如果蛇没了就-1,其他情况给-0.1的reward 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /joyrl/envs/snake/example_assignment_and_report2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/envs/snake/example_assignment_and_report2.pdf -------------------------------------------------------------------------------- /joyrl/envs/snake/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | DISPLAY_SIZE = 560 3 | GRID_SIZE = 40 4 | WALL_SIZE = 40 5 | WHITE = (255, 255, 255) 6 | RED = (255, 0, 0) 7 | BLUE = (72, 61, 139) 8 | BLACK = (0, 0, 0) 9 | GREEN = (0, 255, 0) 10 | 11 | NUM_ADJOINING_WALL_X_STATES=3 12 | NUM_ADJOINING_WALL_Y_STATES=3 13 | NUM_FOOD_DIR_X=3 14 | NUM_FOOD_DIR_Y=3 15 | NUM_ADJOINING_BODY_TOP_STATES=2 16 | NUM_ADJOINING_BODY_BOTTOM_STATES=2 17 | NUM_ADJOINING_BODY_LEFT_STATES=2 18 | NUM_ADJOINING_BODY_RIGHT_STATES=2 19 | NUM_ACTIONS = 4 20 | 21 | CHECKPOINT = 'checkpoint.npy' 22 | 23 | def create_q_table(): 24 | return np.zeros((NUM_ADJOINING_WALL_X_STATES, NUM_ADJOINING_WALL_Y_STATES, NUM_FOOD_DIR_X, NUM_FOOD_DIR_Y, 25 | NUM_ADJOINING_BODY_TOP_STATES, NUM_ADJOINING_BODY_BOTTOM_STATES, NUM_ADJOINING_BODY_LEFT_STATES, 26 | NUM_ADJOINING_BODY_RIGHT_STATES, NUM_ACTIONS)) 27 | 28 | def sanity_check(arr): 29 | if (type(arr) is np.ndarray and 30 | arr.shape==(NUM_ADJOINING_WALL_X_STATES, NUM_ADJOINING_WALL_Y_STATES, NUM_FOOD_DIR_X, NUM_FOOD_DIR_Y, 31 | NUM_ADJOINING_BODY_TOP_STATES, NUM_ADJOINING_BODY_BOTTOM_STATES, NUM_ADJOINING_BODY_LEFT_STATES, 32 | NUM_ADJOINING_BODY_RIGHT_STATES,NUM_ACTIONS)): 33 | return True 34 | else: 35 | return False 36 | 37 | def save(filename, arr): 38 | if sanity_check(arr): 39 | np.save(filename,arr) 40 | return True 41 | else: 42 | print("Failed to save model") 43 | return False 44 | 45 | def load(filename): 46 | try: 47 | arr = np.load(filename) 48 | if sanity_check(arr): 49 | print("Loaded model successfully") 50 | return arr 51 | print("Model loaded is not in the required format") 52 | return None 53 | except: 54 | print("Filename doesnt exist") 55 | return None -------------------------------------------------------------------------------- /joyrl/envs/stochastic_mdp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: John 5 | Email: johnjim0816@gmail.com 6 | Date: 2021-03-24 22:12:19 7 | LastEditor: John 8 | LastEditTime: 2021-03-26 17:12:43 9 | Discription: 10 | Environment: 11 | ''' 12 | import numpy as np 13 | import random 14 | 15 | 16 | class StochasticMDP: 17 | def __init__(self): 18 | self.end = False 19 | self.curr_state = 2 20 | self.n_actions = 2 21 | self.n_states = 6 22 | self.p_right = 0.5 23 | 24 | def reset(self): 25 | self.end = False 26 | self.curr_state = 2 27 | state = np.zeros(self.n_states) 28 | state[self.curr_state - 1] = 1. 29 | return state 30 | 31 | def step(self, action): 32 | if self.curr_state != 1: 33 | if action == 1: 34 | if random.random() < self.p_right and self.curr_state < self.n_states: 35 | self.curr_state += 1 36 | else: 37 | self.curr_state -= 1 38 | 39 | if action == 0: 40 | self.curr_state -= 1 41 | if self.curr_state == self.n_states: 42 | self.end = True 43 | 44 | state = np.zeros(self.n_states) 45 | state[self.curr_state - 1] = 1. 46 | 47 | if self.curr_state == 1: 48 | if self.end: 49 | return state, 1.00, True, {} 50 | else: 51 | return state, 1.00/100.00, True, {} 52 | else: 53 | return state, 0.0, False, {} 54 | -------------------------------------------------------------------------------- /joyrl/envs/track.txt: -------------------------------------------------------------------------------- 1 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 | 1 1 1 1 1 1 1 1 0 0 0 0 0 3 3 3 3 3 1 3 | 1 1 1 1 1 1 0 0 0 0 0 0 0 3 3 3 3 3 1 4 | 1 1 1 1 1 0 0 0 0 0 0 0 0 3 3 3 3 3 1 5 | 1 1 1 1 0 0 0 0 0 0 0 0 0 3 3 3 3 3 1 6 | 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 7 | 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 8 | 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 9 | 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 10 | 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 11 | 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 12 | 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 13 | 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 14 | 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 15 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 -------------------------------------------------------------------------------- /joyrl/envs/windy_gridworld.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import sys 4 | from gym.envs.toy_text import discrete 5 | 6 | UP = 0 7 | RIGHT = 1 8 | DOWN = 2 9 | LEFT = 3 10 | 11 | class WindyGridworldEnv(discrete.DiscreteEnv): 12 | 13 | metadata = {'render.modes': ['human', 'ansi']} 14 | 15 | def _limit_coordinates(self, coord): 16 | coord[0] = min(coord[0], self.shape[0] - 1) 17 | coord[0] = max(coord[0], 0) 18 | coord[1] = min(coord[1], self.shape[1] - 1) 19 | coord[1] = max(coord[1], 0) 20 | return coord 21 | 22 | def _calculate_transition_prob(self, current, delta, winds): 23 | new_position = np.array(current) + np.array(delta) + np.array([-1, 0]) * winds[tuple(current)] 24 | new_position = self._limit_coordinates(new_position).astype(int) 25 | new_state = np.ravel_multi_index(tuple(new_position), self.shape) 26 | is_done = tuple(new_position) == (3, 7) 27 | return [(1.0, new_state, -1.0, is_done)] 28 | 29 | def __init__(self): 30 | self.shape = (7, 10) 31 | 32 | nS = np.prod(self.shape) 33 | n_actions = 4 34 | 35 | # Wind strength 36 | winds = np.zeros(self.shape) 37 | winds[:,[3,4,5,8]] = 1 38 | winds[:,[6,7]] = 2 39 | 40 | # Calculate transition probabilities 41 | P = {} 42 | for s in range(nS): 43 | position = np.unravel_index(s, self.shape) 44 | P[s] = { a : [] for a in range(n_actions) } 45 | P[s][UP] = self._calculate_transition_prob(position, [-1, 0], winds) 46 | P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1], winds) 47 | P[s][DOWN] = self._calculate_transition_prob(position, [1, 0], winds) 48 | P[s][LEFT] = self._calculate_transition_prob(position, [0, -1], winds) 49 | 50 | # We always start in state (3, 0) 51 | isd = np.zeros(nS) 52 | isd[np.ravel_multi_index((3,0), self.shape)] = 1.0 53 | 54 | super(WindyGridworldEnv, self).__init__(nS, n_actions, P, isd) 55 | 56 | def render(self, mode='human', close=False): 57 | self._render(mode, close) 58 | 59 | def _render(self, mode='human', close=False): 60 | if close: 61 | return 62 | 63 | outfile = StringIO() if mode == 'ansi' else sys.stdout 64 | 65 | for s in range(self.nS): 66 | position = np.unravel_index(s, self.shape) 67 | # print(self.s) 68 | if self.s == s: 69 | output = " x " 70 | elif position == (3,7): 71 | output = " T " 72 | else: 73 | output = " o " 74 | 75 | if position[1] == 0: 76 | output = output.lstrip() 77 | if position[1] == self.shape[1] - 1: 78 | output = output.rstrip() 79 | output += "\n" 80 | 81 | outfile.write(output) 82 | outfile.write("\n") 83 | -------------------------------------------------------------------------------- /joyrl/framework/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datawhalechina/joyrl/b36e8de522a663fb6cdfc34b2fa5d074d0fe9024/joyrl/framework/__init__.py -------------------------------------------------------------------------------- /joyrl/framework/base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-02 17:30:36 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-06-02 10:50:42 9 | Discription: 10 | ''' 11 | import ray 12 | from joyrl.framework.config import MergedConfig 13 | from joyrl.framework.message import Msg 14 | from joyrl.framework.utils import Logger, create_module 15 | 16 | 17 | class Moduler(object): 18 | def __init__(self, cfg: MergedConfig, **kwargs) -> None: 19 | self.cfg = cfg 20 | self.name = kwargs.get('name', 'Moduler') 21 | self.logger = Logger(self.cfg.log_dir, log_name = self.name) 22 | 23 | def _t_start(self): 24 | ''' start threads 25 | ''' 26 | raise NotImplementedError 27 | 28 | def _p_start(self): 29 | ''' start processes 30 | ''' 31 | raise NotImplementedError 32 | 33 | def pub_msg(self, msg: Msg): 34 | ''' publish message 35 | ''' 36 | raise NotImplementedError 37 | 38 | def init(self): 39 | ''' init module 40 | ''' 41 | raise NotImplementedError 42 | 43 | def run(self): 44 | ''' run module 45 | ''' 46 | raise NotImplementedError 47 | 48 | 49 | -------------------------------------------------------------------------------- /joyrl/framework/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-02 15:30:09 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-07-20 13:03:03 9 | Discription: 10 | ''' 11 | class DefaultConfig: 12 | ''' Default parameters for running 13 | ''' 14 | def __init__(self) -> None: 15 | pass 16 | def print_cfg(self): 17 | ''' Print all parameters 18 | ''' 19 | print(self.__dict__) 20 | 21 | class MergedConfig(object): 22 | ''' Merge general, algorithm and environment config 23 | ''' 24 | def __init__(self) -> None: 25 | self.general_cfg = None 26 | self.algo_cfg = None 27 | self.env_cfg = None 28 | 29 | class GeneralConfig(object): 30 | ''' General parameters for running 31 | ''' 32 | def __init__(self) -> None: 33 | # basic settings 34 | self.joyrl_version = "0.5.0" 35 | self.env_name = "gym" # name of environment 36 | self.algo_name = "DQN" # name of algorithm 37 | self.mode = "train" # train, test 38 | self.device = "custom" # set all device to cpu, cuda, custom 39 | self.interactor_device = "cpu" # device to use for interactor 40 | self.learner_device = "cpu" # device to use for learner 41 | self.seed = 0 # random seed 42 | self.is_learner_async = False # if learner is async 43 | self.max_episode = -1 # number of episodes for training, set -1 to keep running 44 | self.max_step = -1 # number of episodes for testing, set -1 means unlimited steps 45 | self.reward_threshold = float('inf') # reward threshold to stop training 46 | self.reward_threshold_limit = 10 # number of episodes to check reward threshold 47 | self.collect_traj = False # if collect trajectory or not 48 | # multiprocessing settings 49 | self.n_interactors = 1 # number of workers 50 | self.exps_trucation_size = 1 # size of exps to truncate 51 | self.n_learners = 1 # number of learners if using multi-processing, default 1 52 | self.share_buffer = True # if all learners share the same buffer 53 | # online evaluation settings 54 | self.online_eval = False # online evaluation or not 55 | self.online_eval_episode = 10 # online eval episodes 56 | self.model_save_fre = 500 # model save frequency per update step 57 | # load model settings 58 | self.load_checkpoint = False # if load checkpoint 59 | self.restore_model_meta = True # if restore model meta 60 | self.load_path = "Train_single_CartPole-v1_DQN_20230515-211721" # path to load model 61 | self.load_model_step = 'best' # load model at which step 62 | # stats recorder settings 63 | self.interact_summary_fre = 10 # record interact stats per episode 64 | self.policy_summary_fre = 100 # record update stats per update step 65 | -------------------------------------------------------------------------------- /joyrl/framework/message.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2024-02-25 15:46:04 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-06-14 17:49:49 9 | Discription: 10 | ''' 11 | from enum import Enum, unique 12 | from typing import Optional, Any 13 | from dataclasses import dataclass 14 | 15 | @unique 16 | class MsgType(Enum): 17 | # tracker 18 | TRACKER_GET_EPISODE = 0 19 | TRACKER_INCREASE_EPISODE = 1 20 | TRACKER_INCREASE_UPDATE_STEP = 2 21 | TRACKER_GET_UPDATE_STEP = 3 22 | TRACKER_CHECK_TASK_END = 4 23 | TRACKER_FORCE_TASK_END = 5 24 | 25 | # interactor 26 | INTERACTOR_SAMPLE = 10 27 | INTERACTOR_GET_SAMPLE_DATA = 11 28 | 29 | # learner 30 | LEARNER_UPDATE_POLICY = 20 31 | LEARNER_GET_UPDATED_MODEL_PARAMS_QUEUE = 21 32 | 33 | # collector 34 | COLLECTOR_PUT_EXPS = 30 35 | COLLECTOR_GET_TRAINING_DATA = 31 36 | COLLECTOR_GET_BUFFER_LENGTH = 32 37 | 38 | # recorder 39 | RECORDER_PUT_SUMMARY = 40 40 | 41 | # policy_mgr 42 | POLICY_MGR_PUT_MODEL_PARAMS = 70 43 | POLICY_MGR_PUT_MODEL_META = 71 44 | 45 | @dataclass 46 | class Msg(object): 47 | type: MsgType 48 | data: Optional[Any] = None -------------------------------------------------------------------------------- /joyrl/framework/tracker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-04-28 16:16:04 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-06-14 17:49:11 9 | Discription: 10 | ''' 11 | from joyrl.framework.message import Msg, MsgType 12 | from joyrl.framework.config import MergedConfig 13 | from joyrl.framework.base import Moduler 14 | 15 | class Tracker(Moduler): 16 | ''' tacker global information 17 | ''' 18 | def __init__(self, cfg: MergedConfig, *args, **kwargs) -> None: 19 | super().__init__(cfg, *args, **kwargs) 20 | self.global_episode = 0 # current global episode 21 | self.global_sample_count = 0 # global sample count 22 | self.global_update_step = 0 # global update step 23 | self.force_task_end = False # force task end 24 | self.max_episode = cfg.max_episode # max episode 25 | 26 | def pub_msg(self, msg: Msg): 27 | msg_type, msg_data = msg.type, msg.data 28 | if msg_type == MsgType.TRACKER_GET_EPISODE: 29 | return self._get_episode() 30 | elif msg_type == MsgType.TRACKER_INCREASE_EPISODE: 31 | episode_delta = 1 if msg_data is None else msg_data 32 | self._increase_episode(i = episode_delta) 33 | elif msg_type == MsgType.TRACKER_GET_UPDATE_STEP: 34 | return self._get_update_step() 35 | elif msg_type == MsgType.TRACKER_INCREASE_UPDATE_STEP: 36 | update_step_delta = 1 if msg_data is None else msg_data 37 | self._increase_update_step(i = update_step_delta) 38 | elif msg_type == MsgType.TRACKER_CHECK_TASK_END: 39 | return self._check_task_end() 40 | elif msg_type == MsgType.TRACKER_FORCE_TASK_END: 41 | self.force_task_end = True 42 | else: 43 | raise NotImplementedError 44 | 45 | def _increase_episode(self, i: int = 1): 46 | ''' increase episode 47 | ''' 48 | self.global_episode += i 49 | 50 | def _get_episode(self): 51 | ''' get current episode 52 | ''' 53 | return self.global_episode 54 | 55 | def _check_task_end(self): 56 | ''' check if episode reaches the max episode 57 | ''' 58 | if self.force_task_end: 59 | return True 60 | if self.max_episode < 0: 61 | return False 62 | return self.global_episode >= self.max_episode 63 | 64 | def _increase_update_step(self, i: int = 1): 65 | ''' increase update step 66 | ''' 67 | self.global_update_step += i 68 | 69 | def _get_update_step(self): 70 | ''' get update step 71 | ''' 72 | return self.global_update_step -------------------------------------------------------------------------------- /joyrl/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-22 14:07:32 7 | LastEditor: JiangJi 8 | LastEditTime: 2023-12-22 14:07:32 9 | Discription: 10 | ''' 11 | -------------------------------------------------------------------------------- /joyrl/scripts/scripts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-22 13:55:56 7 | LastEditor: JiangJi 8 | LastEditTime: 2023-12-22 14:07:35 9 | Discription: 10 | ''' 11 | import argparse 12 | from joyrl import run 13 | 14 | def main(): 15 | parser = argparse.ArgumentParser(description="hyperparameters") 16 | parser.add_argument('--yaml', default=None, type=str, 17 | help='the path of config file') 18 | args = parser.parse_args() 19 | run(yaml_path = args.yaml) 20 | if __name__ == "__main__": 21 | main() -------------------------------------------------------------------------------- /presets/Atari/Breakout-v5/Breakout-v5_DQN.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DQN # algo name 3 | env_name: gym # env name, differ from env_id in env_cfgs 4 | device: cuda # device, cpu or cuda 5 | mode: train # run mode: train, test 6 | collect_traj: false # if collect trajectories or not 7 | mp_backend: single # multi-processing mode: single(default), ray 8 | n_workers: 2 # number of workers if using multi-processing, default 1 9 | load_checkpoint: false # if load checkpoint or not 10 | load_path: Train_single_CartPole-v1_DQN_20230515-211721 # if load checkpoint, then config path in 'tasks' dir 11 | load_model_step: best # load model step 12 | max_episode: 300 # max episodes, set -1 to keep running 13 | max_step: 500 # max steps per episode 14 | seed: 1 # random seed, set 0 not to use seed 15 | online_eval: true # if online eval or not 16 | online_eval_episode: 10 # online eval episodes 17 | model_save_fre: 500 # update step frequency of saving model 18 | algo_cfg: 19 | value_layers: 20 | - layer_type: conv2d 21 | in_channel: 4 22 | out_channel: 32 23 | kernel_size: 8 24 | stride: 4 25 | activation: relu 26 | - layer_type: conv2d 27 | in_channel: 32 28 | out_channel: 64 29 | kernel_size: 4 30 | stride: 2 31 | activation: relu 32 | - layer_type: conv2d 33 | in_channel: 64 34 | out_channel: 64 35 | kernel_size: 3 36 | stride: 1 37 | activation: relu 38 | - layer_type: flatten 39 | - layer_type: linear 40 | layer_size: [512] 41 | activation: relu 42 | batch_size: 64 43 | buffer_type: REPLAY_QUE 44 | max_buffer_size: 100000 45 | epsilon_decay: 500 46 | epsilon_end: 0.01 47 | epsilon_start: 0.95 48 | gamma: 0.95 49 | lr: 0.0001 50 | target_update: 4 51 | env_cfg: 52 | id: ALE/Breakout-v5 53 | wrapper: envs.wrappers.AtariWrapper 54 | render_mode: null -------------------------------------------------------------------------------- /presets/Atari/Breakout-v5/Breakout-v5_PPO.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.2.2 3 | algo_name: PPO 4 | env_name: gym 5 | interactor_device: cpu 6 | learner_device: cuda 7 | mode: train # test # test 8 | exps_trucation_size: 1024 9 | exps_trucation_size_input_only: true 10 | is_learner_async: false 11 | load_checkpoint: false # true # false # true # test 12 | # load_path: Train_ALE/Breakout-v5_PPO_20240721-190921 # td 58 13 | # load_path: Train_ALE/Breakout-v5_PPO_20240722-001214 # td 67 258 14 | # load_path: Train_ALE/Breakout-v5_PPO_20240724-001646 # continue-train td 87 369 15 | # load_path: Train_ALE/Breakout-v5_PPO_20240724-223629 # 5.5e-5 continue-train td 99 408 16 | # load_path: Train_ALE/Breakout-v5_PPO_20240725-145732 # 2.5e-4 98 17 | load_path: Train_ALE/Breakout-v5_PPO_20240725-225306 # 2.5e-4 103 419 18 | load_model_step: best 19 | n_interactors: 10 # 1 # test 20 | max_episode: 45000 # 60000 # 3 # test 21 | max_step: 1200 22 | seed: 202407 23 | online_eval: true 24 | online_eval_episode: 15 # 1 # test 25 | model_save_fre: 10 # 1 # test 26 | policy_summary_fre: 10 # 1 # test 27 | interact_summary_fre: 100 # 1 # test 28 | algo_cfg: 29 | independ_actor: false 30 | return_form: gae # td 31 | branch_layers: 32 | - name: feature_1 33 | layers: 34 | - layer_type: conv2d 35 | in_channel: 4 36 | out_channel: 32 37 | kernel_size: 8 38 | stride: 4 39 | activation: relu 40 | - layer_type: conv2d 41 | in_channel: 32 42 | out_channel: 64 43 | kernel_size: 4 44 | stride: 2 45 | activation: relu 46 | - layer_type: conv2d 47 | in_channel: 64 48 | out_channel: 64 49 | kernel_size: 3 50 | stride: 1 51 | activation: relu 52 | - layer_type: flatten 53 | - layer_type: linear 54 | layer_size: [512] 55 | activation: relu 56 | buffer_type: ONPOLICY_QUE 57 | lr: 4.5e-4 # 3.0e-4 # 2.5e-4 58 | actor_lr: 3.0e-4 59 | critic_lr: 5.5e-4 60 | entropy_coef: 0.01 61 | critic_loss_coef: 0.5 62 | eps_clip: 0.105 63 | gamma: 0.99 64 | gae_lambda: 0.95 65 | k_epochs: 3 66 | batch_size: 256 67 | sgd_batch_size: 200 68 | max_norm: 0.5 69 | mini_batch_normalize: true 70 | env_cfg: 71 | id: ALE/Breakout-v5 72 | render_mode: null 73 | wrappers: 74 | - wrapper_name: BaseSkipFrame 75 | start_skip: 30 76 | skip: 4 77 | terminal_done_flag: true 78 | max_no_reward_count: 200 79 | - wrapper_name: EpisodicLifeEnv 80 | - wrapper_name: GrayScaleObservation 81 | - wrapper_name: ResizeObservation 82 | shape: 84 83 | - wrapper_name: ClipRewardEnv 84 | - wrapper_name: FrameStack 85 | num_stack: 4 86 | - wrapper_name: MultiHeadObsWrapper 87 | - wrapper_name: MultiHeadActionWrapper 88 | - wrapper_name: FrameStack2Numpy 89 | 90 | -------------------------------------------------------------------------------- /presets/Atari/Breakout-v5/Breakout-v5_PPO_test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.2.2 3 | algo_name: PPO 4 | env_name: gym 5 | interactor_device: cpu 6 | learner_device: cuda 7 | mode: test # test 8 | exps_trucation_size: 1024 9 | exps_trucation_size_input_only: true 10 | is_learner_async: false 11 | load_checkpoint: true 12 | # load_path: Train_ALE/Breakout-v5_PPO_20240814-141323 # 373.000 in 800 steps 13 | # load_path: Train_ALE/Breakout-v5_PPO_20240814-004435 # 252.000 in 800 steps 14 | load_path: Train_ALE/Breakout-v5_PPO_20240923-204459 # 408 in 512 15 | load_model_step: best 16 | n_interactors: 1 # test 17 | max_episode: 3 # test 18 | max_step: 1200 19 | seed: 202407 20 | online_eval: true 21 | online_eval_episode: 1 # test 22 | model_save_fre: 1 # test 23 | policy_summary_fre: 1 # test 24 | interact_summary_fre: 1 # test 25 | algo_cfg: 26 | mini_batch_normalize: true 27 | independ_actor: false 28 | return_form: gae # td 29 | branch_layers: 30 | - name: feature_1 31 | layers: 32 | - layer_type: conv2d 33 | in_channel: 4 34 | out_channel: 32 35 | kernel_size: 8 36 | stride: 4 37 | activation: relu 38 | - layer_type: conv2d 39 | in_channel: 32 40 | out_channel: 64 41 | kernel_size: 4 42 | stride: 2 43 | activation: relu 44 | - layer_type: conv2d 45 | in_channel: 64 46 | out_channel: 64 47 | kernel_size: 3 48 | stride: 1 49 | activation: relu 50 | - layer_type: flatten 51 | - layer_type: linear 52 | layer_size: [512] 53 | activation: relu 54 | buffer_type: ONPOLICY_QUE 55 | lr: 3.5e-4 # 2.5e-4 56 | actor_lr: 3.0e-4 57 | critic_lr: 5.5e-4 58 | entropy_coef: 0.001 59 | critic_loss_coef: 0.5 60 | eps_clip: 0.105 61 | gamma: 0.99 62 | gae_lambda: 0.95 63 | k_epochs: 3 64 | batch_size: 256 65 | sgd_batch_size: 128 66 | max_norm: 0.5 67 | env_cfg: 68 | id: ALE/Breakout-v5 69 | render_mode: null 70 | wrappers: 71 | - wrapper_name: BaseSkipFrame 72 | start_skip: 30 73 | skip: 4 74 | terminal_done_flag: true 75 | max_no_reward_count: 200 76 | - wrapper_name: GrayScaleObservation 77 | - wrapper_name: ResizeObservation 78 | shape: 84 79 | # - wrapper_name: ClipRewardEnv 80 | - wrapper_name: EpisodicLifeEnv 81 | - wrapper_name: FrameStack 82 | num_stack: 4 83 | - wrapper_name: MultiHeadObsWrapper 84 | - wrapper_name: MultiHeadActionWrapper 85 | - wrapper_name: FrameStack2Numpy 86 | 87 | -------------------------------------------------------------------------------- /presets/Atari/DemonAttack-v5/DoubleDQN_DemonAttack-v5_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DoubleDQN 3 | env_name: gym 4 | device: cuda 5 | mode: test 6 | collect_traj: false 7 | n_interactors: 1 8 | load_checkpoint: true 9 | # reward 860.000 in 552 steps 10 | load_path: Train_ALE/DemonAttack-v5_DoubleDQN_20240120-200251 11 | load_model_step: best 12 | max_episode: 2 13 | max_step: 1000 14 | seed: 2023 15 | 16 | algo_cfg: 17 | action_type: DISCRETE 18 | merge_layers: 19 | branch_layers: 20 | - name: feature_1 21 | layers: 22 | - layer_type: conv2d 23 | in_channel: 4 24 | out_channel: 16 25 | kernel_size: 4 26 | stride: 2 27 | activation: relu 28 | - layer_type: pooling 29 | pooling_type: max2d 30 | kernel_size: 2 31 | stride: 2 32 | padding: 0 33 | - layer_type: conv2d 34 | in_channel: 16 35 | out_channel: 32 36 | kernel_size: 4 37 | stride: 2 38 | activation: relu 39 | - layer_type: pooling 40 | pooling_type: avg2d 41 | kernel_size: 2 42 | stride: 2 43 | padding: 0 44 | - layer_type: flatten 45 | - layer_type: norm 46 | norm_type: LayerNorm 47 | normalized_shape: 512 48 | - layer_type: linear 49 | layer_size: [200] 50 | activation: relu 51 | - layer_type: linear 52 | layer_size: [200] 53 | activation: relu 54 | batch_size: 32 55 | buffer_type: REPLAY_QUE 56 | buffer_size: 12000 57 | epsilon_decay: 20000 58 | epsilon_end: 0.05 59 | epsilon_start: 0.95 60 | gamma: 0.99 61 | lr: 1.5e-4 62 | target_update: 16 63 | env_cfg: 64 | id: ALE/DemonAttack-v5 65 | render_mode: human 66 | obs_type: rgb 67 | wrappers: 68 | - wrapper_name: BaseSkipFrame 69 | skip: 5 70 | cut_slices: 71 | - [15, 188] 72 | - [0, 160] 73 | start_skip: 14 74 | int_action_flag: true 75 | - wrapper_name: GrayScaleObservation 76 | - wrapper_name: ResizeObservation 77 | shape: 84 78 | - wrapper_name: FrameStack 79 | num_stack: 4 80 | 81 | version: '0.5.2' -------------------------------------------------------------------------------- /presets/Atari/DemonAttack-v5/DoubleDQN_DemonAttack-v5_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DoubleDQN 3 | env_name: gym 4 | device: cuda 5 | mode: train 6 | collect_traj: false 7 | n_interactors: 1 8 | load_checkpoint: false 9 | load_path: Train_ALE/DemonAttack-v5_DoubleDQN_20240114-101724 10 | load_model_step: best 11 | max_episode: 2000 12 | max_step: 300 13 | seed: 2023 14 | online_eval: true 15 | online_eval_episode: 10 16 | model_save_fre: 1500 17 | 18 | algo_cfg: 19 | action_type: DISCRETE 20 | merge_layers: 21 | branch_layers: 22 | - name: feature_1 23 | layers: 24 | - layer_type: conv2d 25 | in_channel: 4 26 | out_channel: 16 27 | kernel_size: 4 28 | stride: 2 29 | activation: relu 30 | - layer_type: pooling 31 | pooling_type: max2d 32 | kernel_size: 2 33 | stride: 2 34 | padding: 0 35 | - layer_type: conv2d 36 | in_channel: 16 37 | out_channel: 32 38 | kernel_size: 4 39 | stride: 2 40 | activation: relu 41 | - layer_type: pooling 42 | pooling_type: avg2d 43 | kernel_size: 2 44 | stride: 2 45 | padding: 0 46 | - layer_type: flatten 47 | - layer_type: norm 48 | norm_type: LayerNorm 49 | normalized_shape: 512 50 | - layer_type: linear 51 | layer_size: [200] 52 | activation: relu 53 | - layer_type: linear 54 | layer_size: [200] 55 | activation: relu 56 | batch_size: 32 57 | buffer_type: REPLAY_QUE 58 | buffer_size: 12000 59 | epsilon_decay: 20000 60 | epsilon_end: 0.05 61 | epsilon_start: 0.95 62 | gamma: 0.99 63 | lr: 1.5e-4 64 | target_update: 16 65 | env_cfg: 66 | id: ALE/DemonAttack-v5 67 | render_mode: null 68 | obs_type: rgb 69 | wrappers: 70 | - wrapper_name: BaseSkipFrame 71 | skip: 5 72 | cut_slices: 73 | - [15, 188] 74 | - [0, 160] 75 | start_skip: 14 76 | int_action_flag: true 77 | - wrapper_name: GrayScaleObservation 78 | - wrapper_name: ResizeObservation 79 | shape: 84 80 | - wrapper_name: FrameStack 81 | num_stack: 4 82 | 83 | version: '0.5.2' -------------------------------------------------------------------------------- /presets/Atari/Enduro-v5/Enduro-v5_DQN.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DQN # algo name 3 | env_name: gym # env name, differ from env_id in env_cfgs 4 | device: cuda # device, cpu or cuda 5 | mode: train # run mode: train, test 6 | collect_traj: false # if collect trajectories or not 7 | mp_backend: single # multi-processing mode: single(default), ray 8 | n_workers: 2 # number of workers if using multi-processing, default 1 9 | load_checkpoint: false # if load checkpoint or not 10 | load_path: Train_single_CartPole-v1_DQN_20230515-211721 # if load checkpoint, then config path in 'tasks' dir 11 | load_model_step: best # load model step 12 | max_episode: 1000 # max episodes, set -1 to keep running 13 | max_step: 500 # max steps per episode 14 | seed: 1 # random seed, set 0 not to use seed 15 | online_eval: true # if online eval or not 16 | online_eval_episode: 10 # online eval episodes 17 | model_save_fre: 500 # update step frequency of saving model 18 | algo_cfg: 19 | value_layers: 20 | - layer_type: conv2d 21 | in_channel: 4 22 | out_channel: 32 23 | kernel_size: 8 24 | stride: 4 25 | activation: relu 26 | - layer_type: conv2d 27 | in_channel: 32 28 | out_channel: 64 29 | kernel_size: 4 30 | stride: 2 31 | activation: relu 32 | - layer_type: conv2d 33 | in_channel: 64 34 | out_channel: 64 35 | kernel_size: 3 36 | stride: 1 37 | activation: relu 38 | - layer_type: flatten 39 | - layer_type: linear 40 | layer_size: [512] 41 | activation: relu 42 | batch_size: 256 43 | buffer_type: REPLAY_QUE 44 | max_buffer_size: 100000 45 | epsilon_decay: 500 46 | epsilon_end: 0.01 47 | epsilon_start: 0.95 48 | gamma: 0.95 49 | lr: 0.0001 50 | target_update: 4 51 | env_cfg: 52 | id: ALE/Enduro-v5 53 | wrapper: envs.wrappers.AtariWrapper -------------------------------------------------------------------------------- /presets/BipedalWalker-v3_DDPG_mp_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DDPG 3 | device: cpu 4 | env_name: gym 5 | mode: train 6 | eval_per_episode: 20 7 | load_checkpoint: false 8 | load_path: Train_BipedalWalker-v3_DDPG_20230414-214211 9 | max_steps: 300 10 | save_fig: true 11 | seed: 0 12 | show_fig: false 13 | test_eps: 3 14 | train_eps: 700 15 | n_workers: 2 16 | render: false 17 | render_mode: rgb_array 18 | algo_cfg: 19 | critic_hidden_dim: 128 20 | actor_hidden_dim: 128 21 | gamma: 0.99 22 | actor_lr: 5.0e-5 23 | critic_lr: 1.0e-3 24 | max_buffer_size: 20480 25 | tau: 0.01 26 | batch_size: 256 27 | env_cfg: 28 | id: BipedalWalker-v3 29 | new_step_api: true 30 | render_mode: rgb_array -------------------------------------------------------------------------------- /presets/Box2D/BipedalWalker-v3/BipedalWalker-v3_DDPG_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DDPG 3 | device: cpu 4 | env_name: gym 5 | mode: test 6 | load_checkpoint: true 7 | load_path: Train_BipedalWalker-v3_DDPG_20230414-214211 8 | max_steps: 900 9 | save_fig: true 10 | seed: 2023 11 | show_fig: false 12 | test_eps: 3 13 | train_eps: 2000 14 | n_workers: 1 15 | render: true 16 | render_mode: rgb_array 17 | algo_cfg: 18 | critic_hidden_dim: 128 19 | actor_hidden_dim: 128 20 | gamma: 0.99 21 | actor_lr: 5.0e-5 22 | critic_lr: 1.0e-3 23 | max_buffer_size: 20480 24 | tau: 0.01 25 | batch_size: 256 26 | env_cfg: 27 | id: BipedalWalker-v3 28 | new_step_api: true 29 | render: true 30 | render_mode: rgb_array -------------------------------------------------------------------------------- /presets/Box2D/BipedalWalker-v3/BipedalWalker-v3_DDPG_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DDPG 3 | device: cpu 4 | env_name: gym 5 | mode: train 6 | eval_per_episode: 20 7 | load_checkpoint: false 8 | load_path: Train_BipedalWalker-v3_DDPG_20230414-214211 9 | max_steps: 300 10 | save_fig: true 11 | seed: 2023 12 | show_fig: false 13 | test_eps: 3 14 | train_eps: 700 15 | n_workers: 1 16 | render: false 17 | render_mode: rgb_array 18 | algo_cfg: 19 | critic_hidden_dim: 128 20 | actor_hidden_dim: 128 21 | gamma: 0.99 22 | actor_lr: 5.0e-5 23 | critic_lr: 1.0e-3 24 | max_buffer_size: 20480 25 | tau: 0.01 26 | batch_size: 256 27 | env_cfg: 28 | id: BipedalWalker-v3 29 | new_step_api: true 30 | render_mode: rgb_array -------------------------------------------------------------------------------- /presets/Box2D/BipedalWalker-v3/BipedalWalker-v3_PPO.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.2.2 3 | algo_name: PPO 4 | env_name: gym 5 | device: cuda 6 | interactor_device: cpu 7 | learner_device: cuda 8 | mode: train 9 | exps_trucation_size: 1024 10 | is_learner_async: false 11 | load_checkpoint: false 12 | load_path: Train_BipedalWalker-v3_PPO_20240619-222052 # if load checkpoint, then config path in 'tasks' dir 13 | load_model_step: best 14 | n_interactors: 10 15 | max_episode: 50000 16 | max_step: 500 17 | seed: 202406 18 | online_eval: true 19 | online_eval_episode: 15 20 | model_save_fre: 10 21 | policy_summary_fre: 10 22 | interact_summary_fre: 100 23 | algo_cfg: 24 | independ_actor: true 25 | return_form: td 26 | actor_branch_layers: 27 | - name: feature_1 28 | layers: 29 | - layer_type: linear 30 | layer_size: [200] 31 | activation: tanh 32 | - layer_type: linear 33 | layer_size: [200] 34 | activation: tanh 35 | critic_branch_layers: 36 | - name: feature_1 37 | layers: 38 | - layer_type: linear 39 | layer_size: [200] 40 | activation: tanh 41 | - layer_type: linear 42 | layer_size: [200] 43 | activation: tanh 44 | buffer_type: ONPOLICY_QUE 45 | lr: 2.5e-4 46 | actor_lr: 2.5e-4 # 1 47 | critic_lr: 3.0e-4 # 3 48 | entropy_coef: 0.001 49 | critic_loss_coef: 0.001 50 | eps_clip: 0.25 51 | gamma: 0.99 52 | gae_lambda: 0.95 53 | k_epochs: 2 54 | batch_size: 512 55 | sgd_batch_size: 256 56 | env_cfg: 57 | id: BipedalWalker-v3 58 | render_mode: null 59 | wrappers: 60 | - wrapper_name: ClipAction 61 | - wrapper_name: MultiHeadObsWrapper 62 | - wrapper_name: BipedalWalkerV3TFReward 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /presets/Box2D/BipedalWalker-v3/BipedalWalker-v3_SAC_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: SAC 3 | continous: true 4 | device: cuda 5 | env_name: gym 6 | eval_eps: 10 7 | eval_per_episode: 5 8 | load_checkpoint: true 9 | load_path: Train_gym_SAC_20230415-140928 10 | max_steps: 800 11 | mode: test 12 | mp_backend: mp 13 | new_step_api: true 14 | render: false 15 | render_mode: human 16 | save_fig: true 17 | seed: 666 18 | show_fig: false 19 | test_eps: 20 20 | train_eps: 2000 21 | wrapper: null 22 | algo_cfg: 23 | alpha: 0.01 24 | automatic_entropy_tuning: false 25 | batch_size: 64 26 | max_buffer_size: 1000000 27 | gamma: 0.98 28 | hidden_dim: 256 29 | lr: 0.0004 30 | n_epochs: 1 31 | policy_type: Gaussian 32 | start_steps: 10000 33 | target_update_fre: 1 34 | tau: 0.01 35 | env_cfg: 36 | id: BipedalWalker-v3 37 | new_step_api: true 38 | render_mode: null 39 | -------------------------------------------------------------------------------- /presets/Box2D/BipedalWalker-v3/BipedalWalker-v3_SAC_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: SAC 3 | continous: true 4 | device: cuda 5 | env_name: gym 6 | eval_eps: 10 7 | eval_per_episode: 5 8 | load_checkpoint: false 9 | load_path: Train_gym_SAC_20230415-140928 10 | max_steps: 500 11 | mode: train 12 | mp_backend: mp 13 | new_step_api: true 14 | render: false 15 | render_mode: human 16 | save_fig: true 17 | seed: 666 18 | show_fig: false 19 | test_eps: 20 20 | train_eps: 2000 21 | wrapper: null 22 | algo_cfg: 23 | alpha: 0.01 24 | automatic_entropy_tuning: false 25 | batch_size: 64 26 | max_buffer_size: 1000000 27 | gamma: 0.98 28 | hidden_dim: 256 29 | lr: 0.0004 30 | n_epochs: 1 31 | policy_type: Gaussian 32 | start_steps: 10000 33 | target_update_fre: 1 34 | tau: 0.01 35 | env_cfg: 36 | id: BipedalWalker-v3 37 | new_step_api: true 38 | render_mode: null 39 | -------------------------------------------------------------------------------- /presets/Box2D/BipedalWalkerHardcore-v3/TD3_BipedalWalkerHardcore-v3.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: TD3 3 | device: cuda 4 | env_name: gym 5 | mode: train 6 | load_checkpoint: false 7 | load_path: Train_single_BipedalWalkerHardcore-v3_TD3_20230528-151042 8 | eval_per_episode: 50 9 | max_episode: 10000 10 | max_step: 1000 11 | seed: 2023 12 | model_save_fre: 70000 13 | online_eval: true 14 | algo_cfg: 15 | action_type: DPG 16 | buffer_type: REPLAY_QUE 17 | actor_layers: 18 | - layer_type: linear 19 | layer_size: [200] 20 | activation: relu 21 | - layer_type: linear 22 | layer_size: [200] 23 | activation: relu 24 | actor_lr: 1.0e-4 25 | batch_size: 256 26 | max_buffer_size: 60000 27 | critic_layers: 28 | - layer_type: linear 29 | layer_size: [200] 30 | activation: relu 31 | - layer_type: linear 32 | layer_size: [200] 33 | activation: relu 34 | critic_lr: 1.2e-4 35 | expl_noise: 0.25 36 | explore_steps: 2048 37 | gamma: 0.99 38 | noise_clip: 0.5 39 | policy_freq: 2 40 | policy_noise: 0.2 41 | tau: 0.005 42 | env_cfg: 43 | id: BipedalWalkerHardcore-v3 44 | -------------------------------------------------------------------------------- /presets/Box2D/CarRacing-v2/CarRacing-v2_PPO.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.1.4 3 | algo_name: PPO 4 | env_name: gym 5 | device: cpu 6 | mode: train 7 | exps_trucation_size: 200 8 | is_learner_async: false 9 | load_checkpoint: false 10 | load_path: Train_CartPole-v1_PPO_20231225-124842 # if load checkpoint, then config path in 'tasks' dir 11 | load_model_step: best 12 | n_interactors: 10 13 | max_episode: -1 14 | max_step: 200 15 | seed: 1 16 | online_eval: true 17 | online_eval_episode: 10 18 | model_save_fre: 10 19 | policy_summary_fre: 2 20 | interact_summary_fre: 100 21 | algo_cfg: 22 | branch_layers: 23 | - name: feature_1 24 | layers: 25 | - layer_type: conv2d 26 | in_channel: 3 27 | out_channel: 32 28 | kernel_size: 8 29 | stride: 4 30 | activation: relu 31 | # - layer_type: pooling 32 | # pooling_type: max2d 33 | # kernel_size: 2 34 | # stride: 2 35 | # padding: 0 36 | - layer_type: conv2d 37 | in_channel: 32 38 | out_channel: 64 39 | kernel_size: 4 40 | stride: 2 41 | activation: relu 42 | - layer_type: conv2d 43 | in_channel: 64 44 | out_channel: 64 45 | kernel_size: 3 46 | stride: 1 47 | activation: relu 48 | - layer_type: norm 49 | norm_type: LayerNorm 50 | # - layer_type: pooling 51 | # pooling_type: avg2d 52 | # kernel_size: 2 53 | # stride: 2 54 | # padding: 0 55 | - layer_type: flatten 56 | - layer_type: linear 57 | layer_size: [512] 58 | activation: relu 59 | - layer_type: norm 60 | norm_type: LayerNorm 61 | - layer_type: linear 62 | layer_size: [128] 63 | activation: relu 64 | buffer_type: ONPOLICY_QUE 65 | lr: 0.0003 66 | actor_lr: 0.003 67 | critic_lr: 0.01 68 | entropy_coef: 0.001 69 | critic_loss_coef: 0.5 70 | eps_clip: 0.2 71 | gamma: 0.95 72 | gae_lambda: 0.95 73 | k_epochs: 4 74 | batch_size: 2000 75 | sgd_batch_size: 50 76 | env_cfg: 77 | id: CarRacing-v2 78 | render_mode: null 79 | wrappers: 80 | - wrapper_name: ReshapeImageObsWrapper 81 | - wrapper_name: MultiHeadObsWrapper 82 | 83 | -------------------------------------------------------------------------------- /presets/Box2D/CarRacing-v2/DQN_carRacing-v2_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DQN 3 | env_name: gym 4 | device: cuda 5 | mode: test 6 | collect_traj: false 7 | n_interactors: 1 8 | load_checkpoint: true 9 | load_path: Train_CarRacing-v2_DQN_20240109-221840 10 | load_model_step: best # 754 11 | max_episode: 3 12 | max_step: 1200 13 | seed: 2023 14 | 15 | algo_cfg: 16 | merge_layers: 17 | branch_layers: 18 | - name: feature_1 19 | layers: 20 | - layer_type: conv2d 21 | in_channel: 4 22 | out_channel: 16 23 | kernel_size: 4 24 | stride: 2 25 | activation: relu 26 | - layer_type: pooling 27 | pooling_type: max2d 28 | kernel_size: 2 29 | stride: 2 30 | padding: 0 31 | - layer_type: conv2d 32 | in_channel: 16 33 | out_channel: 32 34 | kernel_size: 4 35 | stride: 2 36 | activation: relu 37 | - layer_type: pooling 38 | pooling_type: avg2d 39 | kernel_size: 2 40 | stride: 2 41 | padding: 0 42 | - layer_type: flatten 43 | - layer_type: norm 44 | norm_type: LayerNorm 45 | normalized_shape: 512 46 | - layer_type: linear 47 | layer_size: [128] 48 | activation: relu 49 | batch_size: 128 50 | buffer_type: REPLAY_QUE 51 | max_buffer_size: 12000 52 | epsilon_decay: 2000 53 | epsilon_end: 0.02 54 | epsilon_start: 0.99 55 | gamma: 0.99 56 | lr: 1.5e-4 # 2.0e-4 57 | target_update: 4 58 | env_cfg: 59 | id: CarRacing-v2 60 | render_mode: human 61 | continuous: False 62 | wrapper: 63 | - wrapper_name: CarV2SkipFrame 64 | skip: 5 65 | - wrapper_name: GrayScaleObservation 66 | - wrapper_name: ResizeObservation 67 | shape: 84 68 | - wrapper_name: FrameStack 69 | num_stack: 4 70 | -------------------------------------------------------------------------------- /presets/Box2D/CarRacing-v2/DQN_carRacing-v2_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DQN 3 | env_name: gym 4 | device: cuda 5 | mode: train 6 | collect_traj: false 7 | n_interactors: 1 8 | load_checkpoint: false 9 | load_path: Train_single_carRacing-v2_DQN 10 | load_model_step: best 11 | max_episode: 1200 12 | max_step: 1600 # 1200 13 | seed: 2023 14 | online_eval: true 15 | online_eval_episode: 100 16 | model_save_fre: 1000 17 | 18 | algo_cfg: 19 | merge_layers: 20 | branch_layers: 21 | - name: feature_1 22 | layers: 23 | - layer_type: conv2d 24 | in_channel: 4 25 | out_channel: 16 26 | kernel_size: 4 27 | stride: 2 28 | activation: relu 29 | - layer_type: pooling 30 | pooling_type: max2d 31 | kernel_size: 2 32 | stride: 2 33 | padding: 0 34 | - layer_type: conv2d 35 | in_channel: 16 36 | out_channel: 32 37 | kernel_size: 4 38 | stride: 2 39 | activation: relu 40 | - layer_type: pooling 41 | pooling_type: avg2d 42 | kernel_size: 2 43 | stride: 2 44 | padding: 0 45 | - layer_type: flatten 46 | - layer_type: norm 47 | norm_type: LayerNorm 48 | normalized_shape: 512 49 | - layer_type: linear 50 | layer_size: [128] 51 | activation: relu 52 | batch_size: 128 53 | buffer_type: REPLAY_QUE 54 | max_buffer_size: 12000 55 | epsilon_decay: 2000 56 | epsilon_end: 0.02 57 | epsilon_start: 0.99 58 | gamma: 0.99 59 | lr: 1.5e-4 # 2.0e-4 60 | target_update: 4 61 | env_cfg: 62 | id: CarRacing-v2 63 | render_mode: null 64 | continuous: False 65 | wrappers: 66 | - wrapper_name: CarV2SkipFrame 67 | skip: 5 68 | - wrapper_name: GrayScaleObservation 69 | - wrapper_name: ResizeObservation 70 | shape: 84 71 | - wrapper_name: FrameStack 72 | num_stack: 4 73 | -------------------------------------------------------------------------------- /presets/Box2D/LunarLander-v2/LunarLander-v2_PPO.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.5 3 | algo_name: PPO 4 | env_name: gym 5 | device: cpu 6 | mode: train 7 | exps_trucation_size: 200 8 | is_learner_async: false 9 | load_checkpoint: true 10 | load_path: Train_LunarLander-v2_PPO_20240617-175014 11 | load_model_step: best 12 | n_interactors: 1 13 | max_episode: -1 14 | max_step: 1000 15 | seed: 1 16 | online_eval: true 17 | online_eval_episode: 10 18 | reward_threshold: 210 19 | model_save_fre: 10 20 | policy_summary_fre: 5 21 | interact_summary_fre: 100 22 | algo_cfg: 23 | independ_actor: false 24 | return_form: td 25 | actor_branch_layers: 26 | - name: state 27 | layers: 28 | - layer_type: linear 29 | layer_size: [256] 30 | activation: relu 31 | critic_branch_layers: 32 | - name: state 33 | layers: 34 | - layer_type: linear 35 | layer_size: [256] 36 | activation: relu 37 | branch_layers: 38 | - name: state 39 | layers: 40 | - layer_type: linear 41 | layer_size: [256] 42 | activation: relu 43 | buffer_type: ONPOLICY_QUE 44 | eps_clip: 0.2 45 | entropy_coef: 0.002 46 | lr: 0.0003 47 | actor_lr: 0.003 48 | critic_lr: 0.01 49 | critic_loss_coef: 0.5 50 | gamma: 0.99 51 | gae_lambda: 0.95 52 | k_epochs: 4 53 | batch_size: 2000 54 | sgd_batch_size: 32 55 | env_cfg: 56 | id: LunarLander-v2 57 | render_mode: null 58 | wrappers: 59 | - wrapper_name: MultiHeadObsWrapper 60 | - wrapper_name: MultiHeadActionWrapper 61 | -------------------------------------------------------------------------------- /presets/Box2D/LunarLander-v2/LunarLander-v2_PPO_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: PPO 3 | device: cpu 4 | env_name: LunarLander-v2 5 | eval_eps: 10 6 | eval_per_episode: 5 7 | load_checkpoint: true 8 | load_path: Train_LunarLander-v2_PPO_20230402-223154 9 | max_steps: 1000 10 | mode: test 11 | mp_backend: mp 12 | new_step_api: true 13 | render: false 14 | save_fig: true 15 | seed: 1 16 | show_fig: false 17 | test_eps: 20 18 | train_eps: 600 19 | wrapper: null 20 | algo_cfg: 21 | actor_hidden_dim: 256 22 | actor_lr: 0.0003 23 | continuous: false 24 | critic_hidden_dim: 256 25 | critic_lr: 0.001 26 | entropy_coef: 0.01 27 | eps_clip: 0.2 28 | gamma: 0.99 29 | k_epochs: 4 30 | ppo_type: clip 31 | sgd_batch_size: 32 32 | train_batch_size: 256 33 | -------------------------------------------------------------------------------- /presets/Box2D/LunarLander-v2/LunarLanderContinuous-v2_SAC_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: SAC 3 | continuous: true 4 | device: cpu 5 | env_name: LunarLanderContinuous-v2 6 | eval_eps: 10 7 | eval_per_episode: 5 8 | load_checkpoint: true 9 | load_path: Train_LunarLanderContinuous-v2_SAC_20230402-170158 10 | max_steps: 500 11 | mode: test 12 | mp_backend: mp 13 | new_step_api: true 14 | render: true 15 | render_mode: human 16 | save_fig: true 17 | seed: 666 18 | show_fig: false 19 | test_eps: 20 20 | train_eps: 2000 21 | wrapper: null 22 | algo_cfg: 23 | alpha: 0.2 24 | automatic_entropy_tuning: false 25 | batch_size: 64 26 | max_buffer_size: 1000000 27 | gamma: 0.99 28 | hidden_dim: 256 29 | lr: 0.001 30 | n_epochs: 1 31 | policy_type: Gaussian 32 | start_steps: 10000 33 | target_update_fre: 1 34 | tau: 0.005 35 | -------------------------------------------------------------------------------- /presets/Box2D/LunarLander-v2/LunarLanderContinuous-v2_SAC_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: SAC 3 | continous: true 4 | device: cuda 5 | env_name: gym 6 | eval_eps: 10 7 | eval_per_episode: 5 8 | load_checkpoint: false 9 | load_path: Train_LunarLanderContinuous-v2_PPO_20230401-102521 10 | max_steps: 500 11 | mode: train 12 | new_step_api: true 13 | render: false 14 | save_fig: true 15 | seed: 666 16 | show_fig: false 17 | test_eps: 20 18 | train_eps: 2000 19 | wrapper: null 20 | algo_cfg: 21 | alpha: 0.1 22 | automatic_entropy_tuning: false 23 | batch_size: 64 24 | max_buffer_size: 1000000 25 | gamma: 0.99 26 | hidden_dim: 256 27 | lr: 0.001 28 | n_epochs: 1 29 | policy_type: Gaussian 30 | start_steps: 10000 31 | target_update_fre: 1 32 | tau: 0.005 33 | env_cfg: 34 | id: LunarLanderContinuous-v2 -------------------------------------------------------------------------------- /presets/ClassControl/Acrobot-v1/Acrobot-v1_DQN.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.5 3 | algo_name: DQN 4 | env_name: gym 5 | interactor_device: cpu 6 | learner_device: cpu 7 | mode: train 8 | is_learner_async: false 9 | collect_traj: false 10 | n_interactors: 1 11 | load_checkpoint: true 12 | load_path: Train_Acrobot-v1_DQN_2024613 13 | load_model_step: best 14 | max_episode: -1 15 | max_step: 200 16 | seed: 1 17 | online_eval: true 18 | online_eval_episode: 10 19 | model_save_fre: 500 20 | policy_summary_fre: 100 21 | 22 | algo_cfg: 23 | learn_frequency: 1 24 | merge_layers: 25 | - layer_type: linear 26 | layer_size: [256] 27 | activation: relu 28 | - layer_type: linear 29 | layer_dim: [256] 30 | activation: relu 31 | batch_size: 64 32 | max_buffer_size: 100000 33 | epsilon_decay: 500 34 | epsilon_end: 0.01 35 | epsilon_start: 0.95 36 | gamma: 0.99 37 | lr: 0.0001 38 | target_update: 4 39 | 40 | env_cfg: 41 | id: Acrobot-v1 42 | render_mode: null 43 | wrappers: 44 | - wrapper_name: MultiHeadObsWrapper 45 | - wrapper_name: MultiHeadActionWrapper 46 | -------------------------------------------------------------------------------- /presets/ClassControl/Acrobot-v1/Acrobot-v1_DoubleDQN.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.5 3 | algo_name: DoubleDQN 4 | env_name: gym 5 | mode: train 6 | is_learner_async: false 7 | collect_traj: false 8 | n_interactors: 1 9 | load_checkpoint: false 10 | load_path: Train_ALE/Acrobot-v1_DoubleDQN_20240114-101724 11 | load_model_step: best 12 | max_episode: -1 13 | max_step: 200 14 | seed: 1 15 | online_eval: true 16 | online_eval_episode: 10 17 | model_save_fre: 500 18 | policy_summary_fre: 100 19 | 20 | algo_cfg: 21 | merege_layers: [] 22 | branch_layers: 23 | - name: feature1 24 | layers: 25 | - layer_type: linear 26 | layer_size: [256] 27 | activation: relu 28 | - layer_type: linear 29 | layer_size: [256] 30 | activation: relu 31 | buffer_type: REPLAY_QUE 32 | batch_size: 64 33 | max_buffer_size: 100000 34 | epsilon_decay: 500 35 | epsilon_end: 0.01 36 | epsilon_start: 0.95 37 | gamma: 0.99 38 | lr: 0.0001 39 | target_update: 4 40 | env_cfg: 41 | id: Acrobot-v1 42 | render_mode: null 43 | wrappers: 44 | - wrapper_name: MultiHeadObsWrapper 45 | - wrapper_name: MultiHeadActionWrapper -------------------------------------------------------------------------------- /presets/ClassControl/Acrobot-v1/Acrobot-v1_DuelingDQN.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.5 3 | algo_name: DuelingDQN 4 | env_name: gym 5 | interactor_device: cuda 6 | learner_device: cuda 7 | mode: train 8 | is_learner_async: false 9 | collect_traj: false 10 | n_interactors: 1 11 | load_checkpoint: false 12 | load_path: Train_CartPole-v1_DQN_20221026-054757 13 | load_model_step: best 14 | max_episode: -1 15 | max_step: 200 16 | seed: 1 17 | online_eval: true 18 | online_eval_episode: 10 19 | model_save_fre: 500 20 | policy_summary_fre: 100 21 | 22 | algo_cfg: 23 | merge_layers: 24 | - layer_type: linear 25 | layer_size: [256] 26 | activation: relu 27 | - layer_type: linear 28 | layer_size: [256] 29 | activation: relu 30 | buffer_type: REPLAY_QUE 31 | batch_size: 64 32 | max_buffer_size: 100000 33 | epsilon_decay: 500 34 | epsilon_end: 0.01 35 | epsilon_start: 0.95 36 | gamma: 0.99 37 | lr: 0.0001 38 | target_update: 4 39 | env_cfg: 40 | id: Acrobot-v1 41 | render_mode: null 42 | wrappers: 43 | - wrapper_name: MultiHeadObsWrapper 44 | - wrapper_name: MultiHeadActionWrapper -------------------------------------------------------------------------------- /presets/ClassControl/Acrobot-v1/Acrobot-v1_NoisyDQN.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.5 3 | algo_name: NoisyDQN 4 | env_name: gym 5 | interactor_device: cuda 6 | learner_device: cuda 7 | mode: train 8 | is_learner_async: false 9 | collect_traj: false 10 | n_interactors: 1 11 | load_checkpoint: false 12 | load_path: Train_CartPole-v1_NoisyDQN_20231225-000846 13 | load_model_step: best 14 | max_episode: -1 15 | max_step: 200 16 | seed: 1 17 | online_eval: true 18 | online_eval_episode: 10 19 | model_save_fre: 500 20 | policy_summary_fre: 100 21 | 22 | algo_cfg: 23 | merge_layers: 24 | - layer_type: noisy_linear 25 | layer_size: [256] 26 | activation: relu 27 | std_init: 0.4 28 | - layer_type: noisy_linear 29 | layer_size: [256] 30 | activation: relu 31 | std_init: 0.4 32 | buffer_type: REPLAY_QUE 33 | batch_size: 64 34 | max_buffer_size: 100000 35 | epsilon_decay: 500 36 | epsilon_end: 0.01 37 | epsilon_start: 0.95 38 | gamma: 0.99 39 | lr: 0.0001 40 | target_update: 4 41 | env_cfg: 42 | id: Acrobot-v1 43 | render_mode: null 44 | wrappers: 45 | - wrapper_name: MultiHeadObsWrapper 46 | - wrapper_name: MultiHeadActionWrapper -------------------------------------------------------------------------------- /presets/ClassControl/Acrobot-v1/Acrobot-v1_PPO.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.5 3 | algo_name: PPO 4 | env_name: gym 5 | device: cpu 6 | mode: train 7 | exps_trucation_size: 200 8 | is_learner_async: false 9 | load_checkpoint: false 10 | load_path: Train_CartPole-v1_PPO_20231225-124842 # if load checkpoint, then config path in 'tasks' dir 11 | load_model_step: best 12 | n_interactors: 15 13 | max_episode: -1 14 | max_step: -1 15 | seed: 1 16 | online_eval: true 17 | online_eval_episode: 15 18 | model_save_fre: 10 19 | policy_summary_fre: 10 20 | interact_summary_fre: 100 21 | algo_cfg: 22 | independ_actor: false 23 | return_form: gae 24 | actor_branch_layers: 25 | - name: feature_1 26 | layers: 27 | - layer_type: linear 28 | layer_size: [256] 29 | activation: relu 30 | - layer_type: linear 31 | layer_size: [256] 32 | activation: relu 33 | critic_branch_layers: 34 | - name: feature_1 35 | layers: 36 | - layer_type: linear 37 | layer_size: [256] 38 | activation: relu 39 | - layer_type: linear 40 | layer_size: [256] 41 | activation: relu 42 | branch_layers: 43 | - name: state 44 | layers: 45 | - layer_type: linear 46 | layer_size: [256] 47 | activation: relu 48 | - layer_type: linear 49 | layer_size: [256] 50 | activation: relu 51 | buffer_type: ONPOLICY_QUE 52 | lr: 0.0005 53 | actor_lr: 0.0003 54 | critic_lr: 0.001 55 | entropy_coef: 0.001 56 | critic_loss_coef: 0.001 57 | eps_clip: 0.1 58 | gamma: 0.99 59 | gae_lambda: 0.94 60 | k_epochs: 4 61 | batch_size: 3000 62 | sgd_batch_size: 300 63 | # min_policy: 0.001 64 | env_cfg: 65 | id: Acrobot-v1 66 | render_mode: null 67 | wrappers: 68 | - wrapper_name: MultiHeadObsWrapper 69 | - wrapper_name: MultiHeadActionWrapper 70 | 71 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_A3C.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.6 3 | algo_name: A3C 4 | env_name: gym 5 | device: cpu 6 | mode: train 7 | exps_trucation_size: 20 8 | is_learner_async: false 9 | load_checkpoint: false 10 | load_path: Train_CartPole-v1_PPO_20231225-124842 # if load checkpoint, then config path in 'tasks' dir 11 | load_model_step: best 12 | n_interactors: 10 13 | max_episode: -1 14 | max_step: 200 15 | seed: 1 16 | online_eval: true 17 | online_eval_episode: 15 18 | model_save_fre: 10 19 | policy_summary_fre: 10 20 | interact_summary_fre: 100 21 | algo_cfg: 22 | independ_actor: false 23 | return_form: td 24 | actor_branch_layers: 25 | - name: feature_1 26 | layers: 27 | - layer_type: linear 28 | layer_size: [256] 29 | activation: relu 30 | - layer_type: linear 31 | layer_size: [256] 32 | activation: relu 33 | critic_branch_layers: 34 | - name: feature_1 35 | layers: 36 | - layer_type: linear 37 | layer_size: [256] 38 | activation: relu 39 | - layer_type: linear 40 | layer_size: [256] 41 | activation: relu 42 | branch_layers: 43 | - name: state 44 | layers: 45 | - layer_type: linear 46 | layer_size: [256] 47 | activation: relu 48 | - layer_type: linear 49 | layer_size: [256] 50 | activation: relu 51 | buffer_type: ONPOLICY_QUE 52 | lr: 0.0003 53 | actor_lr: 0.0003 54 | critic_lr: 0.001 55 | entropy_coef: 0.001 56 | critic_loss_coef: 0.5 57 | gamma: 0.95 58 | gae_lambda: 0.95 59 | batch_size: 200 60 | # min_policy: 0.001 61 | env_cfg: 62 | id: CartPole-v1 63 | render_mode: null 64 | wrappers: 65 | - wrapper_name: MultiHeadObsWrapper 66 | - wrapper_name: MultiHeadActionWrapper 67 | 68 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_BC_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: BC 3 | device: cuda 4 | env_name: CartPole-v1 5 | eval_eps: 10 6 | eval_per_episode: 1 7 | load_checkpoint: True 8 | load_path: Train_CartPole-v1_BC_20230319-190431 9 | max_steps: 200 10 | mode: test 11 | render: false 12 | save_fig: true 13 | seed: 1 14 | show_fig: false 15 | test_eps: 10 16 | train_eps: 1 17 | wrapper: null 18 | algo_cfg: 19 | actor_hidden_dim: 256 20 | lr: 0.0003 21 | critic_hidden_dim: 256 22 | batch_size: 256 23 | train_iterations: 500 24 | expert_path: tasks/Collect_CartPole-v1_PPO_20230319-170351/traj/traj.pkl # 专家数据路径 25 | 26 | 27 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_BC_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: BC 3 | device: cuda 4 | env_name: CartPole-v1 5 | eval_eps: 10 6 | eval_per_episode: 1 7 | load_checkpoint: False 8 | load_path: Train_CartPole-v1_BC_20230319-114100 9 | max_steps: 200 10 | mode: train 11 | new_step_api: true 12 | render: false 13 | save_fig: true 14 | seed: 1 15 | show_fig: false 16 | test_eps: 10 17 | train_eps: 1 18 | wrapper: null 19 | algo_cfg: 20 | actor_hidden_dim: 256 21 | lr: 0.0003 22 | critic_hidden_dim: 256 23 | batch_size: 128 24 | train_iterations: 500 25 | expert_path: tasks/Collect_CartPole-v1_PPO_20230319-170351/traj/traj.pkl # 专家数据路径 26 | 27 | 28 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_C51_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: C51 3 | device: cuda 4 | env_name: CartPole-v1 5 | mode: test 6 | load_checkpoint: true 7 | load_path: Train_CartPole-v1_C51_20230114-222523 # model path under tasks folder 8 | max_steps: 200 9 | save_fig: true 10 | seed: 0 11 | show_fig: false 12 | test_eps: 10 13 | train_eps: 100 14 | Vmin: 0 # support of C51 15 | Vmax: 200 # support of C51 16 | n_atoms: 51 # support of C51 17 | algo_cfg: 18 | batch_size: 64 19 | max_buffer_size: 100000 20 | epsilon_decay: 500 21 | epsilon_end: 0.01 22 | epsilon_start: 0.95 23 | gamma: 0.95 24 | lr: 0.0001 25 | target_update: 4 26 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_C51_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: C51 3 | device: cuda 4 | env_name: CartPole-v1 5 | new_step_api: True 6 | mode: train 7 | load_checkpoint: false 8 | load_path: Train_CartPole-v1_C51_20221026-054757 9 | max_steps: 200 10 | save_fig: true 11 | seed: 1 12 | show_fig: false 13 | test_eps: 10 14 | train_eps: 100 15 | Vmin: 0 # support of C51 16 | Vmax: 200 # support of C51 17 | n_atoms: 51 # support of C51 18 | algo_cfg: 19 | value_layers: 20 | - layer_type: linear 21 | layer_dim: ['n_states',256] 22 | activation: relu 23 | - layer_type: linear 24 | layer_dim: [256,256] 25 | activation: relu 26 | - layer_type: linear 27 | layer_dim: [256,'n_actions'] 28 | activation: none 29 | batch_size: 64 30 | max_buffer_size: 100000 31 | epsilon_decay: 500 32 | epsilon_end: 0.01 33 | epsilon_start: 0.95 34 | gamma: 0.95 35 | lr: 0.0001 36 | target_update: 4 37 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_CQL_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: CQL 3 | device: cpu 4 | env_name: CartPole-v1 5 | eval_eps: 10 6 | eval_per_episode: 5 7 | load_checkpoint: true 8 | load_path: Train_CartPole-v1_CQL_20230408-183652 9 | max_steps: 200 10 | mode: test 11 | new_step_api: true 12 | render: false 13 | save_fig: true 14 | seed: 1 15 | show_fig: true 16 | test_eps: 10 17 | train_eps: 300 18 | wrapper: null 19 | algo_cfg: 20 | batch_size: 64 21 | max_buffer_size: 100000 22 | epsilon_decay: 500 23 | epsilon_end: 0.01 24 | epsilon_start: 0.95 25 | gamma: 0.99 26 | lr: 0.001 27 | target_update: 4 28 | tau: 0.001 29 | value_layers: 30 | - activation: relu 31 | layer_dim: 32 | - n_states 33 | - 256 34 | layer_type: linear 35 | - activation: relu 36 | layer_dim: 37 | - 256 38 | - 256 39 | layer_type: linear 40 | - activation: none 41 | layer_dim: 42 | - 256 43 | - n_actions 44 | layer_type: linear 45 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_CQL_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: CQL 3 | device: cpu 4 | env_name: CartPole-v1 5 | eval_eps: 10 6 | eval_per_episode: 5 7 | load_checkpoint: false 8 | load_path: tasks 9 | max_steps: 200 10 | mode: train 11 | new_step_api: true 12 | render: false 13 | save_fig: true 14 | seed: 1 15 | show_fig: true 16 | test_eps: 10 17 | train_eps: 300 18 | wrapper: null 19 | algo_cfg: 20 | batch_size: 64 21 | max_buffer_size: 100000 22 | epsilon_decay: 500 23 | epsilon_end: 0.01 24 | epsilon_start: 0.95 25 | gamma: 0.99 26 | lr: 0.001 27 | target_update: 4 28 | tau: 0.001 29 | value_layers: 30 | - activation: relu 31 | layer_dim: 32 | - n_states 33 | - 256 34 | layer_type: linear 35 | - activation: relu 36 | layer_dim: 37 | - 256 38 | - 256 39 | layer_type: linear 40 | - activation: none 41 | layer_dim: 42 | - 256 43 | - n_actions 44 | layer_type: linear 45 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_CategoricalDQN.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.8 3 | algo_name: CategoricalDQN 4 | env_name: gym 5 | interactor_device: cpu 6 | learner_device: cpu 7 | mode: train 8 | is_learner_async: false 9 | collect_traj: false 10 | n_interactors: 1 11 | load_checkpoint: false 12 | load_path: Train_single_CartPole-v1_DQN_20230515-211721 13 | load_model_step: best 14 | reward_threshold: 200 15 | max_episode: -1 16 | max_step: 200 17 | seed: 1 18 | online_eval: true 19 | online_eval_episode: 10 20 | model_save_fre: 500 21 | policy_summary_fre: 100 22 | 23 | algo_cfg: 24 | enable_soft_update: True 25 | distributional: True 26 | learn_frequency: 1 27 | # branch_layers: 28 | # - name: feature_1 29 | # layers: 30 | # - layer_type: linear 31 | # layer_size: [256] 32 | # activation: relu 33 | # - layer_type: linear 34 | # layer_size: [256] 35 | # activation: relu 36 | # - name: feature_2 37 | # layers: 38 | # - layer_type: linear 39 | # layer_size: [256] 40 | # activation: relu 41 | # - layer_type: linear 42 | # layer_size: [256] 43 | # activation: relu 44 | merge_layers: 45 | - layer_type: linear 46 | layer_size: [256] 47 | activation: relu 48 | - layer_type: linear 49 | layer_size: [256] 50 | activation: relu 51 | batch_size: 128 52 | buffer_type: REPLAY_QUE 53 | max_buffer_size: 100000 54 | epsilon_decay: 500 55 | epsilon_end: 0.01 56 | epsilon_start: 0.95 57 | gamma: 0.95 58 | lr: 0.0001 59 | target_update: 4 60 | env_cfg: 61 | id: CartPole-v1 62 | render_mode: null 63 | wrappers: 64 | - wrapper_name: MultiHeadObsWrapper 65 | - wrapper_name: MultiHeadActionWrapper -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_DQN.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.5 3 | algo_name: DQN 4 | env_name: gym 5 | interactor_device: cpu 6 | learner_device: cpu 7 | mode: train 8 | is_learner_async: false 9 | collect_traj: false 10 | n_interactors: 1 11 | load_checkpoint: false 12 | load_path: Train_single_CartPole-v1_DQN_20230515-211721 13 | load_model_step: best 14 | reward_threshold: 200 15 | max_episode: -1 16 | max_step: 200 17 | seed: 1 18 | online_eval: true 19 | online_eval_episode: 10 20 | model_save_fre: 500 21 | policy_summary_fre: 100 22 | 23 | algo_cfg: 24 | learn_frequency: 1 25 | # branch_layers: 26 | # - name: feature_1 27 | # layers: 28 | # - layer_type: linear 29 | # layer_size: [256] 30 | # activation: relu 31 | # - layer_type: linear 32 | # layer_size: [256] 33 | # activation: relu 34 | # - name: feature_2 35 | # layers: 36 | # - layer_type: linear 37 | # layer_size: [256] 38 | # activation: relu 39 | # - layer_type: linear 40 | # layer_size: [256] 41 | # activation: relu 42 | merge_layers: 43 | - layer_type: linear 44 | layer_size: [256] 45 | activation: relu 46 | - layer_type: linear 47 | layer_size: [256] 48 | activation: relu 49 | batch_size: 128 50 | buffer_type: REPLAY_QUE 51 | max_buffer_size: 100000 52 | epsilon_decay: 500 53 | epsilon_end: 0.01 54 | epsilon_start: 0.95 55 | gamma: 0.95 56 | lr: 0.0001 57 | target_update: 4 58 | env_cfg: 59 | id: CartPole-v1 60 | render_mode: null 61 | wrappers: 62 | - wrapper_name: MultiHeadObsWrapper 63 | - wrapper_name: MultiHeadActionWrapper -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_DQN_1.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.5 3 | algo_name: DQN 4 | env_name: gym 5 | device: cpu 6 | mode: train 7 | is_learner_async: true 8 | collect_traj: false 9 | n_interactors: 5 10 | load_checkpoint: false 11 | load_path: Train_single_CartPole-v1_DQN_20230515-211721 12 | load_model_step: best 13 | max_episode: -1 14 | max_step: 200 15 | seed: 1 16 | online_eval: true 17 | online_eval_episode: 10 18 | model_save_fre: 500 19 | interact_summary_fre: 10 20 | policy_summary_fre: 100 21 | 22 | algo_cfg: 23 | exps_trucation_size: 20 24 | learn_frequency: 200 25 | # branch_layers: 26 | # - name: feature_1 27 | # layers: 28 | # - layer_type: linear 29 | # layer_size: [256] 30 | # activation: relu 31 | # - layer_type: linear 32 | # layer_size: [256] 33 | # activation: relu 34 | # - name: feature_2 35 | # layers: 36 | # - layer_type: linear 37 | # layer_size: [256] 38 | # activation: relu 39 | # - layer_type: linear 40 | # layer_size: [256] 41 | # activation: relu 42 | merge_layers: 43 | - layer_type: linear 44 | layer_size: [256] 45 | activation: relu 46 | - layer_type: linear 47 | layer_size: [256] 48 | activation: relu 49 | batch_size: 128 50 | buffer_type: REPLAY_QUE 51 | max_buffer_size: 100000 52 | epsilon_decay: 500 53 | epsilon_end: 0.01 54 | epsilon_start: 0.95 55 | gamma: 0.95 56 | lr: 0.0001 57 | target_update: 4 58 | env_cfg: 59 | id: CartPole-v1 60 | render_mode: null 61 | wrappers: 62 | - wrapper_name: MultiHeadObsWrapper 63 | - wrapper_name: MultiHeadActionWrapper -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_DRQN_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DRQN 3 | device: cuda 4 | env_name: CartPole-v1 5 | mode: test 6 | load_checkpoint: true 7 | load_path: Train_CartPole-v1_DRQN_20230204-223146 # model path under tasks folder 8 | max_steps: 200 9 | save_fig: true 10 | seed: 0 11 | show_fig: false 12 | test_eps: 10 13 | train_eps: 100 14 | algo_cfg: 15 | batch_size: 64 16 | max_buffer_size: 100000 17 | epsilon_decay: 0.995 18 | epsilon_end: 0.001 19 | epsilon_start: 0.1 20 | gamma: 0.99 21 | lr: 0.001 22 | target_update: 4 23 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_DRQN_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DRQN 3 | device: cuda 4 | env_name: CartPole-v1 5 | mode: train 6 | load_checkpoint: false 7 | load_path: Train_CartPole-v1_ 8 | max_steps: 200 9 | save_fig: true 10 | seed: 1 11 | show_fig: false 12 | test_eps: 10 13 | train_eps: 200 ### 14 | algo_cfg: 15 | value_layers: 16 | - layer_type: linear 17 | layer_dim: ['n_states',64] 18 | activation: relu 19 | - layer_type: linear 20 | layer_dim: [64,64] 21 | activation: relu 22 | - layer_type: linear 23 | layer_dim: [64,'n_actions'] 24 | activation: none 25 | batch_size: 8 26 | min_epi_num: 16 27 | max_epi_len: 100 28 | lookup_step: 10 29 | max_epi_num: 100 30 | 31 | max_buffer_size: 100000 32 | epsilon_decay: 0.995 33 | epsilon_end: 0.001 34 | epsilon_start: 0.1 35 | gamma: 0.99 36 | lr: 0.001 37 | target_update: 4 38 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_DoubleDQN.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.5 3 | algo_name: DoubleDQN 4 | env_name: gym 5 | interactor_device: cpu 6 | learner_device: cpu 7 | seed: 1 8 | mode: train 9 | max_episode: -1 10 | max_step: 200 11 | load_checkpoint: false 12 | load_path: Train_CartPole-v1_DQN_20221026-054757 13 | load_model_step: best 14 | online_eval: true 15 | online_eval_episode: 10 16 | model_save_fre: 500 17 | policy_summary_fre: 100 18 | 19 | algo_cfg: 20 | learn_frequency: 1 21 | # branch_layers: 22 | # - name: feature_1 23 | # layers: 24 | # - layer_type: linear 25 | # layer_size: [256] 26 | # activation: relu 27 | # - layer_type: linear 28 | # layer_size: [256] 29 | # activation: relu 30 | # - name: feature_2 31 | # layers: 32 | # - layer_type: linear 33 | # layer_size: [256] 34 | # activation: relu 35 | # - layer_type: linear 36 | # layer_size: [256] 37 | # activation: relu 38 | merge_layers: 39 | - layer_type: linear 40 | layer_size: [256] 41 | activation: relu 42 | - layer_type: linear 43 | layer_size: [256] 44 | activation: relu 45 | batch_size: 128 46 | buffer_type: REPLAY_QUE 47 | max_buffer_size: 100000 48 | epsilon_decay: 500 49 | epsilon_end: 0.01 50 | epsilon_start: 0.95 51 | gamma: 0.99 52 | lr: 0.0001 53 | target_update: 4 54 | env_cfg: 55 | id: CartPole-v1 56 | render_mode: null 57 | wrappers: 58 | - wrapper_name: MultiHeadObsWrapper 59 | - wrapper_name: MultiHeadActionWrapper -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_DuelingDQN.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.5 3 | algo_name: DuelingDQN 4 | env_name: gym 5 | interactor_device: cpu 6 | learner_device: cpu 7 | seed: 1 8 | mode: train 9 | max_episode: -1 10 | max_step: 200 11 | load_checkpoint: false 12 | load_path: Train_CartPole-v1_DQN_20221026-054757 13 | load_model_step: best 14 | online_eval: true 15 | online_eval_episode: 10 16 | model_save_fre: 500 17 | policy_summary_fre: 100 18 | algo_cfg: 19 | dueling: true 20 | merge_layers: 21 | - layer_type: linear 22 | layer_size: [256] 23 | activation: relu 24 | - layer_type: linear 25 | layer_size: [256] 26 | activation: relu 27 | batch_size: 128 28 | buffer_type: REPLAY_QUE 29 | max_buffer_size: 100000 30 | epsilon_decay: 500 31 | epsilon_end: 0.01 32 | epsilon_start: 0.95 33 | gamma: 0.95 34 | lr: 0.0001 35 | target_update: 4 36 | env_cfg: 37 | id: CartPole-v1 38 | render_mode: null 39 | wrappers: 40 | - wrapper_name: MultiHeadObsWrapper 41 | - wrapper_name: MultiHeadActionWrapper -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_GAIL_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: GAIL 3 | batch_size: 2048 4 | device: cuda 5 | discount: 0.99 6 | env_name: CartPole-v1 7 | eval_eps: 10 8 | eval_per_episode: 5 9 | load_checkpoint: true 10 | load_path: Train_CartPole-v1_GAIL_20221207-160945 11 | max_steps: 200 12 | mode: test 13 | hidden_dim: 32 14 | lr: 0.001 15 | imitation_replay_size: 4 16 | imitation_epochs: 5 17 | imitation_batch_size: 128 18 | new_step_api: true 19 | ppo_epochs: 5 20 | render: false 21 | r1_reg_coeff: 1 22 | save_fig: true 23 | seed: 1 24 | show_fig: false 25 | test_eps: 10 26 | train_eps: 400 27 | wrapper: null 28 | algo_cfg: 29 | 30 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_GAIL_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: GAIL 3 | adversarial_batch_size: 128 4 | adversarial_epochs: 5 5 | batch_size: 2048 6 | device: cuda 7 | discount: 0.99 8 | env_name: CartPole-v1 9 | eval_eps: 10 10 | eval_per_episode: 5 11 | load_checkpoint: false 12 | load_path: Collect_CartPole-v1_PPO_20221206-173222 13 | max_steps: 200 14 | mode: train 15 | new_step_api: true 16 | num_workers: 8 17 | hidden_dim: 32 18 | lr: 0.001 19 | imitation_replay_size: 4 20 | r1_reg_coeff: 1 21 | render: false 22 | ppo_epochs: 5 23 | save_fig: true 24 | seed: 1 25 | show_fig: false 26 | test_eps: 10 27 | train_eps: 1500 28 | wrapper: null 29 | algo_cfg: 30 | 31 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_NoisyDQN.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.4.1 3 | algo_name: NoisyDQN 4 | env_name: gym 5 | interactor_device: cpu 6 | learner_device: cpu 7 | seed: 1 8 | mode: train 9 | collect_traj: false 10 | max_episode: -1 11 | max_step: 200 12 | load_checkpoint: false 13 | load_path: Train_CartPole-v1_NoisyDQN_20231225-000846 14 | load_model_step: best 15 | online_eval: true 16 | online_eval_episode: 10 17 | model_save_fre: 500 18 | policy_summary_fre: 100 19 | algo_cfg: 20 | merge_layers: 21 | - layer_type: noisy_linear 22 | layer_size: [256] 23 | activation: relu 24 | std_init: 0.4 25 | - layer_type: noisy_linear 26 | layer_size: [256] 27 | activation: relu 28 | std_init: 0.4 29 | batch_size: 128 30 | buffer_type: REPLAY_QUE 31 | max_buffer_size: 100000 32 | epsilon_decay: 500 33 | epsilon_end: 0.01 34 | epsilon_start: 0.95 35 | gamma: 0.99 36 | lr: 0.0001 37 | target_update: 4 38 | env_cfg: 39 | id: CartPole-v1 40 | render_mode: null 41 | wrappers: 42 | - wrapper_name: MultiHeadObsWrapper 43 | - wrapper_name: MultiHeadActionWrapper -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_PER_DQN.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: PER_DQN # algo name 3 | env_name: gym # env name, differ from env_id in env_cfgs 4 | device: cuda # device, cpu or cuda 5 | mode: test # run mode: train, test 6 | collect_traj: false # if collect trajectories or not 7 | mp_backend: ray # multi-processing mode: single(default), ray 8 | n_workers: 2 # number of workers if using multi-processing, default 1 9 | load_checkpoint: false # if load checkpoint or not 10 | load_path: Train_single_CartPole-v1_PER_DQN_20230518-232215 # if load checkpoint, then config path in 'tasks' dir 11 | load_model_step: best # load model step 12 | max_episode: 100 # max episodes, set -1 to keep running 13 | max_step: 200 # max steps per episode 14 | seed: 1 # random seed, set 0 not to use seed 15 | online_eval: true # if online eval or not 16 | online_eval_episode: 10 # online eval episodes 17 | model_save_fre: 500 # update step frequency of saving model 18 | 19 | algo_cfg: 20 | value_layers: 21 | - layer_type: linear 22 | layer_size: [256] 23 | activation: relu 24 | - layer_type: linear 25 | layer_size: [256] 26 | activation: relu 27 | batch_size: 64 28 | buffer_type: PER_QUE 29 | max_buffer_size: 100000 30 | per_alpha: 0.6 31 | per_beta: 0.4 32 | per_beta_annealing: 0.001 33 | per_epsilon: 0.01 34 | epsilon_decay: 1000 35 | epsilon_end: 0.01 36 | epsilon_start: 0.95 37 | gamma: 0.99 38 | lr: 0.0001 39 | target_update: 4 40 | env_cfg: 41 | id: CartPole-v1 42 | render_mode: null -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_PPO.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.5 3 | algo_name: PPO 4 | env_name: gym 5 | device: cpu 6 | mode: train 7 | exps_trucation_size: 200 8 | is_learner_async: false 9 | load_checkpoint: false 10 | load_path: Train_CartPole-v1_PPO_20231225-124842 # if load checkpoint, then config path in 'tasks' dir 11 | load_model_step: best 12 | n_interactors: 10 13 | max_episode: -1 14 | max_step: 200 15 | seed: 1 16 | online_eval: true 17 | online_eval_episode: 15 18 | model_save_fre: 10 19 | policy_summary_fre: 10 20 | interact_summary_fre: 100 21 | algo_cfg: 22 | independ_actor: false 23 | return_form: td 24 | actor_branch_layers: 25 | - name: feature_1 26 | layers: 27 | - layer_type: linear 28 | layer_size: [256] 29 | activation: relu 30 | - layer_type: linear 31 | layer_size: [256] 32 | activation: relu 33 | critic_branch_layers: 34 | - name: feature_1 35 | layers: 36 | - layer_type: linear 37 | layer_size: [256] 38 | activation: relu 39 | - layer_type: linear 40 | layer_size: [256] 41 | activation: relu 42 | branch_layers: 43 | - name: state 44 | layers: 45 | - layer_type: linear 46 | layer_size: [256] 47 | activation: relu 48 | - layer_type: linear 49 | layer_size: [256] 50 | activation: relu 51 | buffer_type: ONPOLICY_QUE 52 | lr: 0.0003 53 | actor_lr: 0.0003 54 | critic_lr: 0.001 55 | entropy_coef: 0.001 56 | critic_loss_coef: 0.001 57 | eps_clip: 0.1 58 | gamma: 0.95 59 | gae_lambda: 0.95 60 | k_epochs: 4 61 | batch_size: 2000 62 | sgd_batch_size: 200 63 | # min_policy: 0.001 64 | env_cfg: 65 | id: CartPole-v1 66 | render_mode: null 67 | wrappers: 68 | - wrapper_name: MultiHeadObsWrapper 69 | - wrapper_name: MultiHeadActionWrapper 70 | 71 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_PPO_off_policy.yaml: -------------------------------------------------------------------------------- 1 | 2 | general_cfg: 3 | joyrl_version: 0.6.5 4 | algo_name: PPO 5 | env_name: gym 6 | device: cpu 7 | mode: train 8 | exps_trucation_size: 200 9 | is_learner_async: true 10 | load_checkpoint: false 11 | load_path: Train_CartPole-v1_PPO_20231225-124842 # if load checkpoint, then config path in 'tasks' dir 12 | load_model_step: best 13 | n_interactors: 10 14 | max_episode: -1 15 | max_step: 200 16 | seed: 1 17 | online_eval: true 18 | online_eval_episode: 15 19 | model_save_fre: 10 20 | policy_summary_fre: 10 21 | interact_summary_fre: 100 22 | algo_cfg: 23 | independ_actor: false 24 | return_form: td 25 | actor_branch_layers: 26 | - name: feature_1 27 | layers: 28 | - layer_type: linear 29 | layer_size: [256] 30 | activation: relu 31 | - layer_type: linear 32 | layer_size: [256] 33 | activation: relu 34 | critic_branch_layers: 35 | - name: feature_1 36 | layers: 37 | - layer_type: linear 38 | layer_size: [256] 39 | activation: relu 40 | - layer_type: linear 41 | layer_size: [256] 42 | activation: relu 43 | branch_layers: 44 | - name: state 45 | layers: 46 | - layer_type: linear 47 | layer_size: [256] 48 | activation: relu 49 | - layer_type: linear 50 | layer_size: [256] 51 | activation: relu 52 | buffer_type: REPLAY_QUE 53 | max_buffer_size: 4000 54 | lr: 0.0003 55 | actor_lr: 0.0003 56 | critic_lr: 0.001 57 | entropy_coef: 0.001 58 | critic_loss_coef: 0.001 59 | eps_clip: 0.1 60 | gamma: 0.95 61 | gae_lambda: 0.95 62 | k_epochs: 4 63 | batch_size: 2000 64 | sgd_batch_size: 200 65 | # min_policy: 0.001 66 | env_cfg: 67 | id: CartPole-v1 68 | render_mode: null 69 | wrappers: 70 | - wrapper_name: MultiHeadObsWrapper 71 | - wrapper_name: MultiHeadActionWrapper 72 | 73 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_REINFORCE_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: REINFORCE 3 | device: cpu 4 | env_name: CartPole-v1 5 | mode: test 6 | load_checkpoint: true 7 | load_path: Train_CartPole-v1_REINFORCE_20221203-143307 8 | max_steps: 200 9 | save_fig: true 10 | seed: 1 11 | show_fig: false 12 | test_eps: 10 13 | train_eps: 200 14 | algo_cfg: -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_REINFORCE_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: REINFORCE 3 | device: cpu 4 | env_name: CartPole-v1 5 | eval_eps: 10 6 | eval_per_episode: 5 7 | load_checkpoint: false 8 | load_path: Train_CartPole-v1_DQN_20221026-054757 9 | max_steps: 200 10 | mode: train 11 | new_step_api: true 12 | render: false 13 | save_fig: true 14 | seed: 1 15 | show_fig: false 16 | test_eps: 10 17 | train_eps: 200 18 | wrapper: null 19 | algo_cfg: 20 | gamma: 0.99 21 | hidden_dim: 36 22 | lr: 0.01 23 | update_freq: 200 24 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_RainbowDQN_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: RainbowDQN 3 | device: cuda 4 | env_name: CartPole-v1 5 | mode: test 6 | load_checkpoint: true 7 | load_path: Train_CartPole-v1_RainbowDQN_20230114-222012 # model path under tasks folder 8 | max_steps: 200 9 | save_fig: true 10 | seed: 0 11 | show_fig: false 12 | test_eps: 10 13 | train_eps: 100 14 | Vmin: 0 # support of C51 15 | Vmax: 200 # support of C51 16 | n_atoms: 51 # support of C51 17 | algo_cfg: 18 | batch_size: 64 19 | max_buffer_size: 100000 20 | epsilon_decay: 500 21 | epsilon_end: 0.01 22 | epsilon_start: 0.95 23 | gamma: 0.95 24 | lr: 0.0001 25 | target_update: 4 26 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_RainbowDQN_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: RainbowDQN 3 | device: cpu 4 | env_name: gym 5 | mode: train 6 | load_checkpoint: false 7 | load_path: Train_CartPole-v1_ 8 | max_steps: 200 9 | save_fig: true 10 | seed: 1 11 | show_fig: false 12 | test_eps: 10 13 | train_eps: 100 14 | Vmin: 0 # support of C51 15 | Vmax: 200 # support of C51 16 | n_atoms: 51 # support of C51 17 | algo_cfg: 18 | value_layers: 19 | - layer_type: linear 20 | layer_dim: ['n_states',256] 21 | activation: relu 22 | - layer_type: linear 23 | layer_dim: [256,256] 24 | activation: relu 25 | - layer_type: linear 26 | layer_dim: [256,'n_actions'] 27 | activation: none 28 | batch_size: 64 29 | max_buffer_size: 100000 30 | epsilon_decay: 500 31 | epsilon_end: 0.01 32 | epsilon_start: 0.95 33 | gamma: 0.95 34 | lr: 0.0001 35 | target_update: 4 36 | env_cfg: 37 | id: CartPole-v1 38 | new_step_api: true 39 | render_mode: null 40 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_RainbowDQN_Train_mp.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: RainbowDQN 3 | device: cpu 4 | env_name: gym 5 | mode: train 6 | mp_backend: mp # 多线程框架,ray或者mp(multiprocessing),默认mp 7 | n_workers: 2 # number of workers for parallel training 8 | load_checkpoint: false 9 | load_path: Train_CartPole-v1_ 10 | max_steps: 200 11 | save_fig: true 12 | seed: 1 13 | show_fig: false 14 | test_eps: 10 15 | train_eps: 100 16 | Vmin: 0 # support of C51 17 | Vmax: 200 # support of C51 18 | n_atoms: 51 # support of C51 19 | algo_cfg: 20 | value_layers: 21 | - layer_type: linear 22 | layer_dim: ['n_states',256] 23 | activation: relu 24 | - layer_type: linear 25 | layer_dim: [256,256] 26 | activation: relu 27 | - layer_type: linear 28 | layer_dim: [256,'n_actions'] 29 | activation: none 30 | batch_size: 64 31 | max_buffer_size: 100000 32 | epsilon_decay: 500 33 | epsilon_end: 0.01 34 | epsilon_start: 0.95 35 | gamma: 0.95 36 | lr: 0.0001 37 | target_update: 4 38 | env_cfg: 39 | id: CartPole-v1 40 | new_step_api: true 41 | render_mode: null 42 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_SAC_D_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: SAC_D 3 | device: cuda 4 | env_name: CartPole-v1 5 | mode: test 6 | load_checkpoint: true 7 | load_path: Train_CartPole-v1_SAC_D_20230305-112849 # model path under tasks folder 8 | max_steps: 200 9 | save_fig: true 10 | seed: 0 11 | show_fig: false 12 | test_eps: 10 13 | train_eps: 200 14 | algo_cfg: 15 | batch_size: 64 16 | max_buffer_size: 100000 17 | alpha: 0.2 18 | epsilon_decay: 500 19 | epsilon_end: 0.01 20 | epsilon_start: 0.95 21 | gamma: 0.95 22 | lr: 0.0001 23 | target_update: 1 24 | -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_SAC_D_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: SAC_D 3 | device: cuda 4 | env_name: CartPole-v1 5 | new_step_api: True 6 | wrapper: null 7 | mode: train 8 | load_checkpoint: false 9 | load_path: Train_CartPole-v1_DQN_20221026-054757 10 | max_steps: 200 11 | save_fig: true 12 | seed: 0 13 | show_fig: false 14 | test_eps: 10 15 | train_eps: 200 16 | algo_cfg: 17 | alpha: 0.2 18 | epsilon_decay: 500 19 | epsilon_end: 0.01 20 | epsilon_start: 0.95 21 | gamma: 0.95 22 | lr: 0.0001 23 | target_update: 1 -------------------------------------------------------------------------------- /presets/ClassControl/CartPole-v1/CartPole-v1_SoftQ.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.5 3 | algo_name: SoftQ 4 | env_name: gym 5 | interactor_device: cpu 6 | learner_device: cpu 7 | mode: train 8 | is_learner_async: false 9 | collect_traj: false 10 | n_interactors: 1 11 | load_checkpoint: false 12 | load_path: Train_single_CartPole-v1_DQN_20230515-211721 13 | load_model_step: best 14 | reward_threshold: 200 15 | max_episode: -1 16 | max_step: 200 17 | seed: 1 18 | online_eval: true 19 | online_eval_episode: 10 20 | model_save_fre: 500 21 | policy_summary_fre: 100 22 | 23 | algo_cfg: 24 | alpha: 4 25 | learn_frequency: 1 26 | # branch_layers: 27 | # - name: feature_1 28 | # layers: 29 | # - layer_type: linear 30 | # layer_size: [256] 31 | # activation: relu 32 | # - layer_type: linear 33 | # layer_size: [256] 34 | # activation: relu 35 | # - name: feature_2 36 | # layers: 37 | # - layer_type: linear 38 | # layer_size: [256] 39 | # activation: relu 40 | # - layer_type: linear 41 | # layer_size: [256] 42 | # activation: relu 43 | merge_layers: 44 | - layer_type: linear 45 | layer_size: [256] 46 | activation: relu 47 | - layer_type: linear 48 | layer_size: [256] 49 | activation: relu 50 | batch_size: 128 51 | buffer_type: REPLAY_QUE 52 | max_buffer_size: 100000 53 | epsilon_decay: 500 54 | epsilon_end: 0.01 55 | epsilon_start: 0.95 56 | gamma: 0.95 57 | lr: 0.0001 58 | target_update: 4 59 | env_cfg: 60 | id: CartPole-v1 61 | render_mode: null 62 | wrappers: 63 | - wrapper_name: MultiHeadObsWrapper 64 | - wrapper_name: MultiHeadActionWrapper -------------------------------------------------------------------------------- /presets/ClassControl/MountainCar-v0/MountainCar-v0_DQN.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.2.5 3 | algo_name: DQN 4 | env_name: gym 5 | interactor_device: cpu 6 | learner_device: cpu 7 | mode: train 8 | is_learner_async: false 9 | collect_traj: false 10 | n_interactors: 1 11 | load_checkpoint: false 12 | load_path: "Train_MountainCar-v0_DQN" 13 | load_model_step: "best" 14 | max_episode: -1 15 | max_step: 200 16 | seed: 1 17 | online_eval: true 18 | online_eval_episode: 10 19 | algo_cfg: 20 | learn_frequency: 1 21 | merge_layers: 22 | - layer_type: linear 23 | layer_size: [256] 24 | activation: relu 25 | - layer_type: linear 26 | layer_dim: [256] 27 | activation: relu 28 | batch_size: 64 29 | max_buffer_size: 10000 30 | epsilon_decay: 1500 31 | epsilon_end: 0.01 32 | epsilon_start: 0.98 33 | gamma: 0.98 34 | lr: 0.001 35 | target_update: 10 36 | env_cfg: 37 | id: MountainCar-v0 38 | render_mode: null 39 | wrappers: 40 | - wrapper_name: MultiHeadActionWrapper 41 | -------------------------------------------------------------------------------- /presets/ClassControl/MountainCar-v0/MountainCar-v0_PPO.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.5 3 | algo_name: PPO 4 | env_name: gym 5 | device: cpu 6 | interactor device: cuda 7 | learner device: cuda 8 | mode: train 9 | exps_trucation_size: 200 10 | is_learner_async: false 11 | load_checkpoint: false 12 | load_path: Train_MountainCar-v0_PPO_20240618-192707 13 | load_model_step: best 14 | n_interactors: 1 15 | max_episode: -1 16 | max_step: 200 17 | seed: 1 18 | online_eval: true 19 | online_eval_episode: 10 20 | reward_threshold: -110 21 | model_save_fre: 10 22 | policy_summary_fre: 5 23 | interact_summary_fre: 100 24 | algo_cfg: 25 | independ_actor: false 26 | return_form: td 27 | actor_branch_layers: 28 | - name: state 29 | layers: 30 | - layer_type: linear 31 | layer_size: [256] 32 | activation: relu 33 | critic_branch_layers: 34 | - name: state 35 | layers: 36 | - layer_type: linear 37 | layer_size: [256] 38 | activation: relu 39 | branch_layers: 40 | - name: state 41 | layers: 42 | - layer_type: linear 43 | layer_size: [256] 44 | activation: relu 45 | buffer_type: ONPOLICY_QUE 46 | eps_clip: 0.2 47 | entropy_coef: 0.002 48 | lr: 0.0003 49 | actor_lr: 0.003 50 | critic_lr: 0.01 51 | critic_loss_coef: 0.5 52 | gamma: 0.99 53 | gae_lambda: 0.95 54 | k_epochs: 4 55 | batch_size: 2000 56 | sgd_batch_size: 32 57 | env_cfg: 58 | id: MountainCar-v0 59 | render_mode: null 60 | wrappers: 61 | - wrapper_name: MultiHeadObsWrapper 62 | - wrapper_name: MultiHeadActionWrapper 63 | -------------------------------------------------------------------------------- /presets/ClassControl/MountainCarContinuous-v0/MountainCarContinuous-v0_PPO-test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.5.1 3 | algo_name: PPO 4 | device: cpu 5 | env_name: gym 6 | interactor device: cuda 7 | learner device: cuda 8 | mode: test 9 | exps_trucation_size: 1024 10 | is_learner_async: false 11 | load_checkpoint: true 12 | load_path: Train_MountainCarContinuous-v0_PPO_20240715-161812 13 | load_model_step: best 14 | n_interactors: 20 15 | max_episode: -1 16 | max_step: -1 17 | seed: 1 18 | reward_threshold: 90 19 | online_eval: true 20 | online_eval_episode: 20 21 | model_save_fre: 10000 22 | policy_summary_fre: 5000 23 | interact_summary_fre: 5000 24 | algo_cfg: 25 | actor_branch_layers: 26 | - name: action 27 | layers: 28 | - layer_type: linear 29 | layer_size: [256] 30 | activation: tanh 31 | - layer_type: linear 32 | layer_size: [256] 33 | activation: tanh 34 | 35 | critic_branch_layers: 36 | - name: critic 37 | layers: 38 | - layer_type: linear 39 | layer_size: [256] 40 | activation: relu 41 | - layer_type: linear 42 | layer_size: [256] 43 | activation: relu 44 | buffer_type: REPLAY_QUE 45 | max_buffer_size: 100000 46 | action_type_list: continuous 47 | lr: 0.0003 48 | actor_lr: 0.003 49 | critic_lr: 0.005 50 | entropy_coef: 0.003 51 | critic_loss_coef: 0.5 52 | eps_clip: 0.2 53 | gamma: 0.99 54 | return_form: mc 55 | gae_lambda: 0.95 56 | k_epochs: 4 57 | batch_size: 64 58 | env_cfg: 59 | id: MountainCarContinuous-v0 60 | render_mode: human 61 | wrappers: 62 | - wrapper_name: MultiHeadObsWrapper 63 | -------------------------------------------------------------------------------- /presets/ClassControl/MountainCarContinuous-v0/MountainCarContinuous-v0_PPO.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.5.1 3 | algo_name: PPO 4 | env_name: gym 5 | device: cpu 6 | interactor device: cuda 7 | learner device: cuda 8 | mode: train 9 | exps_trucation_size: 1024 10 | is_learner_async: false 11 | load_checkpoint: false 12 | load_path: Train_MountainCar-v0_PPO_20240618-192707 13 | load_model_step: best 14 | n_interactors: 1 15 | n_learners: 1 16 | max_episode: -1 17 | max_step: -1 18 | seed: 1 19 | reward_threshold: 90 20 | online_eval: true 21 | online_eval_episode: 20 22 | model_save_fre: 10000 23 | policy_summary_fre: 10000 24 | interact_summary_fre: 10000 25 | algo_cfg: 26 | actor_branch_layers: 27 | - name: action 28 | layers: 29 | - layer_type: linear 30 | layer_size: [256] 31 | activation: tanh 32 | - layer_type: linear 33 | layer_size: [256] 34 | activation: tanh 35 | 36 | critic_branch_layers: 37 | - name: critic 38 | layers: 39 | - layer_type: linear 40 | layer_size: [256] 41 | activation: relu 42 | - layer_type: linear 43 | layer_size: [256] 44 | activation: relu 45 | buffer_type: REPLAY_QUE 46 | max_buffer_size: 100000 47 | action_type_list: continuous 48 | lr: 0.0003 49 | actor_lr: 0.003 50 | critic_lr: 0.005 51 | entropy_coef: 0.003 52 | critic_loss_coef: 0.5 53 | eps_clip: 0.2 54 | gamma: 0.99 55 | return_form: mc 56 | gae_lambda: 0.95 57 | k_epochs: 4 58 | batch_size: 64 59 | action_std_bias: 0.8 60 | env_cfg: 61 | id: MountainCarContinuous-v0 62 | render_mode: null 63 | wrappers: 64 | - wrapper_name: MultiHeadObsWrapper 65 | -------------------------------------------------------------------------------- /presets/ClassControl/Pendulum-v1/Pendulum-v1_DDPG.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.7.2 3 | algo_name: DDPG 4 | env_name: gym 5 | device: cpu 6 | mode: train 7 | load_checkpoint: false 8 | load_path: Train_ray_Pendulum-v1_DDPG_20230527-001715 9 | load_model_step: best 10 | max_episode: -1 11 | max_step: 200 12 | seed: 1 13 | online_eval: true 14 | online_eval_episode: 20 15 | model_save_fre: 500 16 | algo_cfg: 17 | action_space: 18 | type: [dpg] 19 | actor_branch_layers: 20 | - name: state 21 | layers: 22 | - layer_type: linear 23 | layer_size: [256] 24 | activation: relu 25 | - layer_type: linear 26 | layer_size: [256] 27 | activation: relu 28 | critic_branch_layers: 29 | - name: state 30 | layers: 31 | - layer_type: none 32 | - name: action 33 | layers: 34 | - layer_type: none 35 | critic_merge_layers: 36 | - layer_type: linear 37 | layer_size: [256] 38 | activation: relu 39 | - layer_type: linear 40 | layer_size: [256] 41 | activation: relu 42 | batch_size: 256 43 | buffer_type: REPLAY_QUE 44 | max_buffer_size: 8000 45 | actor_lr: 0.0001 46 | critic_lr: 0.001 47 | policy_loss_weight: 0.002 48 | gamma: 0.99 49 | tau: 0.01 50 | env_cfg: 51 | id: Pendulum-v1 52 | render_mode: null # null, rgb_array, human 53 | wrappers: 54 | - wrapper_name: MultiHeadObsWrapper -------------------------------------------------------------------------------- /presets/ClassControl/Pendulum-v1/Pendulum-v1_DDPG_HER_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DDPG_HER 3 | device: cpu 4 | env_name: gym 5 | eval_eps: 10 6 | eval_per_episode: 5 7 | her_sample_num: 4 8 | load_checkpoint: true 9 | load_path: tasks 10 | max_steps: 200 11 | mode: test 12 | mp_backend: mp 13 | new_step_api: true 14 | render: false 15 | render_mode: human 16 | save_fig: true 17 | seed: 0 18 | show_fig: false 19 | test_eps: 20 20 | train_eps: 150 21 | update_every: 100 22 | wrapper: null 23 | algo_cfg: 24 | actor_hidden_dim: 256 25 | actor_lr: 0.0001 26 | batch_size: 128 27 | max_buffer_size: 8000 28 | critic_hidden_dim: 256 29 | critic_lr: 0.001 30 | gamma: 0.99 31 | tau: 0.001 32 | env_cfg: 33 | id: Pendulum-v1 -------------------------------------------------------------------------------- /presets/ClassControl/Pendulum-v1/Pendulum-v1_DDPG_HER_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DDPG_HER 3 | device: cpu 4 | env_name: gym 5 | eval_eps: 10 6 | eval_per_episode: 5 7 | her_sample_num: 4 8 | load_checkpoint: false 9 | load_path: tasks 10 | max_steps: 200 11 | mode: train 12 | mp_backend: mp 13 | new_step_api: true 14 | render: false 15 | render_mode: human 16 | save_fig: true 17 | seed: 0 18 | show_fig: false 19 | test_eps: 20 20 | train_eps: 100 21 | update_every: 100 22 | wrapper: null 23 | algo_cfg: 24 | actor_hidden_dim: 256 25 | actor_lr: 0.0001 26 | batch_size: 128 27 | max_buffer_size: 8000 28 | critic_hidden_dim: 256 29 | critic_lr: 0.001 30 | gamma: 0.99 31 | tau: 0.001 32 | env_cfg: 33 | id: Pendulum-v1 -------------------------------------------------------------------------------- /presets/ClassControl/Pendulum-v1/Pendulum-v1_PPO.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.5 3 | algo_name: PPO 4 | env_name: gym 5 | device: cpu 6 | mode: train 7 | exps_trucation_size: 200 8 | is_learner_async: false 9 | load_checkpoint: false 10 | load_path: Train_CartPole-v1_PPO_20231225-124842 # if load checkpoint, then config path in 'tasks' dir 11 | load_model_step: best 12 | n_interactors: 10 13 | max_episode: -1 14 | max_step: 200 15 | seed: 1 16 | online_eval: true 17 | online_eval_episode: 10 18 | model_save_fre: 10 19 | policy_summary_fre: 5 20 | interact_summary_fre: 100 21 | algo_cfg: 22 | actor_branch_layers: 23 | - name: state 24 | layers: 25 | - layer_type: linear 26 | layer_size: [256] 27 | activation: relu 28 | # - layer_type: linear 29 | # layer_size: [256] 30 | # activation: tanh 31 | critic_branch_layers: 32 | - name: state 33 | layers: 34 | - layer_type: linear 35 | layer_size: [256] 36 | activation: relu 37 | # - layer_type: linear 38 | # layer_size: [256] 39 | # activation: tanh 40 | branch_layers: 41 | - name: state 42 | layers: 43 | - layer_type: linear 44 | layer_size: [256] 45 | activation: relu 46 | buffer_type: ONPOLICY_QUE 47 | lr: 0.0003 48 | actor_lr: 0.003 49 | critic_lr: 0.01 50 | entropy_coef: 0.001 51 | critic_loss_coef: 0.5 52 | eps_clip: 0.2 53 | gamma: 0.95 54 | gae_lambda: 0.95 55 | k_epochs: 4 56 | batch_size: 2000 57 | sgd_batch_size: 50 58 | env_cfg: 59 | id: Pendulum-v1 60 | render_mode: null 61 | wrappers: 62 | - wrapper_name: MultiHeadObsWrapper 63 | 64 | -------------------------------------------------------------------------------- /presets/ClassControl/Pendulum-v1/Pendulum-v1_PPO_off_policy.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.2.3 3 | algo_name: PPO 4 | env_name: gym 5 | device: cpu 6 | mode: train 7 | exps_trucation_size: 200 8 | is_learner_async: true 9 | load_checkpoint: false 10 | load_path: Train_CartPole-v1_PPO_20231225-124842 # if load checkpoint, then config path in 'tasks' dir 11 | load_model_step: best 12 | n_interactors: 10 13 | max_episode: -1 14 | max_step: 200 15 | seed: 1 16 | online_eval: true 17 | online_eval_episode: 10 18 | model_save_fre: 10 19 | policy_summary_fre: 10 20 | interact_summary_fre: 100 21 | algo_cfg: 22 | actor_branch_layers: 23 | - name: state 24 | layers: 25 | - layer_type: linear 26 | layer_size: [256] 27 | activation: relu 28 | # - layer_type: linear 29 | # layer_size: [256] 30 | # activation: tanh 31 | critic_branch_layers: 32 | - name: state 33 | layers: 34 | - layer_type: linear 35 | layer_size: [256] 36 | activation: relu 37 | # - layer_type: linear 38 | # layer_size: [256] 39 | # activation: tanh 40 | branch_layers: 41 | - name: state 42 | layers: 43 | - layer_type: linear 44 | layer_size: [256] 45 | activation: relu 46 | buffer_type: REPLAY_QUE 47 | max_buffer_size: 4000 48 | lr: 0.0003 49 | actor_lr: 0.003 50 | critic_lr: 0.01 51 | entropy_coef: 0.001 52 | critic_loss_coef: 0.5 53 | eps_clip: 0.2 54 | gamma: 0.95 55 | gae_lambda: 0.95 56 | k_epochs: 4 57 | batch_size: 2000 58 | sgd_batch_size: 50 59 | env_cfg: 60 | id: Pendulum-v1 61 | render_mode: null 62 | 63 | -------------------------------------------------------------------------------- /presets/ClassControl/Pendulum-v1/Pendulum-v1_SAC_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: SAC 3 | device: cuda 4 | env_name: Pendulum-v1 5 | new_step_api: True 6 | wrapper: null 7 | mode: train 8 | load_checkpoint: false 9 | load_path: Train_CartPole-v1_DQN_20221026-054757 10 | max_steps: 200 11 | save_fig: true 12 | seed: 10 13 | show_fig: false 14 | test_eps: 10 15 | train_eps: 400 16 | algo_cfg: 17 | continous: false 18 | alpha: 0.2 19 | 20 | -------------------------------------------------------------------------------- /presets/ClassControl/Pendulum-v1/Pendulum-v1_TD3.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.7.2 3 | algo_name: TD3 4 | device: cpu 5 | mode: train 6 | load_checkpoint: false 7 | load_path: Train_ray_Pendulum-v1_DDPG_20230527-001715 8 | load_model_step: best 9 | max_episode: 400 10 | max_step: 200 11 | seed: 1 12 | online_eval: true 13 | online_eval_episode: 20 14 | model_save_fre: 500 15 | algo_cfg: 16 | action_space: 17 | type: [dpg] 18 | actor_branch_layers: 19 | - name: state 20 | layers: 21 | - layer_type: linear 22 | layer_size: [256] 23 | activation: tanh 24 | - layer_type: linear 25 | layer_size: [256] 26 | activation: tanh 27 | critic_branch_layers: 28 | - name: state 29 | layers: 30 | - layer_type: none 31 | - name: action 32 | layers: 33 | - layer_type: none 34 | critic_merge_layers: 35 | - layer_type: linear 36 | layer_size: [256] 37 | activation: tanh 38 | - layer_type: linear 39 | layer_size: [256] 40 | activation: tanh 41 | batch_size: 128 42 | buffer_type: REPLAY_QUE 43 | max_buffer_size: 8000 44 | actor_lr: 0.001 45 | critic_lr: 0.001 46 | policy_loss_weight: 0.002 47 | gamma: 0.99 48 | tau: 0.005 49 | env_cfg: 50 | id: Pendulum-v1 51 | render_mode: null # null, rgb_array, human 52 | wrappers: 53 | - wrapper_name: MultiHeadObsWrapper -------------------------------------------------------------------------------- /presets/ClassControl/Pendulum-v1/Pendulum-v1_TD3_BC_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: TD3_BC 3 | device: cuda 4 | env_name: gym 5 | eval_eps: 5 6 | eval_per_episode: 10 7 | load_checkpoint: true 8 | load_path: Train_gym_TD3_BC_20230416-111154 9 | max_steps: 200 10 | mode: test 11 | new_step_api: true 12 | render: false 13 | save_fig: true 14 | seed: 1 15 | show_fig: false 16 | test_eps: 10 17 | train_eps: 1 18 | wrapper: null 19 | algo_cfg: 20 | actor_hidden_dim: 256 21 | actor_lr: 0.0003 22 | batch_size: 100 23 | max_buffer_size: 1000000 24 | critic_hidden_dim: 256 25 | critic_lr: 0.0003 26 | expl_noise: 0.1 27 | explore_steps: 1000 28 | gamma: 0.99 29 | noise_clip: 0.5 30 | policy_freq: 2 31 | policy_noise: 0.2 32 | tau: 0.005 33 | alpha: 5 34 | lmbda: 1 35 | normalize: false 36 | train_iterations: 2000 # 训练的迭代次数 37 | expert_path: tasks/Collect_gym_TD3_20230416-111040/traj/traj.pkl # 专家数据路径 38 | env_cfg: 39 | id: Pendulum-v1 40 | new_step_api: true -------------------------------------------------------------------------------- /presets/ClassControl/Pendulum-v1/Pendulum-v1_TD3_BC_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: TD3_BC 3 | device: cuda 4 | env_name: gym 5 | eval_eps: 5 6 | eval_per_episode: 1 7 | load_checkpoint: false 8 | load_path: Train_CartPole-v1_DQN_20221026-054757 9 | max_steps: 200 10 | mode: train 11 | new_step_api: true 12 | render: false 13 | save_fig: true 14 | seed: 1 15 | show_fig: false 16 | test_eps: 10 17 | train_eps: 1 18 | wrapper: null 19 | algo_cfg: 20 | actor_hidden_dim: 256 21 | actor_lr: 0.0003 22 | batch_size: 100 23 | max_buffer_size: 1000000 24 | critic_hidden_dim: 256 25 | critic_lr: 0.0003 26 | expl_noise: 0.1 27 | explore_steps: 1000 28 | gamma: 0.99 29 | noise_clip: 0.5 30 | policy_freq: 2 31 | policy_noise: 0.2 32 | tau: 0.005 33 | alpha: 5 34 | lmbda: 1 35 | normalize: false 36 | train_iterations: 1500 # 训练的迭代次数 37 | expert_path: tasks/Collect_gym_TD3_20230416-111040/traj/traj.pkl # 专家数据路径 38 | env_cfg: 39 | id: Pendulum-v1 40 | new_step_api: true 41 | -------------------------------------------------------------------------------- /presets/External/Mario/Mario_DQN_CNN_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DQN_CNN 3 | device: cuda 4 | env_name: Mario 5 | mode: test 6 | load_checkpoint: true 7 | load_path: Train_Mario_DQN_CNN_20221207-155552 8 | max_steps: 1000 9 | save_fig: true 10 | seed: 1 11 | show_fig: false 12 | test_eps: 10 13 | train_eps: 100 14 | wrapper: envs.wrappers.MarioWrappers 15 | new_step_api: True 16 | algo_cfg: 17 | batch_size: 64 18 | max_buffer_size: 100000 19 | epsilon_decay: 500 20 | epsilon_end: 0.01 21 | epsilon_start: 0.95 22 | gamma: 0.99 23 | lr: 0.0001 24 | target_update: 4 25 | -------------------------------------------------------------------------------- /presets/External/Mario/Mario_DQN_CNN_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DQN_CNN 3 | device: cuda 4 | env_name: Mario 5 | mode: train 6 | load_checkpoint: false 7 | load_path: Train_Mario_DQN_CNN_20221207-155552 8 | max_steps: 1000 9 | save_fig: true 10 | seed: 1 11 | show_fig: false 12 | test_eps: 10 13 | train_eps: 200 14 | wrapper: envs.wrappers.MarioWrappers 15 | new_step_api: True 16 | algo_cfg: 17 | batch_size: 64 18 | max_buffer_size: 100000 19 | epsilon_decay: 500 20 | epsilon_end: 0.01 21 | epsilon_start: 0.95 22 | gamma: 0.99 23 | lr: 0.0001 24 | target_update: 4 25 | -------------------------------------------------------------------------------- /presets/Mujoco/Ant-v4/Ant-v4_PPO.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: PPO # algo name 3 | env_name: gym # env name, differ from env_id in env_cfgs 4 | device: cuda # device, cpu or cuda 5 | mode: train # run mode: train, test 6 | collect_traj: false # if collect trajectories or not 7 | mp_backend: single # multi-processing mode: single(default), ray 8 | n_workers: 2 # number of workers if using multi-processing, default 1 9 | load_checkpoint: false # if load checkpoint or not 10 | load_path: Train_single_CartPole-v1_DQN_20230515-211721 # if load checkpoint, then config path in 'tasks' dir 11 | load_model_step: best # load model step 12 | max_episode: 2000 # max episodes, set -1 to keep running 13 | max_step: 1000 # max steps per episode 14 | seed: 1 # random seed, set 0 not to use seed 15 | online_eval: true # if online eval or not 16 | online_eval_episode: 10 # online eval episodes 17 | model_save_fre: 500 # update step frequency of saving model 18 | algo_cfg: 19 | actor_layers: 20 | - layer_type: linear 21 | layer_size: [256] 22 | activation: relu 23 | - layer_type: linear 24 | layer_size: [256] 25 | activation: relu 26 | critic_layers: 27 | - layer_type: linear 28 | layer_size: [256] 29 | activation: relu 30 | - layer_type: linear 31 | layer_size: [256] 32 | activation: relu 33 | batch_size: 256 34 | sgd_batch_size: 128 35 | k_epochs: 8 36 | buffer_type: ONPOLICY_QUE 37 | max_buffer_size: 100000 38 | epsilon_decay: 500 39 | epsilon_end: 0.01 40 | epsilon_start: 0.95 41 | gamma: 0.95 42 | lr: 0.0001 43 | target_update: 4 44 | env_cfg: 45 | id: Ant-v4 46 | render_mode: null -------------------------------------------------------------------------------- /presets/Mujoco/HalfCheetah-v4/HalfCheetah-v2_TD3_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: TD3 3 | device: cuda 4 | env_name: HalfCheetah-v2 5 | eval_eps: 5 6 | eval_per_episode: 10 7 | load_checkpoint: false 8 | load_path: Train_HalfCheetah-v2_TD3_20230221-213446 9 | max_steps: 1000 10 | mode: test 11 | new_step_api: true 12 | render: false 13 | save_fig: true 14 | seed: 1 15 | show_fig: false 16 | test_eps: 10 17 | train_eps: 1200 18 | wrapper: null 19 | algo_cfg: 20 | actor_hidden_dim: 256 21 | actor_lr: 0.0005 22 | batch_size: 256 23 | max_buffer_size: 1000000 24 | critic_hidden_dim: 256 25 | critic_lr: 0.0005 26 | expl_noise: 0.1 27 | explore_steps: 10000 28 | gamma: 0.99 29 | noise_clip: 0.5 30 | policy_freq: 2 31 | policy_noise: 0.2 32 | tau: 0.005 33 | 34 | -------------------------------------------------------------------------------- /presets/Mujoco/HalfCheetah-v4/HalfCheetah-v2_TD3_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: TD3 3 | device: cuda 4 | env_name: HalfCheetah-v2 5 | eval_eps: 5 6 | eval_per_episode: 10 7 | load_checkpoint: false 8 | load_path: Train_CartPole-v1_DQN_20221026-054757 9 | max_steps: 1000 10 | mode: train 11 | new_step_api: true 12 | render: false 13 | save_fig: true 14 | seed: 1 15 | show_fig: false 16 | test_eps: 10 17 | train_eps: 1200 18 | wrapper: null 19 | algo_cfg: 20 | actor_hidden_dim: 256 21 | actor_lr: 0.0005 22 | batch_size: 256 23 | max_buffer_size: 1000000 24 | critic_hidden_dim: 256 25 | critic_lr: 0.0005 26 | expl_noise: 0.1 27 | explore_steps: 10000 28 | gamma: 0.99 29 | noise_clip: 0.5 30 | policy_freq: 2 31 | policy_noise: 0.2 32 | tau: 0.005 33 | 34 | -------------------------------------------------------------------------------- /presets/Mujoco/HalfCheetah-v4/HalfCheetah-v4_PPO.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: PPO # algo name 3 | env_name: gym # env name, differ from env_id in env_cfgs 4 | device: cuda # device, cpu or cuda 5 | mode: train # run mode: train, test 6 | collect_traj: false # if collect trajectories or not 7 | mp_backend: single # multi-processing mode: single(default), ray 8 | n_workers: 2 # number of workers if using multi-processing, default 1 9 | load_checkpoint: false # if load checkpoint or not 10 | load_path: Train_single_CartPole-v1_DQN_20230515-211721 # if load checkpoint, then config path in 'tasks' dir 11 | load_model_step: best # load model step 12 | max_episode: 2000 # max episodes, set -1 to keep running 13 | max_step: 1000 # max steps per episode 14 | seed: 1 # random seed, set 0 not to use seed 15 | online_eval: true # if online eval or not 16 | online_eval_episode: 10 # online eval episodes 17 | model_save_fre: 500 # update step frequency of saving model 18 | algo_cfg: 19 | # value_layers: 20 | # - layer_type: linear 21 | # layer_size: [256] 22 | # activation: relu 23 | # - layer_type: linear 24 | # layer_size: [256] 25 | # activation: relu 26 | actor_layers: 27 | - layer_type: linear 28 | layer_size: [256] 29 | activation: relu 30 | - layer_type: linear 31 | layer_size: [256] 32 | activation: relu 33 | critic_layers: 34 | - layer_type: linear 35 | layer_size: [256] 36 | activation: relu 37 | - layer_type: linear 38 | layer_size: [256] 39 | activation: relu 40 | batch_size: 256 41 | sgd_batch_size: 128 42 | k_epochs: 8 43 | buffer_type: ONPOLICY_QUE 44 | max_buffer_size: 100000 45 | epsilon_decay: 500 46 | epsilon_end: 0.01 47 | epsilon_start: 0.95 48 | gamma: 0.95 49 | lr: 0.0001 50 | target_update: 4 51 | env_cfg: 52 | id: HalfCheetah-v4 53 | render_mode: null -------------------------------------------------------------------------------- /presets/Mujoco/Hopper-v4/Hopper-v4_PPO.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.2.2 3 | algo_name: PPO 4 | env_name: gym 5 | device: cuda 6 | interactor_device: cpu 7 | learner_device: cuda 8 | mode: train # test # test 9 | exps_trucation_size: 512 10 | is_learner_async: false 11 | load_checkpoint: false # true # test 12 | load_path: Train_Hopper-v4_PPO_20240625-123656 # if load checkpoint, then config path in 'tasks' dir 13 | load_model_step: best 14 | n_interactors: 10 # 1 # test 15 | max_episode: 10000 # 3 # test 16 | max_step: 500 17 | seed: 202406 18 | online_eval: true 19 | online_eval_episode: 15 # 1 # test 20 | model_save_fre: 10 21 | policy_summary_fre: 10 22 | interact_summary_fre: 100 23 | algo_cfg: 24 | independ_actor: true 25 | return_form: td 26 | actor_branch_layers: 27 | - name: feature_1 28 | layers: 29 | - layer_type: linear 30 | layer_size: [256] 31 | activation: tanh 32 | - layer_type: linear 33 | layer_size: [256] 34 | activation: tanh 35 | critic_branch_layers: 36 | - name: feature_1 37 | layers: 38 | - layer_type: linear 39 | layer_size: [256] 40 | activation: tanh 41 | - layer_type: linear 42 | layer_size: [256] 43 | activation: tanh 44 | buffer_type: ONPOLICY_QUE 45 | lr: 2.5e-4 46 | actor_lr: 1.5e-4 47 | critic_lr: 3.5e-4 48 | entropy_coef: 0.001 49 | critic_loss_coef: 0.001 50 | eps_clip: 0.25 51 | gamma: 0.99 52 | gae_lambda: 0.9 53 | k_epochs: 2 54 | batch_size: 256 55 | sgd_batch_size: 24 56 | env_cfg: 57 | id: Hopper-v4 58 | render_mode: null # human # test 59 | wrappers: 60 | - wrapper_name: ClipAction 61 | - wrapper_name: MultiHeadObsWrapper 62 | -------------------------------------------------------------------------------- /presets/Mujoco/Reacher-v4/Reacher-v4_PPO.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | joyrl_version: 0.6.2.2 3 | algo_name: PPO 4 | env_name: gym 5 | device: cuda 6 | interactor_device: cpu 7 | learner_device: cuda 8 | mode: train 9 | exps_trucation_size: 512 10 | is_learner_async: false 11 | load_checkpoint: false 12 | load_path: Train_Reacher-v4_PPO_20240702-005711 13 | load_model_step: best 14 | n_interactors: 10 15 | max_episode: 42000 16 | max_step: 100 17 | seed: 202406 18 | online_eval: true 19 | online_eval_episode: 15 20 | model_save_fre: 10 21 | policy_summary_fre: 10 22 | interact_summary_fre: 100 23 | algo_cfg: 24 | independ_actor: true 25 | return_form: td 26 | actor_branch_layers: 27 | - name: feature_1 28 | layers: 29 | - layer_type: linear 30 | layer_size: [240] 31 | activation: tanh 32 | - layer_type: linear 33 | layer_size: [240] 34 | activation: tanh 35 | critic_branch_layers: 36 | - name: feature_1 37 | layers: 38 | - layer_type: linear 39 | layer_size: [240] 40 | activation: tanh 41 | - layer_type: linear 42 | layer_size: [240] 43 | activation: tanh 44 | buffer_type: ONPOLICY_QUE 45 | lr: 2.5e-4 46 | actor_lr: 5.5e-4 47 | critic_lr: 7.5e-4 48 | entropy_coef: 0.001 49 | critic_loss_coef: 0.001 50 | eps_clip: 0.185 51 | gamma: 0.99 52 | gae_lambda: 0.985 53 | k_epochs: 2 54 | batch_size: 256 55 | sgd_batch_size: 128 56 | env_cfg: 57 | id: Reacher-v4 58 | render_mode: null 59 | max_episode_steps: 100 60 | wrappers: 61 | - wrapper_name: ClipAction 62 | - wrapper_name: MultiHeadObsWrapper 63 | - wrapper_name: ReacherDistReward 64 | dis_weight: 0.4 65 | 66 | -------------------------------------------------------------------------------- /presets/Others/Racetrack-v0/Racetrack-v0_QLearning_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: QLearning 3 | device: cpu 4 | env_name: Racetrack-v0 5 | render: True 6 | render_mode: human 7 | mode: test 8 | load_checkpoint: true 9 | load_path: Train_Racetrack-v0_QLearning_20221128-154935 10 | max_steps: 200 11 | save_fig: true 12 | seed: 10 13 | show_fig: false 14 | test_eps: 10 15 | train_eps: 400 16 | algo_cfg: 17 | epsilon_decay: 300 18 | epsilon_end: 0.01 19 | epsilon_start: 0.95 20 | gamma: 0.9 21 | lr: 0.1 22 | -------------------------------------------------------------------------------- /presets/Others/Racetrack-v0/Racetrack-v0_QLearning_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: QLearning 3 | device: cpu 4 | env_name: Racetrack-v0 5 | new_step_api: True 6 | wrapper: null 7 | mode: train 8 | load_checkpoint: false 9 | load_path: Train_CartPole-v1_DQN_20221026-054757 10 | max_steps: 200 11 | save_fig: true 12 | seed: 10 13 | show_fig: false 14 | test_eps: 20 15 | train_eps: 400 16 | algo_cfg: 17 | epsilon_decay: 300 18 | epsilon_end: 0.01 19 | epsilon_start: 0.95 20 | gamma: 0.9 21 | lr: 0.1 22 | -------------------------------------------------------------------------------- /presets/Others/Racetrack-v0/Train_Racetrack-v0_FirstVisitMC.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: FirstVisitMC 3 | device: cpu 4 | env_name: Racetrack-v0 5 | eval_eps: 10 6 | eval_per_episode: 5 7 | load_checkpoint: false 8 | load_path: tasks 9 | max_steps: 200 10 | mode: train 11 | save_fig: true 12 | seed: 1 13 | show_fig: false 14 | test_eps: 20 15 | train_eps: 200 16 | algo_cfg: 17 | epsilon: 0.15 18 | gamma: 0.9 19 | lr: 0.1 20 | -------------------------------------------------------------------------------- /presets/Others/deep-sea-treasure-v0/DeepSeaTreasure-v0_MOQLearning_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: MO-QLearning 3 | device: cpu 4 | env_name: deep-sea-treasure-v0 5 | wrapper: envs.wrappers.DeepSeaTreasure 6 | render: True 7 | mode: test 8 | load_checkpoint: true 9 | load_path: Test_deep-sea-treasure-v0_MO-QLearning_20230329-234802 10 | max_steps: 100 11 | save_fig: true 12 | seed: 1 13 | show_fig: false 14 | test_eps: 10 15 | train_eps: 300 16 | algo_cfg: 17 | epsilon_decay: 300 18 | epsilon_end: 0.01 19 | epsilon_start: 0.95 20 | gamma: 0.99 21 | lr: 0.1 22 | -------------------------------------------------------------------------------- /presets/Others/deep-sea-treasure-v0/DeepSeaTreasure-v0_MOQLearning_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: MO-QLearning 3 | device: cpu 4 | env_name: deep-sea-treasure-v0 5 | wrapper: envs.wrappers.DeepSeaTreasure 6 | mode: train 7 | load_checkpoint: false 8 | load_path: Train_deep-sea-treasure-v0_MO-QLearning_20230329-234319 9 | max_steps: 100 10 | save_fig: true 11 | seed: 1 12 | show_fig: false 13 | test_eps: 10 14 | train_eps: 300 15 | algo_cfg: 16 | exploration_type: e-greedy # softmax, ucb 17 | epsilon_decay: 300 18 | epsilon_end: 0.01 19 | epsilon_start: 0.95 20 | gamma: 0.99 21 | lr: 0.1 22 | -------------------------------------------------------------------------------- /presets/Others/theAlley/theAlley_VI_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: VI 3 | device: cpu 4 | env_name: theAlley 5 | mode: test 6 | load_checkpoint: true 7 | load_path: Train_theAlley_VI_20221122-215228 8 | max_steps: 200 9 | save_fig: true 10 | seed: 1 11 | show_fig: false 12 | test_eps: 10 13 | train_eps: 100 14 | algo_cfg: 15 | # value_layers: 16 | # - layer_type: linear 17 | # layer_dim: ['n_states',256] 18 | # activation: relu 19 | # - layer_type: linear 20 | # layer_dim: [256,256] 21 | # activation: relu 22 | # - layer_type: linear 23 | # layer_dim: [256,'n_actions'] 24 | # activation: none 25 | # batch_size: 64 26 | # max_buffer_size: 100000 27 | # epsilon_decay: 500 28 | # epsilon_end: 0.01 29 | # epsilon_start: 0.95 30 | gamma: 0.95 31 | lr: 0.0001 32 | # target_update: 4 33 | -------------------------------------------------------------------------------- /presets/Others/theAlley/theAlley_VI_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: VI 3 | device: cpu 4 | env_name: theAlley 5 | mode: train 6 | load_checkpoint: false 7 | load_path: Train_theAlley_VI_20221122-215228 8 | max_steps: 200 9 | save_fig: true 10 | seed: 1 11 | show_fig: false 12 | test_eps: 10 13 | train_eps: 100 14 | algo_cfg: 15 | # value_layers: 16 | # - layer_type: linear 17 | # layer_dim: ['n_states',256] 18 | # activation: relu 19 | # - layer_type: linear 20 | # layer_dim: [256,256] 21 | # activation: relu 22 | # - layer_type: linear 23 | # layer_dim: [256,'n_actions'] 24 | # activation: none 25 | # batch_size: 64 26 | # max_buffer_size: 100000 27 | # epsilon_decay: 500 28 | # epsilon_end: 0.01 29 | # epsilon_start: 0.95 30 | gamma: 0.95 31 | lr: 0.0001 32 | # target_update: 4 33 | -------------------------------------------------------------------------------- /presets/Pendulum-v1_TD3_mp_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: TD3 3 | device: cpu 4 | env_name: gym 5 | eval_eps: 5 6 | eval_per_episode: 10 7 | load_checkpoint: true 8 | load_path: Train_gym_TD3_20230416-214019 9 | max_steps: 200 10 | mode: test 11 | n_workers: 1 # number of workers for parallel training 12 | render: false 13 | save_fig: true 14 | seed: 1 15 | show_fig: false 16 | test_eps: 10 17 | train_eps: 200 18 | wrapper: null 19 | algo_cfg: 20 | actor_hidden_dim: 256 21 | actor_lr: 0.001 22 | batch_size: 100 23 | max_buffer_size: 1000000 24 | critic_hidden_dim: 256 25 | critic_lr: 0.001 26 | expl_noise: 0.1 27 | explore_steps: 1000 28 | gamma: 0.99 29 | noise_clip: 0.5 30 | policy_freq: 2 31 | policy_noise: 0.2 32 | tau: 0.005 33 | global_best_reward: -1800 34 | env_cfg: 35 | id: Pendulum-v1 36 | new_step_api: true 37 | -------------------------------------------------------------------------------- /presets/Pendulum-v1_TD3_mp_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: TD3 3 | device: cpu 4 | env_name: gym 5 | eval_eps: 1 6 | eval_per_episode: 10 7 | load_checkpoint: false 8 | load_path: Train_CartPole-v1_DQN_20221026-054757 9 | max_steps: 200 10 | mode: train 11 | mp_backend: mp # 多线程框架,ray或者mp(multiprocessing),默认mp 12 | n_workers: 4 # number of workers for parallel training 13 | render: false 14 | save_fig: true 15 | seed: 1 16 | show_fig: false 17 | test_eps: 10 18 | train_eps: 200 19 | wrapper: null 20 | algo_cfg: 21 | actor_hidden_dim: 256 22 | actor_lr: 0.001 23 | batch_size: 100 24 | max_buffer_size: 1000000 25 | critic_hidden_dim: 256 26 | critic_lr: 0.001 27 | expl_noise: 0.1 28 | explore_steps: 1000 29 | gamma: 0.99 30 | noise_clip: 0.5 31 | policy_freq: 2 32 | policy_noise: 0.2 33 | tau: 0.005 34 | global_best_reward: -1800 35 | env_cfg: 36 | id: Pendulum-v1 37 | new_step_api: true 38 | -------------------------------------------------------------------------------- /presets/ToyText/CliffWalking-v0/CliffWalking-v0_DynaQ_Test.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DynaQ 3 | device: cpu 4 | env_name: CliffWalking-v0 5 | wrapper: envs.wrappers.CliffWalkingWapper 6 | mode: test 7 | load_checkpoint: true 8 | load_path: Train_CliffWalking-v0_DynaQ_20221210-095808 9 | max_steps: 100 10 | save_fig: true 11 | seed: 1 12 | show_fig: false 13 | test_eps: 10 14 | train_eps: 100 15 | algo_cfg: 16 | exploration_type: e-greedy # softmax, ucb 17 | epsilon_decay: 300 18 | epsilon_end: 0.01 19 | epsilon_start: 0.95 20 | gamma: 0.99 21 | lr: 0.1 22 | -------------------------------------------------------------------------------- /presets/ToyText/CliffWalking-v0/CliffWalking-v0_DynaQ_Train.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DynaQ 3 | device: cpu 4 | env_name: CliffWalking-v0 5 | wrapper: envs.wrappers.CliffWalkingWapper 6 | mode: train 7 | load_checkpoint: false 8 | load_path: Train_CliffWalking-v0_DynaQ_20221210-095808 9 | max_steps: 100 10 | save_fig: true 11 | seed: 1 12 | show_fig: false 13 | test_eps: 10 14 | train_eps: 100 15 | algo_cfg: 16 | exploration_type: e-greedy # softmax, ucb 17 | epsilon_decay: 300 18 | epsilon_end: 0.01 19 | epsilon_start: 0.95 20 | gamma: 0.99 21 | lr: 0.1 22 | -------------------------------------------------------------------------------- /presets/ToyText/CliffWalking-v0/CliffWalking-v0_QLearning.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: QLearning # algo name 3 | env_name: gym # env name, differ from env_id in env_cfgs 4 | mode: train # run mode: train, test 5 | collect_traj: false # if collect trajectories or not 6 | load_checkpoint: false # if load checkpoint or not 7 | load_path: Train_CliffWalking-v0_QLearning_20231224-173215 # if load checkpoint, then config path in 'tasks' dir 8 | load_model_step: 12000 # load model step 9 | max_episode: 500 # max episodes, set -1 to keep running 10 | max_step: 100 # max steps per episode 11 | seed: 1 # random seed, set 0 not to use seed 12 | model_save_fre: 200 13 | online_eval: true 14 | algo_cfg: 15 | epsilon_decay: 500 16 | epsilon_end: 0.01 17 | epsilon_start: 0.95 18 | gamma: 0.99 19 | lr: 0.1 20 | env_cfg: 21 | id: CliffWalking-v0 22 | render_mode: null -------------------------------------------------------------------------------- /presets/ToyText/CliffWalking-v0/CliffWalking-v0_Sarsa.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: Sarsa # algo name 3 | env_name: gym # env name, differ from env_id in env_cfgs 4 | mode: train # run mode: train, test 5 | collect_traj: false # if collect trajectories or not 6 | load_checkpoint: false # if load checkpoint or not 7 | load_path: Train_single_CliffWalking-v0_Sarsa_20230519-010804 # if load checkpoint, then config path in 'tasks' dir 8 | load_model_step: 12000 # load model step 9 | max_episode: 500 # max episodes, set -1 to keep running 10 | max_step: 100 # max steps per episode 11 | seed: 1 # random seed, set 0 not to use seed 12 | model_save_fre: 200 13 | online_eval: true 14 | 15 | algo_cfg: 16 | epsilon_decay: 500 17 | epsilon_end: 0.01 18 | epsilon_start: 0.95 19 | gamma: 0.99 20 | lr: 0.1 21 | 22 | env_cfg: 23 | id: CliffWalking-v0 24 | render_mode: null -------------------------------------------------------------------------------- /presets/ToyText/CliffWalking-v0/CustomCliffWalking-v0_DQN.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: DQN 3 | env_name: gym 4 | device: cpu 5 | mode: train 6 | collect_traj: false 7 | n_interactors: 1 8 | load_checkpoint: false 9 | load_path: Train_single_CartPole-v1_DQN_20230515-211721 10 | load_model_step: best 11 | max_episode: -1 12 | max_step: 20 13 | seed: 1 14 | online_eval: true 15 | online_eval_episode: 10 16 | model_save_fre: 500 17 | 18 | algo_cfg: 19 | value_layers: 20 | - layer_type: embed 21 | n_embeddings: 48 22 | embedding_dim: 4 23 | - layer_type: linear 24 | layer_size: [256] 25 | activation: relu 26 | - layer_type: linear 27 | layer_size: [256] 28 | activation: relu 29 | batch_size: 128 30 | buffer_type: REPLAY_QUE 31 | max_buffer_size: 10000 32 | epsilon_decay: 1000 33 | epsilon_end: 0.01 34 | epsilon_start: 0.99 35 | gamma: 0.95 36 | lr: 0.001 37 | target_update: 4 38 | env_cfg: 39 | id: CustomCliffWalking-v0 40 | render_mode: null -------------------------------------------------------------------------------- /presets/ToyText/FrozenLake-v1/FrozenLake-v1_NoSlippery_QLearning.yaml: -------------------------------------------------------------------------------- 1 | general_cfg: 2 | algo_name: QLearning # algo name 3 | env_name: gym # env name, differ from env_id in env_cfgs 4 | mode: train # run mode: train, test 5 | collect_traj: false # if collect trajectories or not 6 | load_checkpoint: false # if load checkpoint or not 7 | load_path: Train_CliffWalking-v0_QLearning_20231224-173215 # if load checkpoint, then config path in 'tasks' dir 8 | load_model_step: 12000 # load model step 9 | max_episode: 800 # max episodes, set -1 to keep running 10 | max_step: 100 # max steps per episode 11 | seed: 10 # random seed, set 0 not to use seed 12 | model_save_fre: 200 13 | online_eval: true 14 | algo_cfg: 15 | epsilon_decay: 2000 16 | epsilon_end: 0.1 17 | epsilon_start: 0.7 18 | gamma: 0.95 19 | lr: 0.9 20 | env_cfg: 21 | id: FrozenLake-v1 22 | is_slippery: false 23 | render_mode: null -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ray[default]==2.6.3 2 | gymnasium==0.29.1 3 | tensorboard==2.16.2 4 | matplotlib==3.8.4 5 | seaborn==0.13.2 6 | dill==0.3.8 7 | scipy==1.13.0 8 | swig==4.2.1 9 | pygame==2.6.0 10 | gymnasium[box2d]==0.29.1 11 | numpy==1.26.4 12 | pandas==2.2.2 13 | six==1.16.0 14 | setuptools==69.5.1 15 | scipy==1.13.0 16 | PyYAML==6.0.1 17 | pydantic==1.10.17 18 | psutil==6.0.0 19 | colorlog==6.8.2 -------------------------------------------------------------------------------- /run.bat: -------------------------------------------------------------------------------- 1 | python offline_run.py --yaml presets/ClassControl/CartPole-v1/CartPole-v1_DQN.yaml -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | # conda activate joyrl 2 | python offline_run.py --yaml presets/ClassControl/CartPole-v1/CartPole-v1_DQN.yaml -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = 3 | .git 4 | log 5 | __pycache__ 6 | docs 7 | build 8 | dist 9 | *.egg-info 10 | .DS_Store 11 | benchmarks 12 | max-line-length = 87 13 | ignore = B305,W504,B006,B008,B024 14 | 15 | [yapf] 16 | based_on_style = pep8 17 | dedent_closing_brackets = true 18 | column_limit = 87 19 | blank_line_before_nested_class_or_def = true 20 | 21 | [isort] 22 | profile = black 23 | multi_line_output = 3 24 | line_length = 87 25 | 26 | [mypy] 27 | files = joyrl/**/*.py 28 | allow_redefinition = True 29 | check_untyped_defs = True 30 | disallow_incomplete_defs = True 31 | disallow_untyped_defs = True 32 | ignore_missing_imports = True 33 | no_implicit_optional = True 34 | pretty = True 35 | show_error_codes = True 36 | show_error_context = True 37 | show_traceback = True 38 | strict_equality = True 39 | strict_optional = True 40 | warn_no_return = True 41 | warn_redundant_casts = True 42 | warn_unreachable = True 43 | warn_unused_configs = True 44 | warn_unused_ignores = True 45 | 46 | [pydocstyle] 47 | ignore = D100,D102,D104,D105,D107,D203,D213,D401,D402 48 | 49 | [doc8] 50 | max-line-length = 1000 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Author: JiangJi 5 | Email: johnjim0816@gmail.com 6 | Date: 2023-12-22 13:01:23 7 | LastEditor: JiangJi 8 | LastEditTime: 2024-06-17 14:43:29 9 | Discription: 10 | ''' 11 | import sys,os 12 | from setuptools import setup, find_packages 13 | curr_path = os.path.dirname(os.path.abspath(__file__)) # current path 14 | 15 | def get_version() -> str: 16 | # https://packaging.python.org/guides/single-sourcing-package-version/ 17 | init = open(os.path.join("joyrl", "__init__.py"), "r").read().split() 18 | return init[init.index("__version__") + 2][1:-1] 19 | 20 | def get_install_requires() -> str: 21 | return [ 22 | "ray[default]==2.6.3", 23 | "gymnasium==0.29.1", 24 | "gymnasium[box2d]==0.29.1", 25 | "tensorboard==2.16.2", 26 | "matplotlib==3.8.4", 27 | "seaborn==0.13.2", 28 | "dill==0.3.8", 29 | "scipy==1.13.0", 30 | "pygame==2.5.2", 31 | "swig==4.2.1", 32 | "numpy==1.26.4", 33 | "pandas==2.2.2", 34 | "six==1.16.0", 35 | "setuptools==69.5.1", 36 | "scipy==1.13.0", 37 | "PyYAML==6.0.1", 38 | "pydantic==1.10.15", 39 | "psutil==0.3.14", 40 | "" 41 | ] 42 | 43 | def get_extras_require() -> str: 44 | req = { 45 | "atari": ["atari_py", "opencv-python"], 46 | "mujoco": ["mujoco_py"], 47 | "pybullet": ["pybullet"], 48 | } 49 | return req 50 | 51 | setup( 52 | name="joyrl", 53 | version=get_version(), 54 | description="A Library for Deep Reinforcement Learning", 55 | long_description=open(f"{curr_path}/README.md", encoding="utf8").read(), 56 | long_description_content_type="text/markdown", 57 | url="https://github.com/datawhalechina/joyrl", 58 | author="johnjim0816", 59 | author_email="johnjim0816@gmail.com", 60 | license="MIT", 61 | python_requires=">=3.7", 62 | keywords="reinforcement learning platform pytorch", 63 | packages=find_packages( 64 | exclude=["test", "test.*", "examples", "examples.*", "docs", "docs.*"] 65 | ), 66 | platforms = "any", 67 | install_requires=get_install_requires(), 68 | extras_require=get_extras_require(), 69 | entry_points={ 70 | "console_scripts": [ 71 | "joyrl=joyrl.scripts.scripts:main", 72 | ], 73 | }, 74 | ) -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | echo "rm -rf old dist" 2 | rm -rf dist/ 3 | echo "rm -rf old dist" 4 | rm -rf build/ 5 | echo "run: python setup.py sdist bdist_wheel" 6 | python setup.py sdist bdist_wheel 7 | echo "run: test pypi" 8 | twine upload --repository pypitest dist/* 9 | echo "run: upload!" 10 | twine upload dist/* -------------------------------------------------------------------------------- /stop.bat: -------------------------------------------------------------------------------- 1 | ray stop -------------------------------------------------------------------------------- /stop.sh: -------------------------------------------------------------------------------- 1 | ray stop --------------------------------------------------------------------------------