├── .DS_Store
├── .idea
    ├── .gitignore
    ├── DRL_pytorch.iml
    ├── deployment.xml
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    ├── other.xml
    ├── vcs.xml
    └── webServers.xml
├── Actor_Critic
    ├── A3C
    │   ├── Pendulum_A3C_1.png
    │   ├── __pycache__
    │   │   ├── agent_a3c.cpython-37.pyc
    │   │   ├── agent_a3c.cpython-38.pyc
    │   │   ├── untils.cpython-37.pyc
    │   │   ├── untils.cpython-38.pyc
    │   │   ├── worker.cpython-37.pyc
    │   │   └── worker.cpython-38.pyc
    │   ├── a3c_main.py
    │   ├── agent_a3c.py
    │   ├── untils.py
    │   └── worker.py
    └── SAC
    │   ├── sac_agent.py
    │   ├── sac_main.py
    │   └── sac_network.py
├── BlackBox_optimazation
    ├── Hill_Climbing
    │   ├── __pycache__
    │   │   └── agent_HC.cpython-36.pyc
    │   ├── agent_HC.py
    │   └── main_hillClimb.py
    └── cross_entropy_method
    │   ├── CEM.png
    │   ├── __pycache__
    │       └── agent_cem.cpython-37.pyc
    │   ├── agent_cem.py
    │   ├── checkpoint.pth
    │   └── main_cem.py
├── DDPGs
    ├── DDPG
    │   ├── DDPG_agent.py
    │   ├── DDPG_main.py
    │   ├── DDPG_model.py
    │   ├── ddpg_1.py
    │   └── model_save
    │   │   ├── actor1.pth
    │   │   ├── actor2.pth
    │   │   ├── checkpoint_actor.pth
    │   │   ├── checkpoint_critic.pth
    │   │   ├── critic1.pth
    │   │   └── critic2.pth
    └── TD3
    │   ├── TD3_agent.py
    │   ├── TD3_main.py
    │   ├── TD3_model.py
    │   ├── TD3_new.py
    │   ├── TD3_solved.png
    │   ├── __pycache__
    │       ├── TD3_model.cpython-38.pyc
    │       └── TD3_new.cpython-38.pyc
    │   ├── models
    │       ├── TD3_actor.pth
    │       └── TD3_critic.pth
    │   ├── scores_saved.csv
    │   └── test.py
├── DQNs
    ├── .DS_Store
    ├── DDQN
    │   ├── .DS_Store
    │   ├── DQN_main.py
    │   ├── Deep_Q_Network.ipynb
    │   ├── __pycache__
    │   │   ├── ddqn_v3.cpython-38.pyc
    │   │   └── model_dueling.cpython-38.pyc
    │   ├── ddqn_v1.py
    │   ├── ddqn_v2.py
    │   ├── ddqn_v3.py
    │   ├── dqn.py
    │   ├── images
    │   │   ├── Total Average reward scores plot.png
    │   │   ├── ddqn_agent_scores.png
    │   │   ├── ddqn_testing_scores.png
    │   │   ├── double_dqn_v1.png
    │   │   ├── dueling-ddqn_testing.png
    │   │   ├── dueling-ddqn_training.png
    │   │   ├── runningResult.png
    │   │   └── runningResult_1.png
    │   ├── model.py
    │   ├── model_dueling.py
    │   ├── models
    │   │   ├── checkpoint.pth
    │   │   ├── dueling_model.pth
    │   │   └── org_dqn.pth
    │   ├── old_agent.py
    │   ├── play_env.py
    │   └── test.py
    ├── DQN_PER
    │   ├── .DS_Store
    │   ├── PER_memory.py
    │   ├── Plots
    │   │   ├── cnn_per.png
    │   │   ├── epsilon_1.png
    │   │   ├── epsilon_2.png
    │   │   ├── epsilon_exp-1.png
    │   │   ├── epsilon_exp-2.png
    │   │   ├── epsilon_exp-3.png
    │   │   ├── epsilon_linear-1.png
    │   │   ├── train_1.png
    │   │   ├── train_2.png
    │   │   ├── train_DQN_per.png
    │   │   ├── train_exp-1.png
    │   │   ├── train_exp-2.png
    │   │   ├── train_exp-3.png
    │   │   ├── train_exp.png
    │   │   └── train_linear-1.png
    │   ├── SumTree.py
    │   ├── __pycache__
    │   │   ├── PER_memory.cpython-38.pyc
    │   │   ├── SumTree.cpython-38.pyc
    │   │   ├── atari_wappers.cpython-38.pyc
    │   │   ├── dqn_model.cpython-38.pyc
    │   │   └── dqn_per.cpython-38.pyc
    │   ├── atari_wappers.py
    │   ├── dqn_model.py
    │   ├── dqn_per.py
    │   ├── main_dqn_per.py
    │   └── train_20210520.log
    ├── DQN_cnn
    │   ├── .DS_Store
    │   ├── Models
    │   │   ├── CNN_model|03-29#19:21.pth
    │   │   ├── CNN_model|03-30#11:19.pth
    │   │   ├── CNN_model|03-30#21:05.pth
    │   │   ├── CNN_model|03-31#19:32.pth
    │   │   ├── dqnCNN_model_0324.pth
    │   │   └── dqn_model.pth
    │   ├── Plots
    │   │   ├── test-score|03-25#20:00.png
    │   │   ├── test-score|03-26#09:15.png
    │   │   ├── test-score|03-26#09:45.png
    │   │   ├── train-score|03-29#19:21.png
    │   │   ├── train-score|03-30#11:19.png
    │   │   ├── train-score|03-30#21:05.png
    │   │   └── train-score|03-31#19:32.png
    │   ├── ReadMe.md
    │   ├── __pycache__
    │   │   ├── atari_wappers.cpython-38.pyc
    │   │   ├── cnn_model.cpython-38.pyc
    │   │   └── dqn_agent.cpython-38.pyc
    │   ├── atari_wappers.py
    │   ├── cnn_model.py
    │   ├── dqn_agent.py
    │   ├── image
    │   │   ├── pic-0.jpg
    │   │   ├── pic-100.jpg
    │   │   ├── pic-140.jpg
    │   │   ├── pic-152.jpg
    │   │   ├── pic-167.jpg
    │   │   ├── pic-185.jpg
    │   │   ├── pic-200.jpg
    │   │   ├── pic-204.jpg
    │   │   ├── pic-227.jpg
    │   │   ├── pic-300.jpg
    │   │   ├── pic-400.jpg
    │   │   ├── pic-500.jpg
    │   │   ├── pic-600.jpg
    │   │   ├── pic-674.jpg
    │   │   ├── pic-683.jpg
    │   │   ├── pic-696.jpg
    │   │   ├── pic-700.jpg
    │   │   ├── pic-714.jpg
    │   │   ├── pic-733.jpg
    │   │   ├── pic-756.jpg
    │   │   ├── pic-800.jpg
    │   │   ├── pic-900.jpg
    │   │   ├── pic-902.jpg
    │   │   ├── pic-909.jpg
    │   │   ├── pic-920.jpg
    │   │   ├── pic-936.jpg
    │   │   └── pic-956.jpg
    │   ├── log
    │   │   ├── train_20210326.log
    │   │   ├── train_20210329.log
    │   │   ├── train_20210329_1.log
    │   │   ├── train_20210330.log
    │   │   └── train_20210331.log
    │   ├── main_dqn_atari.py
    │   ├── main_test.py
    │   ├── play_atari.py
    │   └── train_20210401.log
    └── DQN_iws
    │   └── ref_ddqn_iws.py
├── Evaluation_Algorithms
    └── CartPole.py
├── Games_play_train
    └── atari.py
└── Policy_Gradient
    ├── .DS_Store
    ├── PGs
        ├── __pycache__
        │   ├── agent_PG.cpython-37.pyc
        │   ├── model.cpython-36.pyc
        │   └── model.cpython-37.pyc
        ├── agent_PG.py
        ├── main_PG.py
        ├── model.py
        └── models
        │   ├── PPO_model-1.pth
        │   ├── PPO_new.pth
        │   ├── PPOv2_model-1.pth
        │   ├── pg_model_1.pth
        │   ├── pg_model_2.pth
        │   ├── pg_model_3.pth
        │   ├── pg_model_4.pth
        │   ├── reinforce_model_2.pth
        │   ├── reinforce_model_3.pth
        │   ├── reinforce_model_4.pth
        │   ├── reinforce_model_5.pth
        │   └── reinforce_model_6.pth
    ├── PPO
        ├── .DS_Store
        ├── PPO_model.py
        ├── PPO_v1.py
        ├── PPO_v2.py
        ├── __pycache__
        │   ├── PPO_model.cpython-37.pyc
        │   ├── PPO_model.cpython-38.pyc
        │   ├── PPO_v1.cpython-38.pyc
        │   └── PPO_v2.cpython-38.pyc
        ├── board
        │   ├── .DS_Store
        │   └── logs
        │   │   ├── .DS_Store
        │   │   ├── events.out.tfevents.1608693869.bogon.80327.0
        │   │   ├── events.out.tfevents.1608694041.bogon.80355.0
        │   │   ├── events.out.tfevents.1608778854.bogon.82580.0
        │   │   ├── events.out.tfevents.1608779119.bogon.82611.0
        │   │   ├── events.out.tfevents.1608779166.bogon.82627.0
        │   │   ├── events.out.tfevents.1608779638.bogon.82655.0
        │   │   ├── events.out.tfevents.1608779657.bogon.82666.0
        │   │   ├── events.out.tfevents.1608780330.bogon.82692.0
        │   │   └── events.out.tfevents.1608780689.bogon.82718.0
        ├── cartPole_ppo-v1_1.png
        ├── main_PPO.py
        └── models
        │   ├── PPO_new.pth
        │   ├── cartPole_ppo.pth
        │   └── cartPole_ppo_20201222.pth
    ├── PPO_cnn
        └── cnn_ppo.py
    ├── envTest.py
    └── results&plots
        ├── .DS_Store
        ├── PPO-A vs. PPO-R.png
        ├── PPO-A vs.PPO-R_1.png
        ├── PPO-A_train_5_times.png
        ├── PPO-A_train_5_times1.png
        ├── PPO-A_train_5times_2.png
        ├── PPO-entropy_5times.png
        ├── PPO_cartPole_20201222.png
        ├── PPO_cartPole_test.png
        ├── PPO_cartPole_train.png
        ├── PPO_comparison.png
        ├── PPO_comparison_1.png
        ├── PPO_comparison_2.png
        ├── PPO_comparison_3.png
        ├── PPO_comparison_4.png
        ├── PPO_multiple_1.png
        ├── PPO_with_entropy.png
        ├── PPO_with_entropy_1.png
        ├── cartpole_reinforce.png
        ├── diff_algorithm.png
        ├── pg_1.png
        ├── pg_2.png
        ├── pg_3.png
        ├── reinforce_1.png
        ├── reinforce_2.png
        ├── reinforce_3.png
        ├── reinforce_4.png
        ├── reinforce_5.png
        ├── reinforce_vs_pg.png
        ├── rf-vs-pg_1.png
        ├── rf-vs-pg_2.png
        ├── rf-vs-pg_3.png
        └── rf-vs-pg_4.png


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/.DS_Store


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/.idea/DRL_pytorch.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 3.8 (base)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="PyDocumentationSettings">
 9 |     <option name="format" value="PLAIN" />
10 |     <option name="myDocStringFormat" value="Plain" />
11 |   </component>
12 |   <component name="TestRunnerService">
13 |     <option name="PROJECT_TEST_RUNNER" value="pytest" />
14 |   </component>
15 | </module>


--------------------------------------------------------------------------------
/.idea/deployment.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="PublishConfigData" autoUpload="Always" serverName="14.110" remoteFilesAllowedToDisappearOnAutoupload="false">
 4 |     <serverData>
 5 |       <paths name="14.110">
 6 |         <serverdata>
 7 |           <mappings>
 8 |             <mapping deploy="/home/lesreg/Remote_Pros/DRL_pytorch" local="$PROJECT_DIR$" web="/" />
 9 |           </mappings>
10 |         </serverdata>
11 |       </paths>
12 |     </serverData>
13 |     <option name="myAutoUpload" value="ALWAYS" />
14 |   </component>
15 | </project>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (base)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/DRL_pytorch.iml" filepath="$PROJECT_DIR$/.idea/DRL_pytorch.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/other.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="PySciProjectComponent">
4 |     <option name="PY_SCI_VIEW_SUGGESTED" value="true" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/webServers.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="WebServers">
 4 |     <option name="servers">
 5 |       <webServer id="6357c7b8-189f-4775-8e9d-6a1b9a8fab4e" name="14.110">
 6 |         <fileTransfer accessType="SFTP" host="192.168.14.110" port="22" sshConfigId="2e87baac-903f-48f6-917d-792cad227a5c" sshConfig="lesreg@192.168.14.110:22 password">
 7 |           <advancedOptions>
 8 |             <advancedOptions dataProtectionLevel="Private" passiveMode="true" shareSSLContext="true" />
 9 |           </advancedOptions>
10 |         </fileTransfer>
11 |       </webServer>
12 |     </option>
13 |   </component>
14 | </project>


--------------------------------------------------------------------------------
/Actor_Critic/A3C/Pendulum_A3C_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/Pendulum_A3C_1.png


--------------------------------------------------------------------------------
/Actor_Critic/A3C/__pycache__/agent_a3c.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/agent_a3c.cpython-37.pyc


--------------------------------------------------------------------------------
/Actor_Critic/A3C/__pycache__/agent_a3c.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/agent_a3c.cpython-38.pyc


--------------------------------------------------------------------------------
/Actor_Critic/A3C/__pycache__/untils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/untils.cpython-37.pyc


--------------------------------------------------------------------------------
/Actor_Critic/A3C/__pycache__/untils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/untils.cpython-38.pyc


--------------------------------------------------------------------------------
/Actor_Critic/A3C/__pycache__/worker.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/worker.cpython-37.pyc


--------------------------------------------------------------------------------
/Actor_Critic/A3C/__pycache__/worker.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/worker.cpython-38.pyc


--------------------------------------------------------------------------------
/Actor_Critic/A3C/a3c_main.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from Actor_Critic.A3C.agent_a3c import A3C
 5 | 
 6 | 
 7 | def get_env_prop(env_name, continuous):
 8 |     env = gym.make(env_name)
 9 |     state_dim = env.observation_space.shape[0]
10 |     if continuous:
11 |         action_dim = env.action_space.shape[0]
12 |     else:
13 |         action_dim = env.action_space.n
14 | 
15 |     return env,state_dim, action_dim
16 | 
17 | 
18 | def train_a3c(env_name,continuous):
19 |     env,state_size,action_size = get_env_prop(env_name,continuous)
20 |     agent = A3C(env,continuous,state_size,action_size)
21 |     scores = agent.train_worker()
22 |     return scores
23 | 
24 | 
25 | def train_agent_for_env(env_name,continuous):
26 |     env = gym.make(env_name)
27 | 
28 |     state_dim = env.observation_space.shape[0]
29 |     if continuous:
30 |         action_dim = env.action_space.shape[0]
31 |     else:
32 |         action_dim = env.action_space.n
33 | 
34 |     agent = A3C(env, continuous,state_dim,action_dim)
35 |     scores = agent.train_worker()
36 | 
37 |     return agent,scores
38 | 
39 | 
40 | def plot_scores(scores,filename):
41 |     fig = plt.figure()
42 |     ax = fig.add_subplot(111)
43 |     plt.plot(np.arange(1, len(scores) + 1), scores)
44 |     plt.ylabel('Score')
45 |     plt.xlabel('Episode #')
46 |     plt.savefig(filename)
47 |     plt.show()
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     # env = gym.make("Pendulum-v0")
52 |     # train_scores = train_a3c(env,True)
53 | 
54 |     # train A3C on discrete env : CartPole
55 |     scores_cartPole = train_agent_for_env("CartPole-v0",False)
56 |     plot_scores(scores_cartPole,"cartPole_trainPlot.png")
57 | 
58 |     # train A3C on continuous env : continuous
59 |     # a3c_mCar = train_agent_for_env("MountainCarContinuous-v0", True)
60 | 
61 | 


--------------------------------------------------------------------------------
/Actor_Critic/A3C/agent_a3c.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import torch
 3 | import torch.optim as optim
 4 | import multiprocessing as mp
 5 | from multiprocessing import Process
 6 | from Actor_Critic.A3C.untils import ValueNetwork,ActorDiscrete,ActorContinous
 7 | from Actor_Critic.A3C.worker import Worker
 8 | 
 9 | GAMMA = 0.9
10 | LR = 1e-4
11 | GLOBAL_MAX_EPISODE = 5000
12 | 
13 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
14 | 
15 | 
16 | class A3C():
17 |     def __init__(self,env,continuous,state_size,action_size):
18 |         self.max_episode=GLOBAL_MAX_EPISODE
19 |         self.global_episode = mp.Value('i', 0)  # 进程之间共享的变量
20 |         self.global_epi_rew = mp.Value('d',0)
21 |         self.rew_queue = mp.Queue()
22 |         self.worker_num = mp.cpu_count()
23 | 
24 |         # define the global networks
25 |         self.global_valueNet= ValueNetwork(state_size,1).to(device)
26 |         # global 的网络参数放入 shared memory，以便复制给各个进程中的 worker网络
27 |         self.global_valueNet.share_memory()
28 | 
29 |         if continuous:
30 |             self.global_policyNet = ActorContinous(state_size, action_size).to(device)
31 |         else:
32 |             self.global_policyNet = ActorDiscrete(state_size, action_size).to(device)
33 |         self.global_policyNet.share_memory()
34 | 
35 |         # global optimizer
36 |         self.global_optimizer_policy = optim.Adam(self.global_policyNet.parameters(), lr=LR)
37 |         self.global_optimizer_value = optim.Adam(self.global_valueNet.parameters(),lr=LR)
38 | 
39 |         # define the workers
40 |         self.workers=[Worker(env,continuous,state_size,action_size,i,
41 |                              self.global_valueNet,self.global_optimizer_value,
42 |                              self.global_policyNet,self.global_optimizer_policy,
43 |                              self.global_episode,self.global_epi_rew,self.rew_queue,
44 |                              self.max_episode,GAMMA)
45 |                       for i in range(self.worker_num)]
46 | 
47 |     def train_worker(self):
48 |         scores=[]
49 |         [w.start() for w in self.workers]
50 |         while True:
51 |             r = self.rew_queue.get()
52 |             if r is not None:
53 |                 scores.append(r)
54 |             else:
55 |                 break
56 |         [w.join() for w in self.workers]
57 | 
58 |         return scores
59 | 
60 |     def save_model(self):
61 |         torch.save(self.global_valueNet.state_dict(), "a3c_value_model.pth")
62 |         torch.save(self.global_policyNet.state_dict(), "a3c_policy_model.pth")
63 | 
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/Actor_Critic/A3C/untils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from collections import namedtuple
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from torch.distributions import Categorical
 6 | from torch.distributions import Normal
 7 | 
 8 | 
 9 | class ValueNetwork(nn.Module):
10 | 
11 |     def __init__(self, input_dim, output_dim):
12 |         super(ValueNetwork, self).__init__()
13 |         self.fc1 = nn.Linear(input_dim, 256)
14 |         self.fc2 = nn.Linear(256, output_dim)
15 | 
16 |     def forward(self, state):
17 |         value = F.relu(self.fc1(state))
18 |         value = self.fc2(value)
19 | 
20 |         return value
21 | 
22 | 
23 | class ActorDiscrete(nn.Module):
24 |     """
25 |     用于离散动作空间的策略网络
26 |     """
27 |     def __init__(self,state_size,action_size):
28 |         super(ActorDiscrete, self).__init__()
29 |         self.seed = torch.manual_seed(0)
30 |         self.fc1 = nn.Linear(state_size, 128)
31 |         # self.fc2 = nn.Linear(64,128)
32 |         self.fc2= nn.Linear(128, action_size)
33 | 
34 |     def forward(self, x):
35 |         """
36 |         Build a network that maps state -> action probs.
37 |         """
38 | 
39 |         x=F.relu(self.fc1(x))
40 |         out = F.softmax(self.fc2(x),dim=1)
41 |         return out
42 | 
43 |     def act(self,state):
44 |         """
45 |         返回 action 和 action的概率
46 |         """
47 |         # probs for each action (2d tensor)
48 |         probs = self.forward(state)
49 |         m = Categorical(probs)
50 |         action = m.sample()
51 | 
52 |         # return action for current state, and the corresponding probability
53 |         return action.item(),probs[:,action.item()].item()
54 | 
55 | 
56 | class ActorContinous(nn.Module):
57 |     """
58 |     用于连续动作空间的策略网络
59 |     """
60 |     def __init__(self,state_size,action_size):
61 |         super(ActorContinous, self).__init__()
62 |         self.fc1 = nn.Linear(state_size, 128)
63 |         self.fc2 = nn.Linear(128,128)
64 |         self.mu_head = nn.Linear(128, action_size)
65 |         self.sigma_head = nn.Linear(128, action_size)
66 | 
67 |     def forward(self, x):
68 |         x = F.relu(self.fc1(x))
69 |         x = F.relu(self.fc2(x))
70 |         mu = 2.0 * torch.tanh(self.mu_head(x))
71 |         sigma = F.softplus(self.sigma_head(x))
72 |         return (mu, sigma)
73 | 
74 |     def act(self,state):
75 |         """
76 |         返回 action 和 action 的 log prob
77 |         """
78 |         with torch.no_grad():
79 |             (mu, sigma) = self.policy(state)  # 2d tensors
80 |         dist = Normal(mu, sigma)
81 |         action = dist.sample()
82 |         action_log_prob = dist.log_prob(action)
83 | 
84 |         return action.numpy()[0], action_log_prob.numpy()[0]
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/Actor_Critic/A3C/worker.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch.multiprocessing as mp
  3 | import torch
  4 | import torch.nn.functional as F
  5 | from torch.distributions import Normal
  6 | from Actor_Critic.A3C.untils import ValueNetwork,ActorDiscrete,ActorContinous
  7 | 
  8 | 
  9 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 10 | 
 11 | 
 12 | class Worker(mp.Process):
 13 |     def __init__(self,env,continuous,state_size,action_size,id, global_valueNet,global_value_optimizer,
 14 |                  global_policyNet,global_policy_optimizer,
 15 |                  global_epi,global_epi_rew,rew_queue,
 16 |                  max_epi,gamma):
 17 |         super(Worker, self).__init__()
 18 |         # define env for individual worker
 19 |         self.env = env
 20 |         self.continuous = continuous
 21 |         self.name = str(id)
 22 |         self.env.seed(id)
 23 |         self.state_size = state_size
 24 |         self.action_size = action_size
 25 |         self.memory=[]
 26 | 
 27 |         # passing global settings to worker
 28 |         self.global_valueNet,self.global_value_optimizer = global_valueNet,global_value_optimizer
 29 |         self.global_policyNet,self.global_policy_optimizer = global_policyNet,global_policy_optimizer
 30 |         self.global_epi,self.global_epi_rew = global_epi,global_epi_rew
 31 |         self.rew_queue = rew_queue
 32 |         self.max_epi = max_epi
 33 |         # self.batch_size = batch_size
 34 |         self.gamma = gamma
 35 | 
 36 |         # define local net for individual worker
 37 |         self.local_policyNet = ActorDiscrete(self.state_size,self.action_size).to(device)
 38 |         if self.continuous:
 39 |             self.local_policyNet = ActorContinous(self.state_size,self.action_size).to(device)
 40 |         self.local_valueNet = ValueNetwork(self.state_size,1).to(device)
 41 | 
 42 |     def sync_global(self):
 43 |         self.local_valueNet.load_state_dict(self.global_valueNet.state_dict())
 44 |         self.local_policyNet.load_state_dict(self.global_policyNet.state_dict())
 45 | 
 46 |     def calculate_loss(self):
 47 |         # get experiences from current trajectory
 48 |         states = torch.tensor([t[0] for t in self.memory], dtype=torch.float)
 49 |         log_probs = torch.tensor([t[1] for t in self.memory], dtype=torch.float)
 50 | 
 51 |         # -- calculate discount future rewards for every time step
 52 |         rewards = [t[2] for t in self.memory]
 53 |         fur_Rewards = []
 54 |         for i in range(len(rewards)):
 55 |             discount = [self.gamma ** i for i in range(len(rewards) - i)]
 56 |             f_rewards = rewards[i:]
 57 |             fur_Rewards.append(sum(d * f for d, f in zip(discount, f_rewards)))
 58 |         fur_Rewards = torch.tensor(fur_Rewards, dtype=torch.float).view(-1, 1)
 59 | 
 60 |         # calculate loss for critic
 61 |         V = self.local_valueNet(states)
 62 |         value_loss = F.mse_loss(fur_Rewards, V)
 63 | 
 64 |         # compute entropy for policy loss
 65 |         (mu, sigma) = self.local_policyNet(states)
 66 |         dist = Normal(mu, sigma)
 67 |         entropy = 0.5 + 0.5 * math.log(2 * math.pi) + torch.log(dist.scale)  # exploration
 68 | 
 69 |         # calculate loss for actor
 70 |         advantage = (fur_Rewards - V).detach()
 71 |         policy_loss = -advantage * log_probs
 72 |         policy_loss = (policy_loss - 0.005 * entropy).mean()
 73 | 
 74 |         return value_loss,policy_loss
 75 | 
 76 |     def update_global(self):
 77 |         value_loss, policy_loss = self.calculate_loss()
 78 | 
 79 |         self.global_value_optimizer.zero_grad()
 80 |         value_loss.backward()
 81 |         # propagate local gradients to global parameters
 82 |         for local_params, global_params in zip(self.local_valueNet.parameters(), self.global_valueNet.parameters()):
 83 |             global_params._grad = local_params._grad
 84 |         self.global_value_optimizer.step()
 85 | 
 86 |         self.global_policy_optimizer.zero_grad()
 87 |         policy_loss.backward()
 88 |         # propagate local gradients to global parameters
 89 |         for local_params, global_params in zip(self.local_policyNet.parameters(), self.global_policyNet.parameters()):
 90 |             global_params._grad = local_params._grad
 91 |         self.global_policy_optimizer.step()
 92 | 
 93 |         self.memory=[]  # clear trajectory
 94 | 
 95 |     def run(self):
 96 |         while self.global_epi.value < self.max_epi:
 97 |             state = self.env.reset()
 98 |             total_reward=0
 99 |             while True:
100 |                 state = torch.from_numpy(state).float().unsqueeze(0).to(device)
101 |                 action, prob = self.local_policyNet.act(state)  # 离散空间取直接prob，连续空间取log prob
102 |                 next_state, reward, done, _ = self.env.step(action)
103 |                 self.memory.append([state,action,reward,next_state,done])
104 |                 total_reward += reward
105 |                 state = next_state
106 | 
107 |                 if done:
108 |                     # recoding global episode and episode reward
109 |                     with self.global_epi.get_lock():
110 |                         self.global_epi.value += 1
111 |                     with self.global_epi_rew.get_lock():
112 |                         if self.global_epi_rew.value == 0.:
113 |                             self.global_epi_rew.value = total_reward
114 |                         else:
115 |                             # Moving average reward
116 |                             self.global_epi_rew.value = self.global_epi_rew.value * 0.99 + total_reward * 0.01
117 |                     self.rew_queue.put(self.global_epi_rew.value)
118 | 
119 |                     print("w{} | episode: {}\t , episode reward:{:.4} \t  "
120 |                           .format(self.name,self.global_epi.value,self.global_epi_rew.value))
121 |                     break
122 | 
123 |             # update and sync with the global net when finishing an episode
124 |             self.update_global()
125 |             self.sync_global()
126 | 
127 |         self.rew_queue.put(None)
128 | 
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/Actor_Critic/SAC/sac_agent.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/SAC/sac_agent.py


--------------------------------------------------------------------------------
/Actor_Critic/SAC/sac_main.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import arrow
 3 | import gym
 4 | import numpy as np
 5 | import pandas as pd
 6 | from matplotlib import pyplot as plt
 7 | 
 8 | 
 9 | def output_scores(start_time,i_episode,scores_deque,score,solve_limit):
10 |     print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'
11 |           .format(i_episode, np.mean(scores_deque), score), end="")
12 |     if i_episode % 100 == 0:
13 |         print('\rEpisode {}\tAverage Score: {:.2f}\t Running time til now :{}'
14 |               .format(i_episode, np.mean(scores_deque),arrow.now()-start_time))
15 |     if np.mean(scores_deque) >= solve_limit:
16 |         print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}\t Total running time :{}'
17 |                 .format(i_episode, np.mean(scores_deque),arrow.now()-start_time))
18 |         return True
19 | 
20 |     return False
21 | 
22 | 
23 | def plot_scores(scores,filename):
24 |     plt.plot(np.arange(1, len(scores) + 1), scores)
25 |     plt.ylabel('Score')
26 |     plt.xlabel('Episode #')
27 |     plt.savefig(filename)
28 |     plt.show()
29 | 
30 | 
31 | def get_env_prop(env_name, continuous):
32 |     env = gym.make(env_name)
33 |     state_dim = env.observation_space.shape[0]
34 |     if continuous:
35 |         action_dim = env.action_space.shape[0]
36 |     else:
37 |         action_dim = env.action_space.n
38 | 
39 |     return env,state_dim, action_dim
40 | 
41 | 
42 | if __name__=="__main__":
43 |     env,state_dim,action_dim = get_env_prop("CartPole-v0",False)
44 | 
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/Actor_Critic/SAC/sac_network.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/SAC/sac_network.py


--------------------------------------------------------------------------------
/BlackBox_optimazation/Hill_Climbing/__pycache__/agent_HC.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/BlackBox_optimazation/Hill_Climbing/__pycache__/agent_HC.cpython-36.pyc


--------------------------------------------------------------------------------
/BlackBox_optimazation/Hill_Climbing/agent_HC.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from collections import deque
 4 | 
 5 | 
 6 | def hill_climbing(env,policy,n_episodes=1000, max_t=1000, gamma=1.0, print_every=100, noise_scale=1e-2):
 7 |     """Implementation of hill climbing with adaptive noise scaling.
 8 | 
 9 |     Params
10 |     ======
11 |         n_episodes (int): maximum number of training episodes
12 |         max_t (int): maximum number of timesteps per episode
13 |         gamma (float): discount rate
14 |         print_every (int): how often to print average score (over last 100 episodes)
15 |         noise_scale (float): standard deviation of additive noise
16 |     """
17 |     scores_deque = deque(maxlen=100)
18 |     scores = []         # 用于存储各 episode 的得分（总奖励）
19 |     best_R = -np.Inf
20 |     best_w = policy.w
21 | 
22 |     for i_episode in range(1, n_episodes + 1):
23 |         rewards = [] # 每个episode 重置奖励队列
24 |         state = env.reset()
25 |         for t in range(max_t):
26 |             action = policy.act(state)
27 |             state, reward, done, _ = env.step(action)
28 |             rewards.append(reward) # 把当前 时间步的奖励加入 rewards 队列
29 |             if done:
30 |                 break
31 |         # 设定折扣率
32 |         discounts = [gamma ** i for i in range(len(rewards) + 1)]
33 |         # 计算当前episode的折扣累计总奖励
34 |         R = sum([a * b for a, b in zip(discounts, rewards)])
35 | 
36 |         scores_deque.append(sum(rewards)) # 把当前episode的累计奖励（无折扣）加入 scores 队列
37 |         scores.append(sum(rewards))
38 | 
39 |         # ------- 参数搜索 ----- #
40 |         if R >= best_R:  # found better weights
41 |             best_R = R
42 |             best_w = policy.w
43 |             noise_scale = max(1e-3, noise_scale / 2) # 缩小搜索范围（下限为 0.001）
44 |             policy.w += noise_scale * np.random.rand(*policy.w.shape)
45 |         else:  # did not find better weights
46 |             noise_scale = min(2, noise_scale * 2) # 扩大搜索范围（上限为2）
47 |             policy.w = best_w + noise_scale * np.random.rand(*policy.w.shape)
48 |         # --------------------- #
49 | 
50 |         if i_episode % print_every == 0:
51 |             print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
52 |         if np.mean(scores_deque) >= 195.0:
53 |             print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode - 100,
54 |                                                                                        np.mean(scores_deque)))
55 |             policy.w = best_w
56 |             break
57 | 
58 |     return scores


--------------------------------------------------------------------------------
/BlackBox_optimazation/Hill_Climbing/main_hillClimb.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from collections import deque
 4 | import matplotlib.pyplot as plt
 5 | from CartPole.Hill_Climbing.agent_HC import hill_climbing
 6 | 
 7 | 
 8 | class Policy():
 9 |     """
10 |     策略函数是一个单层线性神经网络 P(A)=softmax(W*S)
11 |     输出层加入了激活函数softmax，为了把输出值转换成概率（0-1），但没有中间隐藏层，即没有非线性变换
12 |     输入节点数：s_size ；输出节点数：a_size
13 |     参数矩阵 w 的维度 tate_space x action_space
14 |     """
15 |     def __init__(self, s_size=4, a_size=2):
16 |         self.w = 1e-4 * np.random.rand(s_size, a_size)  # weights for simple linear policy: state_space x action_space
17 | 
18 |     def forward(self, state):
19 |         x = np.dot(state, self.w)
20 |         return np.exp(x) / sum(np.exp(x))
21 | 
22 |     def act(self, state):
23 |         probs = self.forward(state)
24 |         # action = np.random.choice(2, p=probs) # option 1: stochastic policy
25 |         action = np.argmax(probs)  # option 2: deterministic policy
26 |         return action
27 | 
28 | 
29 | if __name__=="__main__":
30 |     env = gym.make('CartPole-v0')
31 |     policy=Policy()
32 | 
33 |     print(policy.w)
34 | 
35 |     # 训练智能体：更新 policy （参数w）
36 |     scores = hill_climbing(env,policy)
37 | 
38 |     # 观察训练好的智能体
39 |     state = env.reset()
40 |     img = plt.imshow(env.render(mode='rgb_array'))
41 |     for t in range(200):
42 |         action = policy.act(state)
43 |         img.set_data(env.render(mode='rgb_array'))
44 | 
45 |         state, reward, done, _ = env.step(action)
46 |         if done:
47 |             break
48 | 
49 |     env.close()
50 | 
51 |     # 画累计奖励曲线
52 |     fig = plt.figure()
53 |     ax = fig.add_subplot(111)
54 |     plt.plot(np.arange(1, len(scores) + 1), scores)
55 |     plt.ylabel('Score')
56 |     plt.xlabel('Episode #')
57 |     plt.show()


--------------------------------------------------------------------------------
/BlackBox_optimazation/cross_entropy_method/CEM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/BlackBox_optimazation/cross_entropy_method/CEM.png


--------------------------------------------------------------------------------
/BlackBox_optimazation/cross_entropy_method/__pycache__/agent_cem.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/BlackBox_optimazation/cross_entropy_method/__pycache__/agent_cem.cpython-37.pyc


--------------------------------------------------------------------------------
/BlackBox_optimazation/cross_entropy_method/agent_cem.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from torch.autograd import Variable
 6 | 
 7 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 8 | 
 9 | 
10 | class Agent(nn.Module):
11 |     def __init__(self, env, h_size=16):
12 |         super(Agent, self).__init__()
13 |         self.env = env
14 |         # state, hidden layer, action sizes
15 |         self.s_size = env.observation_space.shape[0]
16 |         self.h_size = h_size
17 |         self.a_size = env.action_space.shape[0]
18 |         # define layers
19 |         self.fc1 = nn.Linear(self.s_size, self.h_size)
20 |         self.fc2 = nn.Linear(self.h_size, self.a_size)
21 | 
22 |     def set_weights(self, weights):
23 |         s_size = self.s_size
24 |         h_size = self.h_size
25 |         a_size = self.a_size
26 |         # separate the weights for each layer
27 |         fc1_end = (s_size * h_size) + h_size
28 |         fc1_W = torch.from_numpy(weights[:s_size * h_size].reshape(s_size, h_size))
29 |         fc1_b = torch.from_numpy(weights[s_size * h_size:fc1_end])
30 |         fc2_W = torch.from_numpy(weights[fc1_end:fc1_end + (h_size * a_size)].reshape(h_size, a_size))
31 |         fc2_b = torch.from_numpy(weights[fc1_end + (h_size * a_size):])
32 |         # set the weights for each layer
33 |         self.fc1.weight.data.copy_(fc1_W.view_as(self.fc1.weight.data))
34 |         self.fc1.bias.data.copy_(fc1_b.view_as(self.fc1.bias.data))
35 |         self.fc2.weight.data.copy_(fc2_W.view_as(self.fc2.weight.data))
36 |         self.fc2.bias.data.copy_(fc2_b.view_as(self.fc2.bias.data))
37 | 
38 |     def get_weights_dim(self):
39 |         return (self.s_size + 1) * self.h_size + (self.h_size + 1) * self.a_size
40 | 
41 |     def forward(self, x):
42 |         x = F.relu(self.fc1(x))
43 |         x = F.tanh(self.fc2(x))
44 |         return x.cpu().data
45 | 
46 |     def evaluate(self, weights, gamma=1.0, max_t=5000):
47 |         self.set_weights(weights)
48 |         episode_return = 0.0
49 |         state = self.env.reset()
50 |         for t in range(max_t):
51 |             state = torch.from_numpy(state).float().to(device)
52 |             action = self.forward(state)
53 |             state, reward, done, _ = self.env.step(action)
54 |             episode_return += reward * math.pow(gamma, t)
55 |             if done:
56 |                 break
57 |         return episode_return
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/BlackBox_optimazation/cross_entropy_method/checkpoint.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/BlackBox_optimazation/cross_entropy_method/checkpoint.pth


--------------------------------------------------------------------------------
/BlackBox_optimazation/cross_entropy_method/main_cem.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import math
 3 | import numpy as np
 4 | from collections import deque
 5 | import matplotlib.pyplot as plt
 6 | import torch
 7 | from MountCar_continuous.cross_entropy_method.agent_cem import Agent
 8 | 
 9 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
10 | 
11 | 
12 | def cem(agent,n_iterations=500, max_t=1000, gamma=1.0, print_every=10, pop_size=50, elite_frac=0.2, sigma=0.5):
13 |     """PyTorch implementation of a cross-entropy method.
14 | 
15 |     Params
16 |     ======
17 |         n_iterations (int): maximum number of training iterations
18 |         max_t (int): maximum number of timesteps per episode
19 |         gamma (float): discount rate
20 |         print_every (int): how often to print average score (over last 100 episodes)
21 |         pop_size (int): size of population at each iteration
22 |         elite_frac (float): percentage of top performers to use in update
23 |         sigma (float): standard deviation of additive noise
24 |     """
25 |     n_elite = int(pop_size * elite_frac)
26 | 
27 |     scores_deque = deque(maxlen=100)
28 |     scores = []
29 |     best_weight = sigma * np.random.randn(agent.get_weights_dim())
30 | 
31 |     for i_iteration in range(1, n_iterations + 1):
32 |         weights_pop = [best_weight + (sigma * np.random.randn(agent.get_weights_dim())) for i in range(pop_size)]
33 |         rewards = np.array([agent.evaluate(weights, gamma, max_t) for weights in weights_pop])
34 | 
35 |         elite_idxs = rewards.argsort()[-n_elite:]
36 |         elite_weights = [weights_pop[i] for i in elite_idxs]
37 |         best_weight = np.array(elite_weights).mean(axis=0)
38 | 
39 |         reward = agent.evaluate(best_weight, gamma=1.0)
40 |         scores_deque.append(reward)
41 |         scores.append(reward)
42 | 
43 |         torch.save(agent.state_dict(), 'checkpoint.pth')
44 | 
45 |         if i_iteration % print_every == 0:
46 |             print('Episode {}\tAverage Score: {:.2f}'.format(i_iteration, np.mean(scores_deque)))
47 | 
48 |         if np.mean(scores_deque) >= 90.0:
49 |             print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}'.format(i_iteration - 100,
50 |                                                                                            np.mean(scores_deque)))
51 |             break
52 |     return scores
53 | 
54 | 
55 | 
56 | def watch_trained_agent(agent):
57 |     # load the weights from file
58 |     agent.load_state_dict(torch.load('checkpoint.pth'))
59 | 
60 |     state = env.reset()
61 |     img = plt.imshow(env.render(mode='rgb_array'))
62 |     while True:
63 |         state = torch.from_numpy(state).float().to(device)
64 |         with torch.no_grad():
65 |             action = agent(state)
66 |         img.set_data(env.render(mode='rgb_array'))
67 |         plt.axis('off')
68 |         next_state, reward, done, _ = env.step(action)
69 |         state = next_state
70 |         if done:
71 |             break
72 | 
73 |     env.close()
74 | 
75 | 
76 | if __name__=="__main__":
77 |     env = gym.make('MountainCarContinuous-v0')
78 |     env.seed(101)
79 |     np.random.seed(101)
80 |     agent = Agent(env).to(device)
81 | 
82 |     # --- train and plot scores --- #
83 |     scores = cem(agent)
84 | 
85 |     # plot the scores
86 |     fig = plt.figure()
87 |     ax = fig.add_subplot(111)
88 |     plt.plot(np.arange(1, len(scores) + 1), scores)
89 |     plt.ylabel('Score')
90 |     plt.xlabel('Episode #')
91 |     plt.show()
92 | 
93 |     # --- watch a pre-trained agent --- #
94 |     watch_trained_agent(agent)
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/DDPGs/DDPG/DDPG_main.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import gym
 5 | import torch
 6 | from DDPGs.DDPG.DDPG_agent import DDPGAgent
 7 | 
 8 | 
 9 | def ddpg(env,agent,n_episodes=2000, max_t=700):
10 |     scores_deque = deque(maxlen=100)
11 |     scores = []
12 | 
13 |     for i_episode in range(1, n_episodes+1):
14 |         state = env.reset()
15 |         agent.reset()
16 |         score = 0
17 |         while True:
18 |             # 智能体生成与当前 state 对应的 action （行动策略）
19 |             action = agent.act(state)
20 |             # 与环境交互，得到 sars'
21 |             next_state, reward, done, _ = env.step(action)
22 |             # 把当前时间步的经验元组传给 agent
23 |             agent.step(i_episode,state, action, reward, next_state, done)
24 |             state = next_state
25 |             score += reward
26 |             if done:
27 |                 break
28 |         scores_deque.append(score)
29 |         scores.append(score)
30 | 
31 |         print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'
32 |               .format(i_episode, np.mean(scores_deque), score),end="")
33 |         if i_episode % 100 == 0:
34 |             torch.save(agent.actor_local.state_dict(), 'model_save/actor2.pth')
35 |             torch.save(agent.critic_local.state_dict(), 'model_save/critic2.pth')
36 |             print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
37 | 
38 |     return scores
39 | 
40 | 
41 | def watch_agent(agent,filename_actor,filename_crtic):
42 |     agent.actor_local.load_state_dict(torch.load(filename_actor))
43 |     agent.critic_local.load_state_dict(torch.load(filename_crtic))
44 |     state = env.reset()
45 |     for t in range(1000):
46 |         action = agent.act(state, noise=False)
47 |         print(action)
48 |         env.render()
49 |         state, reward, done, _ = env.step(action)
50 |         if done:
51 |             break
52 |     env.close()
53 | 
54 | 
55 | def plot_scores(scores):
56 |     fig = plt.figure()
57 |     ax = fig.add_subplot(111)
58 |     plt.plot(np.arange(1, len(scores.size()) + 1), scores)
59 |     plt.ylabel('Score')
60 |     plt.xlabel('Episode #')
61 |     plt.show()
62 | 
63 | 
64 | if __name__=="__main__":
65 |     env = gym.make('BipedalWalker-v2')
66 |     env.seed(10)
67 | 
68 |     # 初始化 ddpg agent
69 |     agent=DDPGAgent(state_size=env.observation_space.shape[0], action_size=env.action_space.shape[0], seed=10)
70 |     # 训练并保存 scores
71 |     scores=ddpg(env,agent)
72 |     plot_scores(scores)
73 | 
74 |     # watch_agent(agent,"actor1.pth","critic1.pth")
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/DDPGs/DDPG/DDPG_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | def hidden_init(layer):
 9 |     fan_in = layer.weight.data.size()[0]
10 |     lim = 1. / np.sqrt(fan_in)
11 |     return (-lim, lim)
12 | 
13 | 
14 | class Actor(nn.Module):
15 |     """Actor (Policy) Model.
16 |     """
17 | 
18 |     def __init__(self, state_size, action_size, seed, fc1_units=256,fc2_units=256):
19 |         """
20 |         single layer MLP network
21 |         ======
22 |             Input dim: state_size
23 |             Output dim: action_size
24 |         """
25 |         super(Actor, self).__init__()
26 |         self.seed = torch.manual_seed(seed)
27 |         self.fc1 = nn.Linear(state_size, fc1_units)
28 |         self.fc2 = nn.Linear(fc1_units, fc2_units)
29 |         self.fc3=nn.Linear(fc2_units,action_size)
30 | 
31 |         self.reset_parameters()
32 | 
33 |     def reset_parameters(self):
34 |         self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
35 |         self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
36 |         self.fc3.weight.data.uniform_(-3e-3, 3e-3)
37 | 
38 |     def forward(self, state):
39 |         """Build an actor (policy) network that maps states -> actions."""
40 |         x = torch.relu(self.fc1(state))
41 |         x = torch.relu(self.fc2(x))
42 |         return torch.tanh(self.fc3(x))
43 | 
44 | 
45 | class Critic(nn.Module):
46 |     """Critic (Value) Model.
47 |     """
48 | 
49 |     def __init__(self, state_size, action_size, seed, fcs1_units=256, fc2_units=256,fc3_units=128):
50 |         """
51 |         ======
52 |             Input dim: state_size
53 |             Output dim: 1
54 |         """
55 |         super(Critic, self).__init__()
56 |         self.seed = torch.manual_seed(seed)
57 |         self.fcs1 = nn.Linear(state_size, fcs1_units)
58 |         self.fc2 = nn.Linear(fcs1_units+action_size, fc2_units)
59 |         self.fc3 = nn.Linear(fc2_units, fc3_units)
60 |         self.fc4 = nn.Linear(fc3_units, 1)
61 |         self.reset_parameters()
62 | 
63 |     def reset_parameters(self):
64 |         self.fcs1.weight.data.uniform_(*hidden_init(self.fcs1))
65 |         self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
66 |         self.fc3.weight.data.uniform_(*hidden_init(self.fc3))
67 |         self.fc4.weight.data.uniform_(-3e-3, 3e-3)
68 | 
69 |     def forward(self, state, action):
70 |         """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
71 |         xs = F.leaky_relu(self.fcs1(state))
72 |         x = torch.cat((xs, action), dim=1)
73 |         x = F.leaky_relu(self.fc2(x))
74 |         x = F.leaky_relu(self.fc3(x))
75 |         return self.fc4(x)
76 | 


--------------------------------------------------------------------------------
/DDPGs/DDPG/model_save/actor1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/actor1.pth


--------------------------------------------------------------------------------
/DDPGs/DDPG/model_save/actor2.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/actor2.pth


--------------------------------------------------------------------------------
/DDPGs/DDPG/model_save/checkpoint_actor.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/checkpoint_actor.pth


--------------------------------------------------------------------------------
/DDPGs/DDPG/model_save/checkpoint_critic.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/checkpoint_critic.pth


--------------------------------------------------------------------------------
/DDPGs/DDPG/model_save/critic1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/critic1.pth


--------------------------------------------------------------------------------
/DDPGs/DDPG/model_save/critic2.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/critic2.pth


--------------------------------------------------------------------------------
/DDPGs/TD3/TD3_main.py:
--------------------------------------------------------------------------------
  1 | from collections import deque
  2 | import numpy as np
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | import gym
  6 | import torch
  7 | import arrow
  8 | import os
  9 | from DDPGs.TD3.TD3_new import TD3
 10 | 
 11 | RESUME= True
 12 | SAVE_MODEL_EVERY = 5
 13 | load_checkpoint_patch=["models/checkpoint/actor_10.pth","models/checkpoint/critic_10.pth"]
 14 | 
 15 | 
 16 | def output_scores(start_time,i_episode,scores_deque,score):
 17 |     print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'
 18 |           .format(i_episode, np.mean(scores_deque), score), end="")
 19 |     if i_episode % 100 == 0:
 20 |         print('\rEpisode {}\tAverage Score: {:.2f}\t Running time til now :{}'
 21 |               .format(i_episode, np.mean(scores_deque),arrow.now()-start_time))
 22 |     if np.mean(scores_deque) >= 300:
 23 |         print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}\t Total running time :{}'
 24 |                 .format(i_episode, np.mean(scores_deque),arrow.now()-start_time))
 25 |         return True
 26 | 
 27 |     return False
 28 | 
 29 | 
 30 | def watch_smart_agent(agent,filename_actor,filename_crtic):
 31 |     agent.actor.load_state_dict(torch.load(filename_actor))
 32 |     agent.critic.load_state_dict(torch.load(filename_crtic))
 33 |     state = env.reset()
 34 |     for t in range(1000):
 35 |         action = agent.select_action(state)
 36 |         print(action)
 37 |         env.render()
 38 |         state, reward, done, _ = env.step(action)
 39 |         if done:
 40 |             break
 41 |     env.close()
 42 | 
 43 | 
 44 | def watch_random_agent():
 45 | 
 46 |     for _ in range(5):
 47 |         env.reset()
 48 |         while True:
 49 |             env.render()
 50 |             next_state, reward, done, _ =env.step(env.action_space.sample())
 51 |             if done:
 52 |                 break
 53 | 
 54 |     env.close()
 55 | 
 56 | 
 57 | def plot_scores(scores,filename):
 58 |     fig = plt.figure()
 59 |     ax = fig.add_subplot(111)
 60 |     plt.plot(np.arange(1, len(scores) + 1), scores)
 61 |     plt.ylabel('Score')
 62 |     plt.xlabel('Episode #')
 63 |     plt.savefig(filename)
 64 |     plt.show()
 65 | 
 66 | 
 67 | def save_check_point(agent,i_episode):
 68 |     # setting the check point for training
 69 |     checkpoint_actor = {
 70 |         "net": agent.actor.state_dict(),
 71 |         'optimizer': agent.actor_optimizer.state_dict(),
 72 |         "epoch": i_episode
 73 |     }
 74 |     checkpoint_critic = {
 75 |         "net": agent.critic.state_dict(),
 76 |         "optimizer": agent.critic_optimizer.state_dict(),
 77 |         "epoch": i_episode
 78 |     }
 79 |     if not os.path.isdir("models/checkpoint"):
 80 |         os.mkdir("models/checkpoint")
 81 |     torch.save(checkpoint_actor, 'models/checkpoint/actor_%s.pth' % (str(i_episode)))
 82 |     torch.save(checkpoint_critic, 'models/checkpoint/critic_%s.pth' % (str(i_episode)))
 83 | 
 84 | 
 85 | def load_check_point(agent):
 86 |     "load saved checkpoints to resume training"
 87 |     checkpoint_actor = torch.load(load_checkpoint_patch[0])  # 加载断点
 88 |     checkpoint_critic = torch.load(load_checkpoint_patch[1])
 89 | 
 90 |     agent.actor.load_state_dict(checkpoint_actor['net'])  # 加载模型可学习参数
 91 |     agent.critic.load_state_dict(checkpoint_critic['net'])
 92 | 
 93 |     agent.actor_optimizer.load_state_dict(checkpoint_actor['optimizer'])  # 加载优化器参数
 94 |     agent.critic_optimizer.load_state_dict(checkpoint_critic['optimizer'])  # 加载优化器参数
 95 | 
 96 |     start_epoch = checkpoint_actor['epoch']  # 设置开始的epoch
 97 |     return start_epoch
 98 | 
 99 | 
100 | def train_td3(env,agent,n_episodes):
101 |     start_epoch = 1
102 | 
103 |     if RESUME:    # 加载 check point 中保存的模型参数继续训练
104 |         start_epoch=load_check_point(agent)
105 | 
106 |     scores_deque = deque(maxlen=100)
107 |     scores = []
108 |     start_time = arrow.now()
109 |     for i_episode in range(start_epoch, n_episodes + 1):
110 |         state = env.reset()
111 |         total_reward = 0
112 |         time_step = 0
113 | 
114 |         # loop over time steps
115 |         while True:
116 |             # 智能体选择动作（根据当前策略）
117 |             action = agent.select_action(state)
118 |             next_state, reward, done, _ = env.step(action)
119 |             agent.save_exp(state, action, next_state, reward, done)
120 |             if agent.mode==1:
121 |                 agent.train(time_step)
122 |             time_step += 1
123 |             state = next_state
124 |             total_reward += reward
125 |             if done:
126 |                 break
127 | 
128 |         # recording scores
129 |         scores.append([i_episode,total_reward])
130 |         scores_deque.append(total_reward)
131 |         finished = output_scores(start_time, i_episode, scores_deque, total_reward)
132 |         if finished:
133 |             agent.save('models', 'TD3_v2')
134 |             break
135 | 
136 |         if i_episode% SAVE_MODEL_EVERY ==0:
137 |             save_check_point(agent, i_episode)
138 |             # 同时保存 scores，存为 scv 文件
139 |             scores_df=pd.DataFrame(data=scores,columns=['episode','score'])
140 |             scores_df.to_csv('scores_saved.csv',index=False)
141 | 
142 |         if agent.mode==0:
143 |             agent.train(time_step)
144 | 
145 |     return scores
146 | 
147 | 
148 | if __name__=="__main__":
149 |     env = gym.make('BipedalWalker-v3')
150 |     env.seed(10)
151 |     state_dim = env.observation_space.shape[0]
152 |     action_dim = env.action_space.shape[0]
153 |     max_action = float(env.action_space.high[0])
154 | 
155 |     agent_0 = TD3(state_dim,action_dim,max_action,env,0)      # mode=0:update per episode
156 |     agent_1 = TD3(state_dim, action_dim, max_action, env, 1)  # mode=1: update per time step
157 |     # scores=train_td3(env,agent_1,1000)
158 | 
159 |     # 观察未经训练的随机智能体
160 |     #watch_random_agent()
161 |     watch_smart_agent(agent_0,"models/TD3_actor.pth","models/TD3_critic.pth")
162 | 
163 | 
164 | 


--------------------------------------------------------------------------------
/DDPGs/TD3/TD3_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | def hidden_init(layer):
 8 |     fan_in = layer.weight.data.size()[0]
 9 |     lim = 1. / np.sqrt(fan_in)
10 |     return (-lim, lim)
11 | 
12 | # Actor Neural Network
13 | class Actor(nn.Module):
14 |     def __init__(self, state_dim, action_dim, max_action):
15 |         super(Actor, self).__init__()
16 | 
17 |         self.l1 = nn.Linear(state_dim, 400)
18 |         self.l2 = nn.Linear(400, 300)
19 |         self.l3 = nn.Linear(300, action_dim)
20 | 
21 |         self.max_action = max_action
22 | 
23 |     def forward(self, x):
24 |         x = F.relu(self.l1(x))
25 |         x = F.relu(self.l2(x))
26 |         x = self.max_action * torch.tanh(self.l3(x))
27 |         return x
28 | 
29 | 
30 | # Q1-Q2-Critic Neural Network
31 | class Critic(nn.Module):
32 |     def __init__(self, state_dim, action_dim):
33 |         super(Critic, self).__init__()
34 | 
35 |         # Q1 architecture
36 |         self.l1 = nn.Linear(state_dim + action_dim, 400)
37 |         self.l2 = nn.Linear(400, 300)
38 |         self.l3 = nn.Linear(300, 1)
39 | 
40 |         # Q2 architecture
41 |         self.l4 = nn.Linear(state_dim + action_dim, 400)
42 |         self.l5 = nn.Linear(400, 300)
43 |         self.l6 = nn.Linear(300, 1)
44 | 
45 |     def forward(self, x, u):
46 |         xu = torch.cat([x, u], 1)
47 |         x1 = F.relu(self.l1(xu))
48 |         x1 = F.relu(self.l2(x1))
49 |         x1 = self.l3(x1)
50 | 
51 |         x2 = F.relu(self.l4(xu))
52 |         x2 = F.relu(self.l5(x2))
53 |         x2 = self.l6(x2)
54 |         return x1, x2
55 | 
56 |     def Q1(self, x, u):
57 |         xu = torch.cat([x, u], 1)
58 | 
59 |         x1 = F.relu(self.l1(xu))
60 |         x1 = F.relu(self.l2(x1))
61 |         x1 = self.l3(x1)
62 |         return x1
63 | 


--------------------------------------------------------------------------------
/DDPGs/TD3/TD3_solved.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/TD3/TD3_solved.png


--------------------------------------------------------------------------------
/DDPGs/TD3/__pycache__/TD3_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/TD3/__pycache__/TD3_model.cpython-38.pyc


--------------------------------------------------------------------------------
/DDPGs/TD3/__pycache__/TD3_new.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/TD3/__pycache__/TD3_new.cpython-38.pyc


--------------------------------------------------------------------------------
/DDPGs/TD3/models/TD3_actor.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/TD3/models/TD3_actor.pth


--------------------------------------------------------------------------------
/DDPGs/TD3/models/TD3_critic.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/TD3/models/TD3_critic.pth


--------------------------------------------------------------------------------
/DDPGs/TD3/scores_saved.csv:
--------------------------------------------------------------------------------
 1 | episode,score
 2 | 10,-110.23470465514899
 3 | 11,-99.79645054419306
 4 | 12,-126.96190521016625
 5 | 13,-145.1785128976382
 6 | 14,-104.07223475725621
 7 | 15,-115.51990479428935
 8 | 16,-121.89401543392783
 9 | 17,-101.55811828293544
10 | 18,-99.89300219319254
11 | 19,-104.98346411872812
12 | 20,-106.23650016517124
13 | 21,-103.84864901668085
14 | 22,-121.26231449909034
15 | 23,-111.10637092719374
16 | 24,-114.66147317025639
17 | 25,-109.04674660607814
18 | 26,-106.12839938371035
19 | 27,-127.84833024115571
20 | 28,-112.4956240406665
21 | 29,-107.88297384519461
22 | 30,-99.92317202252215
23 | 31,-125.0179119318615
24 | 32,-100.36371986921576
25 | 33,-110.05038883724445
26 | 34,-132.021171753202
27 | 35,-133.880572424944
28 | 36,-100.11141411138209
29 | 37,-122.84088047947732
30 | 38,-103.55589092614429
31 | 39,-121.33897605525831
32 | 40,-230.96539978320874
33 | 41,-124.41352611894291
34 | 42,-119.04664112562759
35 | 43,-115.88990889360397
36 | 44,-116.13402150126659
37 | 45,-153.34740936961384
38 | 46,-107.6970551122332
39 | 47,-103.65690659839511
40 | 48,-107.19409725924586
41 | 49,-95.42425468958133
42 | 50,-103.23654880642972
43 | 51,-106.54915425148904
44 | 52,-114.40587754233572
45 | 53,-117.05561482745843
46 | 54,-113.98186717139261
47 | 55,-107.9192592421826
48 | 56,-100.75576811566108
49 | 57,-101.08801850037774
50 | 58,-132.08337249626365
51 | 59,-108.36035750377272
52 | 60,-111.05887943139943
53 | 61,-121.58251865927255
54 | 62,-141.45334827651286
55 | 63,-114.22801798708173
56 | 64,-119.39000501374326
57 | 65,-132.97221690919855
58 | 66,-123.59944656251093
59 | 67,-101.49795907883284
60 | 68,-103.72652107741114
61 | 69,-98.53701366942036
62 | 70,-100.89222275623676
63 | 71,-143.13936392812377
64 | 72,-101.36843625940055
65 | 73,-100.09964251577696
66 | 74,-98.95479499937969
67 | 75,-104.2556218599781
68 | 


--------------------------------------------------------------------------------
/DDPGs/TD3/test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | scores=np.array([[1,-1.2],[2,-3.4],[3,3.6]])
 5 | 
 6 | 
 7 | df=pd.DataFrame(data=scores,columns=['episode','score'])
 8 | 
 9 | print(df)
10 | 


--------------------------------------------------------------------------------
/DQNs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/.DS_Store


--------------------------------------------------------------------------------
/DQNs/DDQN/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/.DS_Store


--------------------------------------------------------------------------------
/DQNs/DDQN/DQN_main.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import arrow
  3 | import torch
  4 | import numpy as np
  5 | from collections import deque
  6 | # import matplotlib
  7 | # matplotlib.use('TkAgg')
  8 | import matplotlib.pyplot as plt
  9 | from DQNs.DDQN.ddqn_v3 import AgentV3
 10 | 
 11 | 
 12 | def dqn(agent,model_file,n_episodes=2000, max_t=1000,
 13 |         eps_start=1.0, eps_end=0.01, eps_decay=0.995,
 14 |         beta_start=0.4):
 15 |     """Deep Q-Learning.
 16 | 
 17 | 
 18 |     Params
 19 |     ======
 20 |         n_episodes (int): maximum number of training episodes
 21 |         max_t (int): maximum number of timesteps per episode
 22 |         eps_start (float): starting value of epsilon, for epsilon-greedy action selection
 23 |         eps_end (float): minimum value of epsilon
 24 |         eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
 25 |     """
 26 |     scores = []  # list containing scores from each episode
 27 |     scores_window = deque(maxlen=100)  # last 100 scores
 28 |     eps = eps_start  # initialize epsilon
 29 |     beta=beta_start
 30 | 
 31 |     start_time=arrow.now()
 32 |     for i_episode in range(1, n_episodes + 1):
 33 |         state = env.reset()
 34 |         score = 0
 35 |         episode_loss=[]
 36 |         for t in range(max_t):
 37 |             # 在当前状态下获取要采取的 action
 38 |             action = agent.act(state, eps)
 39 |             # 与环境交互获取 （s',r,done）
 40 |             next_state, reward, done, _ = env.step(action)
 41 |             # 构建 sarsa 序列，传给智能体
 42 |             loss=agent.step(state, action, reward, next_state, done)
 43 |             if loss is not None:
 44 |                 episode_loss.append(loss)
 45 |             state = next_state
 46 |             score += reward
 47 |             if done:
 48 |                 break
 49 |         scores_window.append(score)  # save most recent score
 50 |         scores.append(score)  # save most recent score
 51 |         eps = max(eps_end, eps_decay * eps)  # decrease epsilon
 52 | 
 53 |         # beta = beta/beta_incre if beta<beta_end else beta_end # update beta (<=1)
 54 |         # beta=min((beta+beta_incre),beta_end)
 55 | 
 56 |         print('\rEpisode {}\t Loss {} \t Average Score: {:.2f}'.format(i_episode,np.mean(episode_loss), np.mean(scores_window)),end="")
 57 |         if i_episode % 100 == 0:
 58 |             print('\rEpisode {}\t Loss {} \t Average Score: {:.2f}'.format(i_episode,np.mean(episode_loss), np.mean(scores_window)))
 59 |             print('\rRunning time:{}\n'.format(arrow.now()-start_time))
 60 |         if np.mean(scores_window) >= 200.0:
 61 |             print('\nEnvironment solved in {:d} episodes! \t Average Score: {:.2f}'.format(i_episode - 100,
 62 |                                                                                          np.mean(scores_window)))
 63 |             torch.save(agent.qnetwork_local.state_dict(), model_file)
 64 |             print('\nTotal running time:{}'.format(arrow.now() - start_time))
 65 |             break
 66 |     return scores
 67 | 
 68 | 
 69 | def watch_agent(agent):
 70 | 
 71 |     state = env.reset()
 72 |     for j in range(500):
 73 |         action = agent.act(state)
 74 |         env.render()
 75 |         state, reward, done, _ = env.step(action)
 76 |         if done:
 77 |             break
 78 |     env.close()
 79 | 
 80 | 
 81 | def watch_random_agent():
 82 | 
 83 |     for _ in range(3):
 84 |         env.reset()
 85 |         while True:
 86 |             env.render()
 87 |             next_state, reward, done, _ =env.step(env.action_space.sample())
 88 |             if done:
 89 |                 break
 90 | 
 91 |     env.close()
 92 | 
 93 | 
 94 | def trained_agent_test(filename,episode_num=500,max_t=1000,eps=0.01):
 95 |     """
 96 |     :param filename:
 97 |     :param episode_num:
 98 |     :param max_t:
 99 |     :param eps:
100 |     :return:
101 |     """
102 |     # agent = Agent(state_size=8, action_size=4, seed=0)
103 |     agent_v3 = AgentV3(state_size=8, action_size=4, seed=0)
104 |     agent_v3.qnetwork_local.load_state_dict(torch.load(filename))
105 | 
106 |     watch_agent(agent_v3)
107 | 
108 |     scores=[]
109 |     scores_window = deque(maxlen=100)
110 |     start_time=arrow.now()
111 |     for i_episode in range(episode_num):
112 |         state = env.reset()
113 |         score = 0
114 |         for t in range(max_t):
115 |             # 直接采用贪婪策略
116 |             action = agent_v3.act(state)
117 |             next_state, reward, done, _ = env.step(action)
118 |             state = next_state
119 |             score += reward
120 |             if done:
121 |                 break
122 |         scores.append(score)
123 |         scores_window.append(score)
124 |         print('\rEpisode {}\t  Average Score: {:.2f}'.format(i_episode, np.mean(scores_window)),end="")
125 |         if i_episode % 100 == 0:
126 |             print('\rEpisode {}\t  Average Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
127 |             print('\rRunning time:{}\n'.format(arrow.now()-start_time))
128 |     return scores
129 | 
130 | 
131 | def plot_scores(scores,filename):
132 |     # plot the scores
133 |     fig = plt.figure()
134 |     ax = fig.add_subplot(1, 1, 1)
135 |     # ax.plot(np.arange(len(scores_1)), scores_1)
136 |     ax.plot(np.arange(len(scores)), scores)
137 |     # rolling_mean = pd.Series(scores).rolling(100).mean()
138 |     plt.ylabel('Score')
139 |     plt.xlabel('Episode #')
140 |     plt.savefig(filename)
141 | 
142 | 
143 | if __name__=="__main__":
144 |     env = gym.make('LunarLander-v2')
145 |     env.seed(0)
146 | 
147 |     # 训练 ddqn agent 并获取平均累计奖励
148 |     agent_v3 = AgentV3(state_size=8, action_size=4, seed=0)
149 |     print("\n\nTraining ddqn agent:\n-------------------------------------------------------------\n")
150 |     train_scores = dqn(agent_v3,'dueling_model.pth')
151 |     # plot_scores(train_scores,'images/dueling-ddqn_training.png')
152 | 
153 |     # 观察未经训练的随机智能体
154 |     #watch_random_agent()
155 |     # 用训练好的智能体跑分并绘制奖励曲线
156 |     # test_scores=trained_agent_test('models/dueling_model.pth')
157 |     # plot_scores(test_scores,'images/dueling-ddqn_testing.png')
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/DQNs/DDQN/__pycache__/ddqn_v3.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/__pycache__/ddqn_v3.cpython-38.pyc


--------------------------------------------------------------------------------
/DQNs/DDQN/__pycache__/model_dueling.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/__pycache__/model_dueling.cpython-38.pyc


--------------------------------------------------------------------------------
/DQNs/DDQN/ddqn_v1.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 对经典 DQN 的改进
  3 | 1. Double DQN
  4 | """
  5 | 
  6 | import numpy as np
  7 | import random
  8 | from collections import namedtuple, deque
  9 | import torch
 10 | import torch.nn.functional as F
 11 | import torch.optim as optim
 12 | from LunarLander.DQN.model import QNetwork
 13 | 
 14 | 
 15 | BUFFER_SIZE = int(1e5)  # replay buffer size
 16 | BATCH_SIZE = 64  # minibatch size
 17 | GAMMA = 0.99  # discount factor
 18 | TAU = 1e-3  # for soft update of target parameters
 19 | LR = 5e-4  # learning rate
 20 | UPDATE_EVERY = 4  # how often to update the network
 21 | 
 22 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 23 | 
 24 | 
 25 | class AgentV2():
 26 |     """Interacts with and learns from the environment."""
 27 | 
 28 |     def __init__(self, state_size, action_size, seed):
 29 |         """Initialize an Agent object.
 30 | 
 31 |         Params
 32 |         ======
 33 |             state_size (int): dimension of each state
 34 |             action_size (int): dimension of each action
 35 |             seed (int): random seed
 36 |         """
 37 |         self.state_size = state_size
 38 |         self.action_size = action_size
 39 |         self.seed = random.seed(seed)
 40 | 
 41 |         # Q-Network
 42 |         self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
 43 |         self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
 44 |         self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
 45 | 
 46 |         # Replay memory
 47 |         self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
 48 |         # Initialize time step (for updating every UPDATE_EVERY steps)
 49 |         self.t_step = 0
 50 | 
 51 |     def step(self, state, action, reward, next_state, done):
 52 |         # Save experience in replay memory
 53 |         self.memory.add(state, action, reward, next_state, done)
 54 | 
 55 |         # Learn every UPDATE_EVERY time steps.
 56 |         self.t_step = (self.t_step + 1) % UPDATE_EVERY
 57 |         if self.t_step == 0:
 58 |             # If enough samples are available in memory, get random subset and learn
 59 |             if len(self.memory) > BATCH_SIZE:
 60 |                 experiences = self.memory.sample()
 61 |                 self.learn(experiences, GAMMA)
 62 | 
 63 |     def act(self, state, eps=0.):
 64 |         """Returns actions for given state as per current policy.
 65 | 
 66 |         Params
 67 |         ======
 68 |             state (array_like): current state
 69 |             eps (float): epsilon, for epsilon-greedy action selection
 70 |         """
 71 |         state = torch.from_numpy(state).float().unsqueeze(0).to(device)
 72 |         self.qnetwork_local.eval()
 73 |         with torch.no_grad():
 74 |             action_values = self.qnetwork_local(state)
 75 |         self.qnetwork_local.train()
 76 | 
 77 |         # Epsilon-greedy action selection
 78 |         if random.random() > eps:
 79 |             return np.argmax(action_values.cpu().data.numpy())
 80 |         else:
 81 |             return random.choice(np.arange(self.action_size))
 82 | 
 83 |     def learn(self, experiences, gamma):
 84 |         """Update value parameters using given batch of experience tuples.
 85 | 
 86 |         Params
 87 |         ======
 88 |             experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
 89 |             gamma (float): discount factor
 90 |         """
 91 |         # 从 experiences 取得所有时间步的 (s,a,r,s',done)的序列，均为列向量 [BATCH_SIZE,1]
 92 |         states, actions, rewards, next_states, dones = experiences
 93 | 
 94 |         # ----------计算 Q targets------------------------------ #
 95 |         # 从local网络的 Q estimated 取最大值对应的动作序列
 96 |         Q_expected_next_max = self.qnetwork_local(next_states).detach().argmax(1).unsqueeze(1) # shape:[BATCH_SIZE,1](.unsqueeze(1)转换成列向量)
 97 |         # Double TD3：这些动作序列输入target网络得到对应的 Q 估计值，而不是直接让 target 网络选取最大Q（避免了 overestimated 问题）
 98 |         Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_expected_next_max)
 99 |         # 根据公式计算 Q 目标
100 |         Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
101 | 
102 |         # --------- Get expected Q values from local model----------------- #
103 |         # 找到每个 (state,action) 对应的q值，输出为一个q(s,a)序列
104 |         # print(self.qnetwork_local(states)) # shape:[BATCH_SIZE,4]
105 |         Q_expected = self.qnetwork_local(states).gather(1, actions)  # shape:[BATCH_SIZE,1]
106 | 
107 |         # -------------训练 local网络-------------------------------- #
108 |         # Compute loss
109 |         loss = F.mse_loss(Q_expected, Q_targets) # 用Q估计值和Q目标计算均方差损失函数，都为列向量
110 |         # Minimize the loss
111 |         self.optimizer.zero_grad() # 先把原来的梯度清零
112 |         loss.backward()
113 |         self.optimizer.step()
114 | 
115 |         # ------------------- update target network ------------------- #
116 |         self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)
117 | 
118 |     def soft_update(self, local_model, target_model, tau):
119 |         """Soft update model parameters.
120 |         θ_target = τ*θ_local + (1 - τ)*θ_target
121 | 
122 |         Params
123 |         ======
124 |             local_model (PyTorch model): weights will be copied from
125 |             target_model (PyTorch model): weights will be copied to
126 |             tau (float): interpolation parameter
127 |         """
128 |         for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
129 |             target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
130 | 
131 | 
132 | class ReplayBuffer:
133 |     """Fixed-size buffer to store experience tuples."""
134 | 
135 |     def __init__(self, action_size, buffer_size, batch_size, seed):
136 |         """Initialize a ReplayBuffer object.
137 | 
138 |         Params
139 |         ======
140 |             action_size (int): dimension of each action
141 |             buffer_size (int): maximum size of buffer
142 |             batch_size (int): size of each training batch
143 |             seed (int): random seed
144 |         """
145 |         self.action_size = action_size
146 |         self.memory = deque(maxlen=buffer_size)
147 |         self.batch_size = batch_size
148 |         self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
149 |         self.seed = random.seed(seed)
150 | 
151 |     def add(self, state, action, reward, next_state, done):
152 |         """Add a new experience to memory."""
153 |         e = self.experience(state, action, reward, next_state, done)
154 |         self.memory.append(e)
155 | 
156 |     def sample(self):
157 |         """Randomly sample a batch of experiences from memory."""
158 |         experiences = random.sample(self.memory, k=self.batch_size)
159 | 
160 |         states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
161 |         actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
162 |         rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
163 |         next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(
164 |             device)
165 |         dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(
166 |             device)
167 | 
168 |         return (states, actions, rewards, next_states, dones)
169 | 
170 |     def __len__(self):
171 |         """Return the current size of internal memory."""
172 |         return len(self.memory)
173 | 


--------------------------------------------------------------------------------
/DQNs/DDQN/ddqn_v3.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 对经典 DQN 的改进
  3 | 1. Double DQN
  4 | 2. Dueling Network
  5 | """
  6 | import numpy as np
  7 | import random
  8 | from collections import namedtuple, deque
  9 | import torch
 10 | import torch.nn.functional as F
 11 | import torch.optim as optim
 12 | 
 13 | from DQNs.DDQN.model_dueling import QNetwork
 14 | 
 15 | BUFFER_SIZE = int(1e4)  # replay buffer size
 16 | BATCH_SIZE = 64  # minibatch size
 17 | GAMMA = 0.99  # discount factor
 18 | TAU = 1e-3  # for soft update of target parameters
 19 | LR = 5e-3  # learning rate
 20 | UPDATE_EVERY = 4  # how often to update the network
 21 | E=1e-8 # small number to add to the priority of experience
 22 | 
 23 | 
 24 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 25 | 
 26 | 
 27 | class AgentV3():
 28 |     """Interacts with and learns from the environment."""
 29 | 
 30 |     def __init__(self, state_size, action_size, seed):
 31 |         """Initialize an Agent object.
 32 | 
 33 |         Params
 34 |         ======
 35 |             state_size (int): dimension of each state
 36 |             action_size (int): dimension of each action
 37 |             seed (int): random seed
 38 |         """
 39 |         self.state_size = state_size
 40 |         self.action_size = action_size
 41 |         # self.seed = random.seed(seed)
 42 | 
 43 |         # Q-Network
 44 |         self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
 45 |         self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
 46 |         self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
 47 | 
 48 |         # Replay memory
 49 |         self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
 50 |         # Initialize time step (for updating every UPDATE_EVERY steps)
 51 |         self.t_step = 0
 52 | 
 53 |     def step(self, state, action, reward, next_state, done):
 54 |         # Save experience in replay memory
 55 |         self.memory.add(state, action, reward, next_state, done)
 56 | 
 57 |         # Learn every UPDATE_EVERY time steps.
 58 |         self.t_step = (self.t_step + 1) % UPDATE_EVERY
 59 |         if self.t_step == 0:
 60 |             # If enough samples are available in memory, get random subset and learn
 61 |             if len(self.memory) > BATCH_SIZE:
 62 |                 experiences = self.memory.sample()
 63 | 
 64 |                 loss=self.learn(experiences, GAMMA)
 65 |                 return loss
 66 | 
 67 |     def act(self, state, eps=0.):
 68 |         """Returns actions for given state as per current policy.
 69 | 
 70 |         Params
 71 |         ======
 72 |             state (array_like): current state
 73 |             eps (float): epsilon, for epsilon-greedy action selection
 74 |         """
 75 |         state = torch.from_numpy(state).float().unsqueeze(0).to(device)
 76 |         self.qnetwork_local.eval()
 77 |         with torch.no_grad():
 78 |             action_values = self.qnetwork_local(state)
 79 |         self.qnetwork_local.train()
 80 | 
 81 |         # Epsilon-greedy action selection
 82 |         if random.random() > eps:
 83 |             return np.argmax(action_values.cpu().data.numpy())
 84 |         else:
 85 |             return random.choice(np.arange(self.action_size))
 86 | 
 87 |     def learn(self, experiences, gamma):
 88 |         """Update value parameters using given batch of experience tuples.
 89 | 
 90 |         Params
 91 |         ======
 92 |             experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
 93 |             gamma (float): discount factor
 94 |         """
 95 |         # 从 experiences 取得所有时间步的 (s,a,r,s',done)的序列
 96 |         states, actions, rewards, next_states, dones = experiences
 97 | 
 98 |         # ----------计算 Q targets------------------------------ #
 99 |         # 从local网络的 Q estimated 取最大值对应的动作序列
100 |         Q_expected_next_max = self.qnetwork_local(next_states).detach().argmax(1).unsqueeze(1) # shape:[BATCH_SIZE,1](.unsqueeze(1)转换成列向量)
101 |         # Double TD3：这些动作序列输入target网络得到对应的 Q 估计值，而不是直接让 target 网络选取最大Q（避免了 overestimated 问题）
102 |         Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_expected_next_max)
103 |         # 根据公式计算 Q 目标
104 |         Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
105 | 
106 |         # --------- Get expected Q values from local model----------------- #
107 |         # 找到每个 (state,action) 对应的q值，输出为一个q(s,a)序列
108 |         # print(self.qnetwork_local(states)) # shape:[BATCH_SIZE,4]
109 |         Q_expected = self.qnetwork_local(states).gather(1, actions)  # shape:[BATCH_SIZE,1]
110 | 
111 |         # -------------训练 local网络-------------------------------- #
112 |         # Compute loss
113 |         loss = F.mse_loss(Q_expected, Q_targets) # 用Q估计值和Q目标计算均方差损失函数，都为列向量
114 |         # Minimize the loss
115 |         self.optimizer.zero_grad() # 先把原来的梯度清零
116 |         loss.backward()
117 |         self.optimizer.step()
118 | 
119 |         # ------------------- update target network ------------------- #
120 |         self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)
121 | 
122 |         return loss.cpu().detach().numpy()
123 | 
124 |     def soft_update(self, local_model, target_model, tau):
125 |         """Soft update model parameters.
126 |         θ_target = τ*θ_local + (1 - τ)*θ_target
127 | 
128 |         Params
129 |         ======
130 |             local_model (PyTorch model): weights will be copied from
131 |             target_model (PyTorch model): weights will be copied to
132 |             tau (float): interpolation parameter
133 |         """
134 |         for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
135 |             target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
136 | 
137 | 
138 | class ReplayBuffer:
139 |     """Fixed-size buffer to store experience tuples."""
140 | 
141 |     def __init__(self, action_size, buffer_size, batch_size, seed):
142 |         """Initialize a ReplayBuffer object.
143 | 
144 |         Params
145 |         ======
146 |             action_size (int): dimension of each action
147 |             buffer_size (int): maximum size of buffer
148 |             batch_size (int): size of each training batch
149 |             seed (int): random seed
150 |         """
151 |         # self.action_size = action_size
152 |         self.memory = deque(maxlen=buffer_size)
153 |         self.batch_size = batch_size
154 |         self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
155 |         # self.seed = random.seed(seed)
156 | 
157 |     def add(self, state, action, reward, next_state, done):
158 |         """Add a new experience to memory."""
159 |         e = self.experience(state, action, reward, next_state, done)
160 |         self.memory.append(e)
161 | 
162 |     def sample(self):
163 |         """Randomly sample a batch of experiences from memory."""
164 |         experiences = random.sample(self.memory, k=self.batch_size)
165 | 
166 |         states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
167 |         actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
168 |         rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
169 |         next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(
170 |             device)
171 |         dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(
172 |             device)
173 | 
174 |         return (states, actions, rewards, next_states, dones)
175 | 
176 |     def __len__(self):
177 |         """Return the current size of internal memory."""
178 |         return len(self.memory)


--------------------------------------------------------------------------------
/DQNs/DDQN/dqn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | from collections import namedtuple, deque
  4 | from LunarLander.DQN.model import QNetwork
  5 | 
  6 | import torch
  7 | import torch.nn.functional as F
  8 | import torch.optim as optim
  9 | 
 10 | BUFFER_SIZE = int(1e5)  # replay buffer size
 11 | BATCH_SIZE = 64  # minibatch size
 12 | GAMMA = 0.99  # discount factor
 13 | TAU = 1e-3  # for soft update of target parameters
 14 | LR = 5e-4  # learning rate
 15 | UPDATE_EVERY = 4  # how often to update the network
 16 | 
 17 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 18 | 
 19 | 
 20 | class Agent():
 21 |     """Interacts with and learns from the environment."""
 22 | 
 23 |     def __init__(self, state_size, action_size, seed):
 24 |         """Initialize an Agent object.
 25 | 
 26 |         Params
 27 |         ======
 28 |             state_size (int): dimension of each state
 29 |             action_size (int): dimension of each action
 30 |             seed (int): random seed
 31 |         """
 32 |         self.state_size = state_size
 33 |         self.action_size = action_size
 34 |         self.seed = random.seed(seed)
 35 | 
 36 |         # Q-Network
 37 |         self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
 38 |         self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
 39 |         self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
 40 | 
 41 |         # Replay memory
 42 |         self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
 43 |         # Initialize time step (for updating every UPDATE_EVERY steps)
 44 |         self.t_step = 0
 45 | 
 46 |     def step(self, state, action, reward, next_state, done):
 47 |         # Save experience in replay memory
 48 |         self.memory.add(state, action, reward, next_state, done)
 49 | 
 50 |         # Learn every UPDATE_EVERY time steps.
 51 |         self.t_step = (self.t_step + 1) % UPDATE_EVERY
 52 |         if self.t_step == 0:
 53 |             # If enough samples are available in memory, get random subset and learn
 54 |             if len(self.memory) > BATCH_SIZE:
 55 |                 experiences = self.memory.sample()
 56 |                 self.learn(experiences, GAMMA)
 57 | 
 58 |     def act(self, state, eps=0.):
 59 |         """Returns actions for given state as per current policy.
 60 | 
 61 |         Params
 62 |         ======
 63 |             state (array_like): current state
 64 |             eps (float): epsilon, for epsilon-greedy action selection
 65 |         """
 66 |         state = torch.from_numpy(state).float().unsqueeze(0).to(device)
 67 |         self.qnetwork_local.eval()
 68 |         with torch.no_grad():
 69 |             action_values = self.qnetwork_local(state)
 70 |         self.qnetwork_local.train()
 71 | 
 72 |         # Epsilon-greedy action selection
 73 |         if random.random() > eps:
 74 |             return np.argmax(action_values.cpu().data.numpy())
 75 |         else:
 76 |             return random.choice(np.arange(self.action_size))
 77 | 
 78 |     def learn(self, experiences, gamma):
 79 |         """Update value parameters using given batch of experience tuples.
 80 | 
 81 |         Params
 82 |         ======
 83 |             experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
 84 |             gamma (float): discount factor
 85 |         """
 86 |         # 从 experiences 取得所有时间步的 (s,a,r,s',done)的序列，均为列向量 [BATCH_SIZE,1]
 87 |         states, actions, rewards, next_states, dones = experiences
 88 |         # ------计算每个经验元组对应的Q目标序列
 89 |         # Get max predicted Q values (for next states) from target model
 90 |         # print(self.qnetwork_target(next_states)) # shape:[BATCH_SIZE,4]
 91 |         Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)  # shape:[BATCH_SIZE,1]
 92 |         # Compute Q targets for current states
 93 |         Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))  # shape:[BATCH_SIZE,1]
 94 | 
 95 |         # --------- Get expected Q values from local model
 96 |         # 找到每个 (state,action) 对应的q值，输出为一个q(s,a)序列
 97 |         # print(self.qnetwork_local(states)) # shape:[BATCH_SIZE,4]
 98 |         Q_expected = self.qnetwork_local(states).gather(1, actions)  # shape:[BATCH_SIZE,1]
 99 | 
100 |         # Compute loss
101 |         loss = F.mse_loss(Q_expected, Q_targets) # 用Q估计值和Q目标计算均方差损失函数，都为列向量
102 |         # Minimize the loss
103 |         self.optimizer.zero_grad() # 先把原来的梯度清零
104 |         loss.backward()
105 |         self.optimizer.step()
106 | 
107 |         # ------------------- update target network ------------------- #
108 |         self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)
109 | 
110 |     def soft_update(self, local_model, target_model, tau):
111 |         """Soft update model parameters.
112 |         θ_target = τ*θ_local + (1 - τ)*θ_target
113 | 
114 |         Params
115 |         ======
116 |             local_model (PyTorch model): weights will be copied from
117 |             target_model (PyTorch model): weights will be copied to
118 |             tau (float): interpolation parameter
119 |         """
120 |         for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
121 |             target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
122 | 
123 | 
124 | class ReplayBuffer:
125 |     """Fixed-size buffer to store experience tuples."""
126 | 
127 |     def __init__(self, action_size, buffer_size, batch_size, seed):
128 |         """Initialize a ReplayBuffer object.
129 | 
130 |         Params
131 |         ======
132 |             action_size (int): dimension of each action
133 |             buffer_size (int): maximum size of buffer
134 |             batch_size (int): size of each training batch
135 |             seed (int): random seed
136 |         """
137 |         self.action_size = action_size
138 |         self.memory = deque(maxlen=buffer_size)
139 |         self.batch_size = batch_size
140 |         self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
141 |         self.seed = random.seed(seed)
142 | 
143 |     def add(self, state, action, reward, next_state, done):
144 |         """Add a new experience to memory."""
145 |         e = self.experience(state, action, reward, next_state, done)
146 |         self.memory.append(e)
147 | 
148 |     def sample(self):
149 |         """Randomly sample a batch of experiences from memory."""
150 |         experiences = random.sample(self.memory, k=self.batch_size)
151 | 
152 |         states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
153 |         actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
154 |         rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
155 |         next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(
156 |             device)
157 |         dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(
158 |             device)
159 | 
160 |         return (states, actions, rewards, next_states, dones)
161 | 
162 |     def __len__(self):
163 |         """Return the current size of internal memory."""
164 |         return len(self.memory)


--------------------------------------------------------------------------------
/DQNs/DDQN/images/Total Average reward scores plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/Total Average reward scores plot.png


--------------------------------------------------------------------------------
/DQNs/DDQN/images/ddqn_agent_scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/ddqn_agent_scores.png


--------------------------------------------------------------------------------
/DQNs/DDQN/images/ddqn_testing_scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/ddqn_testing_scores.png


--------------------------------------------------------------------------------
/DQNs/DDQN/images/double_dqn_v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/double_dqn_v1.png


--------------------------------------------------------------------------------
/DQNs/DDQN/images/dueling-ddqn_testing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/dueling-ddqn_testing.png


--------------------------------------------------------------------------------
/DQNs/DDQN/images/dueling-ddqn_training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/dueling-ddqn_training.png


--------------------------------------------------------------------------------
/DQNs/DDQN/images/runningResult.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/runningResult.png


--------------------------------------------------------------------------------
/DQNs/DDQN/images/runningResult_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/runningResult_1.png


--------------------------------------------------------------------------------
/DQNs/DDQN/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class QNetwork(nn.Module):
 7 |     """Actor (Policy) Model."""
 8 | 
 9 |     def __init__(self, state_size, action_size, seed):
10 |         """Initialize parameters and build model.
11 |         Params
12 |         ======
13 |             state_size (int): Dimension of each state
14 |             action_size (int): Dimension of each action
15 |             seed (int): Random seed
16 |         """
17 |         super(QNetwork, self).__init__()
18 |         self.seed = torch.manual_seed(seed)
19 |         self.fc1=nn.Linear(state_size,64)
20 |         self.fc2=nn.Linear(64,64)
21 |         self.fc3=nn.Linear(64,action_size)
22 | 
23 | 
24 |     def forward(self, state):
25 |         """Build a network that maps state -> action values."""
26 |         out=self.fc1(state)
27 |         out=F.relu(out)
28 |         out=self.fc2(out)
29 |         out=F.relu(out)
30 |         q_a=self.fc3(out)
31 | 
32 |         return q_a
33 | 


--------------------------------------------------------------------------------
/DQNs/DDQN/model_dueling.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | H_1=64
 6 | H_2=64
 7 | 
 8 | class QNetwork(nn.Module):
 9 |     """Dueling Architecture"""
10 | 
11 |     def __init__(self, state_size, action_size, seed):
12 |         """Initialize parameters and build model.
13 |         Params
14 |         ======
15 |             state_size (int): Dimension of each state
16 |             action_size (int): Dimension of each action
17 |             seed (int): Random seed
18 |         """
19 |         super(QNetwork, self).__init__()
20 |         self.action_size=action_size
21 |         self.seed = torch.manual_seed(seed)
22 |         self.fc1=nn.Linear(state_size,H_1)
23 | 
24 |         self.fc2_adv = nn.Linear(H_1,H_2)
25 |         self.fc2_v = nn.Linear(H_1, H_2)
26 | 
27 |         self.fc3_adv = nn.Linear(H_2,action_size)
28 |         self.fc3_v = nn.Linear(H_2, 1)
29 | 
30 | 
31 |     def forward(self, state):
32 |         # first hidden layer
33 |         h1=F.relu(self.fc1(state))
34 | 
35 |         # dueling start in second layer
36 |         h2_adv = F.relu(self.fc2_adv(h1))
37 |         h2_v = F.relu(self.fc2_v(h1))
38 | 
39 |         # final advantage value
40 |         adv = self.fc3_adv(h2_adv)
41 |         # final state value
42 |         v = self.fc3_v(h2_v).expand(state.size(0), self.action_size) # 从1维扩展到 action_size维
43 | 
44 |         # calculate final Q(s,a) value for output
45 |         out_q=v+adv-adv.mean(1).unsqueeze(1).expand(state.size(0), self.action_size)
46 | 
47 |         return out_q
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/DQNs/DDQN/models/checkpoint.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/models/checkpoint.pth


--------------------------------------------------------------------------------
/DQNs/DDQN/models/dueling_model.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/models/dueling_model.pth


--------------------------------------------------------------------------------
/DQNs/DDQN/models/org_dqn.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/models/org_dqn.pth


--------------------------------------------------------------------------------
/DQNs/DDQN/play_env.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import random
 3 | import torch
 4 | import numpy as np
 5 | from collections import deque
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | 
 9 | env = gym.make('LunarLander-v2')
10 | env.seed(0)
11 | print('State shape: ', env.observation_space.shape)
12 | print('Number of actions: ', env.action_space.n)
13 | 
14 | 
15 | # 观察一个未经训练的随机智能体
16 | state = env.reset()
17 | for _ in range(10000):
18 |     env.render()
19 |     next_state, reward, done, _ =env.step(env.action_space.sample())
20 |     # print(reward)
21 | 
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/DQNs/DDQN/test.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | import torch
 4 | from collections import namedtuple, deque
 5 | 
 6 | lst = [1,2,3,4,0,6]
 7 | 
 8 | print( np.array(lst) / 2.5)
 9 | print( np.array(lst) / 2.5 + 0.0001)
10 | 
11 | s1 = np.array([[1,2,3,4]])
12 | s2 = np.vstack([[3,4,5,6]])
13 | print(np.vstack([s1,s2]))
14 | 
15 | 
16 | 
17 | 
18 | target_org=np.array([[ 0.0910, -0.0224, -0.0552, -0.0192],
19 |         [ 0.0908, -0.0209, -0.0553, -0.0181],
20 |         [ 0.0922, -0.0219, -0.0546, -0.0206],
21 |         [ 0.0913, -0.0211, -0.0548, -0.0182],
22 |         [ 0.0910, -0.0211, -0.0554, -0.0187]])
23 | target_org=torch.tensor(target_org)
24 | # print(target_org.shape)
25 | # # 按行取最大值
26 | # print(target_org.detach().max(1))
27 | # print(target_org.detach().max(1)[0])
28 | # # 转换成列向量
29 | # print(target_org.detach().max(1)[0].unsqueeze(1))
30 | 
31 | 
32 | local_org=np.array([[ 0.0936, -0.0768, -0.1730, -0.0238],
33 |         [ 0.0930, -0.0620, -0.1845, -0.0077],
34 |         [ 0.0986, -0.0473, -0.1868,  0.0110],
35 |         [ 0.0946, -0.0752, -0.1726, -0.0264],
36 |         [ 0.0979, -0.0497, -0.1886,  0.0097]])
37 | local_org=torch.tensor(local_org)
38 | actions=torch.tensor(np.array(
39 |     [[3],
40 |     [1],
41 |     [2],
42 |     [0],
43 |     [0]]))
44 | # print(actions)
45 | # print(local_org.shape)
46 | # print(local_org.gather(1, actions.long()))
47 | 
48 | 
49 | b=torch.tensor(np.array([ 0.0932, -0.0206, -0.0541, -0.0204]))
50 | action=torch.LongTensor([0])
51 | print(b.gather(0,action))
52 | 
53 | memory=deque(maxlen=10)
54 | exp=namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
55 | e1=exp(0.34,1,3.56,0.56,False)
56 | memory.append(e1)
57 | e2=exp(3.34,0,8.56,-2.3,False)
58 | memory.append(e2)
59 | memory.append(exp(4.6,0,8.56,-2.3,False))
60 | memory.append(exp(8.7,0,8.56,-4.3,False))
61 | memory.append(exp(2.2,0,-0.8,-2.3,False))
62 | 
63 | 
64 | 
65 | # print(memory)
66 | # print(memory[0].state)
67 | # print(len(memory))
68 | # #
69 | # sample_inds=np.random.choice(len(memory), 3, p=[0.1,0.2,0.2,0.4,0.1],replace=False)
70 | # print(sample_inds)
71 | 
72 | 
73 | # env = gym.make('LunarLander-v2')
74 | # env.seed(0)
75 | # print('State shape: ', env.observation_space.shape)
76 | # print('Number of actions: ', env.action_space.n)
77 | 
78 | 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/DQNs/DQN_PER/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/.DS_Store


--------------------------------------------------------------------------------
/DQNs/DQN_PER/PER_memory.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | from SumTree import SumTree
 4 | # from DQNs.DQN_PER.SumTree import SumTree
 5 | 
 6 | 
 7 | class Memory:  # stored as ( s, a, r, s_ ) in SumTree
 8 |     e = 0.0001
 9 |     alpha = 0.6
10 |     beta = 0.4
11 |     beta_increment_per_sampling = 0.001
12 | 
13 |     def __init__(self, capacity):
14 |         self.tree = SumTree(capacity)
15 |         self.capacity = capacity
16 | 
17 |     # 根据 TD-error 计算优先级
18 |     def _get_priority(self, error):
19 |         return (np.abs(error) + self.e) ** self.alpha
20 | 
21 |     # 存储一条经验和相应优先级
22 |     def add(self, error, sample):
23 |         p = self._get_priority(error)
24 |         self.tree.add(p, sample)
25 | 
26 |     def batch_sample(self, n):
27 |         batch = []
28 |         idxs = []
29 |         segment = self.tree.total() / n
30 |         priorities = []
31 | 
32 |         # beta 随着sample的次数增加而增大（？？），上限为 1.0
33 |         self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])
34 | 
35 |         # 把叶子节点分成n个采样区间（n为样本数量）
36 |         for i in range(n):
37 |             a = segment * i
38 |             b = segment * (i + 1)
39 |             s = random.uniform(a, b)
40 |             (idx, p, data) = self.tree.get(s)
41 |             priorities.append(p)
42 |             batch.append(data)
43 |             idxs.append(idx)
44 | 
45 |         # 采样概率
46 |         sampling_probabilities = np.array(priorities) / self.tree.total() + self.e
47 |         # 样本权重： IS weight
48 |         is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta)
49 |         is_weight /= is_weight.max()
50 | 
51 |         return batch, idxs, is_weight
52 | 
53 |     def update(self, idx, error):
54 |         p = self._get_priority(error)
55 |         self.tree.update(idx, p)
56 | 
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/cnn_per.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/cnn_per.png


--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/epsilon_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_1.png


--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/epsilon_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_2.png


--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/epsilon_exp-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_exp-1.png


--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/epsilon_exp-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_exp-2.png


--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/epsilon_exp-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_exp-3.png


--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/epsilon_linear-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_linear-1.png


--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/train_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_1.png


--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/train_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_2.png


--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/train_DQN_per.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_DQN_per.png


--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/train_exp-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_exp-1.png


--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/train_exp-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_exp-2.png


--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/train_exp-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_exp-3.png


--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/train_exp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_exp.png


--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/train_linear-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_linear-1.png


--------------------------------------------------------------------------------
/DQNs/DQN_PER/SumTree.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | 
 4 | # SumTree
 5 | # a binary tree data structure where the parent’s value is the sum of its children
 6 | class SumTree:
 7 |     write = 0
 8 | 
 9 |     def __init__(self, capacity):
10 |         self.capacity = capacity
11 |         self.tree = numpy.zeros(2 * capacity - 1)
12 |         self.data = numpy.zeros(capacity, dtype=object)
13 |         self.n_entries = 0
14 | 
15 |     def total(self):
16 |         return self.tree[0]
17 | 
18 |     # 从叶子节点到根节点向上传播，更新整棵树
19 |     def _propagate(self, idx, change):
20 |         parent = (idx - 1) // 2
21 |         self.tree[parent] += change
22 | 
23 |         if parent != 0:
24 |             self._propagate(parent, change)
25 | 
26 |     # 更新目标节点的 priority
27 |     def update(self, idx, p):
28 |         change = p - self.tree[idx]
29 | 
30 |         self.tree[idx] = p
31 |         self._propagate(idx, change)
32 | 
33 |     # 存储样本和对应节点的 priority （只有叶子节点可以存储，上面节点的值都是下层的求和）
34 |     def add(self, p, data):
35 |         # 计算叶子节点的 index
36 |         idx = self.write + self.capacity - 1
37 | 
38 |         self.data[self.write] = data
39 |         self.update(idx, p)
40 | 
41 |         # 如果叶子节点已满，则从第一个开始清空重新存储
42 |         self.write += 1
43 |         if self.write >= self.capacity:
44 |             self.write = 0
45 | 
46 |         if self.n_entries < self.capacity:
47 |             self.n_entries += 1
48 | 
49 |     # 从根节点开始搜索，找到对应的叶子节点
50 |     def _retrieve(self, idx, s):
51 |         left = 2 * idx + 1
52 |         right = left + 1
53 | 
54 |         if left >= len(self.tree):
55 |             return idx
56 | 
57 |         if s <= self.tree[left]:
58 |             return self._retrieve(left, s)
59 |         else:
60 |             return self._retrieve(right, s - self.tree[left])
61 | 
62 |     # 采样方法，取得样本和对应的 priority
63 |     def get(self, s):
64 |         # 找到叶子节点的索引
65 |         idx = self._retrieve(0, s) # s：在每个区间随机取的值
66 |         # 找到样本的索引
67 |         dataIdx = idx - self.capacity + 1
68 | 
69 |         return (idx, self.tree[idx], self.data[dataIdx])
70 | 


--------------------------------------------------------------------------------
/DQNs/DQN_PER/__pycache__/PER_memory.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/__pycache__/PER_memory.cpython-38.pyc


--------------------------------------------------------------------------------
/DQNs/DQN_PER/__pycache__/SumTree.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/__pycache__/SumTree.cpython-38.pyc


--------------------------------------------------------------------------------
/DQNs/DQN_PER/__pycache__/atari_wappers.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/__pycache__/atari_wappers.cpython-38.pyc


--------------------------------------------------------------------------------
/DQNs/DQN_PER/__pycache__/dqn_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/__pycache__/dqn_model.cpython-38.pyc


--------------------------------------------------------------------------------
/DQNs/DQN_PER/__pycache__/dqn_per.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/__pycache__/dqn_per.cpython-38.pyc


--------------------------------------------------------------------------------
/DQNs/DQN_PER/atari_wappers.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import gym
  3 | import gym.spaces
  4 | import numpy as np
  5 | import collections
  6 | 
  7 | 
  8 | class MaxAndSkipEnv(gym.Wrapper):
  9 |     """
 10 |     Combines the repetition of actions during k frames and pixels from two consecutive frames.
 11 |     """
 12 |     def __init__(self, env=None, skip=4):
 13 |         super(MaxAndSkipEnv, self).__init__(env)
 14 |         self._obs_buffer = collections.deque(maxlen=2)
 15 |         self._skip = skip
 16 | 
 17 |     def step(self, action):
 18 |         total_reward = 0.0
 19 |         done = None
 20 |         for _ in range(self._skip):
 21 |             obs, reward, done, info = self.env.step(action)
 22 |             self._obs_buffer.append(obs)
 23 |             total_reward += reward
 24 |             if done:
 25 |                 break
 26 |         max_frame = np.max(np.stack(self._obs_buffer), axis=0)
 27 |         return max_frame, total_reward, done, info
 28 | 
 29 |     def reset(self):
 30 |         self._obs_buffer.clear()
 31 |         obs = self.env.reset()
 32 |         self._obs_buffer.append(obs)
 33 |         return obs
 34 | 
 35 | 
 36 | class FireResetEnv(gym.Wrapper):
 37 |     """
 38 |     Presses fire button for environments that require it for the game to start.
 39 |     Also checks for some corner cases in some games
 40 |     """
 41 |     def __init__(self,env=None):
 42 |             """For environments where the user need to press FIRE for the game to start."""
 43 |             super(FireResetEnv, self).__init__(env)
 44 |             assert env.unwrapped.get_action_meanings()[1]=="FIRE"
 45 |             assert len(env.unwrapped.get_action_meanings()) >= 3
 46 | 
 47 |     def step(self,action):
 48 | 
 49 |         return self.env.step(action)
 50 | 
 51 |     def reset(self):
 52 | 
 53 |         self.env.reset()
 54 | 
 55 |         obs,_,done,_ = self.env.step(1)
 56 |         if done:
 57 |             self.env.reset()
 58 |         obs, _, done, _ = self.env.step(2)
 59 |         if done:
 60 |             self.env.reset()
 61 |         return obs
 62 | 
 63 | 
 64 | class ProcessFrame84(gym.ObservationWrapper):
 65 |     """
 66 |     converts input image of 210x160 rgb to grayscale 84x84
 67 |     """
 68 |     def __init__(self, env=None):
 69 |         super(ProcessFrame84, self).__init__(env)
 70 | 
 71 |         self.observation_space = gym.spaces.Box(
 72 |             low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
 73 | 
 74 |     def observation(self, obs):
 75 | 
 76 |         return ProcessFrame84.process(obs)
 77 |     @staticmethod
 78 |     def process(frame):
 79 |         if frame.size == 210 * 160 * 3:
 80 |             img = np.reshape(frame, [210, 160, 3]).astype(
 81 |                 np.float32)
 82 |         elif frame.size == 250 * 160 * 3:
 83 |             img = np.reshape(frame, [250, 160, 3]).astype(
 84 |                 np.float32)
 85 |         else:
 86 |             assert False, "Unknown resolution."
 87 |         img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + \
 88 |               img[:, :, 2] * 0.114
 89 |         resized_screen = cv2.resize(
 90 |             img, (84, 110), interpolation=cv2.INTER_AREA)
 91 |         x_t = resized_screen[18:102, :]
 92 |         x_t = np.reshape(x_t, [84, 84, 1])
 93 |         return x_t.astype(np.uint8)
 94 | 
 95 | 
 96 | class ImageToPyTorch(gym.ObservationWrapper):
 97 |     def __init__(self, env):
 98 |         super(ImageToPyTorch, self).__init__(env)
 99 |         old_shape = self.observation_space.shape
100 |         new_shape = (old_shape[-1], old_shape[0], old_shape[1])
101 |         self.observation_space = gym.spaces.Box(
102 |             low=0.0, high=1.0, shape=new_shape, dtype=np.float32)
103 |     def observation(self, observation):
104 |         return np.moveaxis(observation, 2, 0)
105 | 
106 | 
107 | class BufferWrapper(gym.ObservationWrapper):
108 |     def __init__(self, env, n_steps, dtype=np.float32):
109 |         super(BufferWrapper, self).__init__(env)
110 |         self.dtype = dtype
111 |         old_space = env.observation_space
112 |         self.observation_space = gym.spaces.Box(
113 |             old_space.low.repeat(n_steps, axis=0),
114 |             old_space.high.repeat(n_steps, axis=0), dtype=dtype)
115 |     def reset(self):
116 |         self.buffer = np.zeros_like(
117 |             self.observation_space.low, dtype=self.dtype)
118 |         return self.observation(self.env.reset())
119 |     def observation(self, observation):
120 |         self.buffer[:-1] = self.buffer[1:]
121 |         self.buffer[-1] = observation
122 |         return self.buffer
123 | 
124 | 
125 | class ScaledFloatFrame(gym.ObservationWrapper):
126 |     def observation(self, obs):
127 |         return np.array(obs).astype(np.float32) / 255.0
128 | 
129 | 
130 | def make_env(env_name):
131 |     env = gym.make(env_name)
132 |     env = MaxAndSkipEnv(env)
133 |     env = FireResetEnv(env)
134 |     env = ProcessFrame84(env)
135 |     env = ImageToPyTorch(env)
136 |     env = BufferWrapper(env, 4)
137 |     env =  ScaledFloatFrame(env)
138 | 
139 |     return env
140 | 
141 | 
142 | if __name__ == "__main__":
143 |     env_name = "Pong-v0"
144 | 
145 |     env = make_env(env_name)
146 |     print(env.reset().shape)
147 |     print(env.observation_space)
148 |     env.render()


--------------------------------------------------------------------------------
/DQNs/DQN_PER/dqn_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | class MLP_Model(nn.Module):
 8 |     def __init__(self, state_size, action_size):
 9 |         super(MLP_Model, self).__init__()
10 |         self.fc1=nn.Linear(state_size,128)
11 |         self.fc2=nn.Linear(128,256)
12 |         self.fc3=nn.Linear(256,action_size)
13 | 
14 |     def forward(self, state):
15 |         """Build a network that maps state -> action values."""
16 |         out=self.fc1(state)
17 |         out=F.relu(out)
18 |         out=self.fc2(out)
19 |         out=F.relu(out)
20 |         q_a=self.fc3(out)
21 | 
22 |         return q_a
23 | 
24 | 
25 | class CNN_Model (nn.Module):
26 |     def __init__(self, input_shape, n_actions):
27 |         super(CNN_Model, self).__init__()
28 |         self.conv = nn.Sequential(
29 |             # input_shape 的第一个维度为 输入的 channel 数，比如输入为（4，84，84）时，channel = 4
30 |             nn.Conv2d(input_shape[0], 128, kernel_size=8, stride=4),
31 |             nn.ReLU(),
32 |             nn.Conv2d(128, 256, kernel_size=4, stride=2),
33 |             nn.ReLU(),
34 |             nn.Conv2d(256, 128, kernel_size=3, stride=1),
35 |             nn.ReLU()
36 |         )
37 |         conv_out_size = self._get_conv_out(input_shape)
38 |         self.fc = nn.Sequential(
39 |             nn.Linear(conv_out_size, 512),
40 |             nn.ReLU(),
41 |             nn.Linear(512, n_actions)
42 |         )
43 | 
44 |     def _get_conv_out(self, input_shape):
45 |         o = self.conv(torch.zeros((1, *input_shape)))
46 |         return int(np.prod(o.size()))
47 | 
48 |     def forward(self, x):
49 |         conv_out = self.conv(x)
50 |         conv_out = conv_out.view(x.size()[0], -1)
51 |         return self.fc(conv_out)
52 | 


--------------------------------------------------------------------------------
/DQNs/DQN_PER/main_dqn_per.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
 3 | 
 4 | import gym
 5 | import arrow
 6 | import torch
 7 | import numpy as np
 8 | from matplotlib import pyplot as plt
 9 | from collections import deque
10 | from dqn_per import Agent_dqn
11 | import atari_wappers
12 | # from DQNs.DQN_PER.dqn_per import Agent_dqn
13 | # from DQNs.DQN_PER import atari_wappers
14 | 
15 | 
16 | def train_agent(agent,state_size,n_episodes ):
17 |     scores_window = deque(maxlen=100)  # last 100 scores
18 |     scores , eps_lst = [],[]
19 | 
20 |     start_time = arrow.now()
21 |     for i_episode in range(1, n_episodes + 1):
22 |         state = env.reset()
23 |         score = 0
24 | 
25 |         while True:
26 |             action,epsilon = agent.act(state,i_episode)
27 |             next_state, reward, done, _ = env.step(action)
28 | 
29 |             ## add sample and train agent
30 |             sarsd = (state, action, reward, next_state, done)
31 |             agent.step(sarsd)
32 | 
33 |             state = next_state
34 |             score += reward
35 |             if done:
36 |                 break
37 | 
38 |         scores_window.append(score)  # save most recent score
39 |         scores.append(score)  # save most recent score
40 |         eps_lst.append(epsilon)
41 | 
42 |         print('\rEpisode {} \t Average Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
43 |         if i_episode % 100 == 0:
44 |             print('\rEpisode {}\t  Average Score: {:.2f}'.format(i_episode,np.mean(scores_window)))
45 |             print('\rRunning time:{}\n'.format(arrow.now() - start_time))
46 |         # if np.mean(scores_window) >= 195.0:
47 |         #     print('\nEnvironment solved in {:d} episodes! \t Average Score: {:.2f}'.format(i_episode - 100,
48 |         #                                                                                    np.mean(scores_window)))
49 |         #     # torch.save(agent.qnetwork_local.state_dict(), model_file)
50 |         #     print('\nTotal running time:{}'.format(arrow.now() - start_time))
51 |         #     break
52 | 
53 |     return scores,eps_lst
54 | 
55 | 
56 | def plot_curves(data,plot_name,filename):
57 |     fig = plt.figure()
58 |     ax = fig.add_subplot(1, 1, 1)
59 |     ax.plot(np.arange(len(data)), data)
60 |     plt.ylabel(plot_name)
61 |     plt.xlabel('Episode #')
62 |     plt.savefig(filename)
63 | 
64 | 
65 | if __name__=="__main__":
66 |     env = atari_wappers.make_env("SpaceInvaders-v0")
67 |     state_size, action_size = env.observation_space.shape, env.action_space.n
68 | 
69 |     cnn_agent = Agent_dqn(state_size,action_size,'CNN','True','nonlinear')
70 |     train_scores, _ = train_agent(cnn_agent, state_size, 2500)
71 |     plot_curves(train_scores, 'Scores', 'Plots/cnn_per.png')
72 | 
73 |     # env = gym.make('CartPole-v0')
74 |     # env.seed(0)
75 |     # state_size, action_size = env.observation_space.shape[0], env.action_space.n
76 |     # mlp_agent = Agent_dqn(state_size, action_size,'MLP','True','nonlinear')
77 |     # train_scores,eps_lst = train_agent(mlp_agent,state_size,2500)
78 |     # plot_curves(train_scores,'Scores','Plots/train_exp-3.png')
79 |     # if mlp_agent.eps_decay:
80 |     #     plot_curves(eps_lst,'Epsilon', 'Plots/epsilon_exp-3.png')
81 | 


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/.DS_Store


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Models/CNN_model|03-29#19:21.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/CNN_model|03-29#19:21.pth


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Models/CNN_model|03-30#11:19.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/CNN_model|03-30#11:19.pth


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Models/CNN_model|03-30#21:05.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/CNN_model|03-30#21:05.pth


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Models/CNN_model|03-31#19:32.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/CNN_model|03-31#19:32.pth


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Models/dqnCNN_model_0324.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/dqnCNN_model_0324.pth


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Models/dqn_model.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/dqn_model.pth


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Plots/test-score|03-25#20:00.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/test-score|03-25#20:00.png


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Plots/test-score|03-26#09:15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/test-score|03-26#09:15.png


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Plots/test-score|03-26#09:45.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/test-score|03-26#09:45.png


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Plots/train-score|03-29#19:21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/train-score|03-29#19:21.png


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Plots/train-score|03-30#11:19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/train-score|03-30#11:19.png


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Plots/train-score|03-30#21:05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/train-score|03-30#21:05.png


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Plots/train-score|03-31#19:32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/train-score|03-31#19:32.png


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/ReadMe.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### To start the project
 3 | 
 4 | #### 0. Basic settings
 5 | * `env_name`: (str) name of the gym atari env that you want to play with 
 6 | * `run_mode` : (str:train/test) 
 7 | 
 8 | #### 1. To train an agent using DDQN with CNN network 
 9 | * `train_episode`
10 | * `learning_rate`
11 | * `buffer_size`
12 | * `batch_size`
13 | * `gamma`
14 | * `update_every`
15 | * `eps_decay`
16 | 
17 | exp. 
18 | ``` 
19 | python main_dqn_atari.py SpaceInvaders-v0 train --learning_rate 1e-3
20 | ```
21 | 
22 | To run in back ground and save a log file:
23 | ```
24 | nohup python -u main_dqn_atari.py SpaceInvaders-v0 train --learning_rate 1e-3 > train_20210326.log 2>&1 &
25 | ```
26 | 
27 | #### 2. To test a trained agent
28 | * `test_episode` (int) number of episodes you what to test the agent
29 | * `test_model_file` (str) path of the model file corresponding with the trained agent you want to test 
30 | * `test_video_play` (str:yes/no) whither you want to watch video playing during testing 
31 | 
32 | exp.
33 | ```
34 | python main_dqn_atari.py SpaceInvaders-v0 test --test_episode 500 --test_model_file Models/dqnCNN_model_0324.pth --test_video_play no
35 | ```
36 | 
37 | 


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/__pycache__/atari_wappers.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/__pycache__/atari_wappers.cpython-38.pyc


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/__pycache__/cnn_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/__pycache__/cnn_model.cpython-38.pyc


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/__pycache__/dqn_agent.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/__pycache__/dqn_agent.cpython-38.pyc


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/atari_wappers.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import gym
  3 | import gym.spaces
  4 | import numpy as np
  5 | import collections
  6 | 
  7 | 
  8 | class MaxAndSkipEnv(gym.Wrapper):
  9 |     """
 10 |     Combines the repetition of actions during k frames and pixels from two consecutive frames.
 11 |     """
 12 |     def __init__(self, env=None, skip=4):
 13 |         super(MaxAndSkipEnv, self).__init__(env)
 14 |         self._obs_buffer = collections.deque(maxlen=2)
 15 |         self._skip = skip
 16 | 
 17 |     def step(self, action):
 18 |         total_reward = 0.0
 19 |         done = None
 20 |         for _ in range(self._skip):
 21 |             obs, reward, done, info = self.env.step(action)
 22 |             self._obs_buffer.append(obs)
 23 |             total_reward += reward
 24 |             if done:
 25 |                 break
 26 |         max_frame = np.max(np.stack(self._obs_buffer), axis=0)
 27 |         return max_frame, total_reward, done, info
 28 | 
 29 |     def reset(self):
 30 |         self._obs_buffer.clear()
 31 |         obs = self.env.reset()
 32 |         self._obs_buffer.append(obs)
 33 |         return obs
 34 | 
 35 | 
 36 | class FireResetEnv(gym.Wrapper):
 37 |     """
 38 |     Presses fire button for environments that require it for the game to start.
 39 |     Also checks for some corner cases in some games
 40 |     """
 41 |     def __init__(self,env=None):
 42 |             """For environments where the user need to press FIRE for the game to start."""
 43 |             super(FireResetEnv, self).__init__(env)
 44 |             assert env.unwrapped.get_action_meanings()[1]=="FIRE"
 45 |             assert len(env.unwrapped.get_action_meanings()) >= 3
 46 | 
 47 |     def step(self,action):
 48 | 
 49 |         return self.env.step(action)
 50 | 
 51 |     def reset(self):
 52 | 
 53 |         self.env.reset()
 54 | 
 55 |         obs,_,done,_ = self.env.step(1)
 56 |         if done:
 57 |             self.env.reset()
 58 |         obs, _, done, _ = self.env.step(2)
 59 |         if done:
 60 |             self.env.reset()
 61 |         return obs
 62 | 
 63 | 
 64 | class ProcessFrame84(gym.ObservationWrapper):
 65 |     """
 66 |     converts input image of 210x160 rgb to grayscale 84x84
 67 |     """
 68 |     def __init__(self, env=None):
 69 |         super(ProcessFrame84, self).__init__(env)
 70 | 
 71 |         self.observation_space = gym.spaces.Box(
 72 |             low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
 73 | 
 74 |     def observation(self, obs):
 75 | 
 76 |         return ProcessFrame84.process(obs)
 77 |     @staticmethod
 78 |     def process(frame):
 79 |         if frame.size == 210 * 160 * 3:
 80 |             img = np.reshape(frame, [210, 160, 3]).astype(
 81 |                 np.float32)
 82 |         elif frame.size == 250 * 160 * 3:
 83 |             img = np.reshape(frame, [250, 160, 3]).astype(
 84 |                 np.float32)
 85 |         else:
 86 |             assert False, "Unknown resolution."
 87 |         img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + \
 88 |               img[:, :, 2] * 0.114
 89 |         resized_screen = cv2.resize(
 90 |             img, (84, 110), interpolation=cv2.INTER_AREA)
 91 |         x_t = resized_screen[18:102, :]
 92 |         x_t = np.reshape(x_t, [84, 84, 1])
 93 |         return x_t.astype(np.uint8)
 94 | 
 95 | 
 96 | class ImageToPyTorch(gym.ObservationWrapper):
 97 |     def __init__(self, env):
 98 |         super(ImageToPyTorch, self).__init__(env)
 99 |         old_shape = self.observation_space.shape
100 |         new_shape = (old_shape[-1], old_shape[0], old_shape[1])
101 |         self.observation_space = gym.spaces.Box(
102 |             low=0.0, high=1.0, shape=new_shape, dtype=np.float32)
103 |     def observation(self, observation):
104 |         return np.moveaxis(observation, 2, 0)
105 | 
106 | 
107 | class BufferWrapper(gym.ObservationWrapper):
108 |     def __init__(self, env, n_steps, dtype=np.float32):
109 |         super(BufferWrapper, self).__init__(env)
110 |         self.dtype = dtype
111 |         old_space = env.observation_space
112 |         self.observation_space = gym.spaces.Box(
113 |             old_space.low.repeat(n_steps, axis=0),
114 |             old_space.high.repeat(n_steps, axis=0), dtype=dtype)
115 |     def reset(self):
116 |         self.buffer = np.zeros_like(
117 |             self.observation_space.low, dtype=self.dtype)
118 |         return self.observation(self.env.reset())
119 |     def observation(self, observation):
120 |         self.buffer[:-1] = self.buffer[1:]
121 |         self.buffer[-1] = observation
122 |         return self.buffer
123 | 
124 | 
125 | class ScaledFloatFrame(gym.ObservationWrapper):
126 |     def observation(self, obs):
127 |         return np.array(obs).astype(np.float32) / 255.0
128 | 
129 | 
130 | def make_env(env_name):
131 |     env = gym.make(env_name)
132 |     env = MaxAndSkipEnv(env)
133 |     env = FireResetEnv(env)
134 |     env = ProcessFrame84(env)
135 |     env = ImageToPyTorch(env)
136 |     env = BufferWrapper(env, 4)
137 |     env =  ScaledFloatFrame(env)
138 | 
139 |     return env
140 | 
141 | 
142 | if __name__ == "__main__":
143 |     env_name = "Pong-v0"
144 | 
145 |     env = make_env(env_name)
146 |     print(env.reset().shape)
147 |     print(env.observation_space)
148 |     env.render()


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/cnn_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import atari_wappers
 6 | 
 7 | 
 8 | class CNN_Model (nn.Module):
 9 |     def __init__(self, input_shape, n_actions):
10 |         super(CNN_Model, self).__init__()
11 |         self.conv = nn.Sequential(
12 |             # input_shape 的第一个维度为 输入的 channel 数，比如输入为（4，84，84）时，channel = 4
13 |             nn.Conv2d(input_shape[0], 128, kernel_size=8, stride=4),
14 |             nn.ReLU(),
15 |             nn.Conv2d(128, 256, kernel_size=4, stride=2),
16 |             nn.ReLU(),
17 |             nn.Conv2d(256, 128, kernel_size=3, stride=1),
18 |             nn.ReLU()
19 |         )
20 |         conv_out_size = self._get_conv_out(input_shape)
21 |         self.fc = nn.Sequential(
22 |             nn.Linear(conv_out_size, 512),
23 |             nn.ReLU(),
24 |             nn.Linear(512, n_actions)
25 |         )
26 | 
27 |     def _get_conv_out(self, input_shape):
28 |         o = self.conv(torch.zeros((1, *input_shape)))
29 |         return int(np.prod(o.size()))
30 | 
31 |     def forward(self, x):
32 |         conv_out = self.conv(x)
33 |         conv_out = conv_out.view(x.size()[0], -1)
34 |         return self.fc(conv_out)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     env = atari_wappers.make_env("SpaceInvaders-v0")
39 |     state_size, action_size = env.observation_space.shape, env.action_space.n
40 |     print(state_size, action_size)
41 |     model = CNN_Model(state_size, action_size)
42 | 
43 |     state = env.reset()
44 |     obs = env.reset()
45 |     obs1 = env.reset()
46 |     t = torch.tensor([obs, obs1])
47 |     print("x.shape", t.shape)
48 | 
49 |     q_value = model.forward(t)
50 |     actions = torch.tensor([[0,1]])
51 |     print(q_value)
52 |     print(q_value.gather(1,actions))
53 | 


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/dqn_agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | from collections import namedtuple, deque
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import torch.optim as optim
  7 | from cnn_model import CNN_Model
  8 | 
  9 | TAU = 1e-3  # for soft update of target parameters
 10 | EPS_start=1.0
 11 | EPS_end=0.01
 12 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 13 | 
 14 | 
 15 | class ReplayBuffer:
 16 |     """Fixed-size buffer to store experience tuples."""
 17 | 
 18 |     def __init__(self, action_size, buffer_size, batch_size):
 19 |         """Initialize a ReplayBuffer object.
 20 | 
 21 |         Params
 22 |         ======
 23 |             action_size (int): dimension of each action
 24 |             buffer_size (int): maximum size of buffer
 25 |             batch_size (int): size of each training batch
 26 |             seed (int): random seed
 27 |         """
 28 |         self.action_size = action_size
 29 |         self.memory = deque(maxlen=buffer_size)
 30 |         # 使用 deque(maxlen=N) 构造函数会新建一个固定大小的队列。当新的元素加入并且这个队列已满的时候， 最老的元素会自动被移除掉
 31 |         self.batch_size = batch_size
 32 |         self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
 33 | 
 34 |     def add(self, state, action, reward, next_state, done):
 35 |         """
 36 |         Add a new experience to the memory
 37 |         :param state:
 38 |         :param p: sample probability for this experience
 39 |         :return:
 40 |         """
 41 |         e = self.experience(state, action, reward, next_state, done)
 42 |         self.memory.append(e)
 43 | 
 44 |     def clean_buffer(self):
 45 |         self.memory.clear()
 46 | 
 47 |     def sample(self):
 48 |         """Randomly sample a batch of experiences from memory."""
 49 |         experiences = random.sample(self.memory, k=self.batch_size)
 50 | 
 51 |         states = torch.tensor([e.state for e in experiences if e is not None]).float().to(device)
 52 |         actions = torch.tensor([[e.action for e in experiences if e is not None]]).long().to(device)
 53 |         rewards = torch.tensor([e.reward for e in experiences if e is not None]).float().to(device)
 54 |         next_states = torch.tensor([e.next_state for e in experiences if e is not None]).float().to(
 55 |             device)
 56 |         dones = torch.from_numpy(np.array([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(
 57 |             device)
 58 |         return (states, actions, rewards, next_states, dones)
 59 | 
 60 |     def __len__(self):
 61 |         """Return the current size of internal memory."""
 62 |         return len(self.memory)
 63 | 
 64 | 
 65 | class Agent_dqn():
 66 |     def __init__(self, input_channel,action_size,learning_rate=5e-3,buffer_size=int(1e4),batch_size=32):
 67 |         """Initialize an Agent object.
 68 | 
 69 |         Params
 70 |         ======
 71 |             state_size (int): dimension of each state
 72 |             action_size (int): dimension of each action
 73 |             seed (int): random seed
 74 |         """
 75 |         self.action_size = action_size
 76 | 
 77 |         # Q-Network
 78 |         self.qnetwork_local = CNN_Model(input_channel,action_size).to(device)
 79 |         self.qnetwork_target = CNN_Model(input_channel,action_size).to(device)
 80 |         self.optimizer = optim.Adam(self.qnetwork_local.parameters(), learning_rate)
 81 | 
 82 |         # Replay memory
 83 |         self.batch_size = batch_size
 84 |         self.memory = ReplayBuffer(action_size, buffer_size,batch_size)
 85 |         # Initialize time step (for updating every UPDATE_EVERY steps)
 86 |         self.t_step = 0
 87 |         self.episode = 0
 88 |         self.epsilon = EPS_start
 89 | 
 90 |     def act(self,state,i_episode,eps_decay):
 91 |         state = torch.from_numpy(state).float().unsqueeze(0).to(device)
 92 |         self.qnetwork_local.eval()
 93 |         with torch.no_grad():
 94 |             action_values = self.qnetwork_local(state)
 95 |         self.qnetwork_local.train()
 96 | 
 97 |         " Epsilon-greedy action selection"
 98 |         if i_episode>self.episode:
 99 |             # update EPS every new episode
100 |             self.epsilon = max(EPS_end, eps_decay * self.epsilon)
101 |             self.episode = i_episode
102 |         # epsilon greedy policy
103 |         if random.random() > self.epsilon:
104 |             return np.argmax(action_values.cpu().data.numpy())
105 |         else:
106 |             return random.choice(np.arange(self.action_size))
107 | 
108 |     def act_greedy_policy(self,state):
109 |         state = torch.from_numpy(state).float().unsqueeze(0).to(device)
110 |         self.qnetwork_local.eval()
111 |         with torch.no_grad():
112 |             action_values = self.qnetwork_local(state)
113 |         return np.argmax(action_values.cpu().data.numpy())
114 | 
115 |     def step(self,sarsd,gamma,update_every):
116 |         state, action, reward, next_state, done = sarsd
117 |         self.t_step += 1
118 | 
119 |         # add an experience for current time step
120 |         self.memory.add(state, action, reward, next_state, done)
121 | 
122 |         # Learn every UPDATE_EVERY time steps
123 |         if (self.t_step+1) % update_every==0:
124 |             if self.memory.__len__()>self.batch_size:
125 |                 batch_exps = self.memory.sample()
126 |                 loss = self.learn(batch_exps,gamma)
127 |                 return loss
128 | 
129 |     def learn(self,exps,gamma):
130 |         # fetch the batch (s,a,r,s',done) from experiences batch
131 |         states,actions,rewards,next_states,dones = exps
132 |         print(states.shape)
133 | 
134 |         # ------------------ calculate loss —------------------------- #
135 | 
136 |         # calculate Q targets
137 |         expected_next_max_actions = self.qnetwork_local(next_states).detach().argmax(1).unsqueeze(0)
138 |         Q_expected_next = self.qnetwork_target(next_states).gather(1, expected_next_max_actions)
139 |         Q_targets = rewards + (gamma * Q_expected_next * (1 - dones))
140 | 
141 |         # get expected Q for current state
142 |         Q_expected = self.qnetwork_local(states).gather(1, actions)
143 | 
144 |         loss = F.mse_loss(Q_expected, Q_targets)
145 | 
146 |         # ---------------- update local Q net -------------------- #
147 |         self.optimizer.zero_grad()
148 |         loss.backward()
149 |         self.optimizer.step()
150 |         # print(next(self.qnetwork_local.parameters()).is_cuda)
151 | 
152 |         # ---------------- update target Q net -------------------- #
153 |         self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)
154 | 
155 |         return loss.cpu().detach().numpy()
156 | 
157 |     def soft_update(self, local_model, target_model, tau):
158 |         """Soft update model parameters.
159 |         θ_target = τ*θ_local + (1 - τ)*θ_target
160 | 
161 |         Params
162 |         ======
163 |             local_model (PyTorch model): weights will be copied from
164 |             target_model (PyTorch model): weights will be copied to
165 |             tau (float): interpolation parameter
166 |         """
167 |         for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
168 |             target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-0.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-100.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-100.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-140.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-140.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-152.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-152.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-167.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-167.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-185.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-185.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-200.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-200.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-204.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-204.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-227.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-227.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-300.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-300.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-400.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-400.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-500.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-500.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-600.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-600.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-674.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-674.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-683.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-683.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-696.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-696.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-700.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-700.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-714.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-714.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-733.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-733.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-756.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-756.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-800.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-800.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-900.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-900.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-902.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-902.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-909.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-909.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-920.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-920.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-936.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-936.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-956.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-956.jpg


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/log/train_20210326.log:
--------------------------------------------------------------------------------
 1 | nohup: ignoring input
 2 | ####################################################
 3 |  Start Training on SpaceInvaders-v0 environment using DQN with CNN
 4 | ####################################################
 5 | 
 6 | Training Parameters :
 7 |  Train episode : 2000
 8 |  Network update every 5 time step 
 9 |  Replay buffer size : 5000
10 |  Batch size : 32
11 |  Learning rate : 0.001 
12 |  GAMMA : 0.99 
13 |  Epsilon decay rate : 0.995
14 | 
15 | 


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/log/train_20210329.log:
--------------------------------------------------------------------------------
 1 | nohup: ignoring input
 2 | ####################################################
 3 |  Start Training on SpaceInvaders-v0 environment using DQN with CNN
 4 | ####################################################
 5 | 
 6 | Training Parameters :
 7 |  Train episode : 2000
 8 |  Network update every 5 time step 
 9 |  Replay buffer size : 5000
10 |  Batch size : 32
11 |  Learning rate : 0.001 
12 |  GAMMA : 0.99 
13 |  Epsilon decay rate : 0.995
14 | 
15 | Episode 100	 Loss 38.24846267700195 	 Average Score: 136.55
16 | Running time till now :0:11:03.313392
17 | 
18 | Episode 200	 Loss 13.040627479553223 	 Average Score: 153.55
19 | Running time till now :0:22:44.250463
20 | 
21 | Episode 300	 Loss 15.12213134765625 	 Average Score: 193.75
22 | Running time till now :0:35:31.581827
23 | 
24 | Episode 400	 Loss 52.153236389160156 	 Average Score: 205.35
25 | Running time till now :0:49:19.173927
26 | 
27 | Episode 500	 Loss 21.199983596801758 	 Average Score: 230.80
28 | Running time till now :1:03:42.341828
29 | 
30 | Episode 600	 Loss 44.75456237792969 	 Average Score: 237.10
31 | Running time till now :1:18:31.195379
32 | 
33 | Episode 700	 Loss 71.21875762939453 	 Average Score: 279.30
34 | Running time till now :1:33:07.172067
35 | 
36 | Episode 800	 Loss 46.80872344970703 	 Average Score: 250.20
37 | Running time till now :1:45:39.041750
38 | 
39 | Episode 900	 Loss 69.49663543701172 	 Average Score: 270.55
40 | Running time till now :1:59:20.249699
41 | 
42 | Episode 1000	 Loss 97.96715545654297 	 Average Score: 270.80
43 | Running time till now :2:13:20.709739
44 | 
45 | Episode 1100	 Loss 82.20999145507812 	 Average Score: 280.75
46 | Running time till now :2:28:11.418844
47 | 
48 | Episode 1200	 Loss 29.77111053466797 	 Average Score: 270.15
49 | Running time till now :2:42:38.161003
50 | 
51 | Episode 1300	 Loss 33.50057601928711 	 Average Score: 263.00
52 | Running time till now :2:55:55.026575
53 | 
54 | Episode 1400	 Loss 32.226627349853516 	 Average Score: 296.65
55 | Running time till now :3:10:45.828023
56 | 
57 | Episode 1500	 Loss 30.3413143157959 	 Average Score: 280.10
58 | Running time till now :3:26:07.034734
59 | 
60 | Episode 1600	 Loss 30.96596336364746 	 Average Score: 271.00
61 | Running time till now :3:40:35.273112
62 | 
63 | Episode 1700	 Loss 32.25701904296875 	 Average Score: 255.85
64 | Running time till now :3:53:36.508000
65 | 
66 | Episode 1800	 Loss 28.328149795532227 	 Average Score: 293.50
67 | Running time till now :4:09:19.669146
68 | 
69 | Episode 1900	 Loss 29.688913345336914 	 Average Score: 259.10
70 | Running time till now :4:23:57.511495
71 | 
72 | Episode 2000	 Loss 27.258968353271484 	 Average Score: 261.15
73 | Running time till now :4:38:51.220961
74 | 
75 | Training finished, total running time:4:38:51.232527. 
76 |  Model saved.
77 | 


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/log/train_20210329_1.log:
--------------------------------------------------------------------------------
 1 | nohup: ignoring input
 2 | ####################################################
 3 |  Start Training on SpaceInvaders-v0 environment using DQN with CNN
 4 | ####################################################
 5 | 
 6 | Training Parameters :
 7 |  Train episode : 1000
 8 |  Network update every 5 time step 
 9 |  Replay buffer size : 5000
10 |  Batch size : 64
11 |  Learning rate : 0.005 
12 |  GAMMA : 0.99 
13 |  Epsilon decay rate : 0.995
14 | 
15 | 


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/log/train_20210330.log:
--------------------------------------------------------------------------------
 1 | nohup: ignoring input
 2 | ####################################################
 3 |  Start Training on SpaceInvaders-v0 environment using DQN with CNN
 4 | ####################################################
 5 | 
 6 | Training Parameters :
 7 |  Train episode : 1000
 8 |  Network update every 5 time step 
 9 |  Replay buffer size : 3500
10 |  Batch size : 64
11 |  Learning rate : 0.005 
12 |  GAMMA : 0.99 
13 |  Epsilon decay rate : 0.995
14 | 
15 | Episode 100	 Loss 71.22970581054688 	 Average Score: 180.00
16 | Running time till now :0:22:27.094323
17 | 
18 | Episode 200	 Loss 29.8588924407959 	 Average Score: 152.60
19 | Running time till now :0:44:21.409593
20 | 
21 | Episode 300	 Loss 61.14106369018555 	 Average Score: 209.75
22 | Running time till now :1:09:07.883698
23 | 
24 | Episode 400	 Loss 30.508338928222656 	 Average Score: 218.05
25 | Running time till now :1:35:38.577875
26 | 
27 | Episode 500	 Loss 89.18991088867188 	 Average Score: 245.05
28 | Running time till now :2:03:52.850307
29 | 
30 | Episode 600	 Loss 21.991769790649414 	 Average Score: 262.35
31 | Running time till now :2:30:57.078511
32 | 
33 | Episode 700	 Loss 23.49405860900879 	 Average Score: 254.65
34 | Running time till now :2:56:23.053378
35 | 
36 | Episode 800	 Loss 81.24069213867188 	 Average Score: 263.45
37 | Running time till now :3:21:37.061215
38 | 
39 | Episode 900	 Loss 24.93558692932129 	 Average Score: 284.15
40 | Running time till now :3:50:02.799914
41 | 
42 | Episode 1000	 Loss 85.55946350097656 	 Average Score: 268.20
43 | Running time till now :4:16:22.288483
44 | 
45 | Training finished, total running time:4:16:22.299132. 
46 |  Model saved.
47 | Traceback (most recent call last):
48 |   File "main_dqn_atari.py", line 178, in <module>
49 |     dqn_agent = Agent_dqn(state_size,action_size)
50 |   File "/home/lesreg/Remote_Pros/DRL_pytorch/DQNs/DQN_cnn/dqn_agent.py", line 79, in __init__
51 |     self.qnetwork_local = CNN_Model(input_channel,action_size).to(device)
52 |   File "/home/lesreg/Remote_Pros/DRL_pytorch/DQNs/DQN_cnn/cnn_model.py", line 20, in __init__
53 |     conv_out_size = self._get_conv_out(input_shape)
54 |   File "/home/lesreg/Remote_Pros/DRL_pytorch/DQNs/DQN_cnn/cnn_model.py", line 28, in _get_conv_out
55 |     o = self.conv(torch.zeros((1, *input_shape)))
56 |   File "/root/anaconda3/envs/gym_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
57 |     result = self.forward(*input, **kwargs)
58 |   File "/root/anaconda3/envs/gym_py36/lib/python3.6/site-packages/torch/nn/modules/container.py", line 119, in forward
59 |     input = module(input)
60 |   File "/root/anaconda3/envs/gym_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
61 |     result = self.forward(*input, **kwargs)
62 |   File "/root/anaconda3/envs/gym_py36/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 399, in forward
63 |     return self._conv_forward(input, self.weight, self.bias)
64 |   File "/root/anaconda3/envs/gym_py36/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 396, in _conv_forward
65 |     self.padding, self.dilation, self.groups)
66 | RuntimeError: Calculated padded input size per channel: (160 x 3). Kernel size: (8 x 8). Kernel size can't be greater than actual input size
67 | 


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/log/train_20210331.log:
--------------------------------------------------------------------------------
 1 | nohup: ignoring input
 2 | ####################################################
 3 |  Start Training on SpaceInvaders-v0 environment using DQN with CNN
 4 | ####################################################
 5 | 
 6 | Training Parameters :
 7 |  Train episode : 2000
 8 |  Network update every 10 time step 
 9 |  Replay buffer size : 3500
10 |  Batch size : 128
11 |  Learning rate : 0.0005 
12 |  GAMMA : 0.99 
13 |  Epsilon decay rate : 0.995
14 | 
15 | Episode 100	 Loss 37.57309341430664 	 Average Score: 144.10
16 | Running time till now :0:20:57.373758
17 | 
18 | Episode 200	 Loss 45.944541931152344 	 Average Score: 149.85
19 | Running time till now :0:43:08.088817
20 | 
21 | Episode 300	 Loss 77.53382110595703 	 Average Score: 200.00
22 | Running time till now :1:08:21.291658
23 | 
24 | Episode 400	 Loss 94.59493255615234 	 Average Score: 200.65
25 | Running time till now :1:34:55.924278
26 | 
27 | Episode 500	 Loss 20.661224365234375 	 Average Score: 228.70
28 | Running time till now :2:02:23.143794
29 | 
30 | Episode 600	 Loss 38.10764694213867 	 Average Score: 259.20
31 | Running time till now :2:32:13.928401
32 | 
33 | Episode 700	 Loss 21.809246063232422 	 Average Score: 238.65
34 | Running time till now :2:58:31.344456
35 | 
36 | Episode 800	 Loss 27.276247024536133 	 Average Score: 252.15
37 | Running time till now :3:22:48.359756
38 | 
39 | Episode 900	 Loss 64.65150451660156 	 Average Score: 273.00
40 | Running time till now :3:50:41.210916
41 | 
42 | Episode 1000	 Loss 25.74323272705078 	 Average Score: 266.45
43 | Running time till now :4:16:18.361667
44 | 
45 | Episode 1100	 Loss 18.910884857177734 	 Average Score: 277.10
46 | Running time till now :4:44:24.187721
47 | 
48 | Episode 1200	 Loss 26.118581771850586 	 Average Score: 267.65
49 | Running time till now :5:09:37.085516
50 | 
51 | Episode 1300	 Loss 25.362396240234375 	 Average Score: 251.85
52 | Running time till now :5:34:53.701273
53 | 
54 | Episode 1400	 Loss 26.500167846679688 	 Average Score: 281.05
55 | Running time till now :6:00:55.716864
56 | 
57 | Episode 1500	 Loss 32.66218185424805 	 Average Score: 257.55
58 | Running time till now :6:25:39.960819
59 | 
60 | Episode 1600	 Loss 52.91573715209961 	 Average Score: 268.40
61 | Running time till now :6:50:39.701043
62 | 
63 | Episode 1700	 Loss 43.722801208496094 	 Average Score: 265.95
64 | Running time till now :7:13:50.829814
65 | 
66 | Episode 1800	 Loss 49.69996643066406 	 Average Score: 262.20
67 | Running time till now :7:39:07.592524
68 | 
69 | Episode 1900	 Loss 84.68921661376953 	 Average Score: 266.25
70 | Running time till now :8:06:00.022165
71 | 
72 | Episode 2000	 Loss 24.432580947875977 	 Average Score: 259.90
73 | Running time till now :8:37:15.948752
74 | 
75 | Training finished, total running time:8:37:15.969363. 
76 |  Model saved.
77 | 


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/main_test.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import arrow
 4 | import torch
 5 | import gym
 6 | import numpy as np
 7 | from matplotlib import pyplot as plt
 8 | from collections import deque
 9 | from DQNs.DQN_cnn.dqn_agent import Agent_dqn
10 | from DQNs.DQN_cnn import atari_wappers
11 | 
12 | 
13 | def plot_scores(scores,filename):
14 |     fig = plt.figure()
15 |     ax = fig.add_subplot(1, 1, 1)
16 |     ax.plot(np.arange(len(scores)), scores)
17 |     plt.ylabel('Score')
18 |     plt.xlabel('Episode #')
19 |     plt.savefig(filename)
20 | 
21 | 
22 | 
23 | def train_agent(env,agent,n_episode,eps_decay,gamma,update_every):
24 |     scores = []  # list containing scores from each episode
25 |     scores_window = deque(maxlen=100)  # last 100 scores
26 | 
27 |     start_time = arrow.now()
28 |     for i_episode in range(1, n_episode + 1):
29 |         state = env.reset()
30 |         print(state.shape)
31 |         score = 0
32 |         episode_loss=[]
33 |         while True:
34 |             # # check the memory usage of system, clean replay buffer if too high
35 |             # if (sys_mem.used / sys_mem.total) >= 0.03:
36 |             #     agent.memory.clean_buffer()
37 |             #     print('Buffer cleaned on episode {}'.format(i_episode))
38 |             # get action
39 |             action = agent.act(state,i_episode,eps_decay)
40 |             # interact with env (one step)
41 |             next_state, reward, done, _ = env.step(action)
42 |             # train the agent
43 |             sarsd = (state, action, reward, next_state,done)
44 |             loss = agent.step(sarsd,gamma,update_every)
45 |             # update status
46 |             state = next_state
47 |             score += reward
48 |             # break the loop if current episode is over
49 |             if done:
50 |                 break
51 |             if loss is not None:
52 |                 episode_loss.append(loss)
53 | 
54 |         # update rewards and scores every episode
55 |         scores_window.append(score)
56 |         scores.append(score)
57 | 
58 |         # print('\rEpisode {}\t Loss {} \t Average Score: {:.2f}'.format(i_episode, np.mean(episode_loss),
59 |         #                                                                np.mean(scores_window)), end="")
60 |         #
61 |         # if i_episode > 25:
62 |         #     print('Replay Buffer size: {}'.format(agent.memory.__len__()))
63 |         #     print('Memory used: ',sys_mem.used)
64 |         #     print('Memory used rate: ',sys_mem.used/sys_mem.total)
65 | 
66 |         if i_episode % 100 == 0:
67 |             print('\rEpisode {}\t Loss {} \t Average Score: {:.2f}'.format(i_episode, np.mean(episode_loss),
68 |                                                                            np.mean(scores_window)))
69 |             print('\rRunning time till now :{}\n'.format(arrow.now() - start_time))
70 | 
71 | 
72 |     print("Training finished, total running time:{}. \n Model saved.".format(arrow.now()-start_time))
73 | 
74 |     return scores
75 | 
76 | 
77 | 
78 | 
79 | 
80 | if __name__ =="__main__":
81 |     env = atari_wappers.make_env("SpaceInvaders-v0")
82 |     state_size, action_size = env.observation_space.shape, env.action_space.n
83 |     dqn_agent = Agent_dqn(state_size,action_size)
84 |     train_agent(env,dqn_agent,1,0.98,0.995,5)
85 | 


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/play_atari.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from collections import deque
 4 | import os
 5 | import torch
 6 | # from skimage import io
 7 | from DQNs.DQN_cnn.dqn_agent import Agent_dqn
 8 | from DQNs.DQN_cnn import atari_wappers
 9 | 
10 | 
11 | # def save_films(state,step):
12 | #     if not os.path.exists('./image'):
13 | #         os.makedirs('./image')
14 | #     img_name = './image/pic-%d.jpg' % step
15 | #     io.imsave(img_name, state)
16 | 
17 | 
18 | def random_play():
19 |     for step in range(5000):
20 |         env.render()
21 |         action = 1
22 |         state, reward, done, info = env.step(action)
23 | 
24 |         if step % 100 == 0:
25 |             print(state.shape)
26 |             # print(state)
27 |             save_films(state, step)
28 | 
29 |         if reward > 0:
30 |             print(reward, step)
31 |             save_films(state, step)
32 | 
33 |         if done:
34 |             print('dead in %d steps' % step)
35 |             break
36 | 
37 | 
38 | def random_test(env):
39 |     socres = []
40 |     scores_window = deque(maxlen=100)
41 | 
42 |     for i_episode in range(100):
43 |         state = env.reset()
44 |         score = 0
45 |         while True:
46 |             action = np.random.choice(env.action_space.n,1)[0]
47 |             state, reward, done, info = env.step(action)
48 |             score += reward
49 |             if done:
50 |                 break
51 |         socres.append(score)
52 |         scores_window.append(score)
53 | 
54 |         if i_episode % 10 == 0:
55 |             print('Episode {},\t Average score : {} '.format(i_episode, np.mean(scores_window)))
56 | 
57 | 
58 | def trained_agent_test(env,agent):
59 |     socres = []
60 |     scores_window = deque(maxlen=100)
61 | 
62 |     for i_episode in range(5000):
63 |         state = env.reset()
64 |         score = 0
65 | 
66 |         while True:
67 |             action = agent.act(state)
68 |             env.render()
69 |             state, reward, done, _ = env.step(action)
70 |             score += reward
71 |             if done:
72 |                 break
73 |         socres.append(score)
74 |         scores_window.append(score)
75 | 
76 |         if i_episode % 100 == 0:
77 |             print('Episode {},\r Average score : {} '.format(i_episode,np.mean(scores_window)))
78 | 
79 | 
80 | if __name__ =="__main__":
81 | 
82 |     env = gym.make('SpaceInvaders-v0')
83 |     random_test(env)
84 | 
85 |     # env = atari_wappers.make_env("SpaceInvaders-v0")
86 |     # state_size, action_size = env.observation_space.shape, env.action_space.n
87 |     # dqn_agent = Agent_dqn(state_size, action_size)
88 |     #
89 |     # dqn_agent.qnetwork_local.load_state_dict(torch.load("dqn_model.pth"))
90 |     # trained_agent_test(env,dqn_agent)
91 | 
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/DQNs/DQN_cnn/train_20210401.log:
--------------------------------------------------------------------------------
 1 | nohup: ignoring input
 2 | ####################################################
 3 |  Start Training on SpaceInvaders-v0 environment using DQN with CNN
 4 | ####################################################
 5 | 
 6 | Training Parameters :
 7 |  Train episode : 5000
 8 |  Network update every 5 time step 
 9 |  Replay buffer size : 3000
10 |  Batch size : 128
11 |  Learning rate : 0.0005 
12 |  GAMMA : 0.99 
13 |  Epsilon decay rate : 0.995
14 | 
15 | 


--------------------------------------------------------------------------------
/Evaluation_Algorithms/CartPole.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 在 CartPole 环境中测试多种算法智能体的表现，并对比奖励曲线图
 3 | 测试算法：1.PPO
 4 |         2.DDPG/TD3
 5 |         3.DQN
 6 |         4.A3C/A2C
 7 | """
 8 | 
 9 | import torch
10 | import gym
11 | import numpy as np
12 | import pandas as pd
13 | from collections import deque
14 | import matplotlib.pyplot as plt
15 | 
16 | 
17 | def plot_scores(scores,file_name,multi_time=False):
18 |     "绘制多次训练多条曲线"
19 |     if multi_time:
20 |         x=np.arange(1, len(scores[0]) + 1)
21 |         for n in range(len(scores)):
22 |             rolling_mean = pd.Series(scores[n]).rolling(100).mean()
23 |             plt.plot(x,rolling_mean,label="trial_"+str(n+1))
24 |     else:
25 |         x = np.arange(1, len(scores) + 1)
26 |         rolling_mean = pd.Series(scores).rolling(100).mean()
27 |         plt.plot(x, rolling_mean)
28 | 
29 |     plt.ylabel('Score')
30 |     plt.xlabel('Episode #')
31 |     plt.legend()
32 |     plt.savefig(file_name)
33 |     plt.show()
34 | 
35 | 
36 | def plot_diff_agent(scores_2d,file_name):
37 |     " 绘制多种不同agent的训练曲线：多曲线图"
38 |     for name,scores in scores_2d:
39 |         x = np.arange(1, len(scores) + 1)
40 |         rolling_mean = pd.Series(scores).rolling(100).mean()
41 |         plt.plot(x, rolling_mean,label=name)
42 |     plt.ylabel('Score')
43 |     plt.xlabel('Episode #')
44 |     plt.legend()
45 |     plt.savefig(file_name)
46 |     plt.show()
47 | 
48 | 


--------------------------------------------------------------------------------
/Games_play_train/atari.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import os
 3 | from skimage import io
 4 | 
 5 | env = gym.make('SpaceInvaders-v0')
 6 | #env = gym.make("PongDeterministic-v4")
 7 | status = env.reset()
 8 | 
 9 | 
10 | print('observation space:', env.observation_space)
11 | print('action space:', env.action_space)
12 | 
13 | 
14 | def save_films(state,step):
15 |     if not os.path.exists('./image'):
16 |         os.makedirs('./image')
17 |     img_name = './image/pic-%d.jpg' % step
18 |     io.imsave(img_name, state)
19 | 
20 | 
21 | for step in range(5000):
22 |     env.render()
23 |     action =1
24 |     state, reward, done, info = env.step(action)
25 | 
26 |     if step%100 ==0 :
27 |         print(state.shape)
28 |         # print(state)
29 |         save_films(state,step)
30 | 
31 |     if reward >0:
32 |         print(reward,step)
33 |         save_films(state,step)
34 | 
35 |     if done:
36 |         print('dead in %d steps' % step)
37 |         break


--------------------------------------------------------------------------------
/Policy_Gradient/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/.DS_Store


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/__pycache__/agent_PG.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/__pycache__/agent_PG.cpython-37.pyc


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/__pycache__/model.cpython-36.pyc


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/__pycache__/model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/__pycache__/model.cpython-37.pyc


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/agent_PG.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gym
  3 | from collections import deque
  4 | import torch
  5 | import torch.optim as optim
  6 | from CartPole.Policy_Gradient.model import Policy
  7 | 
  8 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  9 | 
 10 | GAMMA=1.0
 11 | LR=0.001
 12 | 
 13 | 
 14 | class Agent_PG():
 15 | 
 16 |     def __init__(self, state_size, action_size,type):
 17 |         self.policy=Policy(state_size,action_size).to(device)
 18 |         self.optimizer=optim.Adam(self.policy.parameters(), lr=LR)
 19 |         self.type=type
 20 | 
 21 |     def reinforce_loss(self,log_probs,rewards):
 22 |         "------根据 Reinforce 算法计算的损失函数---------"
 23 |         # calculate discount rewards
 24 |         discounts=[GAMMA**i for i in range(len(rewards))]
 25 |         R=sum([g*r for g,r in zip(discounts,rewards)])
 26 | 
 27 |         loss_arr=[]
 28 |         for log_prob in log_probs:
 29 |             loss_arr.append(-log_prob * R)
 30 | 
 31 |         policy_loss=torch.cat(loss_arr).sum()  # 把n个1d tensor 组成的list 拼接成一个完整的 tensor（1d,size:n）
 32 |         # print(policy_loss)
 33 |         return policy_loss
 34 | 
 35 |     def pg_loss(self,log_probs,rewards):
 36 |         """----
 37 |         Reinforce 的改进版本：
 38 |         1.Credit Assignment：对每个 a(t) 计算未来累积折扣回报 R
 39 |         2.对每个t的回报R进行 batch normalization
 40 |         ------"""
 41 |         # calculate the (discounted) future rewards
 42 |         furRewards_dis = []
 43 |         for i in range(len(rewards)):
 44 |             discount = [GAMMA ** i for i in range(len(rewards) - i)]
 45 |             f_rewards = rewards[i:]
 46 |             furRewards_dis.append(sum(d * f for d, f in zip(discount, f_rewards)))
 47 |         # print(furRewards_dis)
 48 | 
 49 |         # -- Normalize reward
 50 |         mean = np.mean(furRewards_dis)
 51 |         std = np.std(furRewards_dis) + 1.0e-10
 52 |         rewards_normalized = (furRewards_dis - mean) / std
 53 | 
 54 |         # -- calculate policy loss
 55 |         loss_arr = []
 56 |         for i in range(len(rewards_normalized)):
 57 |             loss_arr.append(-log_probs[i]*rewards_normalized[i])
 58 |         # print(loss_arr)
 59 | 
 60 |         policy_loss = torch.cat(loss_arr).sum()
 61 |         # print(policy_loss,"----------\n")
 62 | 
 63 |         return policy_loss
 64 | 
 65 |     def train(self,env):
 66 |         state = env.reset()
 67 |         log_probs = []
 68 |         rewards = []
 69 |         # --- collect log probs and rewards for a single trajectory
 70 |         while True:
 71 |             # convert state to tensor
 72 |             state = torch.from_numpy(state).float().unsqueeze(0).to(device)  # 升维 1d->2d
 73 |             result_dic = self.policy.act(state)
 74 |             next_state, reward, done, _ = env.step(result_dic['action'])
 75 |             log_probs.append(result_dic['log_prob'])
 76 |             rewards.append(reward)
 77 |             state = next_state
 78 |             if done:
 79 |                 break
 80 |         total_reward = sum(rewards)
 81 | 
 82 |         # --- update policy after one completed trajectory
 83 |         # calculate loss
 84 |         loss = self.reinforce_loss(log_probs, rewards)
 85 |         if self.type=="reinforce":
 86 |             loss = self.reinforce_loss(log_probs, rewards)
 87 |         elif self.type=="pg":
 88 |             loss = self.pg_loss(log_probs, rewards)
 89 | 
 90 |         # backprop the loss to update policy network
 91 |         self.optimizer.zero_grad()
 92 |         loss.backward()
 93 |         self.optimizer.step()
 94 | 
 95 |         return total_reward
 96 | 
 97 | 
 98 | if __name__=="__main__":
 99 |     env = gym.make('CartPole-v0')
100 |     agent=Agent_PG(state_size=4,action_size=2,type='pg')
101 |     n_episode=2000
102 | 
103 |     scores_deque = deque(maxlen=100)
104 |     scores = []
105 |     for i_episode in range(1,n_episode+1):
106 |         Reward=agent.train(env)
107 | 
108 |         scores_deque.append(Reward)
109 |         scores.append(Reward)
110 |         if i_episode % 100 == 0:
111 |             print('Episode {}\t Average Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/main_PG.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import gym
  3 | import numpy as np
  4 | import pandas as pd
  5 | from collections import deque
  6 | import matplotlib.pyplot as plt
  7 | from CartPole.Policy_Gradient.agent_PG import Agent_PG
  8 | from CartPole.Policy_Gradient.PPO_with_R import PPO_v1
  9 | from CartPole.Policy_Gradient.PPO_with_A import PPO_V2
 10 | 
 11 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 12 | # model_file="models/pg_model_3.pth"
 13 | # plot_file="results&plots/pg_3.png"
 14 | 
 15 | 
 16 | def watch_smart_agent(agent,model_name):
 17 |     agent.policy.load_state_dict(torch.load(model_name))
 18 |     state = env.reset()
 19 |     for t in range(1000):
 20 |         state = torch.from_numpy(state).float().unsqueeze(0).to(device)
 21 |         action,_ = agent.policy.act(state)
 22 |         env.render()
 23 |         state, reward, done, _ = env.step(action)
 24 |         if done:
 25 |             print("done in time step {}".format(t+1))
 26 |             break
 27 |     env.close()
 28 | 
 29 | 
 30 | def plot_scores(scores,file_name,multi_time=False):
 31 |     "绘制多次训练多条曲线"
 32 |     if multi_time:
 33 |         x=np.arange(1, len(scores[0]) + 1)
 34 |         for n in range(len(scores)):
 35 |             rolling_mean = pd.Series(scores[n]).rolling(100).mean()
 36 |             plt.plot(x,rolling_mean,label="trial_"+str(n+1))
 37 |     else:
 38 |         x = np.arange(1, len(scores) + 1)
 39 |         rolling_mean = pd.Series(scores).rolling(100).mean()
 40 |         plt.plot(x, rolling_mean)
 41 | 
 42 |     plt.ylabel('Score')
 43 |     plt.xlabel('Episode #')
 44 |     plt.legend()
 45 |     plt.savefig(file_name)
 46 |     plt.show()
 47 | 
 48 | 
 49 | def plot_diff_agent(scores_2d,file_name):
 50 |     " 绘制多种不同agent的训练曲线：多曲线图"
 51 |     for name,scores in scores_2d:
 52 |         x = np.arange(1, len(scores) + 1)
 53 |         rolling_mean = pd.Series(scores).rolling(100).mean()
 54 |         plt.plot(x, rolling_mean,label=name)
 55 |     plt.ylabel('Score')
 56 |     plt.xlabel('Episode #')
 57 |     plt.legend()
 58 |     plt.savefig(file_name)
 59 |     plt.show()
 60 | 
 61 | 
 62 | def agent_test(agent,n_episode,model_name):
 63 |     agent.policy.load_state_dict(torch.load(model_name))
 64 |     scores = []
 65 |     for i_episode in range(1, n_episode + 1):
 66 |         rewards=[]
 67 |         state = env.reset()
 68 |         while True:
 69 |             state = torch.from_numpy(state).float().unsqueeze(0).to(device)  # 升维 1d->2d
 70 |             action, _ = agent.policy.act(state)
 71 |             state, reward, done, _ = env.step(action)
 72 |             rewards.append(reward)
 73 |             if done:
 74 |                 break
 75 |         scores.append(sum(rewards))
 76 | 
 77 |     return scores
 78 | 
 79 | 
 80 | def train_agent(env,agent,n_episode,model_file):
 81 |     scores_deque = deque(maxlen=100)
 82 |     scores = []
 83 | 
 84 |     for i_episode in range(1, n_episode + 1):
 85 |         total_reward=agent.train(env)
 86 |         # record scores(total rewards) per episode
 87 |         scores_deque.append(total_reward)
 88 |         scores.append(total_reward)
 89 | 
 90 |         print('\r Episode {}\tAverage Score: {:.2f}\tScore: {:.2f}'
 91 |               .format(i_episode, np.mean(scores_deque), total_reward), end="")
 92 |         if i_episode % 100 == 0:
 93 |             print('\n Episode {}\t Average Score: {:.2f}\n'.format(i_episode,np.mean(scores_deque)))
 94 |         if np.mean(scores_deque) >= 195.0:
 95 |             print('\n Environment solved in {:d} episodes!\tAverage Score: {:.2f}\n----------\n'.format(i_episode,
 96 |                                                                                        np.mean(scores_deque)))
 97 |             torch.save(agent.policy.state_dict(),model_file)
 98 |             break
 99 | 
100 |     return scores
101 | 
102 | 
103 | def train_agent_multi_times(env, agent, n_episode, train_time, file):
104 |     " 一个 agent 训练多次并绘制所有的奖励曲线，考察特定 policy gradient 算法的稳定性"
105 |     scores_2d = []
106 |     for n in range(train_time):
107 |         scores = []
108 |         for i_episode in range(1, n_episode + 1):
109 |             total_reward = agent.train(env)
110 |             scores.append(total_reward)
111 | 
112 |         print('Trial {} finished. \t Avg score for the last 100 episode: {}'
113 |               .format((n + 1), np.mean(scores[-100:])))
114 |         scores_2d.append(scores)
115 | 
116 |     plot_scores(scores_2d, file,multi_time=True)
117 | 
118 | 
119 | def train_diff_agents(env,agents,n_episode,file):
120 |     " 训练多种算法的不同agent, 绘制奖励曲线对比性能 "
121 |     scores_2d=[]
122 |     for name in agents.keys():
123 |         scores = []
124 |         for i_episode in range(1, n_episode + 1):
125 |             total_reward = agents[name].train(env)
126 |             scores.append(total_reward)
127 |         scores_2d.append((name,scores))
128 |         print('Training agent {} finished. \t Avg score for the last 100 episode: {}'\
129 |             .format(name,np.mean(scores[-100:])))
130 | 
131 |     plot_diff_agent(scores_2d,file)
132 | 
133 | 
134 | if __name__=="__main__":
135 |     env = gym.make('CartPole-v0')
136 | 
137 |     agent_pg = Agent_PG(state_size=4,action_size=2,type="pg")
138 |     agent_rf=Agent_PG(state_size=4,action_size=2,type="reinforce")
139 |     ppo_R=PPO_v1(state_size=4,action_size=2)
140 | 
141 |     ppo_without_entropy=PPO_V2(state_size=4,action_size=2,add_entropy=False)
142 |     ppo_with_entropy=PPO_V2(state_size=4,action_size=2,add_entropy=True)
143 | 
144 |     #train_scores = train_agent(env, ppo_with_entropy, 2000, 'PGs/models/PPO_new.pth')
145 |     #plot_scores(train_scores, 'PGs/results&plots/PPO_with_entropy_1.png')
146 | 
147 |     # agents={'PPO with R':ppo_R,
148 |     #         'PPO with A':ppo_with_entropy,
149 |     #         'Policy Gradient':agent_pg,
150 |     #         'Reinforce':agent_rf}
151 | 
152 |     ppo_agents={'PPO_R':ppo_R,'PPO_A_org':ppo_without_entropy,'PPO_A_entropy':ppo_with_entropy}
153 | 
154 |     train_diff_agents(env, ppo_agents, 1500, '../results&plots/PPO_comparison_4.png')
155 |     # train_agent_multi_times(env,ppo_with_entropy,1300,5,'PGs/results&plots/PPO-entropy_5times.png')
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.distributions import Categorical
  5 | import numpy as np
  6 | 
  7 | 
  8 | class Policy(nn.Module):
  9 |     "single Policy network for Reinforce and PG"
 10 | 
 11 |     def __init__(self,state_size,action_size):
 12 |         super(Policy, self).__init__()
 13 |         self.seed = torch.manual_seed(0)
 14 |         self.fc1 = nn.Linear(state_size, 24)
 15 |         self.fc2 = nn.Linear(24, 36)
 16 |         self.fc3 = nn.Linear(36, action_size)
 17 | 
 18 |     def forward(self, x):
 19 |         """
 20 |         Build a network that maps state -> action probs.
 21 |         """
 22 | 
 23 |         out=F.relu(self.fc1(x))
 24 |         out = F.relu(self.fc2(out))
 25 |         out = F.softmax(self.fc3(out),dim=1)
 26 | 
 27 |         return out
 28 | 
 29 |     def act(self,state):
 30 |         # probs for each action (2d tensor)
 31 |         probs = self.forward(state)
 32 |         m = Categorical(probs)
 33 |         action = m.sample()
 34 |         # return action for current state, and the corresponding probability
 35 | 
 36 |         result_dic={"action":action.item(),"log_prob":m.log_prob(action)
 37 |             ,"prob":probs[:,action.item()].item()}
 38 |         return result_dic
 39 | 
 40 | 
 41 | class Actor(nn.Module):
 42 |     """Policy netwrok for PPO_R"""
 43 |     "Actor_Critic model for PPO_A"
 44 | 
 45 |     def __init__(self,state_size,action_size):
 46 |         super(Actor, self).__init__()
 47 |         self.seed = torch.manual_seed(0)
 48 |         self.fc1 = nn.Linear(state_size, 128)
 49 |         # self.fc2 = nn.Linear(64,128)
 50 |         self.fc2= nn.Linear(128, action_size)
 51 | 
 52 |     def forward(self, x):
 53 |         """
 54 |         Build a network that maps state -> action probs.
 55 |         """
 56 | 
 57 |         x=F.relu(self.fc1(x))
 58 |         out = F.softmax(self.fc2(x),dim=1)
 59 |         return out
 60 | 
 61 |     def act(self,state):
 62 |         # probs for each action (2d tensor)
 63 |         probs = self.forward(state)
 64 |         m = Categorical(probs)
 65 |         action = m.sample()
 66 |         # return action for current state, and the corresponding probability
 67 | 
 68 |         result_dic={"action":action.item(),"log_prob":m.log_prob(action)
 69 |             ,"prob":probs[:,action.item()].item()}
 70 |         return result_dic
 71 | 
 72 | 
 73 | class Critic(nn.Module):
 74 |     " Actor_Critic model for PPO"
 75 | 
 76 |     def __init__(self,state_size):
 77 |         super(Critic, self).__init__()
 78 |         self.fc1=nn.Linear(state_size,128)
 79 |         # self.fc2=nn.Linear(64,128)
 80 |         self.fc2=nn.Linear(128,1)
 81 | 
 82 |     def forward(self,x):
 83 |         x=F.relu(self.fc1(x))
 84 |         state_value = self.fc2(x)
 85 |         return state_value
 86 | 
 87 | 
 88 | if __name__=="__main__":
 89 | 
 90 |     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 91 |     state = np.array([-0.04456399, 0.04653909, 0.01326909, -0.02099827])
 92 |     state = torch.from_numpy(state).float().unsqueeze(0).to(device)
 93 | 
 94 |     policy=Policy(state_size=4,action_size=2).to(device)
 95 |     action,log_prob=policy.act(state)
 96 |     print(action,log_prob)
 97 | 
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/PPO_model-1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/PPO_model-1.pth


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/PPO_new.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/PPO_new.pth


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/PPOv2_model-1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/PPOv2_model-1.pth


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/pg_model_1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/pg_model_1.pth


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/pg_model_2.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/pg_model_2.pth


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/pg_model_3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/pg_model_3.pth


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/pg_model_4.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/pg_model_4.pth


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/reinforce_model_2.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/reinforce_model_2.pth


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/reinforce_model_3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/reinforce_model_3.pth


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/reinforce_model_4.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/reinforce_model_4.pth


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/reinforce_model_5.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/reinforce_model_5.pth


--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/reinforce_model_6.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/reinforce_model_6.pth


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/.DS_Store


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/PPO_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch.distributions import Categorical
 5 | from torch.distributions import Normal
 6 | 
 7 | 
 8 | class ActorDiscrete(nn.Module):
 9 |     """
10 |     用于离散动作空间的策略网络
11 |     """
12 |     def __init__(self,state_size,action_size):
13 |         super(ActorDiscrete, self).__init__()
14 |         self.seed = torch.manual_seed(0)
15 |         self.fc1 = nn.Linear(state_size, 128)
16 |         # self.fc2 = nn.Linear(64,128)
17 |         self.fc2= nn.Linear(128, action_size)
18 | 
19 |     def forward(self, x):
20 |         """
21 |         Build a network that maps state -> action probs.
22 |         """
23 | 
24 |         x=F.relu(self.fc1(x))
25 |         out = F.softmax(self.fc2(x),dim=1)
26 |         return out
27 | 
28 |     def act(self,state):
29 |         """
30 |         返回 action 和 action的概率
31 |         """
32 |         # probs for each action (2d tensor)
33 |         probs = self.forward(state)
34 |         m = Categorical(probs)
35 |         action = m.sample()
36 |         ## return action for current state, and the corresponding probability
37 |         # result_dic={"action":action.item(),"log_prob":m.log_prob(action)
38 |         #     ,"prob":probs[:,action.item()].item()}
39 | 
40 |         return action.item(),probs[:,action.item()].item()
41 | 
42 | 
43 | class ActorContinous(nn.Module):
44 |     """
45 |     用于连续动作空间的策略网络
46 |     """
47 |     def __init__(self,state_size,action_size):
48 |         super(ActorContinous, self).__init__()
49 |         self.fc1 = nn.Linear(state_size, 128)
50 |         self.fc2 = nn.Linear(128,128)
51 |         self.mu_head = nn.Linear(128, action_size)
52 |         self.sigma_head = nn.Linear(128, action_size)
53 | 
54 |     def forward(self, x):
55 |         x = F.relu(self.fc1(x))
56 |         x = F.relu(self.fc2(x))
57 |         mu = 2.0 * torch.tanh(self.mu_head(x))
58 |         sigma = F.softplus(self.sigma_head(x))
59 |         return (mu, sigma)
60 | 
61 |     def act(self,state):
62 |         """
63 |         返回 action 和 action 的 log prob
64 |         """
65 |         with torch.no_grad():
66 |             (mu, sigma) = self.forward(state)  # 2d tensors
67 |         dist = Normal(mu, sigma)
68 |         action = dist.sample()
69 |         action_log_prob = dist.log_prob(action)
70 | 
71 |         return action.numpy()[0], action_log_prob.numpy()[0]
72 | 
73 | 
74 | class Critic(nn.Module):
75 |     " Actor_Critic model for PPO"
76 | 
77 |     def __init__(self,state_size):
78 |         super(Critic, self).__init__()
79 |         self.fc1=nn.Linear(state_size,128)
80 |         # self.fc2=nn.Linear(64,128)
81 |         self.fc2=nn.Linear(128,1)
82 | 
83 |     def forward(self,x):
84 |         x=F.relu(self.fc1(x))
85 |         state_value = self.fc2(x)
86 |         return state_value
87 | 
88 | 
89 | if __name__=="__main__":
90 |    pass
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/PPO_v1.py:
--------------------------------------------------------------------------------
  1 | """
  2 | PPO_V1: 直接使用累积奖励计算loss；无critic，只有policy网络
  3 | """
  4 | import numpy as np
  5 | import gym
  6 | from collections import namedtuple
  7 | from collections import deque
  8 | import torch
  9 | import torch.optim as optim
 10 | import torch.nn as nn
 11 | from torch.distributions import Normal
 12 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
 13 | from Policy_Gradient.PPO.PPO_model import ActorContinous,ActorDiscrete,Critic
 14 | 
 15 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 16 | GAMMA=0.99
 17 | LR=0.001
 18 | BATCH_SIZE=32
 19 | CLIP=0.2
 20 | UPDATE_TIME=10
 21 | max_grad_norm=0.5
 22 | Transition = namedtuple('Transition', ['state', 'action',  'prob', 'reward'])
 23 | 
 24 | 
 25 | class PPO_v1():
 26 | 
 27 |     def __init__(self, state_size, action_size,continuous=False):
 28 |         self.policy = ActorDiscrete(state_size, action_size).to(device)
 29 |         self.continuous = continuous
 30 |         if self.continuous:
 31 |             self.policy = ActorContinous(state_size, action_size).to(device)
 32 |         self.optimizer=optim.Adam(self.policy.parameters(), lr=LR)
 33 |         self.trajectory=[]
 34 | 
 35 |     def update_policy(self,exps,i_episode):
 36 |         """
 37 |         update policy for every sampled transition groups
 38 |         called by learn() multiple times for one episode
 39 |         """
 40 |         states,actions,old_probs,f_Rewrds=exps
 41 |         # get action probs from new policy
 42 |         if self.continuous:
 43 |             (mus, sigmas) = self.policy(states)
 44 |             dists = Normal(mus, sigmas)
 45 |             new_probs = dists.log_prob(actions)
 46 |             ratios = torch.exp(new_probs - old_probs)
 47 |         else:
 48 |             new_probs = self.policy(states).gather(1, actions)
 49 |             ratios = new_probs / old_probs
 50 | 
 51 |         # calculate clipped surrogate function
 52 |         surr1 = ratios * f_Rewrds
 53 |         surr2 = torch.clamp(ratios, 1 - CLIP, 1 + CLIP) * f_Rewrds
 54 |         policy_loss=-torch.min(surr1,surr2).mean()
 55 | 
 56 |         # update policy network
 57 |         self.optimizer.zero_grad()
 58 |         policy_loss.backward()
 59 |         nn.utils.clip_grad_norm_(self.policy.parameters(), max_grad_norm)
 60 |         self.optimizer.step()
 61 | 
 62 |         # self.traintime_counter+=1
 63 | 
 64 |     def learn(self,i_episode):
 65 |         """
 66 |         agent learn after finishing every episode.
 67 |         learn from experiences of this trajectory
 68 |         :return:
 69 |         """
 70 |         states=torch.cat([t.state for t in self.trajectory])
 71 |         actions=torch.tensor([t.action for t in self.trajectory],dtype=torch.long).view(-1,1)
 72 |         old_probs=torch.tensor([t.prob for t in self.trajectory],dtype=torch.float).view(-1,1)
 73 | 
 74 |         # -- calculate discount future rewards for every time step
 75 |         rewards = [t.reward for t in self.trajectory]
 76 |         fur_Rewards = []
 77 |         for i in range(len(rewards)):
 78 |             discount = [GAMMA ** i for i in range(len(rewards) - i)]
 79 |             f_rewards = rewards[i:]
 80 |             fur_Rewards.append(sum(d * f for d, f in zip(discount, f_rewards)))
 81 |         fur_Rewards=torch.tensor(fur_Rewards,dtype=torch.float).view(-1,1)
 82 | 
 83 |         for i in range(UPDATE_TIME):
 84 |             # -- repeat the flowing update loop for several times
 85 |             # disorganize transitions in the trajectory into sub groups
 86 |             for index_set in BatchSampler(SubsetRandomSampler(range(len(self.trajectory))), BATCH_SIZE, False):
 87 |                 exps=(states[index_set],actions[index_set],old_probs[index_set],fur_Rewards[index_set])
 88 |                 # -- update policy network for every sub groups
 89 |                 self.update_policy(exps,i_episode)
 90 | 
 91 |         del self.trajectory[:]  # clear trajectory
 92 | 
 93 | 
 94 |     def train(self,env,i_episode):
 95 |         state = env.reset()
 96 |         total_reward=0
 97 |         while True:
 98 |             # self.timesetp_counter+=1
 99 |             state = torch.from_numpy(state).float().unsqueeze(0).to(device)  # 升维 1d->2d
100 |             action, prob = self.policy.act(state)  # 离散空间取直接prob，连续空间取log prob
101 |             next_state, reward, done, _ = env.step(action)
102 | 
103 |             # --store transition in this current trajectory
104 |             self.trajectory.append(Transition(state,action,prob,reward))
105 |             state=next_state
106 |             total_reward+=reward
107 |             if done:
108 |                 break
109 |         # --agent learn after finish current episode, and if there is enough transitions
110 |         if BATCH_SIZE <= len(self.trajectory):
111 |             self.learn(i_episode)
112 | 
113 |         return total_reward
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/PPO_v2.py:
--------------------------------------------------------------------------------
  1 | """
  2 | PPO_V2: 使用优势函数计算loss；有critic网络
  3 | """
  4 | import random
  5 | import numpy as np
  6 | import gym
  7 | from collections import namedtuple
  8 | from collections import deque
  9 | import torch
 10 | import torch.optim as optim
 11 | from torch.distributions import Normal
 12 | import torch.nn.functional as F
 13 | import torch.nn as nn
 14 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
 15 | from Policy_Gradient.PPO.PPO_model import ActorContinous,ActorDiscrete,Critic
 16 | from torch.utils.tensorboard import SummaryWriter
 17 | 
 18 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 19 | writer = SummaryWriter('./board/logs')
 20 | 
 21 | GAMMA=0.99
 22 | LR_a=0.001
 23 | LR_c=0.003
 24 | BATCH_SIZE=32
 25 | CLIP=0.2
 26 | BETA=0.01
 27 | UPDATE_TIME=10
 28 | max_grad_norm=0.5
 29 | 
 30 | Transition = namedtuple('Transition', ['state', 'action',  'prob', 'reward'])
 31 | 
 32 | 
 33 | class Memory():
 34 |     def __init__(self):
 35 |         self.trajectory=[]
 36 |         self.Transition = namedtuple('Transition', ['state', 'action', 'prob', 'reward'])
 37 | 
 38 |     def add(self,state,action,prob,reward):
 39 |         # state = torch.from_numpy(state).float().unsqueeze(0).to(device)
 40 |         self.trajectory.append(self.Transition(state,action,prob,reward))
 41 | 
 42 |     def clean_buffer(self):
 43 |         del self.trajectory[:]
 44 | 
 45 |     def get_trajectory(self):
 46 |         states = torch.cat([t.state for t in self.trajectory])
 47 |         actions = torch.tensor([t.action for t in self.trajectory], dtype=torch.long).view(-1, 1)
 48 |         probs = torch.tensor([t.prob for t in self.trajectory], dtype=torch.float).view(-1, 1)
 49 |         rewards = [t.reward for t in self.trajectory]
 50 |         return states,actions,probs,rewards
 51 | 
 52 |     def __len__(self):
 53 |         return len(self.trajectory)
 54 | 
 55 | 
 56 | class PPO_v2():
 57 |     def __init__(self,state_size, action_size,continuous=False,add_entropy=True):
 58 | 
 59 |         self.critic = Critic(state_size)
 60 |         self.policy = ActorDiscrete(state_size, action_size).to(device)
 61 |         self.continuous = continuous
 62 |         if self.continuous:
 63 |             self.policy = ActorContinous(state_size, action_size).to(device)
 64 | 
 65 |         self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=LR_a)
 66 |         self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=LR_c)
 67 | 
 68 |         self.memory = Memory()
 69 |         self.train_step = 0
 70 |         self.add_entropy=add_entropy
 71 | 
 72 |     def policy_loss(self,states,actions,
 73 |                     old_probs,f_Rewrds,V):
 74 | 
 75 |         # get action probs from new policy and calculate the ratio
 76 |         if self.continuous:
 77 |             (mus, sigmas) = self.policy(states)
 78 |             dists = Normal(mus, sigmas)
 79 |             new_probs = dists.log_prob(actions)
 80 |             ratios = torch.exp(new_probs - old_probs)
 81 |         else:
 82 |             new_probs = self.policy(states).gather(1, actions)
 83 |             ratios = new_probs / old_probs
 84 | 
 85 |         # calculate advance from critic network
 86 |         advantage = (f_Rewrds - V).detach()
 87 | 
 88 |         # calculate clipped surrogate function
 89 |         surr1 = ratios * advantage
 90 |         surr2 = torch.clamp(ratios, 1 - CLIP, 1 + CLIP) * advantage
 91 |         policy_loss = -torch.min(surr1, surr2)
 92 | 
 93 |         if self.add_entropy:
 94 |             # include a regularization term,this steers new_policy towards 0.5
 95 |             # add in 1.e-10 to avoid log(0) which gives nan
 96 |             entropy= -(new_probs*torch.log(old_probs+1.e-10)+ (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10))
 97 |             policy_loss+=BETA*entropy
 98 | 
 99 |         policy_loss=torch.mean(policy_loss)
100 | 
101 |         return policy_loss
102 | 
103 |     def critic_loss(self,f_Rewrds, V):
104 |         return F.mse_loss(f_Rewrds, V)
105 | 
106 |     def update_policy(self,exps,i_episode):
107 |         states, actions, old_probs, f_Rewrds = exps
108 |         V = self.critic(states)
109 | 
110 |         # -- update policy(actor) network -- #
111 |         policy_loss = self.policy_loss(states,actions,old_probs,f_Rewrds,V)
112 |         # self.writer.add_scalar('loss/policy_loss', policy_loss, global_step=self.train_step)
113 |         # update parameters
114 |         self.policy_optimizer.zero_grad()
115 |         policy_loss.backward()
116 |         # nn.utils.clip_grad_norm_(self.policy.parameters(), max_grad_norm)
117 |         self.policy_optimizer.step()
118 | 
119 |         # -- update value(critic) network -- #
120 |         value_loss = self.critic_loss(f_Rewrds,V)
121 |         # self.writer.add_scalar('loss/value_loss', value_loss, global_step=self.train_step)
122 |         self.critic_optimizer.zero_grad()
123 |         value_loss.backward()
124 |         # nn.utils.clip_grad_norm_(self.critic.parameters(), max_grad_norm)
125 |         self.critic_optimizer.step()
126 | 
127 |         self.train_step+=1
128 |         writer.add_scalar('loss/policy_loss',policy_loss.item(),i_episode)
129 |         writer.add_scalar('loss/value_loss', value_loss.item(), i_episode)
130 |         writer.flush()
131 | 
132 |     def learn(self,i_episode):
133 |         """
134 |         agent learn after finishing every episode.
135 |         learn from experiences of this trajectory
136 |         :return:
137 |         """
138 |         # states=torch.cat([t.state for t in self.memory.trajectory])
139 |         # actions=torch.tensor([t.action for t in self.memory.trajectory],dtype=torch.long).view(-1,1)
140 |         # old_probs=torch.tensor([t.prob for t in self.memory.trajectory],dtype=torch.float).view(-1,1)
141 |         # rewards = [t.reward for t in self.memory.trajectory]
142 | 
143 |         states, actions, old_probs, rewards = self.memory.get_trajectory()
144 |         # -- calculate discount future rewards for every time step
145 |         fur_Rewards = []
146 |         for i in range(len(rewards)):
147 |             discount = [GAMMA ** i for i in range(len(rewards) - i)]
148 |             f_rewards = rewards[i:]
149 |             fur_Rewards.append(sum(d * f for d, f in zip(discount, f_rewards)))
150 |         fur_Rewards=torch.tensor(fur_Rewards,dtype=torch.float).view(-1,1)
151 | 
152 |         for i in range(UPDATE_TIME):
153 |             # -- repeat the flowing update loop for several times
154 |             # disorganize transitions in the trajectory into sub groups
155 |             for index_set in BatchSampler(SubsetRandomSampler(range(self.memory.__len__())), BATCH_SIZE, False):
156 |                 exps=(states[index_set],actions[index_set],old_probs[index_set],fur_Rewards[index_set])
157 |                 # -- update policy network for every sub groups
158 |                 self.update_policy(exps,i_episode)
159 | 
160 |         self.memory.clean_buffer()
161 | 
162 |     def train(self,env,i_episode):
163 |         state = env.reset()
164 |         total_reward=0
165 |         while True:
166 |             # self.timesetp_counter+=1
167 |             state = torch.from_numpy(state).float().unsqueeze(0).to(device)  # 升维 1d->2d
168 |             action,prob = self.policy.act(state) # 离散空间取直接prob，连续空间取log prob
169 |             next_state, reward, done, _ = env.step(action)
170 |             # --store transition in this current trajectory
171 |             self.memory.add(state,action,prob,reward)
172 |             state=next_state
173 |             total_reward+=reward
174 |             if done:
175 |                 break
176 |         # --agent learn after finish current episode, and if there is enough transitions
177 |         if BATCH_SIZE <= self.memory.__len__():
178 |             self.learn(i_episode)
179 | 
180 |         return total_reward
181 | 
182 | 
183 | 
184 | 
185 | 


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/__pycache__/PPO_model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/__pycache__/PPO_model.cpython-37.pyc


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/__pycache__/PPO_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/__pycache__/PPO_model.cpython-38.pyc


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/__pycache__/PPO_v1.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/__pycache__/PPO_v1.cpython-38.pyc


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/__pycache__/PPO_v2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/__pycache__/PPO_v2.cpython-38.pyc


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/.DS_Store


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/.DS_Store


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608693869.bogon.80327.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608693869.bogon.80327.0


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608694041.bogon.80355.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608694041.bogon.80355.0


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608778854.bogon.82580.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608778854.bogon.82580.0


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779119.bogon.82611.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779119.bogon.82611.0


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779166.bogon.82627.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779166.bogon.82627.0


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779638.bogon.82655.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779638.bogon.82655.0


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779657.bogon.82666.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779657.bogon.82666.0


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608780330.bogon.82692.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608780330.bogon.82692.0


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608780689.bogon.82718.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608780689.bogon.82718.0


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/cartPole_ppo-v1_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/cartPole_ppo-v1_1.png


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/main_PPO.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import arrow
  3 | import gym
  4 | import numpy as np
  5 | import pandas as pd
  6 | from collections import deque
  7 | import matplotlib.pyplot as plt
  8 | from Policy_Gradient.PPO.PPO_v2 import PPO_v2
  9 | from Policy_Gradient.PPO.PPO_v1 import PPO_v1
 10 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 11 | 
 12 | 
 13 | def output_scores(start_time,i_episode,scores_deque,score,solve_limit):
 14 |     print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'
 15 |           .format(i_episode, np.mean(scores_deque), score), end="")
 16 |     if i_episode % 100 == 0:
 17 |         print('\rEpisode {}\tAverage Score: {:.2f}\t Running time til now :{}'
 18 |               .format(i_episode, np.mean(scores_deque),arrow.now()-start_time))
 19 |     if np.mean(scores_deque) >= solve_limit:
 20 |         print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}\t Total running time :{}'
 21 |                 .format(i_episode, np.mean(scores_deque),arrow.now()-start_time))
 22 |         return True
 23 | 
 24 |     return False
 25 | 
 26 | 
 27 | def plot_scores(scores,filename):
 28 |     plt.plot(np.arange(1, len(scores) + 1), scores)
 29 |     plt.ylabel('Score')
 30 |     plt.xlabel('Episode #')
 31 |     plt.savefig(filename)
 32 |     plt.show()
 33 | 
 34 | 
 35 | def get_env_prop(env_name, continuous):
 36 |     env = gym.make(env_name)
 37 |     state_dim = env.observation_space.shape[0]
 38 |     if continuous:
 39 |         action_dim = env.action_space.shape[0]
 40 |     else:
 41 |         action_dim = env.action_space.n
 42 | 
 43 |     return env,state_dim, action_dim
 44 | 
 45 | 
 46 | def train_agent_for_env(env_name,continuous,n_episode,model_file,solve_limit):
 47 |     """
 48 |     continuous: 动作空间是否连续（True/False)
 49 |     model_file: 训练好的模型的保存路径
 50 |     solve_limit : 环境 solve 的标准，score 阈值
 51 |     """
 52 |     env, state_dim, action_dim =  get_env_prop(env_name,continuous)
 53 | 
 54 |     agent = PPO_v1(state_dim,action_dim,continuous)
 55 |     scores_deque = deque(maxlen=100)
 56 |     scores = []
 57 | 
 58 |     start_time = arrow.now()
 59 |     for i_episode in range(1, n_episode + 1):
 60 |         total_reward = agent.train(env,i_episode)
 61 |         # record scores(total rewards) per episode
 62 |         scores_deque.append(total_reward)
 63 |         scores.append(total_reward)
 64 |         solved = output_scores(start_time, i_episode, scores_deque, total_reward,solve_limit)
 65 |         if solved:
 66 |             torch.save(agent.policy.state_dict(), model_file)
 67 |             break
 68 | 
 69 |     return agent, scores
 70 | 
 71 | 
 72 | def watch_random_agent(env_name,continuous):
 73 |     env, state_dim, action_dim = get_env_prop(env_name, continuous)
 74 |     for _ in range(5):
 75 |         env.reset()
 76 |         while True:
 77 |             env.render()
 78 |             next_state, reward, done, _ =env.step(env.action_space.sample())
 79 |             if done:
 80 |                 break
 81 | 
 82 |     env.close()
 83 | 
 84 | 
 85 | def watch_smart_agent(env_name,continuous,model_name,n_episode):
 86 |     env,state_dim, action_dim = get_env_prop(env_name,continuous)
 87 |     agent=PPO_v1(state_dim,action_dim,continuous)
 88 |     agent.policy.load_state_dict(torch.load(model_name))
 89 | 
 90 |     scores =[]
 91 |     for i_episode in range(1, n_episode + 1):
 92 |         rewards = []
 93 |         state = env.reset()
 94 |         while True:
 95 |             state = torch.from_numpy(state).float().unsqueeze(0).to(device)
 96 |             action, _ = agent.policy.act(state)
 97 |             env.render()
 98 |             state, reward, done, _ = env.step(action)
 99 |             rewards.append(reward)
100 |             if done:
101 |                 break
102 |         scores.append(sum(rewards))
103 |     return scores
104 | 
105 | 
106 | if __name__=="__main__":
107 |     """train PPO agent in CartPole (discrete action space)"""
108 |     # agent_cartPole,scores_1 =  train_agent_for_env('CartPole-v0',False,2000,
109 |     #                                                'models/cartPole_ppo-v1_1.pth',195)
110 |     # plot_scores(scores_1,'cartPole_ppo-v1_1.png')
111 | 
112 |     # 观察未经训练的随机智能体
113 |     # watch_random_agent('CartPole-v0',False)
114 |     # 测试训练好的智能体
115 |     # test_scores=watch_smart_agent('CartPole-v0',False,'models/PPO_new.pth',100)
116 |     # plot_scores(test_scores,"PPO_cartPole_test.png")
117 | 
118 |     """train PPO agent in MountainCarContinuous (continuous action space)"""
119 |     agent_mCar, scores_2 = train_agent_for_env('MountainCarContinuous-v0', True, 2000,
120 |                                                'models/mCar_ppo-v1.pth',95)
121 |     plot_scores(scores_2, 'mCar_ppo-v1_1.png')
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/models/PPO_new.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/models/PPO_new.pth


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/models/cartPole_ppo.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/models/cartPole_ppo.pth


--------------------------------------------------------------------------------
/Policy_Gradient/PPO/models/cartPole_ppo_20201222.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/models/cartPole_ppo_20201222.pth


--------------------------------------------------------------------------------
/Policy_Gradient/PPO_cnn/cnn_ppo.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | class ActorCritic(torch.nn.Module):
 8 |     def __init__(self, input_shape, output_shape):
 9 |         super(ActorCritic, self).__init__()
10 |         self.conv1 = nn.Conv2d(input_shape, 32, 3, stride=2, padding=1)
11 |         self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
12 |         self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
13 |         self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
14 | 
15 |         self.critic_linear = nn.Linear(256, 1)
16 |         self.actor_linear = nn.Linear(256, output_shape)
17 | 
18 | 


--------------------------------------------------------------------------------
/Policy_Gradient/envTest.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from collections import deque
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | env = gym.make('CartPole-v0')
 7 | env.seed(0)
 8 | 
 9 | print('observation space:', env.observation_space)
10 | print('action space:', env.action_space)
11 | print('action space:', env.action_space.n)
12 | 
13 | # 观察一个未经训练的随机智能体
14 | state = env.reset()
15 | print(state)
16 | 
17 | done=False
18 | for _ in range(5000):
19 |     env.render()
20 |     if not done:
21 |         next_state, reward, done, _ =env.step(env.action_space.sample())
22 |         print(next_state, reward)
23 |     else:
24 |         break
25 | 
26 | env.close()
27 | 


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/.DS_Store


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO-A vs. PPO-R.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-A vs. PPO-R.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO-A vs.PPO-R_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-A vs.PPO-R_1.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO-A_train_5_times.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-A_train_5_times.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO-A_train_5_times1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-A_train_5_times1.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO-A_train_5times_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-A_train_5times_2.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO-entropy_5times.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-entropy_5times.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_cartPole_20201222.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_cartPole_20201222.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_cartPole_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_cartPole_test.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_cartPole_train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_cartPole_train.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_comparison.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_comparison_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_comparison_1.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_comparison_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_comparison_2.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_comparison_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_comparison_3.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_comparison_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_comparison_4.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_multiple_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_multiple_1.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_with_entropy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_with_entropy.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_with_entropy_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_with_entropy_1.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/cartpole_reinforce.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/cartpole_reinforce.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/diff_algorithm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/diff_algorithm.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/pg_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/pg_1.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/pg_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/pg_2.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/pg_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/pg_3.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/reinforce_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_1.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/reinforce_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_2.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/reinforce_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_3.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/reinforce_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_4.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/reinforce_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_5.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/reinforce_vs_pg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_vs_pg.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/rf-vs-pg_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/rf-vs-pg_1.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/rf-vs-pg_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/rf-vs-pg_2.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/rf-vs-pg_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/rf-vs-pg_3.png


--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/rf-vs-pg_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/rf-vs-pg_4.png


--------------------------------------------------------------------------------