├── common
    ├── __init__.py
    ├── schedule.py
    └── atari_wrapper.py
├── deepq
    ├── __init__.py
    ├── model.py
    ├── replay_buffer.py
    └── learn.py
├── img
    ├── pong-dqn-8hr.gif
    └── mario-dqn-16hr.gif
├── README_zh.md
├── train_pong.py
├── train_mario.py
└── README.md


/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepq/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/img/pong-dqn-8hr.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nailo2c/dqn-mario/HEAD/img/pong-dqn-8hr.gif


--------------------------------------------------------------------------------
/img/mario-dqn-16hr.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nailo2c/dqn-mario/HEAD/img/mario-dqn-16hr.gif


--------------------------------------------------------------------------------
/deepq/model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class DQN(nn.Module):
 6 |     def __init__(self, in_channels=4, num_actions=18):
 7 |         super(DQN, self).__init__()
 8 |         self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
 9 |         self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
10 |         self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
11 |         self.fc4 = nn.Linear(7 * 7 * 64, 512)
12 |         self.fc5 = nn.Linear(512, num_actions)
13 |     
14 |     def forward(self, x):
15 |         x = F.relu(self.conv1(x))
16 |         x = F.relu(self.conv2(x))
17 |         x = F.relu(self.conv3(x))
18 |         x = F.relu(self.fc4(x.view(x.size(0), -1)))
19 |         return self.fc5(x)


--------------------------------------------------------------------------------
/common/schedule.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | def linear_interpolation(l, r, alpha):
 3 |     return l + alpha * (r - l)
 4 | 
 5 | 
 6 | class PiecewiseSchedule(object):
 7 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 8 |         """PiecewiseSchedule
 9 |         endpoints: [(int, int)]
10 |             list of pairs `(time, value)`，代表time=t時的value值
11 |         interpolation: lambda float, float, float: float
12 |         outside_value: float
13 |         """
14 |         idxes = [e[0] for e in endpoints]
15 |         assert idxes == sorted(idxes)
16 |         self._interpolation = interpolation
17 |         self._outside_value = outside_value
18 |         self._endpoints     = endpoints
19 |     
20 |     def value(self, t):
21 |         """See Schedule.value"""
22 |         # 假如時間t介於l_t與r_t之間，就做插值
23 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
24 |             if l_t <= t and t <= r_t:
25 |                 alpha = float(t - l_t) / (r_t - l_t)
26 |                 return self._interpolation(l, r, alpha)
27 |             
28 |         # 如果t不屬於任何pieces，返回outside value
29 |         assert self._outside_value is not None
30 |         return self._outside_value
31 |     
32 |     
33 | 
34 | class LinearSchedule(object):
35 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
36 |         self.schedule_timesteps = schedule_timesteps
37 |         self.final_p            = final_p
38 |         self.initial_p          = initial_p
39 |         
40 |     def value(self, t):
41 |         fraction = min(float(t) / self.schedule_timesteps, 1.0)
42 |         return self.initial_p + fraction * (self.final_p - self.initial_p)


--------------------------------------------------------------------------------
/README_zh.md:
--------------------------------------------------------------------------------
 1 | # DQN
 2 | 
 3 | [English](README.md) | [中文](README_zh.md)
 4 | 
 5 | 使用PyTorch實作DQN演算法，並訓練super-mario-bros以及atari-pong，整體架構參考openai/baselines。  
 6 | 
 7 | *Warning*：訓練DQN請開足夠的記憶體，Replay Buffer以預設值1000000為例至少會使用約8G的記憶體。
 8 |   
 9 | # Dependencies
10 | 
11 | * Python 3.6
12 | * Anaconda
13 | * PyTorch
14 | * gym
15 | * gym[atari]
16 | * ppaquette_gym_super_mario
17 | * fceux
18 |   
19 | # Getting Started
20 | 
21 | 以下以Ubuntu 16.04 LTS環境為準，安裝Anaconda時請一路Enter與Yes到底。
22 | 
23 | ```
24 | wget https://repo.continuum.io/archive/Anaconda3-4.4.0-Linux-x86_64.sh
25 | bash Anaconda3-4.4.0-Linux-x86_64.sh
26 | source .bashrc
27 | conda install pytorch torchvision -c soumith
28 | conda install libgcc
29 | pip install gym[Atari]
30 | sudo apt-get update
31 | sudo apt-get install -y python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb libav-tools xorg-dev python-opengl libboost-all-dev libsdl2-dev swig
32 | sudo apt-get install fceux
33 | pip install git+https://github.com/ppaquette/gym-super-mario/
34 | ```
35 |   
36 | # How to run
37 | 
38 | * super-mario-bros
39 | ```
40 | xvfb-run -s "-screen 0 1400x900x24" python train_mario.py
41 | ```
42 |   
43 | * atari-pong
44 | ```
45 | python train_pong.py
46 | ```
47 | 
48 | # Result
49 | 
50 | * Super-Mario-Bros
51 | 
52 | 使用8顆cpu在GCP上跑16個小時，RAM開24G非常足夠，但很難收斂，無法穩定過關。  
53 | 訓練的影像預設位置在/video/mario/。
54 | 
55 | ![](img/mario-dqn-16hr.gif)
56 | 
57 | * Atari-Pong
58 | 
59 | 使用1張GPU(Nvidia Tesla K80)加4顆cpu在GCP上跑8個小時，能夠穩定大幅贏電腦。  
60 | 訓練的影像預設位置在/video/gym-reslults/。
61 | 
62 | ![](img/pong-dqn-8hr.gif)
63 | 
64 | 
65 | # References
66 | 
67 | [Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)  
68 | [openai/baselines](https://github.com/openai/baselines)  
69 | [transedward/pytorch-dqn](https://github.com/transedward/pytorch-dqn)  
70 | [openai/gym](https://github.com/openai/gym)  
71 | [ppaquette/gym-super-mario](https://github.com/ppaquette/gym-super-mario)  
72 | 


--------------------------------------------------------------------------------
/train_pong.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | 
 4 | import gym
 5 | import torch
 6 | import torch.optim as optim
 7 | 
 8 | from collections import namedtuple
 9 | from gym import wrappers
10 | 
11 | from deepq.learn import learning
12 | from deepq.model import DQN
13 | 
14 | from common.atari_wrapper import wrap_dqn
15 | from common.schedule import LinearSchedule
16 | 
17 | 
18 | SEED = 0
19 | BATCH_SIZE = 32
20 | GAMMA = 0.99
21 | REPLAY_BUFFER_SIZE = 1000000
22 | LEARNING_STARTS = 50000
23 | LEARNING_FREQ = 4
24 | FRAME_HISTORY_LEN = 4
25 | TARGET_UPDATE_FREQ = 10000
26 | LEARNING_RATE = 0.00025
27 | ALPHA = 0.95
28 | EPS = 0.01
29 | 
30 | 
31 | def main(env):
32 |     ### 首先要為隨時間改變的參數設定schedule
33 |     # This is a just rough estimate
34 |     num_iterations = float(40000000) / 4.0
35 |     
36 |     
37 |     # define exploration schedule
38 |     exploration_schedule = LinearSchedule(1000000, 0.1)
39 |     
40 |     
41 |     # optimizer
42 |     OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs"])
43 |     
44 |     optimizer = OptimizerSpec(
45 |         constructor=optim.RMSprop,
46 |         kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
47 |     )
48 |     
49 |     
50 |     learning(
51 |         env=env,
52 |         q_func=DQN,
53 |         optimizer_spec=optimizer,
54 |         exploration=exploration_schedule,
55 |         replay_buffer_size=REPLAY_BUFFER_SIZE,
56 |         batch_size=BATCH_SIZE,
57 |         gamma=GAMMA,
58 |         learning_starts=LEARNING_STARTS,
59 |         learning_freq=LEARNING_FREQ,
60 |         frame_history_len=FRAME_HISTORY_LEN,
61 |         target_update_freq=TARGET_UPDATE_FREQ
62 |     )
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     
67 |     env = gym.make("PongNoFrameskip-v3")
68 |     
69 |     # set global seeds
70 |     env.seed(SEED)
71 |     torch.manual_seed(SEED)
72 |     np.random.seed(SEED)
73 |     random.seed(SEED)
74 |     
75 |     # monitor & wrap the game
76 |     env = wrap_dqn(env)
77 |     
78 |     expt_dir = 'video/gym-reslults'
79 |     env = wrappers.Monitor(env, expt_dir, force=True, video_callable=lambda count: count % 50 == 0)
80 | 
81 |     # main
82 |     main(env)


--------------------------------------------------------------------------------
/train_mario.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import random
 3 | import numpy as np
 4 | from collections import namedtuple
 5 | 
 6 | import gym
 7 | import ppaquette_gym_super_mario
 8 | from gym import wrappers
 9 | 
10 | import torch
11 | import torch.optim as optim
12 | 
13 | from deepq.learn import mario_learning
14 | from deepq.model import DQN
15 | 
16 | from common.atari_wrapper import wrap_mario
17 | from common.schedule import LinearSchedule
18 | 
19 | 
20 | SEED = 0
21 | BATCH_SIZE = 32
22 | GAMMA = 0.99
23 | REPLAY_BUFFER_SIZE = 1000000
24 | LEARNING_STARTS = 10000
25 | LEARNING_FREQ = 4
26 | FRAME_HISTORY_LEN = 4
27 | TARGET_UPDATE_FREQ = 3000
28 | LEARNING_RATE = 0.00025
29 | ALPHA = 0.95
30 | EPS = 0.01
31 | 
32 | 
33 | 
34 | def main(env):
35 |     ### 首先要為隨時間改變的參數設定schedule
36 |     # This is a just rough estimate
37 |     num_iterations = float(40000000) / 4.0
38 |     
39 |     
40 |     # define exploration schedule
41 |     exploration_schedule = LinearSchedule(1000000, 0.1)
42 |     
43 |     
44 |     # optimizer
45 |     OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs"])
46 |     
47 |     optimizer = OptimizerSpec(
48 |         constructor=optim.RMSprop,
49 |         kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
50 |     )
51 |     
52 |     
53 |     mario_learning(
54 |         env=env,
55 |         q_func=DQN,
56 |         optimizer_spec=optimizer,
57 |         exploration=exploration_schedule,
58 |         replay_buffer_size=REPLAY_BUFFER_SIZE,
59 |         batch_size=BATCH_SIZE,
60 |         gamma=GAMMA,
61 |         learning_starts=LEARNING_STARTS,
62 |         learning_freq=LEARNING_FREQ,
63 |         frame_history_len=FRAME_HISTORY_LEN,
64 |         target_update_freq=TARGET_UPDATE_FREQ
65 |     )
66 | 
67 | if __name__ == '__main__':
68 |     
69 |     env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
70 |     
71 |     
72 |     # set global seeds
73 |     env.seed(SEED)
74 |     torch.manual_seed(SEED)
75 |     np.random.seed(SEED)
76 |     random.seed(SEED)
77 |     
78 |     
79 |     # monitor & wrap the game
80 |     env = wrap_mario(env)
81 |     
82 |     expt_dir = 'video/mario'
83 |     env = wrappers.Monitor(env, expt_dir, force=True, video_callable=lambda count: count % 10 == 0)
84 | 
85 |     # main
86 |     main(env)
87 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DQN
 2 | 
 3 | [English](README.md) | [中文](README_zh.md)
 4 | 
 5 | An implementation of the DQN algorithm in PyTorch, trained on Super Mario Bros and Atari Pong. The overall architecture is inspired by openai/baselines.
 6 | 
 7 | Warning: Training DQN requires sufficient memory. With the default replay buffer size of 1,000,000, it will use about 8 GB of RAM.
 8 | 
 9 | ## Dependencies
10 | 
11 | - Python 3.6
12 | - Anaconda
13 | - PyTorch
14 | - `gym`
15 | - `gym[atari]`
16 | - `ppaquette_gym_super_mario`
17 | - `fceux`
18 | 
19 | ## Getting Started
20 | 
21 | The following steps assume Ubuntu 16.04 LTS. During Anaconda installation, press Enter and Yes to accept the defaults.
22 | 
23 | ```
24 | wget https://repo.continuum.io/archive/Anaconda3-4.4.0-Linux-x86_64.sh
25 | bash Anaconda3-4.4.0-Linux-x86_64.sh
26 | source .bashrc
27 | conda install pytorch torchvision -c soumith
28 | conda install libgcc
29 | pip install gym[Atari]
30 | sudo apt-get update
31 | sudo apt-get install -y python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb libav-tools xorg-dev python-opengl libboost-all-dev libsdl2-dev swig
32 | sudo apt-get install fceux
33 | pip install git+https://github.com/ppaquette/gym-super-mario/
34 | ```
35 | 
36 | ## How to Run
37 | 
38 | - Super Mario Bros
39 | 
40 | ```
41 | xvfb-run -s "-screen 0 1400x900x24" python train_mario.py
42 | ```
43 | 
44 | - Atari Pong
45 | 
46 | ```
47 | python train_pong.py
48 | ```
49 | 
50 | ## Results
51 | 
52 | - Super Mario Bros
53 | 
54 | Trained on GCP for 16 hours using 8 CPUs. 24 GB of RAM is more than sufficient, but training is hard to converge and it does not consistently clear levels. Training videos are saved by default to `/video/mario/`.
55 | 
56 | ![](img/mario-dqn-16hr.gif)
57 | 
58 | - Atari Pong
59 | 
60 | Trained for 8 hours on GCP with 1 GPU (Nvidia Tesla K80) and 4 CPUs. It can reliably and decisively beat the computer. Training videos are saved by default to `/video/gym-reslults/`.
61 | 
62 | ![](img/pong-dqn-8hr.gif)
63 | 
64 | ## References
65 | 
66 | - [Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)
67 | - [openai/baselines](https://github.com/openai/baselines)
68 | - [transedward/pytorch-dqn](https://github.com/transedward/pytorch-dqn)
69 | - [openai/gym](https://github.com/openai/gym)
70 | - [ppaquette/gym-super-mario](https://github.com/ppaquette/gym-super-mario)
71 | 
72 | 


--------------------------------------------------------------------------------
/deepq/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | import random
 4 | 
 5 | def sample_n_unique(sampling_f, n):
 6 |     res = []
 7 |     while len(res) < n:
 8 |         candidate = sampling_f()
 9 |         if candidate not in res:
10 |             res.append(candidate)
11 |     return res
12 | 
13 | 
14 | class ReplayBuffer(object):
15 |     def __init__(self, size, frame_history_len):  # size 代表 replay buffer size
16 |         self.size = size
17 |         self.frame_history_len = frame_history_len
18 |         
19 |         self.next_idx      = 0
20 |         self.num_in_buffer = 0
21 |         
22 |         self.obs    = None
23 |         self.action = None
24 |         self.reward = None
25 |         self.done   = None
26 |     
27 |     def can_sample(self, batch_size):   # return True or False
28 |         return batch_size + 1 <= self.num_in_buffer
29 |     
30 |     def _encode_sample(self, idxes):
31 |         obs_batch      = np.concatenate([self._encode_observation(idx)[np.newaxis, :] for idx in idxes], 0)
32 |         act_batch      = self.action[idxes]
33 |         rew_batch      = self.reward[idxes]
34 |         next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[np.newaxis, :] for idx in idxes], 0)
35 |         done_mask      = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32) # 將True, False轉為1, 0
36 |         
37 |         return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask
38 |     
39 |     def sample(self, batch_size):   # return batch of (obs, act, rew, next_obs) and done_mask
40 |         # 先確認大小足夠能抽樣才進行抽樣
41 |         assert self.can_sample(batch_size)
42 |         idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size)
43 |         return self._encode_sample(idxes)
44 |     
45 |     def encode_recent_observation(self):   # return the most recent `frame_history_len` frames.
46 |         assert self.num_in_buffer > 0
47 |         return self._encode_observation((self.next_idx - 1) % self.size)
48 |     
49 |     def _encode_observation(self, idx):  # return list of frame
50 |         end_idx   = idx + 1
51 |         start_idx = end_idx - self.frame_history_len
52 |         
53 |         # check if it is low-dimensional obs, such as RAM
54 |         if len(self.obs.shape) == 2: return self.obs[end_idx-1]
55 |         
56 |         # 假如buffer裡沒有足夠的frame時
57 |         if start_idx < 0 and self.num_in_buffer != self.size: start_idx = 0
58 |         
59 |         # 標註每個start idx
60 |         for idx in range(start_idx, end_idx - 1):
61 |             if self.done[idx % self.size]:
62 |                 start_idx = idx + 1
63 |         
64 |         # 處理頭幾個frame，因為idx出現負數時沒有frame
65 |         missing_context = self.frame_history_len - (end_idx - start_idx)
66 |         if start_idx < 0 or missing_context > 0:
67 |             frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)]
68 |             # 拿別的frame來填補
69 |             for idx in range(start_idx, end_idx):
70 |                 frames.append(self.obs[idx % self.size])
71 |             return np.concatenate(frames, 0)
72 |         else:
73 |             # 底下的處理可以節約30%的計算時間
74 |             img_h, img_w = self.obs.shape[2], self.obs.shape[3]
75 |             return self.obs[start_idx:end_idx].reshape(-1, img_h, img_w)
76 |     
77 |     def store_frame(self, frame):  # return idx of frame
78 |         if len(frame.shape) > 1:
79 |             # transpose image frame into (img_c, img_h, img_w)
80 |             frame = frame.transpose(2, 0, 1)
81 |             
82 |         if self.obs is None:
83 |             self.obs    = np.empty([self.size] + list(frame.shape), dtype=np.uint8)
84 |             self.action = np.empty([self.size],                     dtype=np.int32)
85 |             self.reward = np.empty([self.size],                     dtype=np.float32)
86 |             self.done   = np.empty([self.size],                     dtype=np.bool)
87 |             
88 |         self.obs[self.next_idx] = frame
89 |         
90 |         ret = self.next_idx
91 |         self.next_idx = (self.next_idx + 1) % self.size
92 |         self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
93 |         
94 |         return ret
95 |     
96 |     def store_effect(self, idx, action, reward, done):
97 |         self.action[idx] = action
98 |         self.reward[idx] = reward
99 |         self.done[idx]   = done


--------------------------------------------------------------------------------
/common/atari_wrapper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | from collections import deque
  4 | import gym
  5 | from gym import spaces
  6 | from PIL import Image
  7 | 
  8 | 
  9 | # 重新實作step與reset，主要是reset，以當lives當指標來當作episode真正的結束
 10 | class EpisodicLifeEnv(gym.Wrapper):
 11 |     def __init__(self, env=None):
 12 |         super(EpisodicLifeEnv, self).__init__(env)
 13 |         self.lives = 0
 14 |         self.was_real_done  = True
 15 |         self.was_real_reset = False
 16 | 
 17 |     def _step(self, action):
 18 |         obs, reward, done, info = self.env.step(action)
 19 |         self.was_real_done = done
 20 |         # 不太清楚使用這個lives是為了什麼
 21 |         lives = self.env.unwrapped.ale.lives()
 22 |         if lives < self.lives and lives > 0:
 23 |             done = True
 24 |         self.lives = lives
 25 |         return obs, reward, done, info
 26 | 
 27 |     def _reset(self):
 28 |         if self.was_real_done:
 29 |             obs = self.env.reset()
 30 |             self.was_real_reset = True
 31 |         else:
 32 |             obs, _, _, _ = self.env.step(0)
 33 |             self.was_real_reset = False
 34 |         self.lives = self.env.unwrapped.ale.lives()
 35 |         return obs
 36 | 
 37 | 
 38 | 
 39 | # 先隨機進行N次noop(no opearation)當作初始obs，但不知道這樣的好處為何?
 40 | class NoopResetEnv(gym.Wrapper):
 41 |     def __init__(self, env=None, noop_max=30):
 42 |         super(NoopResetEnv, self).__init__(env)
 43 |         self.noop_max = noop_max
 44 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 45 | 
 46 |     def _reset(self):
 47 |         self.env.reset()
 48 |         noops = np.random.randint(1, self.noop_max + 1)
 49 |         for _ in range(noops):
 50 |             obs, _, _, _ = self.env.step(0)
 51 |         return obs
 52 | 
 53 | 
 54 | 
 55 | # 實作每4個frames當作一次sample
 56 | class MaxAndSkipEnv(gym.Wrapper):
 57 |     def __init__(self, env=None, skip=4):
 58 |         """Return only every `skip`-th frame"""
 59 |         super(MaxAndSkipEnv, self).__init__(env)
 60 |         self._obs_buffer = deque(maxlen=2)
 61 |         self._skip       = skip
 62 | 
 63 |     def _step(self, action):
 64 |         total_reward = 0.0
 65 |         done = None
 66 |         for _ in range(self._skip):
 67 |             obs, reward, done, info = self.env.step(action)
 68 |             self._obs_buffer.append(obs)
 69 |             total_reward += reward
 70 |             if done:
 71 |                 break
 72 | 
 73 |         # 選倒數兩個frame中較大的那一個，但我不太清楚為何要這樣?
 74 |         max_frame = np.max(np.stack(self._obs_buffer), axis=0)
 75 | 
 76 |         return max_frame, total_reward, done, info
 77 | 
 78 |     def _reset(self):
 79 |         # 清掉buffer，並掛上初始obs當作deque的初始狀態
 80 |         self._obs_buffer.clear()
 81 |         obs = self.env.reset()
 82 |         self._obs_buffer.append(obs)
 83 |         return obs
 84 | 
 85 | 
 86 | 
 87 | # 先Fire一發，再走一步action來當作初始obs，一樣還是不知道為何要這樣做
 88 | class FireResetEnv(gym.Wrapper):
 89 |     def __init__(self, env=None):
 90 |         super(FireResetEnv, self).__init__(env)
 91 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 92 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 93 | 
 94 |     def _reset(self):
 95 |         self.env.reset()
 96 |         obs, _, _, _ = self.env.step(1)
 97 |         obs, _, _, _ = self.env.step(2)
 98 |         return obs
 99 | 
100 | 
101 |     
102 | # 遊戲畫面前處理
103 | def _process_frame84(frame):
104 |     img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
105 |     # RGB轉灰階
106 |     # https://en.wikipedia.org/wiki/Grayscale#Converting_color_to_grayscale
107 |     img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
108 |     # 轉為Image物件，使用BILINEAR插值
109 |     img = Image.fromarray(img)
110 |     resized_screen = img.resize((84, 110), Image.BILINEAR)
111 |     resized_screen = np.array(resized_screen)
112 |     x_t = resized_screen[18:102, :]
113 |     x_t = np.reshape(x_t, [84, 84, 1])
114 |     return x_t.astype(np.uint8)
115 | 
116 | 
117 | 
118 | # 重新實作經過前處理的step與reset
119 | class ProcessFrame84(gym.Wrapper):
120 |     def __init__(self, env=None):
121 |         super(ProcessFrame84, self).__init__(env)
122 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
123 | 
124 |     def _step(self, action):
125 |         obs, reward, done, info = self.env.step(action)
126 |         return _process_frame84(obs), reward, done, info
127 | 
128 |     def _reset(self):
129 |         return _process_frame84(self.env.reset())
130 | 
131 | 
132 | 
133 | # 假如reward大於0 -> 1，等於0 -> 0，小於0 -> -1
134 | class ClippedRewardsWrapper(gym.Wrapper):
135 |     def _step(self, action):
136 |         obs, reward, done, info = self.env.step(action)
137 |         return obs, np.sign(reward), done, info
138 | 
139 | 
140 | 
141 | def wrap_dqn(env):
142 |     assert 'NoFrameskip' in env.spec.id
143 |     env = EpisodicLifeEnv(env)
144 |     env = NoopResetEnv(env, noop_max=30)
145 |     env = MaxAndSkipEnv(env, skip=4)
146 |     if 'FIRE' in env.unwrapped.get_action_meanings():
147 |         env = FireResetEnv(env)
148 |     env = ProcessFrame84(env)
149 |     env = ClippedRewardsWrapper(env)
150 |     return env
151 | 
152 | 
153 | 
154 | 
155 | # 針對mario去修改size
156 | def _process_frame_mario(frame):
157 |     img = np.reshape(frame, [224, 256, 3]).astype(np.float32)
158 |     # RGB轉灰階
159 |     # https://en.wikipedia.org/wiki/Grayscale#Converting_color_to_grayscale
160 |     img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
161 |     # 轉為Image物件，使用BILINEAR插值
162 |     img = Image.fromarray(img)
163 |     resized_screen = img.resize((84, 110), Image.BILINEAR)
164 |     resized_screen = np.array(resized_screen)
165 |     x_t = resized_screen[18:102, :]
166 |     x_t = np.reshape(x_t, [84, 84, 1])
167 |     return x_t.astype(np.uint8)
168 | 
169 | 
170 | 
171 | # 重新實作經過前處理的step與reset
172 | class ProcessFrameMario(gym.Wrapper):
173 |     def __init__(self, env=None):
174 |         super(ProcessFrameMario, self).__init__(env)
175 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
176 | 
177 |     def _step(self, action):
178 |         obs, reward, done, info = self.env.step(action)
179 |         return _process_frame_mario(obs), reward, done, info
180 | 
181 |     def _reset(self):
182 |         return _process_frame_mario(self.env.reset())
183 | 
184 | 
185 | 
186 | def wrap_mario(env):
187 |     assert 'SuperMarioBros' in env.spec.id
188 |     env = MaxAndSkipEnv(env, skip=4)
189 |     env = ProcessFrameMario(env)
190 |     return env


--------------------------------------------------------------------------------
/deepq/learn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import sys
  4 | import random
  5 | import numpy as np
  6 | from itertools import count
  7 | 
  8 | import gym
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.autograd as autograd
 13 | import torch.nn.functional as F
 14 | 
 15 | from .replay_buffer import ReplayBuffer
 16 | 
 17 | 
 18 | # detect GPU
 19 | USE_CUDA = torch.cuda.is_available()
 20 | dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
 21 | 
 22 | 
 23 | class Variable(autograd.Variable):
 24 |     def __init__(self, data, *args, **kwargs):
 25 |         if USE_CUDA:
 26 |             data = data.cuda()
 27 |         super(Variable, self).__init__(data, *args, **kwargs)
 28 | 
 29 | 
 30 | def learning(
 31 |     env,
 32 |     q_func,
 33 |     optimizer_spec,
 34 |     exploration,
 35 |     replay_buffer_size=1000000,
 36 |     batch_size=32,
 37 |     gamma=0.99,
 38 |     learning_starts=50000,
 39 |     learning_freq=4,
 40 |     frame_history_len=4,
 41 |     target_update_freq=10000
 42 |     ):
 43 |     
 44 |     assert type(env.observation_space) == gym.spaces.Box
 45 |     assert type(env.action_space)      == gym.spaces.Discrete
 46 |     
 47 |     
 48 |     # 檢查是否是low-dimensional observations (e.g. RAM)
 49 |     if len(env.observation_space.shape) == 1:
 50 |         input_arg = env.observation_space.shape[0]
 51 |     else:
 52 |         img_h, img_w, img_c = env.observation_space.shape
 53 |         input_arg = frame_history_len * img_c  # 實作論文中的每4 frame擷取一次
 54 |         
 55 |     num_actions = env.action_space.n
 56 |     
 57 |     
 58 |     # Construct an epilson greedy policy with given exploration schedule
 59 |     def select_epilson_greedy_action(model, obs, t):
 60 |         sample = random.random()
 61 |         eps_threshold = exploration.value(t)
 62 |         if sample > eps_threshold:
 63 |             obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
 64 |             return model(Variable(obs, volatile=True)).data.max(1)[1].view(1,1)
 65 |         else:
 66 |             return torch.IntTensor([[random.randrange(num_actions)]])
 67 |         
 68 |     
 69 |     # Initialize target q function and q function
 70 |     Q = q_func(input_arg, num_actions).type(dtype)
 71 |     target_Q = q_func(input_arg, num_actions).type(dtype)
 72 |     
 73 |     # Check & load pretrain model
 74 |     if os.path.isfile('Q_params.pkl'):
 75 |         print('Load Q parametets ...')
 76 |         Q.load_state_dict(torch.load('Q_params.pkl'))
 77 |         
 78 |     if os.path.isfile('target_Q_params.pkl'):
 79 |         print('Load target Q parameters ...')
 80 |         target_Q.load_state_dict(torch.load('target_Q_params.pkl'))
 81 |     
 82 |     
 83 |     # Construct Q network optimizer function
 84 |     optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs)
 85 |     
 86 |     # Construct the replay buffer
 87 |     replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)
 88 |     
 89 |     
 90 |     ### RUN ENV
 91 |     num_param_updates = 0
 92 |     mean_episode_reward = -float('nan')
 93 |     best_mean_episode_reward = -float('inf')
 94 |     last_obs = env.reset()
 95 |     LOG_EVERY_N_STEPS = 10000
 96 |     
 97 |     
 98 |     for t in count():            
 99 |         ### Step the env and store the transition
100 |         last_idx = replay_buffer.store_frame(last_obs)
101 |         # 將最新的observation與最近的幾個frame concat在一起，才能丟進Q網路
102 |         recent_observations = replay_buffer.encode_recent_observation()
103 |         
104 |         
105 |         # buffer 收集到一定的量才開始學習
106 |         if t > learning_starts:
107 |             action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0]
108 |         else:
109 |             action = random.randrange(num_actions)
110 |             
111 |         obs, reward, done, _ = env.step(action)
112 |         reward = max(-1.0, min(reward, 1.0))
113 |         replay_buffer.store_effect(last_idx, action, reward, done) # 將新的資訊存入buffer中
114 |         
115 |         if done:
116 |             obs = env.reset()
117 |         last_obs = obs
118 |         
119 |         ### 從buffer中抽樣並以target network的方式訓練
120 |         if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)):
121 |             obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size)
122 |             
123 |             obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0)
124 |             act_batch = Variable(torch.from_numpy(act_batch).long())
125 |             rew_batch = Variable(torch.from_numpy(rew_batch))
126 |             next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
127 |             not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) # 如果下一個state是episode中的最後一個，則done_mask = 1
128 |             
129 |             if USE_CUDA:
130 |                 act_batch = act_batch.cuda()
131 |                 rew_batch = rew_batch.cuda()
132 |                 
133 |             # 從抽出的batch observation中得出現在的Q值
134 |             current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)).squeeze()
135 |             # 用next_obs_batch計算下一個Q值，detach代表將target network從graph中分離，不去計算它的gradient
136 |             next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
137 |             next_Q_values = not_done_mask * next_max_q
138 |             # TD value
139 |             target_Q_values = rew_batch + (gamma * next_Q_values)
140 |             # Compute Bellman error
141 |             # bellman_error = target_Q_values - current_Q_values
142 |             # clip the bellman error between [-1, 1]
143 |             # clipped_bellman_error = bellman_error.clamp(-1, 1)
144 |             # 要 * -1 才是正確的gradient，why?
145 |             # d_error = clipped_bellman_error * -1.0
146 |            
147 |             # print(d_error)
148 |             loss = F.smooth_l1_loss(current_Q_values, target_Q_values)
149 |             # backward & update
150 |             optimizer.zero_grad()
151 |             # current_Q_values.backward(d_error.data)#.unsqueeze(1))
152 |             loss.backward()
153 |             # Clip the gradients to lie between -1 and +1
154 |             for params in Q.parameters():
155 |                 params.grad.data.clamp_(-1, 1)
156 |             
157 |             optimizer.step()
158 |             num_param_updates += 1
159 |             
160 |             # 每隔一段時間才更新target network
161 |             if num_param_updates % target_update_freq == 0:
162 |                 target_Q.load_state_dict(Q.state_dict())
163 |                 
164 |         ### Log & track
165 |         # 要用gym.wrappers中的Monitor將env包起來，才有get_episode_rewards屬性，返回值為list
166 |         episode_rewards = env.get_episode_rewards()
167 |         if len(episode_rewards) > 0:
168 |             mean_episode_reward = np.mean(episode_rewards[-100:]) # 最近100次reward的平均
169 |         if len(episode_rewards) > 100:
170 |             best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward)
171 |             
172 |         if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
173 |             print("Timestep %d" % (t,))
174 |             print("mean reward (100 episodes) %f" % mean_episode_reward)
175 |             print("best mean reward %f" % best_mean_episode_reward)
176 |             print("episodes %d" % len(episode_rewards))
177 |             print("exploration %f" % exploration.value(t))
178 |             sys.stdout.flush()
179 |             
180 |             # Save the trained model
181 |             torch.save(Q.state_dict(), 'Q_params.pkl')
182 |             torch.save(target_Q.state_dict(), 'target_Q_params.pkl')
183 |             
184 |             
185 | #########            
186 | # Mario #
187 | #########
188 | 
189 | def mario_learning(
190 |     env,
191 |     q_func,
192 |     optimizer_spec,
193 |     exploration,
194 |     replay_buffer_size=1000000,
195 |     batch_size=32,
196 |     gamma=0.99,
197 |     learning_starts=50000,
198 |     learning_freq=4,
199 |     frame_history_len=4,
200 |     target_update_freq=10000
201 |     ):
202 |     
203 |     assert type(env.observation_space) == gym.spaces.Box
204 |     assert type(env.action_space)      == gym.spaces.MultiDiscrete
205 |     
206 |     
207 |     # 檢查是否是low-dimensional observations (e.g. RAM)
208 |     if len(env.observation_space.shape) == 1:
209 |         input_arg = env.observation_space.shape[0]
210 |     else:
211 |         img_h, img_w, img_c = env.observation_space.shape
212 |         input_arg = frame_history_len * img_c  # 實作論文中的每4 frame擷取一次
213 |     
214 |     
215 |     num_actions = env.action_space.shape
216 |     
217 |     
218 |     # Construct an epilson greedy policy with given exploration schedule
219 |     def select_epilson_greedy_action(model, obs, t):
220 |         sample = random.random()
221 |         eps_threshold = exploration.value(t)
222 |         if sample > eps_threshold:
223 |             obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
224 |             return model(Variable(obs, volatile=True)).data.max(1)[1].cpu()
225 |         else:
226 |             return torch.IntTensor([[random.randrange(num_actions)]])
227 |         
228 |     # to one hot
229 |     def to_onehot(action, num_actions):
230 |         action = action % num_actions
231 |         if action == 0:
232 |             # Move right while jumping
233 |             action_onehot = np.array([0, 0, 0, 1, 1, 0])
234 |         else:
235 |             action_onehot = np.zeros(num_actions, dtype=int)
236 |             action_onehot[action] = 1
237 |         return action_onehot
238 | 
239 | 
240 |     # Initialize target q function and q function
241 |     Q = q_func(input_arg, num_actions).type(dtype)
242 |     target_Q = q_func(input_arg, num_actions).type(dtype)
243 |     
244 |     # Check & load pretrain model
245 |     if os.path.isfile('mario_Q_params.pkl'):
246 |         print('Load Q parametets ...')
247 |         Q.load_state_dict(torch.load('mario_Q_params.pkl'))
248 |         
249 |     if os.path.isfile('mario_target_Q_params.pkl'):
250 |         print('Load target Q parameters ...')
251 |         target_Q.load_state_dict(torch.load('mario_target_Q_params.pkl'))
252 |     
253 |     
254 |     # Construct Q network optimizer function
255 |     optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs)
256 |     
257 |     # Construct the replay buffer
258 |     replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)
259 |     
260 |     ### RUN ENV
261 |     num_param_updates = 0
262 |     mean_episode_reward = -float('nan')
263 |     best_mean_episode_reward = -float('inf')
264 |     last_obs = env.reset()
265 |     LOG_EVERY_N_STEPS = 10000
266 |     
267 |     
268 |     for t in count():
269 |         print('timestep:', t)
270 |         ### Step the env and store the transition
271 |         last_idx = replay_buffer.store_frame(last_obs)
272 |         # 將最新的observation與最近的幾個frame concat在一起，才能丟進Q網路
273 |         recent_observations = replay_buffer.encode_recent_observation()
274 |         
275 |         
276 |         # buffer 收集到一定的量才開始學習
277 |         if t > learning_starts:
278 |             action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0]
279 |         else:
280 |             action = random.randrange(num_actions)
281 |             
282 |         # one hot encoding
283 |         act_onehot = to_onehot(action, num_actions)
284 | 
285 |         obs, reward, done, _ = env.step(act_onehot)
286 |         #reward = max(-1.0, min(reward, 1.0))
287 |         replay_buffer.store_effect(last_idx, action, reward, done) # 將新的資訊存入buffer中
288 |         
289 |         if done:
290 |             obs = env.reset()
291 |         last_obs = obs
292 |         
293 |         ### 從buffer中抽樣並以target network的方式訓練
294 |         if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)):
295 |             obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size)
296 |             
297 |             obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0)
298 |             act_batch = Variable(torch.from_numpy(act_batch).long())
299 |             rew_batch = Variable(torch.from_numpy(rew_batch))
300 |             next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
301 |             not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) # 如果下一個state是episode中的最後一個，則done_mask = 1
302 |             
303 |             if USE_CUDA:
304 |                 act_batch = act_batch.cuda()
305 |                 rew_batch = rew_batch.cuda()
306 |                 
307 |             # 從抽出的batch observation中得出現在的Q值
308 |             current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1))
309 |             # 用next_obs_batch計算下一個Q值，detach代表將target network從graph中分離，不去計算它的gradient
310 |             next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
311 |             next_Q_values = not_done_mask * next_max_q
312 |             # TD value
313 |             target_Q_values = rew_batch + (gamma * next_Q_values)
314 |             # Compute Bellman error
315 |             bellman_error = target_Q_values - current_Q_values
316 |             # clip the bellman error between [-1, 1]
317 |             clipped_bellman_error = bellman_error.clamp(-1, 1)
318 |             # 要 * -1 才是正確的gradient，why?
319 |             d_error = clipped_bellman_error * -1.0
320 |             
321 |             print(d_error)
322 | 
323 |             # backward & update
324 |             optimizer.zero_grad()
325 |             current_Q_values.backward(d_error.data.unsqueeze(1))
326 |             
327 |             optimizer.step()
328 |             num_param_updates += 1
329 |             
330 |             # 每隔一段時間才更新target network
331 |             if num_param_updates % target_update_freq == 0:
332 |                 target_Q.load_state_dict(Q.state_dict())
333 |                 
334 |         ### Log & track
335 |         # 要用gym.wrappers中的Monitor將env包起來，才有get_episode_rewards屬性，返回值為list
336 |         episode_rewards = env.get_episode_rewards()
337 |         if len(episode_rewards) > 0:
338 |             mean_episode_reward = np.mean(episode_rewards[-100:]) # 最近100次reward的平均
339 |         if len(episode_rewards) > 100:
340 |             best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward)
341 |             
342 |         if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
343 |             print("Timestep %d" % (t,))
344 |             print("mean reward (100 episodes) %f" % mean_episode_reward)
345 |             print("best mean reward %f" % best_mean_episode_reward)
346 |             print("episodes %d" % len(episode_rewards))
347 |             print("exploration %f" % exploration.value(t))
348 |             sys.stdout.flush()
349 |             
350 |             # Save the trained model
351 |             torch.save(Q.state_dict(), 'mario_Q_params.pkl')
352 |             torch.save(target_Q.state_dict(), 'mario_target_Q_params.pkl')
353 | 


--------------------------------------------------------------------------------