├── common ├── __init__.py ├── schedule.py └── atari_wrapper.py ├── deepq ├── __init__.py ├── model.py ├── replay_buffer.py └── learn.py ├── img ├── pong-dqn-8hr.gif └── mario-dqn-16hr.gif ├── README_zh.md ├── train_pong.py ├── train_mario.py └── README.md /common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepq/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /img/pong-dqn-8hr.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nailo2c/dqn-mario/HEAD/img/pong-dqn-8hr.gif -------------------------------------------------------------------------------- /img/mario-dqn-16hr.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nailo2c/dqn-mario/HEAD/img/mario-dqn-16hr.gif -------------------------------------------------------------------------------- /deepq/model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class DQN(nn.Module): 6 | def __init__(self, in_channels=4, num_actions=18): 7 | super(DQN, self).__init__() 8 | self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4) 9 | self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) 10 | self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1) 11 | self.fc4 = nn.Linear(7 * 7 * 64, 512) 12 | self.fc5 = nn.Linear(512, num_actions) 13 | 14 | def forward(self, x): 15 | x = F.relu(self.conv1(x)) 16 | x = F.relu(self.conv2(x)) 17 | x = F.relu(self.conv3(x)) 18 | x = F.relu(self.fc4(x.view(x.size(0), -1))) 19 | return self.fc5(x) -------------------------------------------------------------------------------- /common/schedule.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | def linear_interpolation(l, r, alpha): 3 | return l + alpha * (r - l) 4 | 5 | 6 | class PiecewiseSchedule(object): 7 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): 8 | """PiecewiseSchedule 9 | endpoints: [(int, int)] 10 | list of pairs `(time, value)`,代表time=t時的value值 11 | interpolation: lambda float, float, float: float 12 | outside_value: float 13 | """ 14 | idxes = [e[0] for e in endpoints] 15 | assert idxes == sorted(idxes) 16 | self._interpolation = interpolation 17 | self._outside_value = outside_value 18 | self._endpoints = endpoints 19 | 20 | def value(self, t): 21 | """See Schedule.value""" 22 | # 假如時間t介於l_t與r_t之間,就做插值 23 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): 24 | if l_t <= t and t <= r_t: 25 | alpha = float(t - l_t) / (r_t - l_t) 26 | return self._interpolation(l, r, alpha) 27 | 28 | # 如果t不屬於任何pieces,返回outside value 29 | assert self._outside_value is not None 30 | return self._outside_value 31 | 32 | 33 | 34 | class LinearSchedule(object): 35 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 36 | self.schedule_timesteps = schedule_timesteps 37 | self.final_p = final_p 38 | self.initial_p = initial_p 39 | 40 | def value(self, t): 41 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 42 | return self.initial_p + fraction * (self.final_p - self.initial_p) -------------------------------------------------------------------------------- /README_zh.md: -------------------------------------------------------------------------------- 1 | # DQN 2 | 3 | [English](README.md) | [中文](README_zh.md) 4 | 5 | 使用PyTorch實作DQN演算法,並訓練super-mario-bros以及atari-pong,整體架構參考openai/baselines。 6 | 7 | *Warning*:訓練DQN請開足夠的記憶體,Replay Buffer以預設值1000000為例至少會使用約8G的記憶體。 8 |   9 | # Dependencies 10 | 11 | * Python 3.6 12 | * Anaconda 13 | * PyTorch 14 | * gym 15 | * gym[atari] 16 | * ppaquette_gym_super_mario 17 | * fceux 18 | 19 | # Getting Started 20 | 21 | 以下以Ubuntu 16.04 LTS環境為準,安裝Anaconda時請一路Enter與Yes到底。 22 | 23 | ``` 24 | wget https://repo.continuum.io/archive/Anaconda3-4.4.0-Linux-x86_64.sh 25 | bash Anaconda3-4.4.0-Linux-x86_64.sh 26 | source .bashrc 27 | conda install pytorch torchvision -c soumith 28 | conda install libgcc 29 | pip install gym[Atari] 30 | sudo apt-get update 31 | sudo apt-get install -y python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb libav-tools xorg-dev python-opengl libboost-all-dev libsdl2-dev swig 32 | sudo apt-get install fceux 33 | pip install git+https://github.com/ppaquette/gym-super-mario/ 34 | ``` 35 | 36 | # How to run 37 | 38 | * super-mario-bros 39 | ``` 40 | xvfb-run -s "-screen 0 1400x900x24" python train_mario.py 41 | ``` 42 | 43 | * atari-pong 44 | ``` 45 | python train_pong.py 46 | ``` 47 | 48 | # Result 49 | 50 | * Super-Mario-Bros 51 | 52 | 使用8顆cpu在GCP上跑16個小時,RAM開24G非常足夠,但很難收斂,無法穩定過關。 53 | 訓練的影像預設位置在/video/mario/。 54 | 55 | ![](img/mario-dqn-16hr.gif) 56 | 57 | * Atari-Pong 58 | 59 | 使用1張GPU(Nvidia Tesla K80)加4顆cpu在GCP上跑8個小時,能夠穩定大幅贏電腦。 60 | 訓練的影像預設位置在/video/gym-reslults/。 61 | 62 | ![](img/pong-dqn-8hr.gif) 63 | 64 | 65 | # References 66 | 67 | [Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) 68 | [openai/baselines](https://github.com/openai/baselines) 69 | [transedward/pytorch-dqn](https://github.com/transedward/pytorch-dqn) 70 | [openai/gym](https://github.com/openai/gym) 71 | [ppaquette/gym-super-mario](https://github.com/ppaquette/gym-super-mario) 72 | -------------------------------------------------------------------------------- /train_pong.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | import gym 5 | import torch 6 | import torch.optim as optim 7 | 8 | from collections import namedtuple 9 | from gym import wrappers 10 | 11 | from deepq.learn import learning 12 | from deepq.model import DQN 13 | 14 | from common.atari_wrapper import wrap_dqn 15 | from common.schedule import LinearSchedule 16 | 17 | 18 | SEED = 0 19 | BATCH_SIZE = 32 20 | GAMMA = 0.99 21 | REPLAY_BUFFER_SIZE = 1000000 22 | LEARNING_STARTS = 50000 23 | LEARNING_FREQ = 4 24 | FRAME_HISTORY_LEN = 4 25 | TARGET_UPDATE_FREQ = 10000 26 | LEARNING_RATE = 0.00025 27 | ALPHA = 0.95 28 | EPS = 0.01 29 | 30 | 31 | def main(env): 32 | ### 首先要為隨時間改變的參數設定schedule 33 | # This is a just rough estimate 34 | num_iterations = float(40000000) / 4.0 35 | 36 | 37 | # define exploration schedule 38 | exploration_schedule = LinearSchedule(1000000, 0.1) 39 | 40 | 41 | # optimizer 42 | OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs"]) 43 | 44 | optimizer = OptimizerSpec( 45 | constructor=optim.RMSprop, 46 | kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), 47 | ) 48 | 49 | 50 | learning( 51 | env=env, 52 | q_func=DQN, 53 | optimizer_spec=optimizer, 54 | exploration=exploration_schedule, 55 | replay_buffer_size=REPLAY_BUFFER_SIZE, 56 | batch_size=BATCH_SIZE, 57 | gamma=GAMMA, 58 | learning_starts=LEARNING_STARTS, 59 | learning_freq=LEARNING_FREQ, 60 | frame_history_len=FRAME_HISTORY_LEN, 61 | target_update_freq=TARGET_UPDATE_FREQ 62 | ) 63 | 64 | 65 | if __name__ == '__main__': 66 | 67 | env = gym.make("PongNoFrameskip-v3") 68 | 69 | # set global seeds 70 | env.seed(SEED) 71 | torch.manual_seed(SEED) 72 | np.random.seed(SEED) 73 | random.seed(SEED) 74 | 75 | # monitor & wrap the game 76 | env = wrap_dqn(env) 77 | 78 | expt_dir = 'video/gym-reslults' 79 | env = wrappers.Monitor(env, expt_dir, force=True, video_callable=lambda count: count % 50 == 0) 80 | 81 | # main 82 | main(env) -------------------------------------------------------------------------------- /train_mario.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | import numpy as np 4 | from collections import namedtuple 5 | 6 | import gym 7 | import ppaquette_gym_super_mario 8 | from gym import wrappers 9 | 10 | import torch 11 | import torch.optim as optim 12 | 13 | from deepq.learn import mario_learning 14 | from deepq.model import DQN 15 | 16 | from common.atari_wrapper import wrap_mario 17 | from common.schedule import LinearSchedule 18 | 19 | 20 | SEED = 0 21 | BATCH_SIZE = 32 22 | GAMMA = 0.99 23 | REPLAY_BUFFER_SIZE = 1000000 24 | LEARNING_STARTS = 10000 25 | LEARNING_FREQ = 4 26 | FRAME_HISTORY_LEN = 4 27 | TARGET_UPDATE_FREQ = 3000 28 | LEARNING_RATE = 0.00025 29 | ALPHA = 0.95 30 | EPS = 0.01 31 | 32 | 33 | 34 | def main(env): 35 | ### 首先要為隨時間改變的參數設定schedule 36 | # This is a just rough estimate 37 | num_iterations = float(40000000) / 4.0 38 | 39 | 40 | # define exploration schedule 41 | exploration_schedule = LinearSchedule(1000000, 0.1) 42 | 43 | 44 | # optimizer 45 | OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs"]) 46 | 47 | optimizer = OptimizerSpec( 48 | constructor=optim.RMSprop, 49 | kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), 50 | ) 51 | 52 | 53 | mario_learning( 54 | env=env, 55 | q_func=DQN, 56 | optimizer_spec=optimizer, 57 | exploration=exploration_schedule, 58 | replay_buffer_size=REPLAY_BUFFER_SIZE, 59 | batch_size=BATCH_SIZE, 60 | gamma=GAMMA, 61 | learning_starts=LEARNING_STARTS, 62 | learning_freq=LEARNING_FREQ, 63 | frame_history_len=FRAME_HISTORY_LEN, 64 | target_update_freq=TARGET_UPDATE_FREQ 65 | ) 66 | 67 | if __name__ == '__main__': 68 | 69 | env = gym.make("ppaquette/SuperMarioBros-1-1-v0") 70 | 71 | 72 | # set global seeds 73 | env.seed(SEED) 74 | torch.manual_seed(SEED) 75 | np.random.seed(SEED) 76 | random.seed(SEED) 77 | 78 | 79 | # monitor & wrap the game 80 | env = wrap_mario(env) 81 | 82 | expt_dir = 'video/mario' 83 | env = wrappers.Monitor(env, expt_dir, force=True, video_callable=lambda count: count % 10 == 0) 84 | 85 | # main 86 | main(env) 87 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DQN 2 | 3 | [English](README.md) | [中文](README_zh.md) 4 | 5 | An implementation of the DQN algorithm in PyTorch, trained on Super Mario Bros and Atari Pong. The overall architecture is inspired by openai/baselines. 6 | 7 | Warning: Training DQN requires sufficient memory. With the default replay buffer size of 1,000,000, it will use about 8 GB of RAM. 8 | 9 | ## Dependencies 10 | 11 | - Python 3.6 12 | - Anaconda 13 | - PyTorch 14 | - `gym` 15 | - `gym[atari]` 16 | - `ppaquette_gym_super_mario` 17 | - `fceux` 18 | 19 | ## Getting Started 20 | 21 | The following steps assume Ubuntu 16.04 LTS. During Anaconda installation, press Enter and Yes to accept the defaults. 22 | 23 | ``` 24 | wget https://repo.continuum.io/archive/Anaconda3-4.4.0-Linux-x86_64.sh 25 | bash Anaconda3-4.4.0-Linux-x86_64.sh 26 | source .bashrc 27 | conda install pytorch torchvision -c soumith 28 | conda install libgcc 29 | pip install gym[Atari] 30 | sudo apt-get update 31 | sudo apt-get install -y python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb libav-tools xorg-dev python-opengl libboost-all-dev libsdl2-dev swig 32 | sudo apt-get install fceux 33 | pip install git+https://github.com/ppaquette/gym-super-mario/ 34 | ``` 35 | 36 | ## How to Run 37 | 38 | - Super Mario Bros 39 | 40 | ``` 41 | xvfb-run -s "-screen 0 1400x900x24" python train_mario.py 42 | ``` 43 | 44 | - Atari Pong 45 | 46 | ``` 47 | python train_pong.py 48 | ``` 49 | 50 | ## Results 51 | 52 | - Super Mario Bros 53 | 54 | Trained on GCP for 16 hours using 8 CPUs. 24 GB of RAM is more than sufficient, but training is hard to converge and it does not consistently clear levels. Training videos are saved by default to `/video/mario/`. 55 | 56 | ![](img/mario-dqn-16hr.gif) 57 | 58 | - Atari Pong 59 | 60 | Trained for 8 hours on GCP with 1 GPU (Nvidia Tesla K80) and 4 CPUs. It can reliably and decisively beat the computer. Training videos are saved by default to `/video/gym-reslults/`. 61 | 62 | ![](img/pong-dqn-8hr.gif) 63 | 64 | ## References 65 | 66 | - [Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) 67 | - [openai/baselines](https://github.com/openai/baselines) 68 | - [transedward/pytorch-dqn](https://github.com/transedward/pytorch-dqn) 69 | - [openai/gym](https://github.com/openai/gym) 70 | - [ppaquette/gym-super-mario](https://github.com/ppaquette/gym-super-mario) 71 | 72 | -------------------------------------------------------------------------------- /deepq/replay_buffer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import random 4 | 5 | def sample_n_unique(sampling_f, n): 6 | res = [] 7 | while len(res) < n: 8 | candidate = sampling_f() 9 | if candidate not in res: 10 | res.append(candidate) 11 | return res 12 | 13 | 14 | class ReplayBuffer(object): 15 | def __init__(self, size, frame_history_len): # size 代表 replay buffer size 16 | self.size = size 17 | self.frame_history_len = frame_history_len 18 | 19 | self.next_idx = 0 20 | self.num_in_buffer = 0 21 | 22 | self.obs = None 23 | self.action = None 24 | self.reward = None 25 | self.done = None 26 | 27 | def can_sample(self, batch_size): # return True or False 28 | return batch_size + 1 <= self.num_in_buffer 29 | 30 | def _encode_sample(self, idxes): 31 | obs_batch = np.concatenate([self._encode_observation(idx)[np.newaxis, :] for idx in idxes], 0) 32 | act_batch = self.action[idxes] 33 | rew_batch = self.reward[idxes] 34 | next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[np.newaxis, :] for idx in idxes], 0) 35 | done_mask = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32) # 將True, False轉為1, 0 36 | 37 | return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask 38 | 39 | def sample(self, batch_size): # return batch of (obs, act, rew, next_obs) and done_mask 40 | # 先確認大小足夠能抽樣才進行抽樣 41 | assert self.can_sample(batch_size) 42 | idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size) 43 | return self._encode_sample(idxes) 44 | 45 | def encode_recent_observation(self): # return the most recent `frame_history_len` frames. 46 | assert self.num_in_buffer > 0 47 | return self._encode_observation((self.next_idx - 1) % self.size) 48 | 49 | def _encode_observation(self, idx): # return list of frame 50 | end_idx = idx + 1 51 | start_idx = end_idx - self.frame_history_len 52 | 53 | # check if it is low-dimensional obs, such as RAM 54 | if len(self.obs.shape) == 2: return self.obs[end_idx-1] 55 | 56 | # 假如buffer裡沒有足夠的frame時 57 | if start_idx < 0 and self.num_in_buffer != self.size: start_idx = 0 58 | 59 | # 標註每個start idx 60 | for idx in range(start_idx, end_idx - 1): 61 | if self.done[idx % self.size]: 62 | start_idx = idx + 1 63 | 64 | # 處理頭幾個frame,因為idx出現負數時沒有frame 65 | missing_context = self.frame_history_len - (end_idx - start_idx) 66 | if start_idx < 0 or missing_context > 0: 67 | frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)] 68 | # 拿別的frame來填補 69 | for idx in range(start_idx, end_idx): 70 | frames.append(self.obs[idx % self.size]) 71 | return np.concatenate(frames, 0) 72 | else: 73 | # 底下的處理可以節約30%的計算時間 74 | img_h, img_w = self.obs.shape[2], self.obs.shape[3] 75 | return self.obs[start_idx:end_idx].reshape(-1, img_h, img_w) 76 | 77 | def store_frame(self, frame): # return idx of frame 78 | if len(frame.shape) > 1: 79 | # transpose image frame into (img_c, img_h, img_w) 80 | frame = frame.transpose(2, 0, 1) 81 | 82 | if self.obs is None: 83 | self.obs = np.empty([self.size] + list(frame.shape), dtype=np.uint8) 84 | self.action = np.empty([self.size], dtype=np.int32) 85 | self.reward = np.empty([self.size], dtype=np.float32) 86 | self.done = np.empty([self.size], dtype=np.bool) 87 | 88 | self.obs[self.next_idx] = frame 89 | 90 | ret = self.next_idx 91 | self.next_idx = (self.next_idx + 1) % self.size 92 | self.num_in_buffer = min(self.size, self.num_in_buffer + 1) 93 | 94 | return ret 95 | 96 | def store_effect(self, idx, action, reward, done): 97 | self.action[idx] = action 98 | self.reward[idx] = reward 99 | self.done[idx] = done -------------------------------------------------------------------------------- /common/atari_wrapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | from collections import deque 4 | import gym 5 | from gym import spaces 6 | from PIL import Image 7 | 8 | 9 | # 重新實作step與reset,主要是reset,以當lives當指標來當作episode真正的結束 10 | class EpisodicLifeEnv(gym.Wrapper): 11 | def __init__(self, env=None): 12 | super(EpisodicLifeEnv, self).__init__(env) 13 | self.lives = 0 14 | self.was_real_done = True 15 | self.was_real_reset = False 16 | 17 | def _step(self, action): 18 | obs, reward, done, info = self.env.step(action) 19 | self.was_real_done = done 20 | # 不太清楚使用這個lives是為了什麼 21 | lives = self.env.unwrapped.ale.lives() 22 | if lives < self.lives and lives > 0: 23 | done = True 24 | self.lives = lives 25 | return obs, reward, done, info 26 | 27 | def _reset(self): 28 | if self.was_real_done: 29 | obs = self.env.reset() 30 | self.was_real_reset = True 31 | else: 32 | obs, _, _, _ = self.env.step(0) 33 | self.was_real_reset = False 34 | self.lives = self.env.unwrapped.ale.lives() 35 | return obs 36 | 37 | 38 | 39 | # 先隨機進行N次noop(no opearation)當作初始obs,但不知道這樣的好處為何? 40 | class NoopResetEnv(gym.Wrapper): 41 | def __init__(self, env=None, noop_max=30): 42 | super(NoopResetEnv, self).__init__(env) 43 | self.noop_max = noop_max 44 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 45 | 46 | def _reset(self): 47 | self.env.reset() 48 | noops = np.random.randint(1, self.noop_max + 1) 49 | for _ in range(noops): 50 | obs, _, _, _ = self.env.step(0) 51 | return obs 52 | 53 | 54 | 55 | # 實作每4個frames當作一次sample 56 | class MaxAndSkipEnv(gym.Wrapper): 57 | def __init__(self, env=None, skip=4): 58 | """Return only every `skip`-th frame""" 59 | super(MaxAndSkipEnv, self).__init__(env) 60 | self._obs_buffer = deque(maxlen=2) 61 | self._skip = skip 62 | 63 | def _step(self, action): 64 | total_reward = 0.0 65 | done = None 66 | for _ in range(self._skip): 67 | obs, reward, done, info = self.env.step(action) 68 | self._obs_buffer.append(obs) 69 | total_reward += reward 70 | if done: 71 | break 72 | 73 | # 選倒數兩個frame中較大的那一個,但我不太清楚為何要這樣? 74 | max_frame = np.max(np.stack(self._obs_buffer), axis=0) 75 | 76 | return max_frame, total_reward, done, info 77 | 78 | def _reset(self): 79 | # 清掉buffer,並掛上初始obs當作deque的初始狀態 80 | self._obs_buffer.clear() 81 | obs = self.env.reset() 82 | self._obs_buffer.append(obs) 83 | return obs 84 | 85 | 86 | 87 | # 先Fire一發,再走一步action來當作初始obs,一樣還是不知道為何要這樣做 88 | class FireResetEnv(gym.Wrapper): 89 | def __init__(self, env=None): 90 | super(FireResetEnv, self).__init__(env) 91 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 92 | assert len(env.unwrapped.get_action_meanings()) >= 3 93 | 94 | def _reset(self): 95 | self.env.reset() 96 | obs, _, _, _ = self.env.step(1) 97 | obs, _, _, _ = self.env.step(2) 98 | return obs 99 | 100 | 101 | 102 | # 遊戲畫面前處理 103 | def _process_frame84(frame): 104 | img = np.reshape(frame, [210, 160, 3]).astype(np.float32) 105 | # RGB轉灰階 106 | # https://en.wikipedia.org/wiki/Grayscale#Converting_color_to_grayscale 107 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 108 | # 轉為Image物件,使用BILINEAR插值 109 | img = Image.fromarray(img) 110 | resized_screen = img.resize((84, 110), Image.BILINEAR) 111 | resized_screen = np.array(resized_screen) 112 | x_t = resized_screen[18:102, :] 113 | x_t = np.reshape(x_t, [84, 84, 1]) 114 | return x_t.astype(np.uint8) 115 | 116 | 117 | 118 | # 重新實作經過前處理的step與reset 119 | class ProcessFrame84(gym.Wrapper): 120 | def __init__(self, env=None): 121 | super(ProcessFrame84, self).__init__(env) 122 | self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1)) 123 | 124 | def _step(self, action): 125 | obs, reward, done, info = self.env.step(action) 126 | return _process_frame84(obs), reward, done, info 127 | 128 | def _reset(self): 129 | return _process_frame84(self.env.reset()) 130 | 131 | 132 | 133 | # 假如reward大於0 -> 1,等於0 -> 0,小於0 -> -1 134 | class ClippedRewardsWrapper(gym.Wrapper): 135 | def _step(self, action): 136 | obs, reward, done, info = self.env.step(action) 137 | return obs, np.sign(reward), done, info 138 | 139 | 140 | 141 | def wrap_dqn(env): 142 | assert 'NoFrameskip' in env.spec.id 143 | env = EpisodicLifeEnv(env) 144 | env = NoopResetEnv(env, noop_max=30) 145 | env = MaxAndSkipEnv(env, skip=4) 146 | if 'FIRE' in env.unwrapped.get_action_meanings(): 147 | env = FireResetEnv(env) 148 | env = ProcessFrame84(env) 149 | env = ClippedRewardsWrapper(env) 150 | return env 151 | 152 | 153 | 154 | 155 | # 針對mario去修改size 156 | def _process_frame_mario(frame): 157 | img = np.reshape(frame, [224, 256, 3]).astype(np.float32) 158 | # RGB轉灰階 159 | # https://en.wikipedia.org/wiki/Grayscale#Converting_color_to_grayscale 160 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 161 | # 轉為Image物件,使用BILINEAR插值 162 | img = Image.fromarray(img) 163 | resized_screen = img.resize((84, 110), Image.BILINEAR) 164 | resized_screen = np.array(resized_screen) 165 | x_t = resized_screen[18:102, :] 166 | x_t = np.reshape(x_t, [84, 84, 1]) 167 | return x_t.astype(np.uint8) 168 | 169 | 170 | 171 | # 重新實作經過前處理的step與reset 172 | class ProcessFrameMario(gym.Wrapper): 173 | def __init__(self, env=None): 174 | super(ProcessFrameMario, self).__init__(env) 175 | self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1)) 176 | 177 | def _step(self, action): 178 | obs, reward, done, info = self.env.step(action) 179 | return _process_frame_mario(obs), reward, done, info 180 | 181 | def _reset(self): 182 | return _process_frame_mario(self.env.reset()) 183 | 184 | 185 | 186 | def wrap_mario(env): 187 | assert 'SuperMarioBros' in env.spec.id 188 | env = MaxAndSkipEnv(env, skip=4) 189 | env = ProcessFrameMario(env) 190 | return env -------------------------------------------------------------------------------- /deepq/learn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import sys 4 | import random 5 | import numpy as np 6 | from itertools import count 7 | 8 | import gym 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.autograd as autograd 13 | import torch.nn.functional as F 14 | 15 | from .replay_buffer import ReplayBuffer 16 | 17 | 18 | # detect GPU 19 | USE_CUDA = torch.cuda.is_available() 20 | dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor 21 | 22 | 23 | class Variable(autograd.Variable): 24 | def __init__(self, data, *args, **kwargs): 25 | if USE_CUDA: 26 | data = data.cuda() 27 | super(Variable, self).__init__(data, *args, **kwargs) 28 | 29 | 30 | def learning( 31 | env, 32 | q_func, 33 | optimizer_spec, 34 | exploration, 35 | replay_buffer_size=1000000, 36 | batch_size=32, 37 | gamma=0.99, 38 | learning_starts=50000, 39 | learning_freq=4, 40 | frame_history_len=4, 41 | target_update_freq=10000 42 | ): 43 | 44 | assert type(env.observation_space) == gym.spaces.Box 45 | assert type(env.action_space) == gym.spaces.Discrete 46 | 47 | 48 | # 檢查是否是low-dimensional observations (e.g. RAM) 49 | if len(env.observation_space.shape) == 1: 50 | input_arg = env.observation_space.shape[0] 51 | else: 52 | img_h, img_w, img_c = env.observation_space.shape 53 | input_arg = frame_history_len * img_c # 實作論文中的每4 frame擷取一次 54 | 55 | num_actions = env.action_space.n 56 | 57 | 58 | # Construct an epilson greedy policy with given exploration schedule 59 | def select_epilson_greedy_action(model, obs, t): 60 | sample = random.random() 61 | eps_threshold = exploration.value(t) 62 | if sample > eps_threshold: 63 | obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 64 | return model(Variable(obs, volatile=True)).data.max(1)[1].view(1,1) 65 | else: 66 | return torch.IntTensor([[random.randrange(num_actions)]]) 67 | 68 | 69 | # Initialize target q function and q function 70 | Q = q_func(input_arg, num_actions).type(dtype) 71 | target_Q = q_func(input_arg, num_actions).type(dtype) 72 | 73 | # Check & load pretrain model 74 | if os.path.isfile('Q_params.pkl'): 75 | print('Load Q parametets ...') 76 | Q.load_state_dict(torch.load('Q_params.pkl')) 77 | 78 | if os.path.isfile('target_Q_params.pkl'): 79 | print('Load target Q parameters ...') 80 | target_Q.load_state_dict(torch.load('target_Q_params.pkl')) 81 | 82 | 83 | # Construct Q network optimizer function 84 | optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) 85 | 86 | # Construct the replay buffer 87 | replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) 88 | 89 | 90 | ### RUN ENV 91 | num_param_updates = 0 92 | mean_episode_reward = -float('nan') 93 | best_mean_episode_reward = -float('inf') 94 | last_obs = env.reset() 95 | LOG_EVERY_N_STEPS = 10000 96 | 97 | 98 | for t in count(): 99 | ### Step the env and store the transition 100 | last_idx = replay_buffer.store_frame(last_obs) 101 | # 將最新的observation與最近的幾個frame concat在一起,才能丟進Q網路 102 | recent_observations = replay_buffer.encode_recent_observation() 103 | 104 | 105 | # buffer 收集到一定的量才開始學習 106 | if t > learning_starts: 107 | action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] 108 | else: 109 | action = random.randrange(num_actions) 110 | 111 | obs, reward, done, _ = env.step(action) 112 | reward = max(-1.0, min(reward, 1.0)) 113 | replay_buffer.store_effect(last_idx, action, reward, done) # 將新的資訊存入buffer中 114 | 115 | if done: 116 | obs = env.reset() 117 | last_obs = obs 118 | 119 | ### 從buffer中抽樣並以target network的方式訓練 120 | if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): 121 | obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size) 122 | 123 | obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0) 124 | act_batch = Variable(torch.from_numpy(act_batch).long()) 125 | rew_batch = Variable(torch.from_numpy(rew_batch)) 126 | next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0) 127 | not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) # 如果下一個state是episode中的最後一個,則done_mask = 1 128 | 129 | if USE_CUDA: 130 | act_batch = act_batch.cuda() 131 | rew_batch = rew_batch.cuda() 132 | 133 | # 從抽出的batch observation中得出現在的Q值 134 | current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)).squeeze() 135 | # 用next_obs_batch計算下一個Q值,detach代表將target network從graph中分離,不去計算它的gradient 136 | next_max_q = target_Q(next_obs_batch).detach().max(1)[0] 137 | next_Q_values = not_done_mask * next_max_q 138 | # TD value 139 | target_Q_values = rew_batch + (gamma * next_Q_values) 140 | # Compute Bellman error 141 | # bellman_error = target_Q_values - current_Q_values 142 | # clip the bellman error between [-1, 1] 143 | # clipped_bellman_error = bellman_error.clamp(-1, 1) 144 | # 要 * -1 才是正確的gradient,why? 145 | # d_error = clipped_bellman_error * -1.0 146 | 147 | # print(d_error) 148 | loss = F.smooth_l1_loss(current_Q_values, target_Q_values) 149 | # backward & update 150 | optimizer.zero_grad() 151 | # current_Q_values.backward(d_error.data)#.unsqueeze(1)) 152 | loss.backward() 153 | # Clip the gradients to lie between -1 and +1 154 | for params in Q.parameters(): 155 | params.grad.data.clamp_(-1, 1) 156 | 157 | optimizer.step() 158 | num_param_updates += 1 159 | 160 | # 每隔一段時間才更新target network 161 | if num_param_updates % target_update_freq == 0: 162 | target_Q.load_state_dict(Q.state_dict()) 163 | 164 | ### Log & track 165 | # 要用gym.wrappers中的Monitor將env包起來,才有get_episode_rewards屬性,返回值為list 166 | episode_rewards = env.get_episode_rewards() 167 | if len(episode_rewards) > 0: 168 | mean_episode_reward = np.mean(episode_rewards[-100:]) # 最近100次reward的平均 169 | if len(episode_rewards) > 100: 170 | best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) 171 | 172 | if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: 173 | print("Timestep %d" % (t,)) 174 | print("mean reward (100 episodes) %f" % mean_episode_reward) 175 | print("best mean reward %f" % best_mean_episode_reward) 176 | print("episodes %d" % len(episode_rewards)) 177 | print("exploration %f" % exploration.value(t)) 178 | sys.stdout.flush() 179 | 180 | # Save the trained model 181 | torch.save(Q.state_dict(), 'Q_params.pkl') 182 | torch.save(target_Q.state_dict(), 'target_Q_params.pkl') 183 | 184 | 185 | ######### 186 | # Mario # 187 | ######### 188 | 189 | def mario_learning( 190 | env, 191 | q_func, 192 | optimizer_spec, 193 | exploration, 194 | replay_buffer_size=1000000, 195 | batch_size=32, 196 | gamma=0.99, 197 | learning_starts=50000, 198 | learning_freq=4, 199 | frame_history_len=4, 200 | target_update_freq=10000 201 | ): 202 | 203 | assert type(env.observation_space) == gym.spaces.Box 204 | assert type(env.action_space) == gym.spaces.MultiDiscrete 205 | 206 | 207 | # 檢查是否是low-dimensional observations (e.g. RAM) 208 | if len(env.observation_space.shape) == 1: 209 | input_arg = env.observation_space.shape[0] 210 | else: 211 | img_h, img_w, img_c = env.observation_space.shape 212 | input_arg = frame_history_len * img_c # 實作論文中的每4 frame擷取一次 213 | 214 | 215 | num_actions = env.action_space.shape 216 | 217 | 218 | # Construct an epilson greedy policy with given exploration schedule 219 | def select_epilson_greedy_action(model, obs, t): 220 | sample = random.random() 221 | eps_threshold = exploration.value(t) 222 | if sample > eps_threshold: 223 | obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 224 | return model(Variable(obs, volatile=True)).data.max(1)[1].cpu() 225 | else: 226 | return torch.IntTensor([[random.randrange(num_actions)]]) 227 | 228 | # to one hot 229 | def to_onehot(action, num_actions): 230 | action = action % num_actions 231 | if action == 0: 232 | # Move right while jumping 233 | action_onehot = np.array([0, 0, 0, 1, 1, 0]) 234 | else: 235 | action_onehot = np.zeros(num_actions, dtype=int) 236 | action_onehot[action] = 1 237 | return action_onehot 238 | 239 | 240 | # Initialize target q function and q function 241 | Q = q_func(input_arg, num_actions).type(dtype) 242 | target_Q = q_func(input_arg, num_actions).type(dtype) 243 | 244 | # Check & load pretrain model 245 | if os.path.isfile('mario_Q_params.pkl'): 246 | print('Load Q parametets ...') 247 | Q.load_state_dict(torch.load('mario_Q_params.pkl')) 248 | 249 | if os.path.isfile('mario_target_Q_params.pkl'): 250 | print('Load target Q parameters ...') 251 | target_Q.load_state_dict(torch.load('mario_target_Q_params.pkl')) 252 | 253 | 254 | # Construct Q network optimizer function 255 | optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) 256 | 257 | # Construct the replay buffer 258 | replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) 259 | 260 | ### RUN ENV 261 | num_param_updates = 0 262 | mean_episode_reward = -float('nan') 263 | best_mean_episode_reward = -float('inf') 264 | last_obs = env.reset() 265 | LOG_EVERY_N_STEPS = 10000 266 | 267 | 268 | for t in count(): 269 | print('timestep:', t) 270 | ### Step the env and store the transition 271 | last_idx = replay_buffer.store_frame(last_obs) 272 | # 將最新的observation與最近的幾個frame concat在一起,才能丟進Q網路 273 | recent_observations = replay_buffer.encode_recent_observation() 274 | 275 | 276 | # buffer 收集到一定的量才開始學習 277 | if t > learning_starts: 278 | action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] 279 | else: 280 | action = random.randrange(num_actions) 281 | 282 | # one hot encoding 283 | act_onehot = to_onehot(action, num_actions) 284 | 285 | obs, reward, done, _ = env.step(act_onehot) 286 | #reward = max(-1.0, min(reward, 1.0)) 287 | replay_buffer.store_effect(last_idx, action, reward, done) # 將新的資訊存入buffer中 288 | 289 | if done: 290 | obs = env.reset() 291 | last_obs = obs 292 | 293 | ### 從buffer中抽樣並以target network的方式訓練 294 | if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): 295 | obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size) 296 | 297 | obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0) 298 | act_batch = Variable(torch.from_numpy(act_batch).long()) 299 | rew_batch = Variable(torch.from_numpy(rew_batch)) 300 | next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0) 301 | not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) # 如果下一個state是episode中的最後一個,則done_mask = 1 302 | 303 | if USE_CUDA: 304 | act_batch = act_batch.cuda() 305 | rew_batch = rew_batch.cuda() 306 | 307 | # 從抽出的batch observation中得出現在的Q值 308 | current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)) 309 | # 用next_obs_batch計算下一個Q值,detach代表將target network從graph中分離,不去計算它的gradient 310 | next_max_q = target_Q(next_obs_batch).detach().max(1)[0] 311 | next_Q_values = not_done_mask * next_max_q 312 | # TD value 313 | target_Q_values = rew_batch + (gamma * next_Q_values) 314 | # Compute Bellman error 315 | bellman_error = target_Q_values - current_Q_values 316 | # clip the bellman error between [-1, 1] 317 | clipped_bellman_error = bellman_error.clamp(-1, 1) 318 | # 要 * -1 才是正確的gradient,why? 319 | d_error = clipped_bellman_error * -1.0 320 | 321 | print(d_error) 322 | 323 | # backward & update 324 | optimizer.zero_grad() 325 | current_Q_values.backward(d_error.data.unsqueeze(1)) 326 | 327 | optimizer.step() 328 | num_param_updates += 1 329 | 330 | # 每隔一段時間才更新target network 331 | if num_param_updates % target_update_freq == 0: 332 | target_Q.load_state_dict(Q.state_dict()) 333 | 334 | ### Log & track 335 | # 要用gym.wrappers中的Monitor將env包起來,才有get_episode_rewards屬性,返回值為list 336 | episode_rewards = env.get_episode_rewards() 337 | if len(episode_rewards) > 0: 338 | mean_episode_reward = np.mean(episode_rewards[-100:]) # 最近100次reward的平均 339 | if len(episode_rewards) > 100: 340 | best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) 341 | 342 | if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: 343 | print("Timestep %d" % (t,)) 344 | print("mean reward (100 episodes) %f" % mean_episode_reward) 345 | print("best mean reward %f" % best_mean_episode_reward) 346 | print("episodes %d" % len(episode_rewards)) 347 | print("exploration %f" % exploration.value(t)) 348 | sys.stdout.flush() 349 | 350 | # Save the trained model 351 | torch.save(Q.state_dict(), 'mario_Q_params.pkl') 352 | torch.save(target_Q.state_dict(), 'mario_target_Q_params.pkl') 353 | --------------------------------------------------------------------------------