├── 01-basics └── linear_regression ├── A3C-DDPG.py ├── PER-and-ACQ ├── AC-DQN.py ├── net.py ├── prioritized-DQN.py ├── prioritized_memory.py └── util.py ├── README.md ├── convolutional_neural_network └── main.py ├── ddpg.py ├── double-DQN.py ├── dueling-DQN.py ├── feedforward_neural_network └── main.py ├── ga ├── bag.py └── peak.py ├── gym_sample └── demo.py ├── native-Qlearning.py ├── nature-DQN.py ├── neat ├── Digraph.gv ├── Digraph.gv.svg ├── avg_fitness.svg ├── cartpole.py ├── config ├── speciation.svg └── visualize.py ├── net.py ├── nips-DQN.py ├── recurrent_neural_network └── main.py └── util.py /01-basics/linear_regression: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | # Hyper-parameters 8 | input_size = 1 9 | output_size = 1 10 | num_epochs = 60 11 | learning_rate = 0.001 12 | 13 | # Toy dataset 14 | x_train = np.array([[3.3], [4.4], [5.5], [6.71], [6.93], [4.168], 15 | [9.779], [6.182], [7.59], [2.167], [7.042], 16 | [10.791], [5.313], [7.997], [3.1]], dtype=np.float32) 17 | 18 | y_train = np.array([[1.7], [2.76], [2.09], [3.19], [1.694], [1.573], 19 | [3.366], [2.596], [2.53], [1.221], [2.827], 20 | [3.465], [1.65], [2.904], [1.3]], dtype=np.float32) 21 | 22 | # Linear regression model 23 | model = nn.Linear(input_size, output_size) 24 | 25 | # Loss and optimizer 26 | criterion = nn.MSELoss() 27 | optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) 28 | 29 | # Train the model 30 | for epoch in range(num_epochs): 31 | # Convert numpy arrays to torch tensors 32 | inputs = torch.from_numpy(x_train) 33 | targets = torch.from_numpy(y_train) 34 | 35 | # Forward pass 36 | outputs = model(inputs) 37 | loss = criterion(outputs, targets) 38 | 39 | # Backward and optimize 40 | optimizer.zero_grad() 41 | loss.backward() 42 | optimizer.step() 43 | 44 | if (epoch+1) % 5 == 0: 45 | print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item())) 46 | 47 | # Plot the graph 48 | predicted = model(torch.from_numpy(x_train)).detach().numpy() 49 | plt.plot(x_train, y_train, 'ro', label='Original data') 50 | plt.plot(x_train, predicted, label='Fitted line') 51 | plt.legend() 52 | plt.show() 53 | 54 | # Save the model checkpoint 55 | torch.save(model.state_dict(), 'model.ckpt') 56 | -------------------------------------------------------------------------------- /A3C-DDPG.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import torch, time, gym, argparse, sys 3 | import numpy as np 4 | from scipy.signal import lfilter 5 | from scipy.misc import imresize 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.multiprocessing as mp 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env', default='Breakout-v4', type=str, help='gym environment') 12 | parser.add_argument('--processes', default=1, type=int, help='number of processes') 13 | parser.add_argument('--lr', default=1e-4, type=float, help='learning rate') 14 | parser.add_argument('--gamma', default=0.99, type=float, help='rewards discount factor') 15 | parser.add_argument('--seed', default=1, type=int, help='random seed') 16 | args = parser.parse_args() 17 | discount = lambda x, gamma: lfilter([1], [1, -gamma], x[::-1])[::-1] 18 | prepro = lambda img: imresize(img[35:195].mean(2), (80, 80)).astype(np.float32).reshape(1, 80, 80) / 255. 19 | 20 | 21 | class NNPolicy(nn.Module): 22 | def __init__(self, num_actions): 23 | super(NNPolicy, self).__init__() 24 | self.conv1 = nn.Conv2d(1, 32, 3, stride=2, padding=1) 25 | self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1) 26 | self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1) 27 | self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1) 28 | self.gru = nn.GRUCell(32 * 5 * 5, 256) 29 | self.critic_net, self.actor_net = nn.Linear(256, 1), nn.Linear(256, num_actions) 30 | 31 | def forward(self, inputs, train=True, hard=False): 32 | inputs, hx = inputs 33 | x = F.elu(self.conv1(inputs)) 34 | x = F.elu(self.conv2(x)) 35 | x = F.elu(self.conv3(x)) 36 | x = F.elu(self.conv4(x)) 37 | hx = self.gru(x.view(-1, 32 * 5 * 5), (hx)) 38 | return self.critic_net(hx), self.actor_net(hx), hx 39 | 40 | 41 | class SharedAdam(torch.optim.Adam): 42 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 43 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay) 44 | for group in self.param_groups: 45 | for p in group['params']: 46 | state = self.state[p] 47 | state['shared_steps'], state['step'] = torch.zeros(1).share_memory_(), 0 48 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_().share_memory_() 49 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_().share_memory_() 50 | 51 | 52 | def loss_func(args, values, logps, actions, rewards): 53 | np_values = values.view(-1).data.numpy() 54 | 55 | delta_t = np.asarray(rewards) + args.gamma * np_values[1:] - np_values[:-1] 56 | logpys = logps.gather(1, torch.tensor(actions).view(-1, 1)) 57 | gen_adv_est = discount(delta_t, args.gamma) 58 | policy_loss = -(logpys.view(-1) * torch.FloatTensor(gen_adv_est.copy())).sum() 59 | 60 | rewards[-1] += args.gamma * np_values[-1] 61 | discounted_r = discount(np.asarray(rewards), args.gamma) 62 | discounted_r = torch.tensor(discounted_r.copy(), dtype=torch.float32) 63 | value_loss = .5 * (discounted_r - values[:-1, 0]).pow(2).sum() 64 | 65 | entropy_loss = -(-logps * torch.exp(logps)).sum() 66 | return policy_loss + 0.5 * value_loss + 0.01 * entropy_loss 67 | 68 | 69 | def worker(shared_model, shared_optimizer, rank, args, info): 70 | env = gym.make(args.env) 71 | env.seed(args.seed + rank) 72 | torch.manual_seed(args.seed + rank) 73 | model = NNPolicy(num_actions=args.num_actions) 74 | state = torch.tensor(prepro(env.reset())) 75 | 76 | start_time = last_disp_time = time.time() 77 | episode_length, epr, eploss, done = 0, 0, 0, True 78 | 79 | while info['frames'][0] <= 4e7: 80 | model.load_state_dict(shared_model.state_dict()) 81 | 82 | hx = torch.zeros(1, 256) if done else hx.detach() 83 | values, logps, actions, rewards = [], [], [], [] 84 | 85 | for step in range(4): 86 | episode_length += 1 87 | value, logit, hx = model((state.view(1, 1, 80, 80), hx)) 88 | logp = F.log_softmax(logit, dim=-1) 89 | 90 | action = torch.exp(logp).multinomial(num_samples=1).data[0] 91 | state, reward, done, _ = env.step(action.numpy()[0]) 92 | # env.render() 93 | 94 | state = torch.tensor(prepro(state)) 95 | epr += reward 96 | reward = np.clip(reward, -1, 1) 97 | done = done or episode_length >= 1e4 98 | 99 | info['frames'].add_(1) 100 | num_frames = int(info['frames'].item()) 101 | 102 | if done: 103 | info['episodes'] += 1 104 | interp = 1 if info['episodes'][0] == 1 else 0.01 105 | info['run_epr'].mul_(1 - interp).add_(interp * epr) 106 | info['run_loss'].mul_(1 - interp).add_(interp * eploss) 107 | 108 | if rank == 0 and time.time() - last_disp_time > 60: 109 | elapsed = time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)) 110 | print('time {}, episodes {:.0f}, frames {:.1f}M, mean epr {:.2f}, run loss {:.2f}' 111 | .format(elapsed, info['episodes'].item(), num_frames / 1e6, 112 | info['run_epr'].item(), info['run_loss'].item())) 113 | last_disp_time = time.time() 114 | 115 | if done: 116 | episode_length, epr, eploss = 0, 0, 0 117 | state = torch.tensor(prepro(env.reset())) 118 | 119 | values.append(value) 120 | logps.append(logp) 121 | actions.append(action) 122 | rewards.append(reward) 123 | 124 | next_value = torch.zeros(1, 1) if done else model((state.unsqueeze(0), hx))[0] 125 | values.append(next_value.detach()) 126 | 127 | loss = loss_func(args, torch.cat(values), torch.cat(logps), torch.cat(actions), np.asarray(rewards)) 128 | eploss += loss.item() 129 | shared_optimizer.zero_grad() 130 | loss.backward() 131 | torch.nn.utils.clip_grad_norm_(model.parameters(), 40) 132 | 133 | for param, shared_param in zip(model.parameters(), shared_model.parameters()): 134 | if shared_param.grad is None: 135 | shared_param._grad = param.grad 136 | shared_optimizer.step() 137 | 138 | 139 | if __name__ == "__main__": 140 | if sys.version_info[0] > 2: 141 | mp.set_start_method('spawn') 142 | elif sys.platform == 'linux' or sys.platform == 'linux2': 143 | raise "Must be using Python 3 with linux! Or else you get a deadlock in conv2d" 144 | 145 | args.num_actions = gym.make(args.env).action_space.n 146 | 147 | torch.manual_seed(args.seed) 148 | shared_model = NNPolicy(num_actions=args.num_actions).share_memory() 149 | shared_optimizer = SharedAdam(shared_model.parameters(), lr=args.lr) 150 | 151 | info = {k: torch.DoubleTensor([0]).share_memory_() for k in ['run_epr', 'run_loss', 'episodes', 'frames']} 152 | 153 | processes = [] 154 | for rank in range(args.processes): 155 | p = mp.Process(target=worker, args=(shared_model, shared_optimizer, rank, args, info)) 156 | p.start() 157 | processes.append(p) 158 | for p in processes: p.join() 159 | -------------------------------------------------------------------------------- /PER-and-ACQ/AC-DQN.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import torch, time, gym, argparse, sys 3 | import numpy as np 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.multiprocessing as mp 7 | 8 | from collections import deque 9 | import random 10 | from net import AtariNet 11 | from util import preprocess 12 | 13 | LR = 0.001 14 | EXPLORE = 1000000 15 | GAMMA = 0.99 16 | N_STEP = 4 17 | ENV = 'Pong-v0' 18 | ACTIONS_SIZE = gym.make(ENV).action_space.n 19 | PROCESSES = 1 20 | SEED = 1 21 | 22 | 23 | class Agent(object): 24 | def __init__(self, action_size): 25 | self.action_size = action_size 26 | self.EPSILON = 1.0 27 | self.network = AtariNet(action_size) 28 | self.memory = deque() 29 | self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR) 30 | self.loss_func = nn.MSELoss() 31 | 32 | def action(self, state, israndom): 33 | if israndom and random.random() < self.EPSILON: 34 | return np.random.randint(0, self.action_size) 35 | state = torch.unsqueeze(torch.FloatTensor(state), 0) 36 | actions_value = self.network.forward(state) 37 | return torch.max(actions_value, 1)[1].data.numpy()[0] 38 | 39 | def add(self, state, action, reward, next_state, done): 40 | if done: 41 | self.memory.append((state, action, reward, next_state, 0)) 42 | else: 43 | self.memory.append((state, action, reward, next_state, 1)) 44 | 45 | def learn(self, shared_optimizer, shared_model): 46 | batch_size = len(self.memory) 47 | batch = random.sample(self.memory, batch_size) 48 | state = torch.FloatTensor([x[0] for x in batch]) 49 | action = torch.LongTensor([[x[1]] for x in batch]) 50 | reward = torch.FloatTensor([[x[2]] for x in batch]) 51 | next_state = torch.FloatTensor([x[3] for x in batch]) 52 | done = torch.FloatTensor([[x[4]] for x in batch]) 53 | 54 | eval_q = self.network.forward(state).gather(1, action) 55 | next_q = self.network(next_state).detach() 56 | target_q = reward + GAMMA * next_q.max(1)[0].view(batch_size, 1) * done 57 | loss = self.loss_func(eval_q, target_q) 58 | 59 | shared_optimizer.zero_grad() 60 | loss.backward() 61 | for param, shared_param in zip(self.network.parameters(), shared_model.parameters()): 62 | if shared_param.grad is None: 63 | shared_param._grad = param.grad 64 | shared_optimizer.step() 65 | 66 | self.memory = deque() 67 | if self.EPSILON > 0.1: 68 | self.EPSILON -= (1.0 - 0.1) / EXPLORE 69 | 70 | 71 | class SharedAdam(torch.optim.Adam): 72 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 73 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay) 74 | for group in self.param_groups: 75 | for p in group['params']: 76 | state = self.state[p] 77 | state['shared_steps'], state['step'] = torch.zeros(1).share_memory_(), 0 78 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_().share_memory_() 79 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_().share_memory_() 80 | 81 | 82 | def worker(shared_model, shared_optimizer, rank, info): 83 | env = gym.make(ENV) 84 | env.seed(SEED + rank) 85 | torch.manual_seed(SEED + rank) 86 | agent = Agent(ACTIONS_SIZE) 87 | 88 | start_time = last_disp_time = time.time() 89 | episode_length, epr = 0, 0 90 | 91 | state = env.reset() 92 | state = preprocess(state) 93 | while info['frames'][0] <= 4e7: 94 | agent.network.load_state_dict(shared_model.state_dict()) 95 | 96 | for _ in range(N_STEP): 97 | # env.render() 98 | episode_length += 1 99 | 100 | action = agent.action(state, True) 101 | next_state, reward, done, ext = env.step(action) 102 | epr += reward 103 | done = done or episode_length >= 1e4 104 | info['frames'].add_(1) 105 | num_frames = int(info['frames'].item()) 106 | 107 | next_state = preprocess(next_state) 108 | agent.add(state, action, reward, next_state, done) 109 | 110 | state = next_state 111 | 112 | if done: 113 | info['episodes'] += 1 114 | interp = 1 if info['episodes'][0] == 1 else 0.01 115 | info['run_epr'].mul_(1 - interp).add_(interp * epr) 116 | 117 | if rank == 0 and time.time() - last_disp_time > 60: 118 | elapsed = time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)) 119 | print('time {}, episodes {:.0f}, frames {:.1f}M, mean epr {:.2f}' 120 | .format(elapsed, info['episodes'].item(), num_frames / 1e6, 121 | info['run_epr'].item())) 122 | last_disp_time = time.time() 123 | 124 | if done: 125 | episode_length, epr, eploss = 0, 0, 0 126 | state = env.reset() 127 | state = preprocess(state) 128 | break 129 | 130 | agent.learn(shared_optimizer, shared_model) 131 | 132 | 133 | 134 | if __name__ == "__main__": 135 | if sys.version_info[0] > 2: 136 | mp.set_start_method('spawn') 137 | elif sys.platform == 'linux' or sys.platform == 'linux2': 138 | raise "Must be using Python 3 with linux! Or else you get a deadlock in conv2d" 139 | 140 | torch.manual_seed(SEED) 141 | shared_model = AtariNet(ACTIONS_SIZE).share_memory() 142 | shared_optimizer = SharedAdam(shared_model.parameters(), lr=LR) 143 | 144 | info = {k: torch.DoubleTensor([0]).share_memory_() for k in ['run_epr', 'episodes', 'frames']} 145 | 146 | processes = [] 147 | for rank in range(PROCESSES): 148 | p = mp.Process(target=worker, args=(shared_model, shared_optimizer, rank, info)) 149 | p.start() 150 | processes.append(p) 151 | for p in processes: p.join() 152 | -------------------------------------------------------------------------------- /PER-and-ACQ/net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class AtariNet(nn.Module): 6 | 7 | def __init__(self, num_actions): 8 | super(AtariNet, self).__init__() 9 | self.conv1 = nn.Sequential( 10 | nn.Conv2d(1, 32, kernel_size=8, stride=4), 11 | nn.ReLU() 12 | ) 13 | self.conv2 = nn.Sequential( 14 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 15 | nn.ReLU() 16 | ) 17 | self.conv3 = nn.Sequential( 18 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 19 | nn.ReLU() 20 | ) 21 | self.hidden = nn.Sequential( 22 | nn.Linear(64 * 7 * 7, 512, bias=True), 23 | nn.ReLU() 24 | ) 25 | self.out = nn.Sequential( 26 | nn.Linear(512, num_actions, bias=True) 27 | ) 28 | self.apply(self.init_weights) 29 | 30 | def init_weights(self, m): 31 | if type(m) == nn.Conv2d: 32 | m.weight.data.normal_(0.0, 0.02) 33 | if type(m) == nn.Linear: 34 | torch.nn.init.xavier_uniform_(m.weight) 35 | m.bias.data.fill_(0.01) 36 | 37 | def forward(self, x): 38 | x = self.conv1(x) 39 | x = self.conv2(x) 40 | x = self.conv3(x) 41 | x = x.view(x.size(0), -1) 42 | x = self.hidden(x) 43 | x = self.out(x) 44 | return x 45 | 46 | 47 | class CnnDQN(nn.Module): 48 | def __init__(self, inputs_shape, num_actions): 49 | super(CnnDQN, self).__init__() 50 | 51 | self.inut_shape = inputs_shape 52 | self.num_actions = num_actions 53 | 54 | self.features = nn.Sequential( 55 | nn.Conv2d(inputs_shape[0], 32, kernel_size=8, stride=4), 56 | nn.ReLU(), 57 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 58 | nn.ReLU(), 59 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 60 | nn.ReLU() 61 | ) 62 | 63 | self.fc = nn.Sequential( 64 | nn.Linear(self.features_size(), 512), 65 | nn.ReLU(), 66 | nn.Linear(512, self.num_actions) 67 | ) 68 | 69 | def forward(self, x): 70 | x = self.features(x) 71 | x = x.view(x.size(0), -1) 72 | x = self.fc(x) 73 | return x 74 | 75 | def features_size(self): 76 | return self.features(torch.zeros(1, *self.inut_shape)).view(1, -1).size(1) 77 | -------------------------------------------------------------------------------- /PER-and-ACQ/prioritized-DQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from prioritized_memory import Memory 4 | import numpy as np 5 | import gym 6 | import random 7 | from net import AtariNet 8 | from util import preprocess 9 | 10 | BATCH_SIZE = 32 11 | LR = 0.001 12 | START_EPSILON = 1.0 13 | FINAL_EPSILON = 0.1 14 | EPSILON = START_EPSILON 15 | EXPLORE = 1000000 16 | GAMMA = 0.99 17 | TOTAL_EPISODES = 10000000 18 | MEMORY_SIZE = 1000000 19 | MEMORY_THRESHOLD = 100000 20 | UPDATE_TIME = 10000 21 | TEST_FREQUENCY = 1000 22 | env = gym.make('Pong-v0') 23 | env = env.unwrapped 24 | ACTIONS_SIZE = env.action_space.n 25 | 26 | 27 | class Agent(object): 28 | def __init__(self): 29 | self.network, self.target_network = AtariNet(ACTIONS_SIZE), AtariNet(ACTIONS_SIZE) 30 | self.memory = Memory(MEMORY_SIZE) 31 | self.learning_count = 0 32 | self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR) 33 | self.loss_func = nn.MSELoss() 34 | 35 | def action(self, state, israndom): 36 | if israndom and random.random() < EPSILON: 37 | return np.random.randint(0, ACTIONS_SIZE) 38 | state = torch.unsqueeze(torch.FloatTensor(state), 0) 39 | actions_value = self.network.forward(state) 40 | return torch.max(actions_value, 1)[1].data.numpy()[0] 41 | 42 | def learn(self, state, action, reward, next_state, done): 43 | old_val = self.network.forward(torch.FloatTensor([state])).gather(1, torch.LongTensor([[action]]))[0] 44 | target_val = self.network.forward(torch.FloatTensor([state])) 45 | if done: 46 | done = 0 47 | target = reward 48 | else: 49 | done = 1 50 | target = reward + GAMMA * torch.max(target_val) 51 | error = abs(old_val[0] - target) 52 | self.memory.add(error.data, (state, action, reward, next_state, done)) 53 | if self.memory.tree.n_entries < MEMORY_THRESHOLD: 54 | return 55 | 56 | if self.learning_count % UPDATE_TIME == 0: 57 | self.target_network.load_state_dict(self.network.state_dict()) 58 | self.learning_count += 1 59 | 60 | batch, idxs, is_weights = self.memory.sample(BATCH_SIZE) 61 | state = torch.FloatTensor([x[0] for x in batch]) 62 | action = torch.LongTensor([[x[1]] for x in batch]) 63 | reward = torch.FloatTensor([[x[2]] for x in batch]) 64 | next_state = torch.FloatTensor([x[3] for x in batch]) 65 | done = torch.FloatTensor([[x[4]] for x in batch]) 66 | 67 | eval_q = self.network.forward(state).gather(1, action) 68 | next_q = self.target_network(next_state).detach() 69 | target_q = reward + GAMMA * next_q.max(1)[0].view(BATCH_SIZE, 1) * done 70 | errors = torch.abs(eval_q - target_q).data.numpy().flatten() 71 | loss = self.loss_func(eval_q, target_q) 72 | 73 | for i in range(BATCH_SIZE): 74 | idx = idxs[i] 75 | self.memory.update(idx, errors[i]) 76 | 77 | self.optimizer.zero_grad() 78 | loss.backward() 79 | self.optimizer.step() 80 | 81 | 82 | agent = Agent() 83 | 84 | for i_episode in range(TOTAL_EPISODES): 85 | state = env.reset() 86 | state = preprocess(state) 87 | while True: 88 | # env.render() 89 | action = agent.action(state, True) 90 | next_state, reward, done, info = env.step(action) 91 | next_state = preprocess(next_state) 92 | agent.learn(state, action, reward, next_state, done) 93 | 94 | state = next_state 95 | if done: 96 | break 97 | if EPSILON > FINAL_EPSILON: 98 | EPSILON -= (START_EPSILON - FINAL_EPSILON) / EXPLORE 99 | 100 | # TEST 101 | if i_episode % TEST_FREQUENCY == 0: 102 | state = env.reset() 103 | state = preprocess(state) 104 | total_reward = 0 105 | while True: 106 | # env.render() 107 | action = agent.action(state, israndom=False) 108 | next_state, reward, done, info = env.step(action) 109 | next_state = preprocess(next_state) 110 | 111 | total_reward += reward 112 | 113 | state = next_state 114 | if done: 115 | break 116 | print('episode: {} , total_reward: {}'.format(i_episode, round(total_reward, 3))) 117 | 118 | env.close() 119 | -------------------------------------------------------------------------------- /PER-and-ACQ/prioritized_memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | 5 | class SumTree: 6 | write = 0 7 | 8 | def __init__(self, capacity): 9 | self.capacity = capacity 10 | self.tree = np.zeros(2 * capacity - 1) 11 | self.data = np.zeros(capacity, dtype=object) 12 | self.n_entries = 0 13 | 14 | def _propagate(self, idx, change): 15 | parent = (idx - 1) // 2 16 | self.tree[parent] += change 17 | if parent != 0: 18 | self._propagate(parent, change) 19 | 20 | def _retrieve(self, idx, s): 21 | left = 2 * idx + 1 22 | right = left + 1 23 | if left >= len(self.tree): 24 | return idx 25 | 26 | if s <= self.tree[left]: 27 | return self._retrieve(left, s) 28 | else: 29 | return self._retrieve(right, s - self.tree[left]) 30 | 31 | def total(self): 32 | return self.tree[0] 33 | 34 | def add(self, p, data): 35 | idx = self.write + self.capacity - 1 36 | self.data[self.write] = data 37 | self.update(idx, p) 38 | self.write += 1 39 | if self.write >= self.capacity: 40 | self.write = 0 41 | 42 | if self.n_entries < self.capacity: 43 | self.n_entries += 1 44 | 45 | def update(self, idx, p): 46 | change = p - self.tree[idx] 47 | self.tree[idx] = p 48 | self._propagate(idx, change) 49 | 50 | def get(self, s): 51 | idx = self._retrieve(0, s) 52 | dataIdx = idx - self.capacity + 1 53 | return (idx, self.tree[idx], self.data[dataIdx]) 54 | 55 | 56 | class Memory: 57 | e = 0.01 58 | a = 0.6 59 | beta = 0.4 60 | beta_increment_per_sampling = 0.001 61 | 62 | def __init__(self, capacity): 63 | self.tree = SumTree(capacity) 64 | self.capacity = capacity 65 | 66 | def _get_priority(self, error): 67 | return (error + self.e) ** self.a 68 | 69 | def add(self, error, sample): 70 | p = self._get_priority(error) 71 | self.tree.add(p, sample) 72 | 73 | def sample(self, n): 74 | batch = [] 75 | idxs = [] 76 | segment = self.tree.total() / n 77 | priorities = [] 78 | self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) 79 | for i in range(n): 80 | a = segment * i 81 | b = segment * (i + 1) 82 | s = random.uniform(a, b) 83 | (idx, p, data) = self.tree.get(s) 84 | priorities.append(p) 85 | batch.append(data) 86 | idxs.append(idx) 87 | sampling_probabilities = priorities / self.tree.total() 88 | is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta) 89 | is_weight /= is_weight.max() 90 | return batch, idxs, is_weight 91 | 92 | def update(self, idx, error): 93 | p = self._get_priority(error) 94 | self.tree.update(idx, p) -------------------------------------------------------------------------------- /PER-and-ACQ/util.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | 5 | def preprocess(observation): 6 | """ 7 | image preprocess 8 | :param observation: 9 | :return: 10 | """ 11 | observation = cv2.cvtColor(cv2.resize(observation, (84, 110)), cv2.COLOR_BGR2GRAY) 12 | observation = observation[26:110,:] 13 | ret, observation = cv2.threshold(observation,1,255,cv2.THRESH_BINARY) 14 | x = np.reshape(observation,(84,84,1)) 15 | return x.transpose((2, 0, 1)) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 《白话强化学习与Pytorch》代码 2 | 3 | ## 5.2 Q-Learning: 4 | 5 | native-Qlearning.py 6 | 7 | ## 6 Deep Learning: 8 | 9 | feedforward_neural_network/ 10 | 11 | convolutional_neural_network/ 12 | 13 | recurrent_neural_network/ 14 | 15 | ## 8.1 NIPS DQN: 16 | 17 | nips-DQN.py 18 | 19 | ## 8.2 Nature DQN: 20 | 21 | nature-DQN.py 22 | 23 | ## 8.3 Double DQN: 24 | 25 | double-DQN.py 26 | 27 | ## 8.4 Dueling DQN: 28 | 29 | dueling-DQN.py 30 | 31 | ## 9.4 DDPG: 32 | 33 | ddpg.py 34 | 35 | ## 10.1.2 A3C DDPG: 36 | 37 | A3C-DDPG.py 38 | 39 | ## 联系作者 40 | 41 | ### 邮箱:zhenbinye@gmail.com,77232517@qq.com 42 | 43 | ### 代码持续更新,您若有改进建议或者问题请联系作者 44 | 45 | ### 由于版本更迭,代码或许和书中印刷内容略有出入,敬请谅解 46 | -------------------------------------------------------------------------------- /convolutional_neural_network/main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchvision 4 | import torchvision.transforms as transforms 5 | 6 | 7 | # Device configuration 8 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 9 | 10 | # Hyper parameters 11 | num_epochs = 5 12 | num_classes = 10 13 | batch_size = 100 14 | learning_rate = 0.001 15 | 16 | # MNIST dataset 17 | train_dataset = torchvision.datasets.MNIST(root='../../data/', 18 | train=True, 19 | transform=transforms.ToTensor(), 20 | download=True) 21 | 22 | test_dataset = torchvision.datasets.MNIST(root='../../data/', 23 | train=False, 24 | transform=transforms.ToTensor()) 25 | 26 | # Data loader 27 | train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 28 | batch_size=batch_size, 29 | shuffle=True) 30 | 31 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 32 | batch_size=batch_size, 33 | shuffle=False) 34 | 35 | # Convolutional neural network (two convolutional layers) 36 | class ConvNet(nn.Module): 37 | def __init__(self, num_classes=10): 38 | super(ConvNet, self).__init__() 39 | self.layer1 = nn.Sequential( 40 | nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2), 41 | nn.BatchNorm2d(16), 42 | nn.ReLU(), 43 | nn.MaxPool2d(kernel_size=2, stride=2)) 44 | self.layer2 = nn.Sequential( 45 | nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2), 46 | nn.BatchNorm2d(32), 47 | nn.ReLU(), 48 | nn.MaxPool2d(kernel_size=2, stride=2)) 49 | self.fc = nn.Linear(7*7*32, num_classes) 50 | 51 | def forward(self, x): 52 | out = self.layer1(x) 53 | out = self.layer2(out) 54 | out = out.reshape(out.size(0), -1) 55 | out = self.fc(out) 56 | return out 57 | 58 | model = ConvNet(num_classes).to(device) 59 | 60 | # Loss and optimizer 61 | criterion = nn.CrossEntropyLoss() 62 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 63 | 64 | # Train the model 65 | total_step = len(train_loader) 66 | for epoch in range(num_epochs): 67 | for i, (images, labels) in enumerate(train_loader): 68 | images = images.to(device) 69 | labels = labels.to(device) 70 | 71 | # Forward pass 72 | outputs = model(images) 73 | loss = criterion(outputs, labels) 74 | 75 | # Backward and optimize 76 | optimizer.zero_grad() 77 | loss.backward() 78 | optimizer.step() 79 | 80 | if (i+1) % 100 == 0: 81 | print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 82 | .format(epoch+1, num_epochs, i+1, total_step, loss.item())) 83 | 84 | # Test the model 85 | model.eval() # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance) 86 | with torch.no_grad(): 87 | correct = 0 88 | total = 0 89 | for images, labels in test_loader: 90 | images = images.to(device) 91 | labels = labels.to(device) 92 | outputs = model(images) 93 | _, predicted = torch.max(outputs.data, 1) 94 | total += labels.size(0) 95 | correct += (predicted == labels).sum().item() 96 | 97 | print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) 98 | 99 | # Save the model checkpoint 100 | torch.save(model.state_dict(), 'model.ckpt') 101 | -------------------------------------------------------------------------------- /ddpg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch, gym, argparse 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | import torch.nn.functional as F 6 | 7 | 8 | class ReplayBuffer(object): 9 | def __init__(self, max_size=1e6): 10 | self.storage = [] 11 | self.max_size = max_size 12 | self.ptr = 0 13 | 14 | def add(self, data): 15 | if len(self.storage) == self.max_size: 16 | self.storage[int(self.ptr)] = data 17 | self.ptr = (self.ptr + 1) % self.max_size 18 | else: 19 | self.storage.append(data) 20 | 21 | def sample(self, batch_size): 22 | ind = np.random.randint(0, len(self.storage), size=batch_size) 23 | x, y, u, r, d = [], [], [], [], [] 24 | for i in ind: 25 | X, Y, U, R, D = self.storage[i] 26 | x.append(np.array(X, copy=False)) 27 | y.append(np.array(Y, copy=False)) 28 | u.append(np.array(U, copy=False)) 29 | r.append(np.array(R, copy=False)) 30 | d.append(np.array(D, copy=False)) 31 | return np.array(x), np.array(y), np.array(u), np.array(r).reshape(-1, 1), np.array(d).reshape(-1, 1) 32 | 33 | 34 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 35 | 36 | 37 | class Actor(nn.Module): 38 | def __init__(self, state_dim, action_dim, max_action): 39 | super(Actor, self).__init__() 40 | self.l1 = nn.Linear(state_dim, 400) 41 | self.l2 = nn.Linear(400, 300) 42 | self.l3 = nn.Linear(300, action_dim) 43 | self.max_action = max_action 44 | 45 | def forward(self, x): 46 | x = F.relu(self.l1(x)) 47 | x = F.relu(self.l2(x)) 48 | x = self.max_action * torch.tanh(self.l3(x)) 49 | return x 50 | 51 | 52 | class Critic(nn.Module): 53 | def __init__(self, state_dim, action_dim): 54 | super(Critic, self).__init__() 55 | self.l1 = nn.Linear(state_dim, 400) 56 | self.l2 = nn.Linear(400 + action_dim, 300) 57 | self.l3 = nn.Linear(300, 1) 58 | 59 | def forward(self, x, u): 60 | x = F.relu(self.l1(x)) 61 | x = F.relu(self.l2(torch.cat([x, u], 1))) 62 | x = self.l3(x) 63 | return x 64 | 65 | 66 | class DDPG(object): 67 | def __init__(self, state_dim, action_dim, max_action): 68 | self.actor = Actor(state_dim, action_dim, max_action).to(device) 69 | self.actor_target = Actor(state_dim, action_dim, max_action).to(device) 70 | self.actor_target.load_state_dict(self.actor.state_dict()) 71 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4) 72 | self.critic = Critic(state_dim, action_dim).to(device) 73 | self.critic_target = Critic(state_dim, action_dim).to(device) 74 | self.critic_target.load_state_dict(self.critic.state_dict()) 75 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), weight_decay=1e-2) 76 | 77 | def select_action(self, state): 78 | state = torch.FloatTensor(state.reshape(1, -1)).to(device) 79 | return self.actor(state).cpu().data.numpy().flatten() 80 | 81 | def train(self, replay_buffer, iterations, batch_size=64, discount=0.99, tau=0.001): 82 | 83 | for _ in range(iterations): 84 | x, y, u, r, d = replay_buffer.sample(batch_size) 85 | state = torch.FloatTensor(x).to(device) 86 | action = torch.FloatTensor(u).to(device) 87 | next_state = torch.FloatTensor(y).to(device) 88 | done = torch.FloatTensor(1 - d).to(device) 89 | reward = torch.FloatTensor(r).to(device) 90 | 91 | target_Q = self.critic_target(next_state, self.actor_target(next_state)) 92 | target_Q = reward + (done * discount * target_Q).detach() 93 | current_Q = self.critic(state, action) 94 | 95 | critic_loss = F.mse_loss(current_Q, target_Q) 96 | self.critic_optimizer.zero_grad() 97 | critic_loss.backward() 98 | self.critic_optimizer.step() 99 | 100 | actor_loss = -self.critic(state, self.actor(state)).mean() 101 | self.actor_optimizer.zero_grad() 102 | actor_loss.backward() 103 | self.actor_optimizer.step() 104 | 105 | # Update model 106 | for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): 107 | target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) 108 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): 109 | target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) 110 | 111 | 112 | if __name__ == "__main__": 113 | parser = argparse.ArgumentParser() 114 | parser.add_argument("--env_name", default="Pendulum-v0") 115 | parser.add_argument("--seed", default=0, type=int, help='Sets Gym, PyTorch and Numpy seeds') 116 | parser.add_argument("--start_timesteps", default=1e4, type=int, help='how many step random policy run') 117 | parser.add_argument("--max_timesteps", default=1e6, type=float, help='max_timesteps') 118 | parser.add_argument("--expl_noise", default=0.1, type=float, help='Gaussian exploration') 119 | parser.add_argument("--batch_size", default=100, type=int, help='Batch size') 120 | parser.add_argument("--GAMMA", default=0.99, type=float, help='Discount') 121 | parser.add_argument("--tau", default=0.005, type=float, help='DDPG update rate') 122 | parser.add_argument("--policy_noise", default=0.2, type=float, help='Noise to target policy during critic update') 123 | parser.add_argument("--noise_clip", default=0.5, type=float, help='Range to clip target policy noise') 124 | parser.add_argument("--policy_freq", default=2, type=int, help=' Frequency of delayed policy updates') 125 | args = parser.parse_args() 126 | 127 | env = gym.make(args.env_name) 128 | env.seed(args.seed) 129 | torch.manual_seed(args.seed) 130 | np.random.seed(args.seed) 131 | state_dim = env.observation_space.shape[0] 132 | action_dim = env.action_space.shape[0] 133 | max_action = float(env.action_space.high[0]) 134 | policy = DDPG(state_dim, action_dim, max_action) 135 | replay_buffer = ReplayBuffer() 136 | total_timesteps = 0 137 | timesteps_since_eval = 0 138 | episode_num = 0 139 | episode_reward = 0 140 | episode_timesteps = 0 141 | done = True 142 | 143 | while total_timesteps < args.max_timesteps: 144 | if done: 145 | if total_timesteps != 0: 146 | print(("Total T: %d Episode Num: %d Episode T: %d Reward: %f") % (total_timesteps, episode_num, episode_timesteps, episode_reward)) 147 | policy.train(replay_buffer, episode_timesteps, args.batch_size, args.GAMMA, args.tau) 148 | 149 | obs = env.reset() 150 | done = False 151 | episode_reward = 0 152 | episode_timesteps = 0 153 | episode_num += 1 154 | 155 | if total_timesteps < args.start_timesteps: 156 | action = env.action_space.sample() 157 | else: 158 | action = policy.select_action(np.array(obs)) 159 | if args.expl_noise != 0: 160 | action = (action + np.random.normal(0, args.expl_noise, size=env.action_space.shape[0])).clip( 161 | env.action_space.low, env.action_space.high) 162 | 163 | new_obs, reward, done, _ = env.step(action) 164 | done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done) 165 | episode_reward += reward 166 | 167 | replay_buffer.add((obs, new_obs, action, reward, done_bool)) 168 | obs = new_obs 169 | episode_timesteps += 1 170 | total_timesteps += 1 171 | timesteps_since_eval += 1 -------------------------------------------------------------------------------- /double-DQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from collections import deque 4 | import numpy as np 5 | import gym 6 | import random 7 | from net import AtariNet 8 | from util import preprocess 9 | 10 | BATCH_SIZE = 32 11 | LR = 0.001 12 | START_EPSILON = 1.0 13 | FINAL_EPSILON = 0.1 14 | EPSILON = START_EPSILON 15 | EXPLORE = 1000000 16 | GAMMA = 0.99 17 | TOTAL_EPISODES = 10000000 18 | MEMORY_SIZE = 1000000 19 | MEMORY_THRESHOLD = 100000 20 | UPDATE_TIME = 10000 21 | TEST_FREQUENCY = 1000 22 | env = gym.make('Pong-v0') 23 | env = env.unwrapped 24 | ACTIONS_SIZE = env.action_space.n 25 | 26 | 27 | class Agent(object): 28 | def __init__(self): 29 | self.network, self.target_network = AtariNet(ACTIONS_SIZE), AtariNet(ACTIONS_SIZE) 30 | self.memory = deque() 31 | self.learning_count = 0 32 | self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR) 33 | self.loss_func = nn.MSELoss() 34 | 35 | def action(self, state, israndom): 36 | if israndom and random.random() < EPSILON: 37 | return np.random.randint(0, ACTIONS_SIZE) 38 | state = torch.unsqueeze(torch.FloatTensor(state), 0) 39 | actions_value = self.network.forward(state) 40 | return torch.max(actions_value, 1)[1].data.numpy()[0] 41 | 42 | def learn(self, state, action, reward, next_state, done): 43 | if done: 44 | self.memory.append((state, action, reward, next_state, 0)) 45 | else: 46 | self.memory.append((state, action, reward, next_state, 1)) 47 | if len(self.memory) > MEMORY_SIZE: 48 | self.memory.popleft() 49 | if len(self.memory) < MEMORY_THRESHOLD: 50 | return 51 | 52 | if self.learning_count % UPDATE_TIME == 0: 53 | self.target_network.load_state_dict(self.network.state_dict()) 54 | self.learning_count += 1 55 | 56 | batch = random.sample(self.memory, BATCH_SIZE) 57 | state = torch.FloatTensor([x[0] for x in batch]) 58 | action = torch.LongTensor([[x[1]] for x in batch]) 59 | reward = torch.FloatTensor([[x[2]] for x in batch]) 60 | next_state = torch.FloatTensor([x[3] for x in batch]) 61 | done = torch.FloatTensor([[x[4]] for x in batch]) 62 | 63 | actions_value = self.network.forward(next_state) 64 | next_action = torch.unsqueeze(torch.max(actions_value, 1)[1], 1) 65 | eval_q = self.network.forward(state).gather(1, action) 66 | next_q = self.target_network.forward(next_state).gather(1, next_action) 67 | target_q = reward + GAMMA * next_q * done 68 | loss = self.loss_func(eval_q, target_q) 69 | 70 | self.optimizer.zero_grad() 71 | loss.backward() 72 | self.optimizer.step() 73 | 74 | 75 | agent = Agent() 76 | 77 | for i_episode in range(TOTAL_EPISODES): 78 | state = env.reset() 79 | state = preprocess(state) 80 | while True: 81 | # env.render() 82 | action = agent.action(state, True) 83 | next_state, reward, done, info = env.step(action) 84 | next_state = preprocess(next_state) 85 | agent.learn(state, action, reward, next_state, done) 86 | 87 | state = next_state 88 | if done: 89 | break 90 | if EPSILON > FINAL_EPSILON: 91 | EPSILON -= (START_EPSILON - FINAL_EPSILON) / EXPLORE 92 | 93 | # TEST 94 | if i_episode % TEST_FREQUENCY == 0: 95 | state = env.reset() 96 | state = preprocess(state) 97 | total_reward = 0 98 | while True: 99 | # env.render() 100 | action = agent.action(state, israndom=False) 101 | next_state, reward, done, info = env.step(action) 102 | next_state = preprocess(next_state) 103 | 104 | total_reward += reward 105 | 106 | state = next_state 107 | if done: 108 | break 109 | print('episode: {} , total_reward: {}'.format(i_episode, round(total_reward, 3))) 110 | 111 | env.close() 112 | -------------------------------------------------------------------------------- /dueling-DQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from collections import deque 4 | import numpy as np 5 | import gym 6 | import random 7 | import cv2 8 | 9 | BATCH_SIZE = 32 10 | LR = 0.001 11 | START_EPSILON = 1.0 12 | FINAL_EPSILON = 0.1 13 | EPSILON = START_EPSILON 14 | EXPLORE = 1000000 15 | GAMMA = 0.99 16 | TOTAL_EPISODES = 10000000 17 | MEMORY_SIZE = 1000000 18 | MEMORY_THRESHOLD = 100000 19 | TEST_FREQUENCY = 1000 20 | env = gym.make('Pong-v0') 21 | env = env.unwrapped 22 | ACTIONS_SIZE = env.action_space.n 23 | 24 | 25 | def preprocess(observation): 26 | observation = cv2.cvtColor(cv2.resize(observation, (84, 110)), cv2.COLOR_BGR2GRAY) 27 | observation = observation[26:110,:] 28 | ret, observation = cv2.threshold(observation,1,255,cv2.THRESH_BINARY) 29 | x = np.reshape(observation,(84,84,1)) 30 | return x.transpose((2, 0, 1)) 31 | 32 | 33 | class DuelingNet(nn.Module): 34 | 35 | def __init__(self, num_actions): 36 | super(DuelingNet, self).__init__() 37 | self.num_actions = num_actions 38 | self.conv1 = nn.Sequential( 39 | nn.Conv2d(1, 32, kernel_size=8, stride=4), 40 | nn.ReLU() 41 | ) 42 | self.conv2 = nn.Sequential( 43 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 44 | nn.ReLU() 45 | ) 46 | self.conv3 = nn.Sequential( 47 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 48 | nn.ReLU() 49 | ) 50 | self.hidden_adv = nn.Sequential( 51 | nn.Linear(64 * 7 * 7, 512, bias=True), 52 | nn.ReLU() 53 | ) 54 | self.hidden_val = nn.Sequential( 55 | nn.Linear(64 * 7 * 7, 512, bias=True), 56 | nn.ReLU() 57 | ) 58 | self.adv = nn.Sequential( 59 | nn.Linear(512, num_actions, bias=True) 60 | ) 61 | self.val = nn.Sequential( 62 | nn.Linear(512, 1, bias=True) 63 | ) 64 | self.apply(self.init_weights) 65 | 66 | def init_weights(self, m): 67 | if type(m) == nn.Conv2d: 68 | m.weight.data.normal_(0.0, 0.02) 69 | if type(m) == nn.Linear: 70 | torch.nn.init.xavier_uniform_(m.weight) 71 | m.bias.data.fill_(0.01) 72 | 73 | def forward(self, x): 74 | x = self.conv1(x) 75 | x = self.conv2(x) 76 | x = self.conv3(x) 77 | x = x.view(x.size(0), -1) 78 | adv = self.hidden_adv(x) 79 | val = self.hidden_val(x) 80 | 81 | adv = self.adv(adv) 82 | val = self.val(val).expand(x.size(0), self.num_actions) 83 | 84 | x = val + adv - adv.mean(1).unsqueeze(1).expand(x.size(0), self.num_actions) 85 | return x 86 | 87 | class Agent(object): 88 | def __init__(self): 89 | self.network = DuelingNet(ACTIONS_SIZE) 90 | self.memory = deque() 91 | self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR) 92 | self.loss_func = nn.MSELoss() 93 | 94 | def action(self, state, israndom): 95 | if israndom and random.random() < EPSILON: 96 | return np.random.randint(0, ACTIONS_SIZE) 97 | state = torch.unsqueeze(torch.FloatTensor(state), 0) 98 | actions_value = self.network.forward(state) 99 | return torch.max(actions_value, 1)[1].data.numpy()[0] 100 | 101 | def learn(self, state, action, reward, next_state, done): 102 | if done: 103 | self.memory.append((state, action, reward, next_state, 0)) 104 | else: 105 | self.memory.append((state, action, reward, next_state, 1)) 106 | if len(self.memory) > MEMORY_SIZE: 107 | self.memory.popleft() 108 | if len(self.memory) < MEMORY_THRESHOLD: 109 | return 110 | 111 | batch = random.sample(self.memory, BATCH_SIZE) 112 | state = torch.FloatTensor([x[0] for x in batch]) 113 | action = torch.LongTensor([[x[1]] for x in batch]) 114 | reward = torch.FloatTensor([[x[2]] for x in batch]) 115 | next_state = torch.FloatTensor([x[3] for x in batch]) 116 | done = torch.FloatTensor([[x[4]] for x in batch]) 117 | 118 | eval_q = self.network.forward(state).gather(1, action) 119 | next_q = self.network(next_state).detach() 120 | target_q = reward + GAMMA * next_q.max(1)[0].view(BATCH_SIZE, 1) * done 121 | loss = self.loss_func(eval_q, target_q) 122 | 123 | self.optimizer.zero_grad() 124 | loss.backward() 125 | self.optimizer.step() 126 | 127 | 128 | agent = Agent() 129 | 130 | for i_episode in range(TOTAL_EPISODES): 131 | state = env.reset() 132 | state = preprocess(state) 133 | while True: 134 | # env.render() 135 | action = agent.action(state, True) 136 | next_state, reward, done, info = env.step(action) 137 | next_state = preprocess(next_state) 138 | agent.learn(state, action, reward, next_state, done) 139 | 140 | state = next_state 141 | if done: 142 | break 143 | if EPSILON > FINAL_EPSILON: 144 | EPSILON -= (START_EPSILON - FINAL_EPSILON) / EXPLORE 145 | 146 | # TEST 147 | if i_episode % TEST_FREQUENCY == 0: 148 | state = env.reset() 149 | state = preprocess(state) 150 | total_reward = 0 151 | while True: 152 | # env.render() 153 | action = agent.action(state, israndom=False) 154 | next_state, reward, done, info = env.step(action) 155 | next_state = preprocess(next_state) 156 | 157 | total_reward += reward 158 | 159 | state = next_state 160 | if done: 161 | break 162 | print('episode: {} , total_reward: {}'.format(i_episode, round(total_reward, 3))) 163 | 164 | env.close() 165 | -------------------------------------------------------------------------------- /feedforward_neural_network/main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchvision 4 | import torchvision.transforms as transforms 5 | 6 | 7 | # Device configuration 8 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 9 | 10 | # Hyper-parameters 11 | input_size = 784 12 | hidden_size = 500 13 | num_classes = 10 14 | num_epochs = 5 15 | batch_size = 100 16 | learning_rate = 0.001 17 | 18 | # MNIST dataset 19 | train_dataset = torchvision.datasets.MNIST(root='../../data', 20 | train=True, 21 | transform=transforms.ToTensor(), 22 | download=True) 23 | 24 | test_dataset = torchvision.datasets.MNIST(root='../../data', 25 | train=False, 26 | transform=transforms.ToTensor()) 27 | 28 | # Data loader 29 | train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 30 | batch_size=batch_size, 31 | shuffle=True) 32 | 33 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 34 | batch_size=batch_size, 35 | shuffle=False) 36 | 37 | # Fully connected neural network with one hidden layer 38 | class NeuralNet(nn.Module): 39 | def __init__(self, input_size, hidden_size, num_classes): 40 | super(NeuralNet, self).__init__() 41 | self.fc1 = nn.Linear(input_size, hidden_size) 42 | self.relu = nn.ReLU() 43 | self.fc2 = nn.Linear(hidden_size, num_classes) 44 | 45 | def forward(self, x): 46 | out = self.fc1(x) 47 | out = self.relu(out) 48 | out = self.fc2(out) 49 | return out 50 | 51 | model = NeuralNet(input_size, hidden_size, num_classes).to(device) 52 | 53 | # Loss and optimizer 54 | criterion = nn.CrossEntropyLoss() 55 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 56 | 57 | # Train the model 58 | total_step = len(train_loader) 59 | for epoch in range(num_epochs): 60 | for i, (images, labels) in enumerate(train_loader): 61 | # Move tensors to the configured device 62 | images = images.reshape(-1, 28*28).to(device) 63 | labels = labels.to(device) 64 | 65 | # Forward pass 66 | outputs = model(images) 67 | loss = criterion(outputs, labels) 68 | 69 | # Backward and optimize 70 | optimizer.zero_grad() 71 | loss.backward() 72 | optimizer.step() 73 | 74 | if (i+1) % 100 == 0: 75 | print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 76 | .format(epoch+1, num_epochs, i+1, total_step, loss.item())) 77 | 78 | # Test the model 79 | # In test phase, we don't need to compute gradients (for memory efficiency) 80 | with torch.no_grad(): 81 | correct = 0 82 | total = 0 83 | for images, labels in test_loader: 84 | images = images.reshape(-1, 28*28).to(device) 85 | labels = labels.to(device) 86 | outputs = model(images) 87 | _, predicted = torch.max(outputs.data, 1) 88 | total += labels.size(0) 89 | correct += (predicted == labels).sum().item() 90 | 91 | print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total)) 92 | 93 | # Save the model checkpoint 94 | torch.save(model.state_dict(), 'model.ckpt') 95 | -------------------------------------------------------------------------------- /ga/bag.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import random 3 | #背包问题 4 | #物品质量价格 5 | X = { 6 | 1: [10, 15], 7 | 2: [15, 25], 8 | 3: [20, 35], 9 | 4: [25, 45], 10 | 5: [30, 55], 11 | 6: [35, 70]} 12 | 13 | #终止界限 14 | FINISHED_LIMIT = 5 15 | 16 | #重量界限 17 | WEIGHT_LIMIT = 80 18 | 19 | #染色体长度 20 | CHROMOSOME_SIZE = 6 21 | 22 | #遴选次数 23 | SELECT_NUMBER = 4 24 | 25 | max_last = 0 26 | diff_last = 10000 27 | 28 | #判断退出 29 | def is_finished(fitnesses): 30 | global max_last 31 | global diff_last 32 | 33 | max_current = 0 34 | for v in fitnesses: 35 | if v[1] > max_current: 36 | max_current = v[1] 37 | 38 | diff = max_current - max_last 39 | if diff < FINISHED_LIMIT and diff_last < FINISHED_LIMIT: 40 | return True 41 | else: 42 | diff_last = diff 43 | max_last = max_current 44 | return False 45 | 46 | #初始染色体样态 47 | def init(): 48 | chromosome_state1 = '100100' 49 | chromosome_state2 = '101010' 50 | chromosome_state3 = '010101' 51 | chromosome_state4 = '101011' 52 | chromosome_states = [chromosome_state1, 53 | chromosome_state2, 54 | chromosome_state3, 55 | chromosome_state4] 56 | return chromosome_states 57 | 58 | 59 | #计算适应度 60 | def fitness(chromosome_states): 61 | fitnesses = [] 62 | for chromosome_state in chromosome_states: 63 | value_sum = 0 64 | weight_sum = 0 65 | for i, v in enumerate(chromosome_state): 66 | if int(v) == 1: 67 | weight_sum += X[i + 1][0] 68 | value_sum += X[i + 1][1] 69 | fitnesses.append([value_sum, weight_sum]) 70 | return fitnesses 71 | 72 | 73 | #筛选 74 | def filter(chromosome_states, fitnesses): 75 | #重量大于80的被淘汰 76 | index = len(fitnesses) - 1 77 | while index >= 0: 78 | index -= 1 79 | if fitnesses[index][1] > WEIGHT_LIMIT: 80 | chromosome_states.pop(index) 81 | fitnesses.pop(index) 82 | 83 | #遴选 84 | selected_index = [0] * len(chromosome_states) 85 | for i in range(SELECT_NUMBER): 86 | j = chromosome_states.index(random.choice(chromosome_states)) 87 | selected_index[j] += 1 88 | return selected_index 89 | 90 | 91 | #产生下一代 92 | def crossover(chromosome_states, selected_index): 93 | chromosome_states_new = [] 94 | index = len(chromosome_states) - 1 95 | while index >= 0: 96 | index -= 1 97 | chromosome_state = chromosome_states.pop(index) 98 | for i in range(selected_index[index]): 99 | chromosome_state_x = random.choice(chromosome_states) 100 | pos = random.choice(range(1, CHROMOSOME_SIZE - 1)) 101 | chromosome_states_new.append(chromosome_state[:pos] + chromosome_state_x[pos:]) 102 | chromosome_states.insert(index, chromosome_state) 103 | return chromosome_states_new 104 | 105 | 106 | if __name__ == '__main__': 107 | #初始群体 108 | chromosome_states = init() 109 | n = 100 110 | while n > 0: 111 | n -= 1 112 | #适应度计算 113 | fitnesses = fitness(chromosome_states) 114 | if is_finished(fitnesses): 115 | break 116 | print('1:', fitnesses) 117 | #遴选 118 | selected_index = filter(chromosome_states, fitnesses) 119 | print('2:', selected_index) 120 | #产生下一代 121 | chromosome_states = crossover(chromosome_states, selected_index) 122 | # print '3:', chromosome_states 123 | 124 | fitnesses = fitness(chromosome_states) 125 | print(chromosome_states) 126 | 127 | # 1: [[60, 35], [105, 60], [140, 75], [175, 95]] 128 | # 2: [1, 1, 2] 129 | 130 | # 1: [[60, 35], [105, 60], [80, 45], [90, 50]] 131 | # 2: [2, 1, 0, 1] 132 | 133 | # 1: [[95, 55], [115, 65], [70, 40], [90, 50]] 134 | # 2: [2, 0, 2, 0] 135 | 136 | # 1: [[70, 40], [70, 40], [150, 85], [115, 65]] 137 | # 2: [3, 0, 1] 138 | 139 | # 1: [[115, 65], [115, 65], [115, 65], [70, 40]] 140 | # 2: [2, 0, 0, 2] 141 | # ['100110', '100110', '100110', '100110'] 142 | -------------------------------------------------------------------------------- /ga/peak.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import random 3 | import math 4 | import numpy as np 5 | 6 | 7 | #极大值问题 8 | #染色体 基因X 基因Y 9 | X = [ 10 | [1, '000000100101001', '101010101010101'], 11 | [2, '011000100101100', '001100110011001'], 12 | [3, '001000100100101', '101010101010101'], 13 | [4, '000110100100100', '110011001100110'], 14 | [5, '100000100100101', '101010101010101'], 15 | [6, '101000100100100', '111100001111000'], 16 | [7, '101010100110100', '101010101010101'], 17 | [8, '100110101101000', '000011110000111']] 18 | 19 | 20 | #染色体长度 21 | CHROMOSOME_SIZE = 15 22 | 23 | 24 | #判断退出 25 | def is_finished(last_three): 26 | s = sorted(last_three) 27 | if s[0] and s[2] - s[0] < 0.01 * s[0]: 28 | return True 29 | else: 30 | return False 31 | 32 | #初始染色体样态 33 | def init(): 34 | chromosome_state1 = ['000000100101001', '101010101010101'] 35 | chromosome_state2 = ['011000100101100', '001100110011001'] 36 | chromosome_state3 = ['001000100100101', '101010101010101'] 37 | chromosome_state4 = ['000110100100100', '110011001100110'] 38 | chromosome_state5 = ['100000100100101', '101010101010101'] 39 | chromosome_state6 = ['101000100100100', '111100001111000'] 40 | chromosome_state7 = ['101010100110100', '101010101010101'] 41 | chromosome_state8 = ['100110101101000', '000011110000111'] 42 | chromosome_states = [chromosome_state1, 43 | chromosome_state2, 44 | chromosome_state3, 45 | chromosome_state4, 46 | chromosome_state5, 47 | chromosome_state6, 48 | chromosome_state7, 49 | chromosome_state8] 50 | return chromosome_states 51 | 52 | 53 | #计算适应度 54 | def fitness(chromosome_states): 55 | fitnesses = [] 56 | for chromosome_state in chromosome_states: 57 | if chromosome_state[0][0] == '1': 58 | x = 10 * (-float(int(chromosome_state[0][1:], 2) - 1)/16384) 59 | else: 60 | x = 10 * (float(int(chromosome_state[0], 2) + 1)/16384) 61 | if chromosome_state[1][0] == '1': 62 | y = 10 * (-float(int(chromosome_state[1][1:], 2) - 1)/16384) 63 | else: 64 | y = 10 * (float(int(chromosome_state[1], 2) + 1)/16384) 65 | z = y * math.sin(x) + x * math.cos(y) 66 | print(x, y, z) 67 | fitnesses.append(z) 68 | 69 | return fitnesses 70 | 71 | 72 | #筛选 73 | def filter(chromosome_states, fitnesses): 74 | #top 8 对应的索引值 75 | chromosome_states_new = [] 76 | top1_fitness_index = 0 77 | for i in np.argsort(fitnesses)[::-1][:8].tolist(): 78 | chromosome_states_new.append(chromosome_states[i]) 79 | top1_fitness_index = i 80 | return chromosome_states_new, top1_fitness_index 81 | 82 | 83 | #产生下一代 84 | def crossover(chromosome_states): 85 | chromosome_states_new = [] 86 | while chromosome_states: 87 | chromosome_state = chromosome_states.pop(0) 88 | for v in chromosome_states: 89 | pos = random.choice(range(8, CHROMOSOME_SIZE - 1)) 90 | chromosome_states_new.append([chromosome_state[0][:pos] + v[0][pos:], chromosome_state[1][:pos] + v[1][pos:]]) 91 | chromosome_states_new.append([v[0][:pos] + chromosome_state[1][pos:], v[0][:pos] + chromosome_state[1][pos:]]) 92 | return chromosome_states_new 93 | 94 | 95 | #基因突变 96 | def mutation(chromosome_states): 97 | n = int(5.0 / 100 * len(chromosome_states)) 98 | while n > 0: 99 | n -= 1 100 | chromosome_state = random.choice(chromosome_states) 101 | index = chromosome_states.index(chromosome_state) 102 | pos = random.choice(range(len(chromosome_state))) 103 | x = chromosome_state[0][:pos] + str(int(not int(chromosome_state[0][pos]))) + chromosome_state[0][pos+1:] 104 | y = chromosome_state[1][:pos] + str(int(not int(chromosome_state[1][pos]))) + chromosome_state[1][pos+1:] 105 | chromosome_states[index] = [x, y] 106 | 107 | 108 | if __name__ == '__main__': 109 | chromosome_states = init() 110 | last_three = [0] * 3 111 | last_num = 0 112 | n = 100 113 | while n > 0: 114 | n -= 1 115 | chromosome_states = crossover(chromosome_states) 116 | mutation(chromosome_states) 117 | fitnesses = fitness(chromosome_states) 118 | chromosome_states, top1_fitness_index = filter(chromosome_states, fitnesses) 119 | print('---------%d-----------' % n) 120 | print(chromosome_states) 121 | last_three[last_num] = fitnesses[top1_fitness_index] 122 | print(fitnesses[top1_fitness_index]) 123 | if is_finished(last_three): 124 | break 125 | if last_num >= 2: 126 | last_num = 0 127 | else: 128 | last_num += 1 129 | 130 | 131 | # ['100100', '101010', '010101', '101011'] 132 | 133 | # 1: [[60, 35], [105, 60], [140, 75], [175, 95]] 134 | # 2: [0, 2, 2] 135 | # 136 | # 1: [[60, 35], [60, 35], [80, 45], [125, 70]] 137 | # 2: [3, 0, 1, 0] 138 | # 139 | # 1: [[80, 45], [60, 35], [60, 35], [140, 80]] 140 | # 2: [1, 2, 0, 1] 141 | # 142 | # 1: [[70, 40], [70, 40], [70, 40], [85, 50]] 143 | # 2: [3, 0, 0, 1] 144 | # 145 | # 1: [[70, 40], [70, 40], [70, 40], [95, 55]] 146 | # 2: [4, 0, 0, 0] 147 | # 148 | # 1: [[70, 40], [70, 40], [70, 40], [70, 40]] 149 | # 2: [4, 0, 0, 0] 150 | # 151 | # ['100010', '100010', '100010', '100010'] 152 | # [[70, 40], [70, 40], [70, 40], [70, 40]] 153 | -------------------------------------------------------------------------------- /gym_sample/demo.py: -------------------------------------------------------------------------------- 1 | import gym 2 | env = gym.make('CartPole-v0') 3 | env.reset() 4 | for _ in range(1000): 5 | env.render() 6 | env.step(env.action_space.sample()) # take a random action 7 | -------------------------------------------------------------------------------- /native-Qlearning.py: -------------------------------------------------------------------------------- 1 | # 问题http://mnemstudio.org/path-finding-q-learning-tutorial.htm的Q学习方法实现 2 | import numpy as np 3 | import random 4 | import matplotlib.pyplot as plt 5 | 6 | Q_fun = np.zeros((6, 6)) 7 | 8 | # 回报函数,在状态state采用action转移到next_state的回报,横纵坐标分别为state和next_state 9 | reward = np.array([[-1, -1, -1, -1, 0, -1], 10 | [-1, -1, -1, 0, -1, 100], 11 | [-1, -1, -1, 0, -1, -1], 12 | [-1, 0, 0, -1, 0, -1], 13 | [0, -1, -1, 0, -1, 100], 14 | [-1, 0, -1, -1, 0, 100]]) 15 | 16 | legal_action = [[4], 17 | [3, 5], 18 | [3], 19 | [1, 2, 4], 20 | [0, 3, 5], 21 | [1, 4, 5]] 22 | 23 | GAMMA = 0.5 24 | TRAINING_STEP = 100 25 | LAYOUT = 221 26 | 27 | for i in range(1, TRAINING_STEP + 1): 28 | state = random.randint(0, 4) 29 | # 百分百探索,随机产生next_state 30 | next_state = random.choice(legal_action[state]) 31 | Q_fun[state, next_state] = reward[state, next_state] + GAMMA * Q_fun[next_state].max() 32 | 33 | if i % (TRAINING_STEP/4) == 0: 34 | plt.subplot(LAYOUT) 35 | plt.imshow(Q_fun, cmap='gray_r') 36 | LAYOUT += 1 37 | print(Q_fun) 38 | plt.show() 39 | 40 | 41 | -------------------------------------------------------------------------------- /nature-DQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from collections import deque 4 | import numpy as np 5 | import gym 6 | import random 7 | from net import AtariNet 8 | from util import preprocess 9 | 10 | BATCH_SIZE = 32 11 | LR = 0.001 12 | START_EPSILON = 1.0 13 | FINAL_EPSILON = 0.1 14 | EPSILON = START_EPSILON 15 | EXPLORE = 1000000 16 | GAMMA = 0.99 17 | TOTAL_EPISODES = 10000000 18 | MEMORY_SIZE = 1000000 19 | MEMORY_THRESHOLD = 100000 20 | UPDATE_TIME = 10000 21 | TEST_FREQUENCY = 1000 22 | env = gym.make('Pong-v0') 23 | env = env.unwrapped 24 | ACTIONS_SIZE = env.action_space.n 25 | 26 | 27 | class Agent(object): 28 | def __init__(self): 29 | self.network, self.target_network = AtariNet(ACTIONS_SIZE), AtariNet(ACTIONS_SIZE) 30 | self.memory = deque() 31 | self.learning_count = 0 32 | self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR) 33 | self.loss_func = nn.MSELoss() 34 | 35 | def action(self, state, israndom): 36 | if israndom and random.random() < EPSILON: 37 | return np.random.randint(0, ACTIONS_SIZE) 38 | state = torch.unsqueeze(torch.FloatTensor(state), 0) 39 | actions_value = self.network.forward(state) 40 | return torch.max(actions_value, 1)[1].data.numpy()[0] 41 | 42 | def learn(self, state, action, reward, next_state, done): 43 | if done: 44 | self.memory.append((state, action, reward, next_state, 0)) 45 | else: 46 | self.memory.append((state, action, reward, next_state, 1)) 47 | if len(self.memory) > MEMORY_SIZE: 48 | self.memory.popleft() 49 | if len(self.memory) < MEMORY_THRESHOLD: 50 | return 51 | 52 | if self.learning_count % UPDATE_TIME == 0: 53 | self.target_network.load_state_dict(self.network.state_dict()) 54 | self.learning_count += 1 55 | 56 | batch = random.sample(self.memory, BATCH_SIZE) 57 | state = torch.FloatTensor([x[0] for x in batch]) 58 | action = torch.LongTensor([[x[1]] for x in batch]) 59 | reward = torch.FloatTensor([[x[2]] for x in batch]) 60 | next_state = torch.FloatTensor([x[3] for x in batch]) 61 | done = torch.FloatTensor([[x[4]] for x in batch]) 62 | 63 | eval_q = self.network.forward(state).gather(1, action) 64 | next_q = self.target_network(next_state).detach() 65 | target_q = reward + GAMMA * next_q.max(1)[0].view(BATCH_SIZE, 1) * done 66 | loss = self.loss_func(eval_q, target_q) 67 | 68 | self.optimizer.zero_grad() 69 | loss.backward() 70 | self.optimizer.step() 71 | 72 | 73 | agent = Agent() 74 | 75 | for i_episode in range(TOTAL_EPISODES): 76 | state = env.reset() 77 | state = preprocess(state) 78 | while True: 79 | # env.render() 80 | action = agent.action(state, True) 81 | next_state, reward, done, info = env.step(action) 82 | next_state = preprocess(next_state) 83 | agent.learn(state, action, reward, next_state, done) 84 | 85 | state = next_state 86 | if done: 87 | break 88 | if EPSILON > FINAL_EPSILON: 89 | EPSILON -= (START_EPSILON - FINAL_EPSILON) / EXPLORE 90 | 91 | # TEST 92 | if i_episode % TEST_FREQUENCY == 0: 93 | state = env.reset() 94 | state = preprocess(state) 95 | total_reward = 0 96 | while True: 97 | # env.render() 98 | action = agent.action(state, israndom=False) 99 | next_state, reward, done, info = env.step(action) 100 | next_state = preprocess(next_state) 101 | 102 | total_reward += reward 103 | 104 | state = next_state 105 | if done: 106 | break 107 | print('episode: {} , total_reward: {}'.format(i_episode, round(total_reward, 3))) 108 | 109 | env.close() 110 | -------------------------------------------------------------------------------- /neat/Digraph.gv: -------------------------------------------------------------------------------- 1 | digraph { 2 | node [fontsize=9 height=0.2 shape=circle width=0.2] 3 | In0 [fillcolor=lightgray shape=box style=filled] 4 | In1 [fillcolor=lightgray shape=box style=filled] 5 | In3 [fillcolor=lightgray shape=box style=filled] 6 | In4 [fillcolor=lightgray shape=box style=filled] 7 | act1 [fillcolor=lightblue style=filled] 8 | act2 [fillcolor=lightblue style=filled] 9 | 137 [fillcolor=white style=filled] 10 | 714 [fillcolor=white style=filled] 11 | 626 [fillcolor=white style=filled] 12 | 404 [fillcolor=white style=filled] 13 | 246 [fillcolor=white style=filled] 14 | 442 [fillcolor=white style=filled] 15 | 540 [fillcolor=white style=filled] 16 | In0 -> act2 [color=red penwidth=0.15464140610078286 style=dotted] 17 | In1 -> act1 [color=green penwidth=0.20589896434649435 style=dotted] 18 | In1 -> act2 [color=green penwidth=0.32201856748289415 style=solid] 19 | In3 -> act1 [color=red penwidth=0.41018376786556177 style=dotted] 20 | In3 -> act2 [color=green penwidth=1.049253720927758 style=solid] 21 | In4 -> act1 [color=red penwidth=0.17560360127452074 style=dotted] 22 | In4 -> act2 [color=green penwidth=0.4084716161702079 style=solid] 23 | 137 -> act1 [color=red penwidth=0.17428254929845405 style=dotted] 24 | In0 -> 246 [color=green penwidth=0.4724790110694175 style=dotted] 25 | In0 -> 404 [color=green penwidth=0.34381289726102127 style=dotted] 26 | 137 -> 442 [color=green penwidth=0.4006046348289356 style=solid] 27 | 442 -> act1 [color=green penwidth=0.5911627385006601 style=solid] 28 | In0 -> 540 [color=red penwidth=0.4238873029754118 style=solid] 29 | 540 -> 404 [color=green penwidth=0.4927242488863397 style=solid] 30 | In0 -> 626 [color=green penwidth=0.37161877250004893 style=solid] 31 | 626 -> 404 [color=green penwidth=0.1753711398635109 style=solid] 32 | 137 -> 714 [color=green penwidth=0.24745341023372266 style=solid] 33 | 714 -> act1 [color=red penwidth=0.3848767903179091 style=solid] 34 | 246 -> 714 [color=red penwidth=0.28561121470609185 style=solid] 35 | } 36 | -------------------------------------------------------------------------------- /neat/Digraph.gv.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | %3 11 | 12 | 13 | In0 14 | 15 | In0 16 | 17 | 18 | act1 19 | 20 | act1 21 | 22 | 23 | In0->act1 24 | 25 | 26 | 27 | 28 | act2 29 | 30 | act2 31 | 32 | 33 | In0->act2 34 | 35 | 36 | 37 | 38 | In1 39 | 40 | In1 41 | 42 | 43 | In1->act1 44 | 45 | 46 | 47 | 48 | In1->act2 49 | 50 | 51 | 52 | 53 | In3 54 | 55 | In3 56 | 57 | 58 | In3->act1 59 | 60 | 61 | 62 | 63 | In3->act2 64 | 65 | 66 | 67 | 68 | In4 69 | 70 | In4 71 | 72 | 73 | In4->act1 74 | 75 | 76 | 77 | 78 | In4->act2 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /neat/avg_fitness.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 10 | 11 | 12 | 13 | 19 | 20 | 21 | 22 | 28 | 29 | 30 | 31 | 32 | 35 | 36 | 37 | 38 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 294 | 318 | 337 | 353 | 385 | 405 | 416 | 437 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 490 | 491 | 492 | 493 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 672 | 673 | 674 | 685 | 686 | 687 | 698 | 699 | 700 | 703 | 704 | 705 | 708 | 709 | 710 | 713 | 714 | 715 | 718 | 719 | 720 | 721 | 722 | 741 | 767 | 788 | 794 | 800 | 801 | 810 | 843 | 869 | 894 | 914 | 915 | 916 | 917 | 918 | 919 | 920 | 921 | 922 | 923 | 924 | 925 | 926 | 927 | 928 | 929 | 930 | 931 | 932 | 933 | 934 | 935 | 936 | 937 | 938 | 939 | 940 | 941 | 942 | 943 | 944 | 945 | 946 | 947 | 948 | 949 | 950 | 951 | 952 | 953 | 954 | 955 | 956 | 957 | 968 | 969 | 970 | 973 | 974 | 975 | 976 | 977 | 978 | 979 | 980 | 981 | 982 | 983 | 984 | 985 | 986 | 987 | 988 | 991 | 992 | 993 | 994 | 995 | 996 | 1010 | 1011 | 1012 | 1013 | 1014 | 1015 | 1016 | 1017 | 1018 | 1019 | 1020 | 1023 | 1024 | 1025 | 1026 | 1027 | 1028 | 1029 | 1030 | 1031 | 1032 | 1033 | 1034 | 1035 | 1036 | 1037 | 1038 | 1039 | 1040 | 1041 | 1042 | 1043 | -------------------------------------------------------------------------------- /neat/cartpole.py: -------------------------------------------------------------------------------- 1 | import neat 2 | import sys 3 | import numpy as np 4 | import gym 5 | import visualize 6 | 7 | GAME = 'CartPole-v0' 8 | env = gym.make(GAME).unwrapped 9 | 10 | CONFIG = "./config" 11 | EP_STEP = 300 12 | GENERATION_EP = 10 13 | CHECKPOINT = 9 14 | 15 | def eval_genomes(genomes, config): 16 | for genome_id, genome in genomes: 17 | net = neat.nn.FeedForwardNetwork.create(genome, config) 18 | ep_r = [] 19 | for ep in range(GENERATION_EP): 20 | accumulative_r = 0 21 | observation = env.reset() 22 | for t in range(EP_STEP): 23 | action_values = net.activate(observation) 24 | action = np.argmax(action_values) 25 | observation_, reward, done, _ = env.step(action) 26 | accumulative_r += reward 27 | if done: 28 | break 29 | observation = observation_ 30 | ep_r.append(accumulative_r) 31 | genome.fitness = np.min(ep_r)/float(EP_STEP) 32 | 33 | def run(): 34 | config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction, 35 | neat.DefaultSpeciesSet, neat.DefaultStagnation, CONFIG) 36 | pop = neat.Population(config) 37 | 38 | # recode history 39 | stats = neat.StatisticsReporter() 40 | pop.add_reporter(stats) 41 | pop.add_reporter(neat.StdOutReporter(True)) 42 | pop.add_reporter(neat.Checkpointer(5)) 43 | 44 | pop.run(eval_genomes, 10) 45 | 46 | # visualize training 47 | visualize.plot_stats(stats, ylog=False, view=True) 48 | visualize.plot_species(stats, view=True) 49 | 50 | def evaluation(): 51 | p = neat.Checkpointer.restore_checkpoint('neat-checkpoint-%i' % CHECKPOINT) 52 | winner = p.run(eval_genomes, 1) 53 | 54 | # show winner net 55 | node_names = {-1: 'In0', -2: 'In1', -3: 'In3', -4: 'In4', 0: 'act1', 1: 'act2'} 56 | visualize.draw_net(p.config, winner, True, node_names=node_names) 57 | 58 | net = neat.nn.FeedForwardNetwork.create(winner, p.config) 59 | while True: 60 | s = env.reset() 61 | while True: 62 | env.render() 63 | a = np.argmax(net.activate(s)) 64 | s, r, done, _ = env.step(a) 65 | if done: break 66 | 67 | if __name__ == '__main__': 68 | TRAINING = sys.argv[1] 69 | 70 | if TRAINING == 'TRAIN': 71 | run() 72 | elif TRAINING == 'EVAL': 73 | evaluation() 74 | else: 75 | print('Please indicate TRAIN or EVAL') 76 | -------------------------------------------------------------------------------- /neat/config: -------------------------------------------------------------------------------- 1 | # neat-python configuration for the LunarLander-v2 environment on OpenAI Gym 2 | 3 | [NEAT] 4 | pop_size = 100 5 | # Note: the fitness threshold will never be reached because 6 | # we are controlling the termination ourselves based on simulation performance. 7 | fitness_criterion = max 8 | fitness_threshold = 2. 9 | reset_on_extinction = 0 10 | 11 | [DefaultGenome] 12 | # node activation options 13 | activation_default = relu 14 | activation_mutate_rate = 0.0 15 | activation_options = relu 16 | 17 | # node aggregation options 18 | aggregation_default = sum 19 | aggregation_mutate_rate = 0.0 20 | aggregation_options = sum 21 | 22 | # node bias options 23 | bias_init_mean = 0.0 24 | bias_init_stdev = 1.0 25 | bias_max_value = 30.0 26 | bias_min_value = -30.0 27 | bias_mutate_power = 0.5 28 | bias_mutate_rate = 0.7 29 | bias_replace_rate = 0.1 30 | 31 | # genome compatibility options 32 | compatibility_disjoint_coefficient = 1.0 33 | compatibility_weight_coefficient = 1.0 34 | 35 | # connection add/remove rates 36 | conn_add_prob = 0.9 37 | conn_delete_prob = 0.2 38 | 39 | # connection enable options 40 | enabled_default = True 41 | enabled_mutate_rate = 0.01 42 | 43 | feed_forward = True 44 | initial_connection = full 45 | # options (unconnected, fs_neat, full) 46 | 47 | # node add/remove rates 48 | node_add_prob = 0.9 49 | node_delete_prob = 0.2 50 | 51 | # network parameters 52 | num_hidden = 0 53 | num_inputs = 4 54 | num_outputs = 2 55 | 56 | # node response options 57 | response_init_mean = 1.0 58 | response_init_stdev = 0.0 59 | response_max_value = 30.0 60 | response_min_value = -30.0 61 | response_mutate_power = 0.0 62 | response_mutate_rate = 0.0 63 | response_replace_rate = 0.0 64 | 65 | # connection weight options 66 | weight_init_mean = 0.0 67 | weight_init_stdev = 1.0 68 | weight_max_value = 30. 69 | weight_min_value = -30. 70 | weight_mutate_power = 0.5 71 | weight_mutate_rate = 0.8 72 | weight_replace_rate = 0.1 73 | 74 | [DefaultSpeciesSet] 75 | compatibility_threshold = 3.0 76 | 77 | [DefaultStagnation] 78 | species_fitness_func = max 79 | max_stagnation = 20 80 | species_elitism = 4 81 | 82 | [DefaultReproduction] 83 | elitism = 2 84 | survival_threshold = 0.2 -------------------------------------------------------------------------------- /neat/speciation.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 10 | 11 | 12 | 13 | 19 | 20 | 21 | 22 | 28 | 29 | 30 | 54 | 55 | 56 | 80 | 81 | 82 | 106 | 107 | 108 | 132 | 133 | 134 | 135 | 136 | 137 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 373 | 397 | 416 | 432 | 464 | 484 | 495 | 516 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 702 | 714 | 715 | 741 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 787 | 788 | 789 | 792 | 793 | 794 | 797 | 798 | 799 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | -------------------------------------------------------------------------------- /neat/visualize.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import copy 4 | import warnings 5 | 6 | import graphviz 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | 10 | 11 | def plot_stats(statistics, ylog=False, view=False, filename='avg_fitness.svg'): 12 | """ Plots the population's average and best fitness. """ 13 | if plt is None: 14 | warnings.warn("This display is not available due to a missing optional dependency (matplotlib)") 15 | return 16 | 17 | generation = range(len(statistics.most_fit_genomes)) 18 | best_fitness = [c.fitness for c in statistics.most_fit_genomes] 19 | avg_fitness = np.array(statistics.get_fitness_mean()) 20 | stdev_fitness = np.array(statistics.get_fitness_stdev()) 21 | 22 | plt.plot(generation, avg_fitness, 'b-', label="average") 23 | #plt.plot(generation, avg_fitness - stdev_fitness, 'g-.', label="-1 sd") 24 | plt.plot(generation, avg_fitness + stdev_fitness, 'g-.', label="+1 sd") 25 | plt.plot(generation, best_fitness, 'r-', label="best") 26 | 27 | plt.title("Population's average and best fitness") 28 | plt.xlabel("Generations") 29 | plt.ylabel("Fitness") 30 | plt.grid() 31 | plt.legend(loc="best") 32 | if ylog: 33 | plt.gca().set_yscale('symlog') 34 | 35 | plt.savefig(filename) 36 | if view: 37 | plt.show() 38 | 39 | plt.close() 40 | 41 | 42 | def plot_species(statistics, view=False, filename='speciation.svg'): 43 | """ Visualizes speciation throughout evolution. """ 44 | if plt is None: 45 | warnings.warn("This display is not available due to a missing optional dependency (matplotlib)") 46 | return 47 | 48 | species_sizes = statistics.get_species_sizes() 49 | num_generations = len(species_sizes) 50 | curves = np.array(species_sizes).T 51 | 52 | fig, ax = plt.subplots() 53 | ax.stackplot(range(num_generations), *curves) 54 | 55 | plt.title("Speciation") 56 | plt.ylabel("Size per Species") 57 | plt.xlabel("Generations") 58 | 59 | plt.savefig(filename) 60 | 61 | if view: 62 | plt.show() 63 | 64 | plt.close() 65 | 66 | 67 | def draw_net(config, genome, view=False, filename=None, node_names=None, show_disabled=True, prune_unused=False, 68 | node_colors=None, fmt='svg'): 69 | """ Receives a genome and draws a neural network with arbitrary topology. """ 70 | # Attributes for network nodes. 71 | if graphviz is None: 72 | warnings.warn("This display is not available due to a missing optional dependency (graphviz)") 73 | return 74 | 75 | if node_names is None: 76 | node_names = {} 77 | 78 | assert type(node_names) is dict 79 | 80 | if node_colors is None: 81 | node_colors = {} 82 | 83 | assert type(node_colors) is dict 84 | 85 | node_attrs = { 86 | 'shape': 'circle', 87 | 'fontsize': '9', 88 | 'height': '0.2', 89 | 'width': '0.2'} 90 | 91 | dot = graphviz.Digraph(format=fmt, node_attr=node_attrs) 92 | 93 | inputs = set() 94 | for k in config.genome_config.input_keys: 95 | inputs.add(k) 96 | name = node_names.get(k, str(k)) 97 | input_attrs = {'style': 'filled', 98 | 'shape': 'box'} 99 | input_attrs['fillcolor'] = node_colors.get(k, 'lightgray') 100 | dot.node(name, _attributes=input_attrs) 101 | 102 | outputs = set() 103 | for k in config.genome_config.output_keys: 104 | outputs.add(k) 105 | name = node_names.get(k, str(k)) 106 | node_attrs = {'style': 'filled'} 107 | node_attrs['fillcolor'] = node_colors.get(k, 'lightblue') 108 | 109 | dot.node(name, _attributes=node_attrs) 110 | 111 | if prune_unused: 112 | connections = set() 113 | for cg in genome.connections.values(): 114 | if cg.enabled or show_disabled: 115 | connections.add(cg.key) 116 | 117 | used_nodes = copy.copy(outputs) 118 | pending = copy.copy(outputs) 119 | while pending: 120 | #print(pending, used_nodes) 121 | new_pending = set() 122 | for a, b in connections: 123 | if b in pending and a not in used_nodes: 124 | new_pending.add(a) 125 | used_nodes.add(a) 126 | pending = new_pending 127 | else: 128 | used_nodes = set(genome.nodes.keys()) 129 | 130 | for n in used_nodes: 131 | if n in inputs or n in outputs: 132 | continue 133 | 134 | attrs = {'style': 'filled'} 135 | attrs['fillcolor'] = node_colors.get(n, 'white') 136 | dot.node(str(n), _attributes=attrs) 137 | 138 | for cg in genome.connections.values(): 139 | if cg.enabled or show_disabled: 140 | #if cg.input not in used_nodes or cg.output not in used_nodes: 141 | # continue 142 | input, output = cg.key 143 | a = node_names.get(input, str(input)) 144 | b = node_names.get(output, str(output)) 145 | style = 'solid' if cg.enabled else 'dotted' 146 | color = 'green' if cg.weight > 0 else 'red' 147 | width = str(0.1 + abs(cg.weight / 5.0)) 148 | dot.edge(a, b, _attributes={'style': style, 'color': color, 'penwidth': width}) 149 | 150 | dot.render(filename, view=view) 151 | 152 | return dot -------------------------------------------------------------------------------- /net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class AtariNet(nn.Module): 6 | 7 | def __init__(self, num_actions): 8 | super(AtariNet, self).__init__() 9 | self.conv1 = nn.Sequential( 10 | nn.Conv2d(1, 32, kernel_size=8, stride=4), 11 | nn.ReLU() 12 | ) 13 | self.conv2 = nn.Sequential( 14 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 15 | nn.ReLU() 16 | ) 17 | self.conv3 = nn.Sequential( 18 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 19 | nn.ReLU() 20 | ) 21 | self.hidden = nn.Sequential( 22 | nn.Linear(64 * 7 * 7, 512, bias=True), 23 | nn.ReLU() 24 | ) 25 | self.out = nn.Sequential( 26 | nn.Linear(512, num_actions, bias=True) 27 | ) 28 | self.apply(self.init_weights) 29 | 30 | def init_weights(self, m): 31 | if type(m) == nn.Conv2d: 32 | m.weight.data.normal_(0.0, 0.02) 33 | if type(m) == nn.Linear: 34 | torch.nn.init.xavier_uniform_(m.weight) 35 | m.bias.data.fill_(0.01) 36 | 37 | def forward(self, x): 38 | x = self.conv1(x) 39 | x = self.conv2(x) 40 | x = self.conv3(x) 41 | x = x.view(x.size(0), -1) 42 | x = self.hidden(x) 43 | x = self.out(x) 44 | return x 45 | 46 | 47 | class CnnDQN(nn.Module): 48 | def __init__(self, inputs_shape, num_actions): 49 | super(CnnDQN, self).__init__() 50 | 51 | self.inut_shape = inputs_shape 52 | self.num_actions = num_actions 53 | 54 | self.features = nn.Sequential( 55 | nn.Conv2d(inputs_shape[0], 32, kernel_size=8, stride=4), 56 | nn.ReLU(), 57 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 58 | nn.ReLU(), 59 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 60 | nn.ReLU() 61 | ) 62 | 63 | self.fc = nn.Sequential( 64 | nn.Linear(self.features_size(), 512), 65 | nn.ReLU(), 66 | nn.Linear(512, self.num_actions) 67 | ) 68 | 69 | def forward(self, x): 70 | x = self.features(x) 71 | x = x.view(x.size(0), -1) 72 | x = self.fc(x) 73 | return x 74 | 75 | def features_size(self): 76 | return self.features(torch.zeros(1, *self.inut_shape)).view(1, -1).size(1) 77 | -------------------------------------------------------------------------------- /nips-DQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from collections import deque 4 | import numpy as np 5 | import gym 6 | import random 7 | from net import AtariNet 8 | from util import preprocess 9 | 10 | BATCH_SIZE = 32 11 | LR = 0.001 12 | START_EPSILON = 1.0 13 | FINAL_EPSILON = 0.1 14 | EPSILON = START_EPSILON 15 | EXPLORE = 1000000 16 | GAMMA = 0.99 17 | TOTAL_EPISODES = 10000000 18 | MEMORY_SIZE = 1000000 19 | MEMORY_THRESHOLD = 100000 20 | TEST_FREQUENCY = 1000 21 | env = gym.make('Pong-v0') 22 | env = env.unwrapped 23 | ACTIONS_SIZE = env.action_space.n 24 | 25 | 26 | class Agent(object): 27 | def __init__(self): 28 | self.network = AtariNet(ACTIONS_SIZE) 29 | self.memory = deque() 30 | self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR) 31 | self.loss_func = nn.MSELoss() 32 | 33 | def action(self, state, israndom): 34 | if israndom and random.random() < EPSILON: 35 | return np.random.randint(0, ACTIONS_SIZE) 36 | state = torch.unsqueeze(torch.FloatTensor(state), 0) 37 | actions_value = self.network.forward(state) 38 | return torch.max(actions_value, 1)[1].data.numpy()[0] 39 | 40 | def learn(self, state, action, reward, next_state, done): 41 | if done: 42 | self.memory.append((state, action, reward, next_state, 0)) 43 | else: 44 | self.memory.append((state, action, reward, next_state, 1)) 45 | if len(self.memory) > MEMORY_SIZE: 46 | self.memory.popleft() 47 | if len(self.memory) < MEMORY_THRESHOLD: 48 | return 49 | 50 | batch = random.sample(self.memory, BATCH_SIZE) 51 | state = torch.FloatTensor([x[0] for x in batch]) 52 | action = torch.LongTensor([[x[1]] for x in batch]) 53 | reward = torch.FloatTensor([[x[2]] for x in batch]) 54 | next_state = torch.FloatTensor([x[3] for x in batch]) 55 | done = torch.FloatTensor([[x[4]] for x in batch]) 56 | 57 | eval_q = self.network.forward(state).gather(1, action) 58 | next_q = self.network(next_state).detach() 59 | target_q = reward + GAMMA * next_q.max(1)[0].view(BATCH_SIZE, 1) * done 60 | loss = self.loss_func(eval_q, target_q) 61 | 62 | self.optimizer.zero_grad() 63 | loss.backward() 64 | self.optimizer.step() 65 | 66 | 67 | agent = Agent() 68 | 69 | for i_episode in range(TOTAL_EPISODES): 70 | state = env.reset() 71 | state = preprocess(state) 72 | while True: 73 | # env.render() 74 | action = agent.action(state, True) 75 | next_state, reward, done, info = env.step(action) 76 | next_state = preprocess(next_state) 77 | agent.learn(state, action, reward, next_state, done) 78 | 79 | state = next_state 80 | if done: 81 | break 82 | if EPSILON > FINAL_EPSILON: 83 | EPSILON -= (START_EPSILON - FINAL_EPSILON) / EXPLORE 84 | 85 | # TEST 86 | if i_episode % TEST_FREQUENCY == 0: 87 | state = env.reset() 88 | state = preprocess(state) 89 | total_reward = 0 90 | while True: 91 | # env.render() 92 | action = agent.action(state, israndom=False) 93 | next_state, reward, done, info = env.step(action) 94 | next_state = preprocess(next_state) 95 | 96 | total_reward += reward 97 | 98 | state = next_state 99 | if done: 100 | break 101 | print('episode: {} , total_reward: {}'.format(i_episode, round(total_reward, 3))) 102 | 103 | env.close() 104 | -------------------------------------------------------------------------------- /recurrent_neural_network/main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchvision 4 | import torchvision.transforms as transforms 5 | 6 | 7 | # Device configuration 8 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 9 | 10 | # Hyper-parameters 11 | sequence_length = 28 12 | input_size = 28 13 | hidden_size = 128 14 | num_layers = 2 15 | num_classes = 10 16 | batch_size = 100 17 | num_epochs = 2 18 | learning_rate = 0.01 19 | 20 | # MNIST dataset 21 | train_dataset = torchvision.datasets.MNIST(root='../../data/', 22 | train=True, 23 | transform=transforms.ToTensor(), 24 | download=True) 25 | 26 | test_dataset = torchvision.datasets.MNIST(root='../../data/', 27 | train=False, 28 | transform=transforms.ToTensor()) 29 | 30 | # Data loader 31 | train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 32 | batch_size=batch_size, 33 | shuffle=True) 34 | 35 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 36 | batch_size=batch_size, 37 | shuffle=False) 38 | 39 | # Recurrent neural network (many-to-one) 40 | class RNN(nn.Module): 41 | def __init__(self, input_size, hidden_size, num_layers, num_classes): 42 | super(RNN, self).__init__() 43 | self.hidden_size = hidden_size 44 | self.num_layers = num_layers 45 | self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True) 46 | self.fc = nn.Linear(hidden_size, num_classes) 47 | 48 | def forward(self, x): 49 | # Set initial hidden and cell states 50 | h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 51 | c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 52 | 53 | # Forward propagate LSTM 54 | out, _ = self.lstm(x, (h0, c0)) # out: tensor of shape (batch_size, seq_length, hidden_size) 55 | 56 | # Decode the hidden state of the last time step 57 | out = self.fc(out[:, -1, :]) 58 | return out 59 | 60 | model = RNN(input_size, hidden_size, num_layers, num_classes).to(device) 61 | 62 | 63 | # Loss and optimizer 64 | criterion = nn.CrossEntropyLoss() 65 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 66 | 67 | # Train the model 68 | total_step = len(train_loader) 69 | for epoch in range(num_epochs): 70 | for i, (images, labels) in enumerate(train_loader): 71 | images = images.reshape(-1, sequence_length, input_size).to(device) 72 | labels = labels.to(device) 73 | 74 | # Forward pass 75 | outputs = model(images) 76 | loss = criterion(outputs, labels) 77 | 78 | # Backward and optimize 79 | optimizer.zero_grad() 80 | loss.backward() 81 | optimizer.step() 82 | 83 | if (i+1) % 100 == 0: 84 | print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 85 | .format(epoch+1, num_epochs, i+1, total_step, loss.item())) 86 | 87 | # Test the model 88 | with torch.no_grad(): 89 | correct = 0 90 | total = 0 91 | for images, labels in test_loader: 92 | images = images.reshape(-1, sequence_length, input_size).to(device) 93 | labels = labels.to(device) 94 | outputs = model(images) 95 | _, predicted = torch.max(outputs.data, 1) 96 | total += labels.size(0) 97 | correct += (predicted == labels).sum().item() 98 | 99 | print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) 100 | 101 | # Save the model checkpoint 102 | torch.save(model.state_dict(), 'model.ckpt') 103 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | 5 | def preprocess(observation): 6 | """ 7 | image preprocess 8 | :param observation: 9 | :return: 10 | """ 11 | observation = cv2.cvtColor(cv2.resize(observation, (84, 110)), cv2.COLOR_BGR2GRAY) 12 | observation = observation[26:110,:] 13 | ret, observation = cv2.threshold(observation,1,255,cv2.THRESH_BINARY) 14 | x = np.reshape(observation,(84,84,1)) 15 | return x.transpose((2, 0, 1)) --------------------------------------------------------------------------------