├── 01-basics
└── linear_regression
├── A3C-DDPG.py
├── PER-and-ACQ
├── AC-DQN.py
├── net.py
├── prioritized-DQN.py
├── prioritized_memory.py
└── util.py
├── README.md
├── convolutional_neural_network
└── main.py
├── ddpg.py
├── double-DQN.py
├── dueling-DQN.py
├── feedforward_neural_network
└── main.py
├── ga
├── bag.py
└── peak.py
├── gym_sample
└── demo.py
├── native-Qlearning.py
├── nature-DQN.py
├── neat
├── Digraph.gv
├── Digraph.gv.svg
├── avg_fitness.svg
├── cartpole.py
├── config
├── speciation.svg
└── visualize.py
├── net.py
├── nips-DQN.py
├── recurrent_neural_network
└── main.py
└── util.py
/01-basics/linear_regression:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 |
6 |
7 | # Hyper-parameters
8 | input_size = 1
9 | output_size = 1
10 | num_epochs = 60
11 | learning_rate = 0.001
12 |
13 | # Toy dataset
14 | x_train = np.array([[3.3], [4.4], [5.5], [6.71], [6.93], [4.168],
15 | [9.779], [6.182], [7.59], [2.167], [7.042],
16 | [10.791], [5.313], [7.997], [3.1]], dtype=np.float32)
17 |
18 | y_train = np.array([[1.7], [2.76], [2.09], [3.19], [1.694], [1.573],
19 | [3.366], [2.596], [2.53], [1.221], [2.827],
20 | [3.465], [1.65], [2.904], [1.3]], dtype=np.float32)
21 |
22 | # Linear regression model
23 | model = nn.Linear(input_size, output_size)
24 |
25 | # Loss and optimizer
26 | criterion = nn.MSELoss()
27 | optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
28 |
29 | # Train the model
30 | for epoch in range(num_epochs):
31 | # Convert numpy arrays to torch tensors
32 | inputs = torch.from_numpy(x_train)
33 | targets = torch.from_numpy(y_train)
34 |
35 | # Forward pass
36 | outputs = model(inputs)
37 | loss = criterion(outputs, targets)
38 |
39 | # Backward and optimize
40 | optimizer.zero_grad()
41 | loss.backward()
42 | optimizer.step()
43 |
44 | if (epoch+1) % 5 == 0:
45 | print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))
46 |
47 | # Plot the graph
48 | predicted = model(torch.from_numpy(x_train)).detach().numpy()
49 | plt.plot(x_train, y_train, 'ro', label='Original data')
50 | plt.plot(x_train, predicted, label='Fitted line')
51 | plt.legend()
52 | plt.show()
53 |
54 | # Save the model checkpoint
55 | torch.save(model.state_dict(), 'model.ckpt')
56 |
--------------------------------------------------------------------------------
/A3C-DDPG.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import torch, time, gym, argparse, sys
3 | import numpy as np
4 | from scipy.signal import lfilter
5 | from scipy.misc import imresize
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | import torch.multiprocessing as mp
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--env', default='Breakout-v4', type=str, help='gym environment')
12 | parser.add_argument('--processes', default=1, type=int, help='number of processes')
13 | parser.add_argument('--lr', default=1e-4, type=float, help='learning rate')
14 | parser.add_argument('--gamma', default=0.99, type=float, help='rewards discount factor')
15 | parser.add_argument('--seed', default=1, type=int, help='random seed')
16 | args = parser.parse_args()
17 | discount = lambda x, gamma: lfilter([1], [1, -gamma], x[::-1])[::-1]
18 | prepro = lambda img: imresize(img[35:195].mean(2), (80, 80)).astype(np.float32).reshape(1, 80, 80) / 255.
19 |
20 |
21 | class NNPolicy(nn.Module):
22 | def __init__(self, num_actions):
23 | super(NNPolicy, self).__init__()
24 | self.conv1 = nn.Conv2d(1, 32, 3, stride=2, padding=1)
25 | self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
26 | self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
27 | self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
28 | self.gru = nn.GRUCell(32 * 5 * 5, 256)
29 | self.critic_net, self.actor_net = nn.Linear(256, 1), nn.Linear(256, num_actions)
30 |
31 | def forward(self, inputs, train=True, hard=False):
32 | inputs, hx = inputs
33 | x = F.elu(self.conv1(inputs))
34 | x = F.elu(self.conv2(x))
35 | x = F.elu(self.conv3(x))
36 | x = F.elu(self.conv4(x))
37 | hx = self.gru(x.view(-1, 32 * 5 * 5), (hx))
38 | return self.critic_net(hx), self.actor_net(hx), hx
39 |
40 |
41 | class SharedAdam(torch.optim.Adam):
42 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
43 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
44 | for group in self.param_groups:
45 | for p in group['params']:
46 | state = self.state[p]
47 | state['shared_steps'], state['step'] = torch.zeros(1).share_memory_(), 0
48 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_().share_memory_()
49 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_().share_memory_()
50 |
51 |
52 | def loss_func(args, values, logps, actions, rewards):
53 | np_values = values.view(-1).data.numpy()
54 |
55 | delta_t = np.asarray(rewards) + args.gamma * np_values[1:] - np_values[:-1]
56 | logpys = logps.gather(1, torch.tensor(actions).view(-1, 1))
57 | gen_adv_est = discount(delta_t, args.gamma)
58 | policy_loss = -(logpys.view(-1) * torch.FloatTensor(gen_adv_est.copy())).sum()
59 |
60 | rewards[-1] += args.gamma * np_values[-1]
61 | discounted_r = discount(np.asarray(rewards), args.gamma)
62 | discounted_r = torch.tensor(discounted_r.copy(), dtype=torch.float32)
63 | value_loss = .5 * (discounted_r - values[:-1, 0]).pow(2).sum()
64 |
65 | entropy_loss = -(-logps * torch.exp(logps)).sum()
66 | return policy_loss + 0.5 * value_loss + 0.01 * entropy_loss
67 |
68 |
69 | def worker(shared_model, shared_optimizer, rank, args, info):
70 | env = gym.make(args.env)
71 | env.seed(args.seed + rank)
72 | torch.manual_seed(args.seed + rank)
73 | model = NNPolicy(num_actions=args.num_actions)
74 | state = torch.tensor(prepro(env.reset()))
75 |
76 | start_time = last_disp_time = time.time()
77 | episode_length, epr, eploss, done = 0, 0, 0, True
78 |
79 | while info['frames'][0] <= 4e7:
80 | model.load_state_dict(shared_model.state_dict())
81 |
82 | hx = torch.zeros(1, 256) if done else hx.detach()
83 | values, logps, actions, rewards = [], [], [], []
84 |
85 | for step in range(4):
86 | episode_length += 1
87 | value, logit, hx = model((state.view(1, 1, 80, 80), hx))
88 | logp = F.log_softmax(logit, dim=-1)
89 |
90 | action = torch.exp(logp).multinomial(num_samples=1).data[0]
91 | state, reward, done, _ = env.step(action.numpy()[0])
92 | # env.render()
93 |
94 | state = torch.tensor(prepro(state))
95 | epr += reward
96 | reward = np.clip(reward, -1, 1)
97 | done = done or episode_length >= 1e4
98 |
99 | info['frames'].add_(1)
100 | num_frames = int(info['frames'].item())
101 |
102 | if done:
103 | info['episodes'] += 1
104 | interp = 1 if info['episodes'][0] == 1 else 0.01
105 | info['run_epr'].mul_(1 - interp).add_(interp * epr)
106 | info['run_loss'].mul_(1 - interp).add_(interp * eploss)
107 |
108 | if rank == 0 and time.time() - last_disp_time > 60:
109 | elapsed = time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time))
110 | print('time {}, episodes {:.0f}, frames {:.1f}M, mean epr {:.2f}, run loss {:.2f}'
111 | .format(elapsed, info['episodes'].item(), num_frames / 1e6,
112 | info['run_epr'].item(), info['run_loss'].item()))
113 | last_disp_time = time.time()
114 |
115 | if done:
116 | episode_length, epr, eploss = 0, 0, 0
117 | state = torch.tensor(prepro(env.reset()))
118 |
119 | values.append(value)
120 | logps.append(logp)
121 | actions.append(action)
122 | rewards.append(reward)
123 |
124 | next_value = torch.zeros(1, 1) if done else model((state.unsqueeze(0), hx))[0]
125 | values.append(next_value.detach())
126 |
127 | loss = loss_func(args, torch.cat(values), torch.cat(logps), torch.cat(actions), np.asarray(rewards))
128 | eploss += loss.item()
129 | shared_optimizer.zero_grad()
130 | loss.backward()
131 | torch.nn.utils.clip_grad_norm_(model.parameters(), 40)
132 |
133 | for param, shared_param in zip(model.parameters(), shared_model.parameters()):
134 | if shared_param.grad is None:
135 | shared_param._grad = param.grad
136 | shared_optimizer.step()
137 |
138 |
139 | if __name__ == "__main__":
140 | if sys.version_info[0] > 2:
141 | mp.set_start_method('spawn')
142 | elif sys.platform == 'linux' or sys.platform == 'linux2':
143 | raise "Must be using Python 3 with linux! Or else you get a deadlock in conv2d"
144 |
145 | args.num_actions = gym.make(args.env).action_space.n
146 |
147 | torch.manual_seed(args.seed)
148 | shared_model = NNPolicy(num_actions=args.num_actions).share_memory()
149 | shared_optimizer = SharedAdam(shared_model.parameters(), lr=args.lr)
150 |
151 | info = {k: torch.DoubleTensor([0]).share_memory_() for k in ['run_epr', 'run_loss', 'episodes', 'frames']}
152 |
153 | processes = []
154 | for rank in range(args.processes):
155 | p = mp.Process(target=worker, args=(shared_model, shared_optimizer, rank, args, info))
156 | p.start()
157 | processes.append(p)
158 | for p in processes: p.join()
159 |
--------------------------------------------------------------------------------
/PER-and-ACQ/AC-DQN.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import torch, time, gym, argparse, sys
3 | import numpy as np
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | import torch.multiprocessing as mp
7 |
8 | from collections import deque
9 | import random
10 | from net import AtariNet
11 | from util import preprocess
12 |
13 | LR = 0.001
14 | EXPLORE = 1000000
15 | GAMMA = 0.99
16 | N_STEP = 4
17 | ENV = 'Pong-v0'
18 | ACTIONS_SIZE = gym.make(ENV).action_space.n
19 | PROCESSES = 1
20 | SEED = 1
21 |
22 |
23 | class Agent(object):
24 | def __init__(self, action_size):
25 | self.action_size = action_size
26 | self.EPSILON = 1.0
27 | self.network = AtariNet(action_size)
28 | self.memory = deque()
29 | self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR)
30 | self.loss_func = nn.MSELoss()
31 |
32 | def action(self, state, israndom):
33 | if israndom and random.random() < self.EPSILON:
34 | return np.random.randint(0, self.action_size)
35 | state = torch.unsqueeze(torch.FloatTensor(state), 0)
36 | actions_value = self.network.forward(state)
37 | return torch.max(actions_value, 1)[1].data.numpy()[0]
38 |
39 | def add(self, state, action, reward, next_state, done):
40 | if done:
41 | self.memory.append((state, action, reward, next_state, 0))
42 | else:
43 | self.memory.append((state, action, reward, next_state, 1))
44 |
45 | def learn(self, shared_optimizer, shared_model):
46 | batch_size = len(self.memory)
47 | batch = random.sample(self.memory, batch_size)
48 | state = torch.FloatTensor([x[0] for x in batch])
49 | action = torch.LongTensor([[x[1]] for x in batch])
50 | reward = torch.FloatTensor([[x[2]] for x in batch])
51 | next_state = torch.FloatTensor([x[3] for x in batch])
52 | done = torch.FloatTensor([[x[4]] for x in batch])
53 |
54 | eval_q = self.network.forward(state).gather(1, action)
55 | next_q = self.network(next_state).detach()
56 | target_q = reward + GAMMA * next_q.max(1)[0].view(batch_size, 1) * done
57 | loss = self.loss_func(eval_q, target_q)
58 |
59 | shared_optimizer.zero_grad()
60 | loss.backward()
61 | for param, shared_param in zip(self.network.parameters(), shared_model.parameters()):
62 | if shared_param.grad is None:
63 | shared_param._grad = param.grad
64 | shared_optimizer.step()
65 |
66 | self.memory = deque()
67 | if self.EPSILON > 0.1:
68 | self.EPSILON -= (1.0 - 0.1) / EXPLORE
69 |
70 |
71 | class SharedAdam(torch.optim.Adam):
72 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
73 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
74 | for group in self.param_groups:
75 | for p in group['params']:
76 | state = self.state[p]
77 | state['shared_steps'], state['step'] = torch.zeros(1).share_memory_(), 0
78 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_().share_memory_()
79 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_().share_memory_()
80 |
81 |
82 | def worker(shared_model, shared_optimizer, rank, info):
83 | env = gym.make(ENV)
84 | env.seed(SEED + rank)
85 | torch.manual_seed(SEED + rank)
86 | agent = Agent(ACTIONS_SIZE)
87 |
88 | start_time = last_disp_time = time.time()
89 | episode_length, epr = 0, 0
90 |
91 | state = env.reset()
92 | state = preprocess(state)
93 | while info['frames'][0] <= 4e7:
94 | agent.network.load_state_dict(shared_model.state_dict())
95 |
96 | for _ in range(N_STEP):
97 | # env.render()
98 | episode_length += 1
99 |
100 | action = agent.action(state, True)
101 | next_state, reward, done, ext = env.step(action)
102 | epr += reward
103 | done = done or episode_length >= 1e4
104 | info['frames'].add_(1)
105 | num_frames = int(info['frames'].item())
106 |
107 | next_state = preprocess(next_state)
108 | agent.add(state, action, reward, next_state, done)
109 |
110 | state = next_state
111 |
112 | if done:
113 | info['episodes'] += 1
114 | interp = 1 if info['episodes'][0] == 1 else 0.01
115 | info['run_epr'].mul_(1 - interp).add_(interp * epr)
116 |
117 | if rank == 0 and time.time() - last_disp_time > 60:
118 | elapsed = time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time))
119 | print('time {}, episodes {:.0f}, frames {:.1f}M, mean epr {:.2f}'
120 | .format(elapsed, info['episodes'].item(), num_frames / 1e6,
121 | info['run_epr'].item()))
122 | last_disp_time = time.time()
123 |
124 | if done:
125 | episode_length, epr, eploss = 0, 0, 0
126 | state = env.reset()
127 | state = preprocess(state)
128 | break
129 |
130 | agent.learn(shared_optimizer, shared_model)
131 |
132 |
133 |
134 | if __name__ == "__main__":
135 | if sys.version_info[0] > 2:
136 | mp.set_start_method('spawn')
137 | elif sys.platform == 'linux' or sys.platform == 'linux2':
138 | raise "Must be using Python 3 with linux! Or else you get a deadlock in conv2d"
139 |
140 | torch.manual_seed(SEED)
141 | shared_model = AtariNet(ACTIONS_SIZE).share_memory()
142 | shared_optimizer = SharedAdam(shared_model.parameters(), lr=LR)
143 |
144 | info = {k: torch.DoubleTensor([0]).share_memory_() for k in ['run_epr', 'episodes', 'frames']}
145 |
146 | processes = []
147 | for rank in range(PROCESSES):
148 | p = mp.Process(target=worker, args=(shared_model, shared_optimizer, rank, info))
149 | p.start()
150 | processes.append(p)
151 | for p in processes: p.join()
152 |
--------------------------------------------------------------------------------
/PER-and-ACQ/net.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 |
5 | class AtariNet(nn.Module):
6 |
7 | def __init__(self, num_actions):
8 | super(AtariNet, self).__init__()
9 | self.conv1 = nn.Sequential(
10 | nn.Conv2d(1, 32, kernel_size=8, stride=4),
11 | nn.ReLU()
12 | )
13 | self.conv2 = nn.Sequential(
14 | nn.Conv2d(32, 64, kernel_size=4, stride=2),
15 | nn.ReLU()
16 | )
17 | self.conv3 = nn.Sequential(
18 | nn.Conv2d(64, 64, kernel_size=3, stride=1),
19 | nn.ReLU()
20 | )
21 | self.hidden = nn.Sequential(
22 | nn.Linear(64 * 7 * 7, 512, bias=True),
23 | nn.ReLU()
24 | )
25 | self.out = nn.Sequential(
26 | nn.Linear(512, num_actions, bias=True)
27 | )
28 | self.apply(self.init_weights)
29 |
30 | def init_weights(self, m):
31 | if type(m) == nn.Conv2d:
32 | m.weight.data.normal_(0.0, 0.02)
33 | if type(m) == nn.Linear:
34 | torch.nn.init.xavier_uniform_(m.weight)
35 | m.bias.data.fill_(0.01)
36 |
37 | def forward(self, x):
38 | x = self.conv1(x)
39 | x = self.conv2(x)
40 | x = self.conv3(x)
41 | x = x.view(x.size(0), -1)
42 | x = self.hidden(x)
43 | x = self.out(x)
44 | return x
45 |
46 |
47 | class CnnDQN(nn.Module):
48 | def __init__(self, inputs_shape, num_actions):
49 | super(CnnDQN, self).__init__()
50 |
51 | self.inut_shape = inputs_shape
52 | self.num_actions = num_actions
53 |
54 | self.features = nn.Sequential(
55 | nn.Conv2d(inputs_shape[0], 32, kernel_size=8, stride=4),
56 | nn.ReLU(),
57 | nn.Conv2d(32, 64, kernel_size=4, stride=2),
58 | nn.ReLU(),
59 | nn.Conv2d(64, 64, kernel_size=3, stride=1),
60 | nn.ReLU()
61 | )
62 |
63 | self.fc = nn.Sequential(
64 | nn.Linear(self.features_size(), 512),
65 | nn.ReLU(),
66 | nn.Linear(512, self.num_actions)
67 | )
68 |
69 | def forward(self, x):
70 | x = self.features(x)
71 | x = x.view(x.size(0), -1)
72 | x = self.fc(x)
73 | return x
74 |
75 | def features_size(self):
76 | return self.features(torch.zeros(1, *self.inut_shape)).view(1, -1).size(1)
77 |
--------------------------------------------------------------------------------
/PER-and-ACQ/prioritized-DQN.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from prioritized_memory import Memory
4 | import numpy as np
5 | import gym
6 | import random
7 | from net import AtariNet
8 | from util import preprocess
9 |
10 | BATCH_SIZE = 32
11 | LR = 0.001
12 | START_EPSILON = 1.0
13 | FINAL_EPSILON = 0.1
14 | EPSILON = START_EPSILON
15 | EXPLORE = 1000000
16 | GAMMA = 0.99
17 | TOTAL_EPISODES = 10000000
18 | MEMORY_SIZE = 1000000
19 | MEMORY_THRESHOLD = 100000
20 | UPDATE_TIME = 10000
21 | TEST_FREQUENCY = 1000
22 | env = gym.make('Pong-v0')
23 | env = env.unwrapped
24 | ACTIONS_SIZE = env.action_space.n
25 |
26 |
27 | class Agent(object):
28 | def __init__(self):
29 | self.network, self.target_network = AtariNet(ACTIONS_SIZE), AtariNet(ACTIONS_SIZE)
30 | self.memory = Memory(MEMORY_SIZE)
31 | self.learning_count = 0
32 | self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR)
33 | self.loss_func = nn.MSELoss()
34 |
35 | def action(self, state, israndom):
36 | if israndom and random.random() < EPSILON:
37 | return np.random.randint(0, ACTIONS_SIZE)
38 | state = torch.unsqueeze(torch.FloatTensor(state), 0)
39 | actions_value = self.network.forward(state)
40 | return torch.max(actions_value, 1)[1].data.numpy()[0]
41 |
42 | def learn(self, state, action, reward, next_state, done):
43 | old_val = self.network.forward(torch.FloatTensor([state])).gather(1, torch.LongTensor([[action]]))[0]
44 | target_val = self.network.forward(torch.FloatTensor([state]))
45 | if done:
46 | done = 0
47 | target = reward
48 | else:
49 | done = 1
50 | target = reward + GAMMA * torch.max(target_val)
51 | error = abs(old_val[0] - target)
52 | self.memory.add(error.data, (state, action, reward, next_state, done))
53 | if self.memory.tree.n_entries < MEMORY_THRESHOLD:
54 | return
55 |
56 | if self.learning_count % UPDATE_TIME == 0:
57 | self.target_network.load_state_dict(self.network.state_dict())
58 | self.learning_count += 1
59 |
60 | batch, idxs, is_weights = self.memory.sample(BATCH_SIZE)
61 | state = torch.FloatTensor([x[0] for x in batch])
62 | action = torch.LongTensor([[x[1]] for x in batch])
63 | reward = torch.FloatTensor([[x[2]] for x in batch])
64 | next_state = torch.FloatTensor([x[3] for x in batch])
65 | done = torch.FloatTensor([[x[4]] for x in batch])
66 |
67 | eval_q = self.network.forward(state).gather(1, action)
68 | next_q = self.target_network(next_state).detach()
69 | target_q = reward + GAMMA * next_q.max(1)[0].view(BATCH_SIZE, 1) * done
70 | errors = torch.abs(eval_q - target_q).data.numpy().flatten()
71 | loss = self.loss_func(eval_q, target_q)
72 |
73 | for i in range(BATCH_SIZE):
74 | idx = idxs[i]
75 | self.memory.update(idx, errors[i])
76 |
77 | self.optimizer.zero_grad()
78 | loss.backward()
79 | self.optimizer.step()
80 |
81 |
82 | agent = Agent()
83 |
84 | for i_episode in range(TOTAL_EPISODES):
85 | state = env.reset()
86 | state = preprocess(state)
87 | while True:
88 | # env.render()
89 | action = agent.action(state, True)
90 | next_state, reward, done, info = env.step(action)
91 | next_state = preprocess(next_state)
92 | agent.learn(state, action, reward, next_state, done)
93 |
94 | state = next_state
95 | if done:
96 | break
97 | if EPSILON > FINAL_EPSILON:
98 | EPSILON -= (START_EPSILON - FINAL_EPSILON) / EXPLORE
99 |
100 | # TEST
101 | if i_episode % TEST_FREQUENCY == 0:
102 | state = env.reset()
103 | state = preprocess(state)
104 | total_reward = 0
105 | while True:
106 | # env.render()
107 | action = agent.action(state, israndom=False)
108 | next_state, reward, done, info = env.step(action)
109 | next_state = preprocess(next_state)
110 |
111 | total_reward += reward
112 |
113 | state = next_state
114 | if done:
115 | break
116 | print('episode: {} , total_reward: {}'.format(i_episode, round(total_reward, 3)))
117 |
118 | env.close()
119 |
--------------------------------------------------------------------------------
/PER-and-ACQ/prioritized_memory.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | import numpy as np
4 |
5 | class SumTree:
6 | write = 0
7 |
8 | def __init__(self, capacity):
9 | self.capacity = capacity
10 | self.tree = np.zeros(2 * capacity - 1)
11 | self.data = np.zeros(capacity, dtype=object)
12 | self.n_entries = 0
13 |
14 | def _propagate(self, idx, change):
15 | parent = (idx - 1) // 2
16 | self.tree[parent] += change
17 | if parent != 0:
18 | self._propagate(parent, change)
19 |
20 | def _retrieve(self, idx, s):
21 | left = 2 * idx + 1
22 | right = left + 1
23 | if left >= len(self.tree):
24 | return idx
25 |
26 | if s <= self.tree[left]:
27 | return self._retrieve(left, s)
28 | else:
29 | return self._retrieve(right, s - self.tree[left])
30 |
31 | def total(self):
32 | return self.tree[0]
33 |
34 | def add(self, p, data):
35 | idx = self.write + self.capacity - 1
36 | self.data[self.write] = data
37 | self.update(idx, p)
38 | self.write += 1
39 | if self.write >= self.capacity:
40 | self.write = 0
41 |
42 | if self.n_entries < self.capacity:
43 | self.n_entries += 1
44 |
45 | def update(self, idx, p):
46 | change = p - self.tree[idx]
47 | self.tree[idx] = p
48 | self._propagate(idx, change)
49 |
50 | def get(self, s):
51 | idx = self._retrieve(0, s)
52 | dataIdx = idx - self.capacity + 1
53 | return (idx, self.tree[idx], self.data[dataIdx])
54 |
55 |
56 | class Memory:
57 | e = 0.01
58 | a = 0.6
59 | beta = 0.4
60 | beta_increment_per_sampling = 0.001
61 |
62 | def __init__(self, capacity):
63 | self.tree = SumTree(capacity)
64 | self.capacity = capacity
65 |
66 | def _get_priority(self, error):
67 | return (error + self.e) ** self.a
68 |
69 | def add(self, error, sample):
70 | p = self._get_priority(error)
71 | self.tree.add(p, sample)
72 |
73 | def sample(self, n):
74 | batch = []
75 | idxs = []
76 | segment = self.tree.total() / n
77 | priorities = []
78 | self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])
79 | for i in range(n):
80 | a = segment * i
81 | b = segment * (i + 1)
82 | s = random.uniform(a, b)
83 | (idx, p, data) = self.tree.get(s)
84 | priorities.append(p)
85 | batch.append(data)
86 | idxs.append(idx)
87 | sampling_probabilities = priorities / self.tree.total()
88 | is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta)
89 | is_weight /= is_weight.max()
90 | return batch, idxs, is_weight
91 |
92 | def update(self, idx, error):
93 | p = self._get_priority(error)
94 | self.tree.update(idx, p)
--------------------------------------------------------------------------------
/PER-and-ACQ/util.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 |
4 |
5 | def preprocess(observation):
6 | """
7 | image preprocess
8 | :param observation:
9 | :return:
10 | """
11 | observation = cv2.cvtColor(cv2.resize(observation, (84, 110)), cv2.COLOR_BGR2GRAY)
12 | observation = observation[26:110,:]
13 | ret, observation = cv2.threshold(observation,1,255,cv2.THRESH_BINARY)
14 | x = np.reshape(observation,(84,84,1))
15 | return x.transpose((2, 0, 1))
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 《白话强化学习与Pytorch》代码
2 |
3 | ## 5.2 Q-Learning:
4 |
5 | native-Qlearning.py
6 |
7 | ## 6 Deep Learning:
8 |
9 | feedforward_neural_network/
10 |
11 | convolutional_neural_network/
12 |
13 | recurrent_neural_network/
14 |
15 | ## 8.1 NIPS DQN:
16 |
17 | nips-DQN.py
18 |
19 | ## 8.2 Nature DQN:
20 |
21 | nature-DQN.py
22 |
23 | ## 8.3 Double DQN:
24 |
25 | double-DQN.py
26 |
27 | ## 8.4 Dueling DQN:
28 |
29 | dueling-DQN.py
30 |
31 | ## 9.4 DDPG:
32 |
33 | ddpg.py
34 |
35 | ## 10.1.2 A3C DDPG:
36 |
37 | A3C-DDPG.py
38 |
39 | ## 联系作者
40 |
41 | ### 邮箱:zhenbinye@gmail.com,77232517@qq.com
42 |
43 | ### 代码持续更新,您若有改进建议或者问题请联系作者
44 |
45 | ### 由于版本更迭,代码或许和书中印刷内容略有出入,敬请谅解
46 |
--------------------------------------------------------------------------------
/convolutional_neural_network/main.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torchvision
4 | import torchvision.transforms as transforms
5 |
6 |
7 | # Device configuration
8 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
9 |
10 | # Hyper parameters
11 | num_epochs = 5
12 | num_classes = 10
13 | batch_size = 100
14 | learning_rate = 0.001
15 |
16 | # MNIST dataset
17 | train_dataset = torchvision.datasets.MNIST(root='../../data/',
18 | train=True,
19 | transform=transforms.ToTensor(),
20 | download=True)
21 |
22 | test_dataset = torchvision.datasets.MNIST(root='../../data/',
23 | train=False,
24 | transform=transforms.ToTensor())
25 |
26 | # Data loader
27 | train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
28 | batch_size=batch_size,
29 | shuffle=True)
30 |
31 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
32 | batch_size=batch_size,
33 | shuffle=False)
34 |
35 | # Convolutional neural network (two convolutional layers)
36 | class ConvNet(nn.Module):
37 | def __init__(self, num_classes=10):
38 | super(ConvNet, self).__init__()
39 | self.layer1 = nn.Sequential(
40 | nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
41 | nn.BatchNorm2d(16),
42 | nn.ReLU(),
43 | nn.MaxPool2d(kernel_size=2, stride=2))
44 | self.layer2 = nn.Sequential(
45 | nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
46 | nn.BatchNorm2d(32),
47 | nn.ReLU(),
48 | nn.MaxPool2d(kernel_size=2, stride=2))
49 | self.fc = nn.Linear(7*7*32, num_classes)
50 |
51 | def forward(self, x):
52 | out = self.layer1(x)
53 | out = self.layer2(out)
54 | out = out.reshape(out.size(0), -1)
55 | out = self.fc(out)
56 | return out
57 |
58 | model = ConvNet(num_classes).to(device)
59 |
60 | # Loss and optimizer
61 | criterion = nn.CrossEntropyLoss()
62 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
63 |
64 | # Train the model
65 | total_step = len(train_loader)
66 | for epoch in range(num_epochs):
67 | for i, (images, labels) in enumerate(train_loader):
68 | images = images.to(device)
69 | labels = labels.to(device)
70 |
71 | # Forward pass
72 | outputs = model(images)
73 | loss = criterion(outputs, labels)
74 |
75 | # Backward and optimize
76 | optimizer.zero_grad()
77 | loss.backward()
78 | optimizer.step()
79 |
80 | if (i+1) % 100 == 0:
81 | print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
82 | .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
83 |
84 | # Test the model
85 | model.eval() # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)
86 | with torch.no_grad():
87 | correct = 0
88 | total = 0
89 | for images, labels in test_loader:
90 | images = images.to(device)
91 | labels = labels.to(device)
92 | outputs = model(images)
93 | _, predicted = torch.max(outputs.data, 1)
94 | total += labels.size(0)
95 | correct += (predicted == labels).sum().item()
96 |
97 | print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
98 |
99 | # Save the model checkpoint
100 | torch.save(model.state_dict(), 'model.ckpt')
101 |
--------------------------------------------------------------------------------
/ddpg.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch, gym, argparse
3 | import torch.nn as nn
4 | from torch.autograd import Variable
5 | import torch.nn.functional as F
6 |
7 |
8 | class ReplayBuffer(object):
9 | def __init__(self, max_size=1e6):
10 | self.storage = []
11 | self.max_size = max_size
12 | self.ptr = 0
13 |
14 | def add(self, data):
15 | if len(self.storage) == self.max_size:
16 | self.storage[int(self.ptr)] = data
17 | self.ptr = (self.ptr + 1) % self.max_size
18 | else:
19 | self.storage.append(data)
20 |
21 | def sample(self, batch_size):
22 | ind = np.random.randint(0, len(self.storage), size=batch_size)
23 | x, y, u, r, d = [], [], [], [], []
24 | for i in ind:
25 | X, Y, U, R, D = self.storage[i]
26 | x.append(np.array(X, copy=False))
27 | y.append(np.array(Y, copy=False))
28 | u.append(np.array(U, copy=False))
29 | r.append(np.array(R, copy=False))
30 | d.append(np.array(D, copy=False))
31 | return np.array(x), np.array(y), np.array(u), np.array(r).reshape(-1, 1), np.array(d).reshape(-1, 1)
32 |
33 |
34 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35 |
36 |
37 | class Actor(nn.Module):
38 | def __init__(self, state_dim, action_dim, max_action):
39 | super(Actor, self).__init__()
40 | self.l1 = nn.Linear(state_dim, 400)
41 | self.l2 = nn.Linear(400, 300)
42 | self.l3 = nn.Linear(300, action_dim)
43 | self.max_action = max_action
44 |
45 | def forward(self, x):
46 | x = F.relu(self.l1(x))
47 | x = F.relu(self.l2(x))
48 | x = self.max_action * torch.tanh(self.l3(x))
49 | return x
50 |
51 |
52 | class Critic(nn.Module):
53 | def __init__(self, state_dim, action_dim):
54 | super(Critic, self).__init__()
55 | self.l1 = nn.Linear(state_dim, 400)
56 | self.l2 = nn.Linear(400 + action_dim, 300)
57 | self.l3 = nn.Linear(300, 1)
58 |
59 | def forward(self, x, u):
60 | x = F.relu(self.l1(x))
61 | x = F.relu(self.l2(torch.cat([x, u], 1)))
62 | x = self.l3(x)
63 | return x
64 |
65 |
66 | class DDPG(object):
67 | def __init__(self, state_dim, action_dim, max_action):
68 | self.actor = Actor(state_dim, action_dim, max_action).to(device)
69 | self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
70 | self.actor_target.load_state_dict(self.actor.state_dict())
71 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4)
72 | self.critic = Critic(state_dim, action_dim).to(device)
73 | self.critic_target = Critic(state_dim, action_dim).to(device)
74 | self.critic_target.load_state_dict(self.critic.state_dict())
75 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), weight_decay=1e-2)
76 |
77 | def select_action(self, state):
78 | state = torch.FloatTensor(state.reshape(1, -1)).to(device)
79 | return self.actor(state).cpu().data.numpy().flatten()
80 |
81 | def train(self, replay_buffer, iterations, batch_size=64, discount=0.99, tau=0.001):
82 |
83 | for _ in range(iterations):
84 | x, y, u, r, d = replay_buffer.sample(batch_size)
85 | state = torch.FloatTensor(x).to(device)
86 | action = torch.FloatTensor(u).to(device)
87 | next_state = torch.FloatTensor(y).to(device)
88 | done = torch.FloatTensor(1 - d).to(device)
89 | reward = torch.FloatTensor(r).to(device)
90 |
91 | target_Q = self.critic_target(next_state, self.actor_target(next_state))
92 | target_Q = reward + (done * discount * target_Q).detach()
93 | current_Q = self.critic(state, action)
94 |
95 | critic_loss = F.mse_loss(current_Q, target_Q)
96 | self.critic_optimizer.zero_grad()
97 | critic_loss.backward()
98 | self.critic_optimizer.step()
99 |
100 | actor_loss = -self.critic(state, self.actor(state)).mean()
101 | self.actor_optimizer.zero_grad()
102 | actor_loss.backward()
103 | self.actor_optimizer.step()
104 |
105 | # Update model
106 | for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
107 | target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
108 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
109 | target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
110 |
111 |
112 | if __name__ == "__main__":
113 | parser = argparse.ArgumentParser()
114 | parser.add_argument("--env_name", default="Pendulum-v0")
115 | parser.add_argument("--seed", default=0, type=int, help='Sets Gym, PyTorch and Numpy seeds')
116 | parser.add_argument("--start_timesteps", default=1e4, type=int, help='how many step random policy run')
117 | parser.add_argument("--max_timesteps", default=1e6, type=float, help='max_timesteps')
118 | parser.add_argument("--expl_noise", default=0.1, type=float, help='Gaussian exploration')
119 | parser.add_argument("--batch_size", default=100, type=int, help='Batch size')
120 | parser.add_argument("--GAMMA", default=0.99, type=float, help='Discount')
121 | parser.add_argument("--tau", default=0.005, type=float, help='DDPG update rate')
122 | parser.add_argument("--policy_noise", default=0.2, type=float, help='Noise to target policy during critic update')
123 | parser.add_argument("--noise_clip", default=0.5, type=float, help='Range to clip target policy noise')
124 | parser.add_argument("--policy_freq", default=2, type=int, help=' Frequency of delayed policy updates')
125 | args = parser.parse_args()
126 |
127 | env = gym.make(args.env_name)
128 | env.seed(args.seed)
129 | torch.manual_seed(args.seed)
130 | np.random.seed(args.seed)
131 | state_dim = env.observation_space.shape[0]
132 | action_dim = env.action_space.shape[0]
133 | max_action = float(env.action_space.high[0])
134 | policy = DDPG(state_dim, action_dim, max_action)
135 | replay_buffer = ReplayBuffer()
136 | total_timesteps = 0
137 | timesteps_since_eval = 0
138 | episode_num = 0
139 | episode_reward = 0
140 | episode_timesteps = 0
141 | done = True
142 |
143 | while total_timesteps < args.max_timesteps:
144 | if done:
145 | if total_timesteps != 0:
146 | print(("Total T: %d Episode Num: %d Episode T: %d Reward: %f") % (total_timesteps, episode_num, episode_timesteps, episode_reward))
147 | policy.train(replay_buffer, episode_timesteps, args.batch_size, args.GAMMA, args.tau)
148 |
149 | obs = env.reset()
150 | done = False
151 | episode_reward = 0
152 | episode_timesteps = 0
153 | episode_num += 1
154 |
155 | if total_timesteps < args.start_timesteps:
156 | action = env.action_space.sample()
157 | else:
158 | action = policy.select_action(np.array(obs))
159 | if args.expl_noise != 0:
160 | action = (action + np.random.normal(0, args.expl_noise, size=env.action_space.shape[0])).clip(
161 | env.action_space.low, env.action_space.high)
162 |
163 | new_obs, reward, done, _ = env.step(action)
164 | done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
165 | episode_reward += reward
166 |
167 | replay_buffer.add((obs, new_obs, action, reward, done_bool))
168 | obs = new_obs
169 | episode_timesteps += 1
170 | total_timesteps += 1
171 | timesteps_since_eval += 1
--------------------------------------------------------------------------------
/double-DQN.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from collections import deque
4 | import numpy as np
5 | import gym
6 | import random
7 | from net import AtariNet
8 | from util import preprocess
9 |
10 | BATCH_SIZE = 32
11 | LR = 0.001
12 | START_EPSILON = 1.0
13 | FINAL_EPSILON = 0.1
14 | EPSILON = START_EPSILON
15 | EXPLORE = 1000000
16 | GAMMA = 0.99
17 | TOTAL_EPISODES = 10000000
18 | MEMORY_SIZE = 1000000
19 | MEMORY_THRESHOLD = 100000
20 | UPDATE_TIME = 10000
21 | TEST_FREQUENCY = 1000
22 | env = gym.make('Pong-v0')
23 | env = env.unwrapped
24 | ACTIONS_SIZE = env.action_space.n
25 |
26 |
27 | class Agent(object):
28 | def __init__(self):
29 | self.network, self.target_network = AtariNet(ACTIONS_SIZE), AtariNet(ACTIONS_SIZE)
30 | self.memory = deque()
31 | self.learning_count = 0
32 | self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR)
33 | self.loss_func = nn.MSELoss()
34 |
35 | def action(self, state, israndom):
36 | if israndom and random.random() < EPSILON:
37 | return np.random.randint(0, ACTIONS_SIZE)
38 | state = torch.unsqueeze(torch.FloatTensor(state), 0)
39 | actions_value = self.network.forward(state)
40 | return torch.max(actions_value, 1)[1].data.numpy()[0]
41 |
42 | def learn(self, state, action, reward, next_state, done):
43 | if done:
44 | self.memory.append((state, action, reward, next_state, 0))
45 | else:
46 | self.memory.append((state, action, reward, next_state, 1))
47 | if len(self.memory) > MEMORY_SIZE:
48 | self.memory.popleft()
49 | if len(self.memory) < MEMORY_THRESHOLD:
50 | return
51 |
52 | if self.learning_count % UPDATE_TIME == 0:
53 | self.target_network.load_state_dict(self.network.state_dict())
54 | self.learning_count += 1
55 |
56 | batch = random.sample(self.memory, BATCH_SIZE)
57 | state = torch.FloatTensor([x[0] for x in batch])
58 | action = torch.LongTensor([[x[1]] for x in batch])
59 | reward = torch.FloatTensor([[x[2]] for x in batch])
60 | next_state = torch.FloatTensor([x[3] for x in batch])
61 | done = torch.FloatTensor([[x[4]] for x in batch])
62 |
63 | actions_value = self.network.forward(next_state)
64 | next_action = torch.unsqueeze(torch.max(actions_value, 1)[1], 1)
65 | eval_q = self.network.forward(state).gather(1, action)
66 | next_q = self.target_network.forward(next_state).gather(1, next_action)
67 | target_q = reward + GAMMA * next_q * done
68 | loss = self.loss_func(eval_q, target_q)
69 |
70 | self.optimizer.zero_grad()
71 | loss.backward()
72 | self.optimizer.step()
73 |
74 |
75 | agent = Agent()
76 |
77 | for i_episode in range(TOTAL_EPISODES):
78 | state = env.reset()
79 | state = preprocess(state)
80 | while True:
81 | # env.render()
82 | action = agent.action(state, True)
83 | next_state, reward, done, info = env.step(action)
84 | next_state = preprocess(next_state)
85 | agent.learn(state, action, reward, next_state, done)
86 |
87 | state = next_state
88 | if done:
89 | break
90 | if EPSILON > FINAL_EPSILON:
91 | EPSILON -= (START_EPSILON - FINAL_EPSILON) / EXPLORE
92 |
93 | # TEST
94 | if i_episode % TEST_FREQUENCY == 0:
95 | state = env.reset()
96 | state = preprocess(state)
97 | total_reward = 0
98 | while True:
99 | # env.render()
100 | action = agent.action(state, israndom=False)
101 | next_state, reward, done, info = env.step(action)
102 | next_state = preprocess(next_state)
103 |
104 | total_reward += reward
105 |
106 | state = next_state
107 | if done:
108 | break
109 | print('episode: {} , total_reward: {}'.format(i_episode, round(total_reward, 3)))
110 |
111 | env.close()
112 |
--------------------------------------------------------------------------------
/dueling-DQN.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from collections import deque
4 | import numpy as np
5 | import gym
6 | import random
7 | import cv2
8 |
9 | BATCH_SIZE = 32
10 | LR = 0.001
11 | START_EPSILON = 1.0
12 | FINAL_EPSILON = 0.1
13 | EPSILON = START_EPSILON
14 | EXPLORE = 1000000
15 | GAMMA = 0.99
16 | TOTAL_EPISODES = 10000000
17 | MEMORY_SIZE = 1000000
18 | MEMORY_THRESHOLD = 100000
19 | TEST_FREQUENCY = 1000
20 | env = gym.make('Pong-v0')
21 | env = env.unwrapped
22 | ACTIONS_SIZE = env.action_space.n
23 |
24 |
25 | def preprocess(observation):
26 | observation = cv2.cvtColor(cv2.resize(observation, (84, 110)), cv2.COLOR_BGR2GRAY)
27 | observation = observation[26:110,:]
28 | ret, observation = cv2.threshold(observation,1,255,cv2.THRESH_BINARY)
29 | x = np.reshape(observation,(84,84,1))
30 | return x.transpose((2, 0, 1))
31 |
32 |
33 | class DuelingNet(nn.Module):
34 |
35 | def __init__(self, num_actions):
36 | super(DuelingNet, self).__init__()
37 | self.num_actions = num_actions
38 | self.conv1 = nn.Sequential(
39 | nn.Conv2d(1, 32, kernel_size=8, stride=4),
40 | nn.ReLU()
41 | )
42 | self.conv2 = nn.Sequential(
43 | nn.Conv2d(32, 64, kernel_size=4, stride=2),
44 | nn.ReLU()
45 | )
46 | self.conv3 = nn.Sequential(
47 | nn.Conv2d(64, 64, kernel_size=3, stride=1),
48 | nn.ReLU()
49 | )
50 | self.hidden_adv = nn.Sequential(
51 | nn.Linear(64 * 7 * 7, 512, bias=True),
52 | nn.ReLU()
53 | )
54 | self.hidden_val = nn.Sequential(
55 | nn.Linear(64 * 7 * 7, 512, bias=True),
56 | nn.ReLU()
57 | )
58 | self.adv = nn.Sequential(
59 | nn.Linear(512, num_actions, bias=True)
60 | )
61 | self.val = nn.Sequential(
62 | nn.Linear(512, 1, bias=True)
63 | )
64 | self.apply(self.init_weights)
65 |
66 | def init_weights(self, m):
67 | if type(m) == nn.Conv2d:
68 | m.weight.data.normal_(0.0, 0.02)
69 | if type(m) == nn.Linear:
70 | torch.nn.init.xavier_uniform_(m.weight)
71 | m.bias.data.fill_(0.01)
72 |
73 | def forward(self, x):
74 | x = self.conv1(x)
75 | x = self.conv2(x)
76 | x = self.conv3(x)
77 | x = x.view(x.size(0), -1)
78 | adv = self.hidden_adv(x)
79 | val = self.hidden_val(x)
80 |
81 | adv = self.adv(adv)
82 | val = self.val(val).expand(x.size(0), self.num_actions)
83 |
84 | x = val + adv - adv.mean(1).unsqueeze(1).expand(x.size(0), self.num_actions)
85 | return x
86 |
87 | class Agent(object):
88 | def __init__(self):
89 | self.network = DuelingNet(ACTIONS_SIZE)
90 | self.memory = deque()
91 | self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR)
92 | self.loss_func = nn.MSELoss()
93 |
94 | def action(self, state, israndom):
95 | if israndom and random.random() < EPSILON:
96 | return np.random.randint(0, ACTIONS_SIZE)
97 | state = torch.unsqueeze(torch.FloatTensor(state), 0)
98 | actions_value = self.network.forward(state)
99 | return torch.max(actions_value, 1)[1].data.numpy()[0]
100 |
101 | def learn(self, state, action, reward, next_state, done):
102 | if done:
103 | self.memory.append((state, action, reward, next_state, 0))
104 | else:
105 | self.memory.append((state, action, reward, next_state, 1))
106 | if len(self.memory) > MEMORY_SIZE:
107 | self.memory.popleft()
108 | if len(self.memory) < MEMORY_THRESHOLD:
109 | return
110 |
111 | batch = random.sample(self.memory, BATCH_SIZE)
112 | state = torch.FloatTensor([x[0] for x in batch])
113 | action = torch.LongTensor([[x[1]] for x in batch])
114 | reward = torch.FloatTensor([[x[2]] for x in batch])
115 | next_state = torch.FloatTensor([x[3] for x in batch])
116 | done = torch.FloatTensor([[x[4]] for x in batch])
117 |
118 | eval_q = self.network.forward(state).gather(1, action)
119 | next_q = self.network(next_state).detach()
120 | target_q = reward + GAMMA * next_q.max(1)[0].view(BATCH_SIZE, 1) * done
121 | loss = self.loss_func(eval_q, target_q)
122 |
123 | self.optimizer.zero_grad()
124 | loss.backward()
125 | self.optimizer.step()
126 |
127 |
128 | agent = Agent()
129 |
130 | for i_episode in range(TOTAL_EPISODES):
131 | state = env.reset()
132 | state = preprocess(state)
133 | while True:
134 | # env.render()
135 | action = agent.action(state, True)
136 | next_state, reward, done, info = env.step(action)
137 | next_state = preprocess(next_state)
138 | agent.learn(state, action, reward, next_state, done)
139 |
140 | state = next_state
141 | if done:
142 | break
143 | if EPSILON > FINAL_EPSILON:
144 | EPSILON -= (START_EPSILON - FINAL_EPSILON) / EXPLORE
145 |
146 | # TEST
147 | if i_episode % TEST_FREQUENCY == 0:
148 | state = env.reset()
149 | state = preprocess(state)
150 | total_reward = 0
151 | while True:
152 | # env.render()
153 | action = agent.action(state, israndom=False)
154 | next_state, reward, done, info = env.step(action)
155 | next_state = preprocess(next_state)
156 |
157 | total_reward += reward
158 |
159 | state = next_state
160 | if done:
161 | break
162 | print('episode: {} , total_reward: {}'.format(i_episode, round(total_reward, 3)))
163 |
164 | env.close()
165 |
--------------------------------------------------------------------------------
/feedforward_neural_network/main.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torchvision
4 | import torchvision.transforms as transforms
5 |
6 |
7 | # Device configuration
8 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
9 |
10 | # Hyper-parameters
11 | input_size = 784
12 | hidden_size = 500
13 | num_classes = 10
14 | num_epochs = 5
15 | batch_size = 100
16 | learning_rate = 0.001
17 |
18 | # MNIST dataset
19 | train_dataset = torchvision.datasets.MNIST(root='../../data',
20 | train=True,
21 | transform=transforms.ToTensor(),
22 | download=True)
23 |
24 | test_dataset = torchvision.datasets.MNIST(root='../../data',
25 | train=False,
26 | transform=transforms.ToTensor())
27 |
28 | # Data loader
29 | train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
30 | batch_size=batch_size,
31 | shuffle=True)
32 |
33 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
34 | batch_size=batch_size,
35 | shuffle=False)
36 |
37 | # Fully connected neural network with one hidden layer
38 | class NeuralNet(nn.Module):
39 | def __init__(self, input_size, hidden_size, num_classes):
40 | super(NeuralNet, self).__init__()
41 | self.fc1 = nn.Linear(input_size, hidden_size)
42 | self.relu = nn.ReLU()
43 | self.fc2 = nn.Linear(hidden_size, num_classes)
44 |
45 | def forward(self, x):
46 | out = self.fc1(x)
47 | out = self.relu(out)
48 | out = self.fc2(out)
49 | return out
50 |
51 | model = NeuralNet(input_size, hidden_size, num_classes).to(device)
52 |
53 | # Loss and optimizer
54 | criterion = nn.CrossEntropyLoss()
55 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
56 |
57 | # Train the model
58 | total_step = len(train_loader)
59 | for epoch in range(num_epochs):
60 | for i, (images, labels) in enumerate(train_loader):
61 | # Move tensors to the configured device
62 | images = images.reshape(-1, 28*28).to(device)
63 | labels = labels.to(device)
64 |
65 | # Forward pass
66 | outputs = model(images)
67 | loss = criterion(outputs, labels)
68 |
69 | # Backward and optimize
70 | optimizer.zero_grad()
71 | loss.backward()
72 | optimizer.step()
73 |
74 | if (i+1) % 100 == 0:
75 | print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
76 | .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
77 |
78 | # Test the model
79 | # In test phase, we don't need to compute gradients (for memory efficiency)
80 | with torch.no_grad():
81 | correct = 0
82 | total = 0
83 | for images, labels in test_loader:
84 | images = images.reshape(-1, 28*28).to(device)
85 | labels = labels.to(device)
86 | outputs = model(images)
87 | _, predicted = torch.max(outputs.data, 1)
88 | total += labels.size(0)
89 | correct += (predicted == labels).sum().item()
90 |
91 | print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))
92 |
93 | # Save the model checkpoint
94 | torch.save(model.state_dict(), 'model.ckpt')
95 |
--------------------------------------------------------------------------------
/ga/bag.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import random
3 | #背包问题
4 | #物品质量价格
5 | X = {
6 | 1: [10, 15],
7 | 2: [15, 25],
8 | 3: [20, 35],
9 | 4: [25, 45],
10 | 5: [30, 55],
11 | 6: [35, 70]}
12 |
13 | #终止界限
14 | FINISHED_LIMIT = 5
15 |
16 | #重量界限
17 | WEIGHT_LIMIT = 80
18 |
19 | #染色体长度
20 | CHROMOSOME_SIZE = 6
21 |
22 | #遴选次数
23 | SELECT_NUMBER = 4
24 |
25 | max_last = 0
26 | diff_last = 10000
27 |
28 | #判断退出
29 | def is_finished(fitnesses):
30 | global max_last
31 | global diff_last
32 |
33 | max_current = 0
34 | for v in fitnesses:
35 | if v[1] > max_current:
36 | max_current = v[1]
37 |
38 | diff = max_current - max_last
39 | if diff < FINISHED_LIMIT and diff_last < FINISHED_LIMIT:
40 | return True
41 | else:
42 | diff_last = diff
43 | max_last = max_current
44 | return False
45 |
46 | #初始染色体样态
47 | def init():
48 | chromosome_state1 = '100100'
49 | chromosome_state2 = '101010'
50 | chromosome_state3 = '010101'
51 | chromosome_state4 = '101011'
52 | chromosome_states = [chromosome_state1,
53 | chromosome_state2,
54 | chromosome_state3,
55 | chromosome_state4]
56 | return chromosome_states
57 |
58 |
59 | #计算适应度
60 | def fitness(chromosome_states):
61 | fitnesses = []
62 | for chromosome_state in chromosome_states:
63 | value_sum = 0
64 | weight_sum = 0
65 | for i, v in enumerate(chromosome_state):
66 | if int(v) == 1:
67 | weight_sum += X[i + 1][0]
68 | value_sum += X[i + 1][1]
69 | fitnesses.append([value_sum, weight_sum])
70 | return fitnesses
71 |
72 |
73 | #筛选
74 | def filter(chromosome_states, fitnesses):
75 | #重量大于80的被淘汰
76 | index = len(fitnesses) - 1
77 | while index >= 0:
78 | index -= 1
79 | if fitnesses[index][1] > WEIGHT_LIMIT:
80 | chromosome_states.pop(index)
81 | fitnesses.pop(index)
82 |
83 | #遴选
84 | selected_index = [0] * len(chromosome_states)
85 | for i in range(SELECT_NUMBER):
86 | j = chromosome_states.index(random.choice(chromosome_states))
87 | selected_index[j] += 1
88 | return selected_index
89 |
90 |
91 | #产生下一代
92 | def crossover(chromosome_states, selected_index):
93 | chromosome_states_new = []
94 | index = len(chromosome_states) - 1
95 | while index >= 0:
96 | index -= 1
97 | chromosome_state = chromosome_states.pop(index)
98 | for i in range(selected_index[index]):
99 | chromosome_state_x = random.choice(chromosome_states)
100 | pos = random.choice(range(1, CHROMOSOME_SIZE - 1))
101 | chromosome_states_new.append(chromosome_state[:pos] + chromosome_state_x[pos:])
102 | chromosome_states.insert(index, chromosome_state)
103 | return chromosome_states_new
104 |
105 |
106 | if __name__ == '__main__':
107 | #初始群体
108 | chromosome_states = init()
109 | n = 100
110 | while n > 0:
111 | n -= 1
112 | #适应度计算
113 | fitnesses = fitness(chromosome_states)
114 | if is_finished(fitnesses):
115 | break
116 | print('1:', fitnesses)
117 | #遴选
118 | selected_index = filter(chromosome_states, fitnesses)
119 | print('2:', selected_index)
120 | #产生下一代
121 | chromosome_states = crossover(chromosome_states, selected_index)
122 | # print '3:', chromosome_states
123 |
124 | fitnesses = fitness(chromosome_states)
125 | print(chromosome_states)
126 |
127 | # 1: [[60, 35], [105, 60], [140, 75], [175, 95]]
128 | # 2: [1, 1, 2]
129 |
130 | # 1: [[60, 35], [105, 60], [80, 45], [90, 50]]
131 | # 2: [2, 1, 0, 1]
132 |
133 | # 1: [[95, 55], [115, 65], [70, 40], [90, 50]]
134 | # 2: [2, 0, 2, 0]
135 |
136 | # 1: [[70, 40], [70, 40], [150, 85], [115, 65]]
137 | # 2: [3, 0, 1]
138 |
139 | # 1: [[115, 65], [115, 65], [115, 65], [70, 40]]
140 | # 2: [2, 0, 0, 2]
141 | # ['100110', '100110', '100110', '100110']
142 |
--------------------------------------------------------------------------------
/ga/peak.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import random
3 | import math
4 | import numpy as np
5 |
6 |
7 | #极大值问题
8 | #染色体 基因X 基因Y
9 | X = [
10 | [1, '000000100101001', '101010101010101'],
11 | [2, '011000100101100', '001100110011001'],
12 | [3, '001000100100101', '101010101010101'],
13 | [4, '000110100100100', '110011001100110'],
14 | [5, '100000100100101', '101010101010101'],
15 | [6, '101000100100100', '111100001111000'],
16 | [7, '101010100110100', '101010101010101'],
17 | [8, '100110101101000', '000011110000111']]
18 |
19 |
20 | #染色体长度
21 | CHROMOSOME_SIZE = 15
22 |
23 |
24 | #判断退出
25 | def is_finished(last_three):
26 | s = sorted(last_three)
27 | if s[0] and s[2] - s[0] < 0.01 * s[0]:
28 | return True
29 | else:
30 | return False
31 |
32 | #初始染色体样态
33 | def init():
34 | chromosome_state1 = ['000000100101001', '101010101010101']
35 | chromosome_state2 = ['011000100101100', '001100110011001']
36 | chromosome_state3 = ['001000100100101', '101010101010101']
37 | chromosome_state4 = ['000110100100100', '110011001100110']
38 | chromosome_state5 = ['100000100100101', '101010101010101']
39 | chromosome_state6 = ['101000100100100', '111100001111000']
40 | chromosome_state7 = ['101010100110100', '101010101010101']
41 | chromosome_state8 = ['100110101101000', '000011110000111']
42 | chromosome_states = [chromosome_state1,
43 | chromosome_state2,
44 | chromosome_state3,
45 | chromosome_state4,
46 | chromosome_state5,
47 | chromosome_state6,
48 | chromosome_state7,
49 | chromosome_state8]
50 | return chromosome_states
51 |
52 |
53 | #计算适应度
54 | def fitness(chromosome_states):
55 | fitnesses = []
56 | for chromosome_state in chromosome_states:
57 | if chromosome_state[0][0] == '1':
58 | x = 10 * (-float(int(chromosome_state[0][1:], 2) - 1)/16384)
59 | else:
60 | x = 10 * (float(int(chromosome_state[0], 2) + 1)/16384)
61 | if chromosome_state[1][0] == '1':
62 | y = 10 * (-float(int(chromosome_state[1][1:], 2) - 1)/16384)
63 | else:
64 | y = 10 * (float(int(chromosome_state[1], 2) + 1)/16384)
65 | z = y * math.sin(x) + x * math.cos(y)
66 | print(x, y, z)
67 | fitnesses.append(z)
68 |
69 | return fitnesses
70 |
71 |
72 | #筛选
73 | def filter(chromosome_states, fitnesses):
74 | #top 8 对应的索引值
75 | chromosome_states_new = []
76 | top1_fitness_index = 0
77 | for i in np.argsort(fitnesses)[::-1][:8].tolist():
78 | chromosome_states_new.append(chromosome_states[i])
79 | top1_fitness_index = i
80 | return chromosome_states_new, top1_fitness_index
81 |
82 |
83 | #产生下一代
84 | def crossover(chromosome_states):
85 | chromosome_states_new = []
86 | while chromosome_states:
87 | chromosome_state = chromosome_states.pop(0)
88 | for v in chromosome_states:
89 | pos = random.choice(range(8, CHROMOSOME_SIZE - 1))
90 | chromosome_states_new.append([chromosome_state[0][:pos] + v[0][pos:], chromosome_state[1][:pos] + v[1][pos:]])
91 | chromosome_states_new.append([v[0][:pos] + chromosome_state[1][pos:], v[0][:pos] + chromosome_state[1][pos:]])
92 | return chromosome_states_new
93 |
94 |
95 | #基因突变
96 | def mutation(chromosome_states):
97 | n = int(5.0 / 100 * len(chromosome_states))
98 | while n > 0:
99 | n -= 1
100 | chromosome_state = random.choice(chromosome_states)
101 | index = chromosome_states.index(chromosome_state)
102 | pos = random.choice(range(len(chromosome_state)))
103 | x = chromosome_state[0][:pos] + str(int(not int(chromosome_state[0][pos]))) + chromosome_state[0][pos+1:]
104 | y = chromosome_state[1][:pos] + str(int(not int(chromosome_state[1][pos]))) + chromosome_state[1][pos+1:]
105 | chromosome_states[index] = [x, y]
106 |
107 |
108 | if __name__ == '__main__':
109 | chromosome_states = init()
110 | last_three = [0] * 3
111 | last_num = 0
112 | n = 100
113 | while n > 0:
114 | n -= 1
115 | chromosome_states = crossover(chromosome_states)
116 | mutation(chromosome_states)
117 | fitnesses = fitness(chromosome_states)
118 | chromosome_states, top1_fitness_index = filter(chromosome_states, fitnesses)
119 | print('---------%d-----------' % n)
120 | print(chromosome_states)
121 | last_three[last_num] = fitnesses[top1_fitness_index]
122 | print(fitnesses[top1_fitness_index])
123 | if is_finished(last_three):
124 | break
125 | if last_num >= 2:
126 | last_num = 0
127 | else:
128 | last_num += 1
129 |
130 |
131 | # ['100100', '101010', '010101', '101011']
132 |
133 | # 1: [[60, 35], [105, 60], [140, 75], [175, 95]]
134 | # 2: [0, 2, 2]
135 | #
136 | # 1: [[60, 35], [60, 35], [80, 45], [125, 70]]
137 | # 2: [3, 0, 1, 0]
138 | #
139 | # 1: [[80, 45], [60, 35], [60, 35], [140, 80]]
140 | # 2: [1, 2, 0, 1]
141 | #
142 | # 1: [[70, 40], [70, 40], [70, 40], [85, 50]]
143 | # 2: [3, 0, 0, 1]
144 | #
145 | # 1: [[70, 40], [70, 40], [70, 40], [95, 55]]
146 | # 2: [4, 0, 0, 0]
147 | #
148 | # 1: [[70, 40], [70, 40], [70, 40], [70, 40]]
149 | # 2: [4, 0, 0, 0]
150 | #
151 | # ['100010', '100010', '100010', '100010']
152 | # [[70, 40], [70, 40], [70, 40], [70, 40]]
153 |
--------------------------------------------------------------------------------
/gym_sample/demo.py:
--------------------------------------------------------------------------------
1 | import gym
2 | env = gym.make('CartPole-v0')
3 | env.reset()
4 | for _ in range(1000):
5 | env.render()
6 | env.step(env.action_space.sample()) # take a random action
7 |
--------------------------------------------------------------------------------
/native-Qlearning.py:
--------------------------------------------------------------------------------
1 | # 问题http://mnemstudio.org/path-finding-q-learning-tutorial.htm的Q学习方法实现
2 | import numpy as np
3 | import random
4 | import matplotlib.pyplot as plt
5 |
6 | Q_fun = np.zeros((6, 6))
7 |
8 | # 回报函数,在状态state采用action转移到next_state的回报,横纵坐标分别为state和next_state
9 | reward = np.array([[-1, -1, -1, -1, 0, -1],
10 | [-1, -1, -1, 0, -1, 100],
11 | [-1, -1, -1, 0, -1, -1],
12 | [-1, 0, 0, -1, 0, -1],
13 | [0, -1, -1, 0, -1, 100],
14 | [-1, 0, -1, -1, 0, 100]])
15 |
16 | legal_action = [[4],
17 | [3, 5],
18 | [3],
19 | [1, 2, 4],
20 | [0, 3, 5],
21 | [1, 4, 5]]
22 |
23 | GAMMA = 0.5
24 | TRAINING_STEP = 100
25 | LAYOUT = 221
26 |
27 | for i in range(1, TRAINING_STEP + 1):
28 | state = random.randint(0, 4)
29 | # 百分百探索,随机产生next_state
30 | next_state = random.choice(legal_action[state])
31 | Q_fun[state, next_state] = reward[state, next_state] + GAMMA * Q_fun[next_state].max()
32 |
33 | if i % (TRAINING_STEP/4) == 0:
34 | plt.subplot(LAYOUT)
35 | plt.imshow(Q_fun, cmap='gray_r')
36 | LAYOUT += 1
37 | print(Q_fun)
38 | plt.show()
39 |
40 |
41 |
--------------------------------------------------------------------------------
/nature-DQN.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from collections import deque
4 | import numpy as np
5 | import gym
6 | import random
7 | from net import AtariNet
8 | from util import preprocess
9 |
10 | BATCH_SIZE = 32
11 | LR = 0.001
12 | START_EPSILON = 1.0
13 | FINAL_EPSILON = 0.1
14 | EPSILON = START_EPSILON
15 | EXPLORE = 1000000
16 | GAMMA = 0.99
17 | TOTAL_EPISODES = 10000000
18 | MEMORY_SIZE = 1000000
19 | MEMORY_THRESHOLD = 100000
20 | UPDATE_TIME = 10000
21 | TEST_FREQUENCY = 1000
22 | env = gym.make('Pong-v0')
23 | env = env.unwrapped
24 | ACTIONS_SIZE = env.action_space.n
25 |
26 |
27 | class Agent(object):
28 | def __init__(self):
29 | self.network, self.target_network = AtariNet(ACTIONS_SIZE), AtariNet(ACTIONS_SIZE)
30 | self.memory = deque()
31 | self.learning_count = 0
32 | self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR)
33 | self.loss_func = nn.MSELoss()
34 |
35 | def action(self, state, israndom):
36 | if israndom and random.random() < EPSILON:
37 | return np.random.randint(0, ACTIONS_SIZE)
38 | state = torch.unsqueeze(torch.FloatTensor(state), 0)
39 | actions_value = self.network.forward(state)
40 | return torch.max(actions_value, 1)[1].data.numpy()[0]
41 |
42 | def learn(self, state, action, reward, next_state, done):
43 | if done:
44 | self.memory.append((state, action, reward, next_state, 0))
45 | else:
46 | self.memory.append((state, action, reward, next_state, 1))
47 | if len(self.memory) > MEMORY_SIZE:
48 | self.memory.popleft()
49 | if len(self.memory) < MEMORY_THRESHOLD:
50 | return
51 |
52 | if self.learning_count % UPDATE_TIME == 0:
53 | self.target_network.load_state_dict(self.network.state_dict())
54 | self.learning_count += 1
55 |
56 | batch = random.sample(self.memory, BATCH_SIZE)
57 | state = torch.FloatTensor([x[0] for x in batch])
58 | action = torch.LongTensor([[x[1]] for x in batch])
59 | reward = torch.FloatTensor([[x[2]] for x in batch])
60 | next_state = torch.FloatTensor([x[3] for x in batch])
61 | done = torch.FloatTensor([[x[4]] for x in batch])
62 |
63 | eval_q = self.network.forward(state).gather(1, action)
64 | next_q = self.target_network(next_state).detach()
65 | target_q = reward + GAMMA * next_q.max(1)[0].view(BATCH_SIZE, 1) * done
66 | loss = self.loss_func(eval_q, target_q)
67 |
68 | self.optimizer.zero_grad()
69 | loss.backward()
70 | self.optimizer.step()
71 |
72 |
73 | agent = Agent()
74 |
75 | for i_episode in range(TOTAL_EPISODES):
76 | state = env.reset()
77 | state = preprocess(state)
78 | while True:
79 | # env.render()
80 | action = agent.action(state, True)
81 | next_state, reward, done, info = env.step(action)
82 | next_state = preprocess(next_state)
83 | agent.learn(state, action, reward, next_state, done)
84 |
85 | state = next_state
86 | if done:
87 | break
88 | if EPSILON > FINAL_EPSILON:
89 | EPSILON -= (START_EPSILON - FINAL_EPSILON) / EXPLORE
90 |
91 | # TEST
92 | if i_episode % TEST_FREQUENCY == 0:
93 | state = env.reset()
94 | state = preprocess(state)
95 | total_reward = 0
96 | while True:
97 | # env.render()
98 | action = agent.action(state, israndom=False)
99 | next_state, reward, done, info = env.step(action)
100 | next_state = preprocess(next_state)
101 |
102 | total_reward += reward
103 |
104 | state = next_state
105 | if done:
106 | break
107 | print('episode: {} , total_reward: {}'.format(i_episode, round(total_reward, 3)))
108 |
109 | env.close()
110 |
--------------------------------------------------------------------------------
/neat/Digraph.gv:
--------------------------------------------------------------------------------
1 | digraph {
2 | node [fontsize=9 height=0.2 shape=circle width=0.2]
3 | In0 [fillcolor=lightgray shape=box style=filled]
4 | In1 [fillcolor=lightgray shape=box style=filled]
5 | In3 [fillcolor=lightgray shape=box style=filled]
6 | In4 [fillcolor=lightgray shape=box style=filled]
7 | act1 [fillcolor=lightblue style=filled]
8 | act2 [fillcolor=lightblue style=filled]
9 | 137 [fillcolor=white style=filled]
10 | 714 [fillcolor=white style=filled]
11 | 626 [fillcolor=white style=filled]
12 | 404 [fillcolor=white style=filled]
13 | 246 [fillcolor=white style=filled]
14 | 442 [fillcolor=white style=filled]
15 | 540 [fillcolor=white style=filled]
16 | In0 -> act2 [color=red penwidth=0.15464140610078286 style=dotted]
17 | In1 -> act1 [color=green penwidth=0.20589896434649435 style=dotted]
18 | In1 -> act2 [color=green penwidth=0.32201856748289415 style=solid]
19 | In3 -> act1 [color=red penwidth=0.41018376786556177 style=dotted]
20 | In3 -> act2 [color=green penwidth=1.049253720927758 style=solid]
21 | In4 -> act1 [color=red penwidth=0.17560360127452074 style=dotted]
22 | In4 -> act2 [color=green penwidth=0.4084716161702079 style=solid]
23 | 137 -> act1 [color=red penwidth=0.17428254929845405 style=dotted]
24 | In0 -> 246 [color=green penwidth=0.4724790110694175 style=dotted]
25 | In0 -> 404 [color=green penwidth=0.34381289726102127 style=dotted]
26 | 137 -> 442 [color=green penwidth=0.4006046348289356 style=solid]
27 | 442 -> act1 [color=green penwidth=0.5911627385006601 style=solid]
28 | In0 -> 540 [color=red penwidth=0.4238873029754118 style=solid]
29 | 540 -> 404 [color=green penwidth=0.4927242488863397 style=solid]
30 | In0 -> 626 [color=green penwidth=0.37161877250004893 style=solid]
31 | 626 -> 404 [color=green penwidth=0.1753711398635109 style=solid]
32 | 137 -> 714 [color=green penwidth=0.24745341023372266 style=solid]
33 | 714 -> act1 [color=red penwidth=0.3848767903179091 style=solid]
34 | 246 -> 714 [color=red penwidth=0.28561121470609185 style=solid]
35 | }
36 |
--------------------------------------------------------------------------------
/neat/Digraph.gv.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
84 |
--------------------------------------------------------------------------------
/neat/avg_fitness.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
1043 |
--------------------------------------------------------------------------------
/neat/cartpole.py:
--------------------------------------------------------------------------------
1 | import neat
2 | import sys
3 | import numpy as np
4 | import gym
5 | import visualize
6 |
7 | GAME = 'CartPole-v0'
8 | env = gym.make(GAME).unwrapped
9 |
10 | CONFIG = "./config"
11 | EP_STEP = 300
12 | GENERATION_EP = 10
13 | CHECKPOINT = 9
14 |
15 | def eval_genomes(genomes, config):
16 | for genome_id, genome in genomes:
17 | net = neat.nn.FeedForwardNetwork.create(genome, config)
18 | ep_r = []
19 | for ep in range(GENERATION_EP):
20 | accumulative_r = 0
21 | observation = env.reset()
22 | for t in range(EP_STEP):
23 | action_values = net.activate(observation)
24 | action = np.argmax(action_values)
25 | observation_, reward, done, _ = env.step(action)
26 | accumulative_r += reward
27 | if done:
28 | break
29 | observation = observation_
30 | ep_r.append(accumulative_r)
31 | genome.fitness = np.min(ep_r)/float(EP_STEP)
32 |
33 | def run():
34 | config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
35 | neat.DefaultSpeciesSet, neat.DefaultStagnation, CONFIG)
36 | pop = neat.Population(config)
37 |
38 | # recode history
39 | stats = neat.StatisticsReporter()
40 | pop.add_reporter(stats)
41 | pop.add_reporter(neat.StdOutReporter(True))
42 | pop.add_reporter(neat.Checkpointer(5))
43 |
44 | pop.run(eval_genomes, 10)
45 |
46 | # visualize training
47 | visualize.plot_stats(stats, ylog=False, view=True)
48 | visualize.plot_species(stats, view=True)
49 |
50 | def evaluation():
51 | p = neat.Checkpointer.restore_checkpoint('neat-checkpoint-%i' % CHECKPOINT)
52 | winner = p.run(eval_genomes, 1)
53 |
54 | # show winner net
55 | node_names = {-1: 'In0', -2: 'In1', -3: 'In3', -4: 'In4', 0: 'act1', 1: 'act2'}
56 | visualize.draw_net(p.config, winner, True, node_names=node_names)
57 |
58 | net = neat.nn.FeedForwardNetwork.create(winner, p.config)
59 | while True:
60 | s = env.reset()
61 | while True:
62 | env.render()
63 | a = np.argmax(net.activate(s))
64 | s, r, done, _ = env.step(a)
65 | if done: break
66 |
67 | if __name__ == '__main__':
68 | TRAINING = sys.argv[1]
69 |
70 | if TRAINING == 'TRAIN':
71 | run()
72 | elif TRAINING == 'EVAL':
73 | evaluation()
74 | else:
75 | print('Please indicate TRAIN or EVAL')
76 |
--------------------------------------------------------------------------------
/neat/config:
--------------------------------------------------------------------------------
1 | # neat-python configuration for the LunarLander-v2 environment on OpenAI Gym
2 |
3 | [NEAT]
4 | pop_size = 100
5 | # Note: the fitness threshold will never be reached because
6 | # we are controlling the termination ourselves based on simulation performance.
7 | fitness_criterion = max
8 | fitness_threshold = 2.
9 | reset_on_extinction = 0
10 |
11 | [DefaultGenome]
12 | # node activation options
13 | activation_default = relu
14 | activation_mutate_rate = 0.0
15 | activation_options = relu
16 |
17 | # node aggregation options
18 | aggregation_default = sum
19 | aggregation_mutate_rate = 0.0
20 | aggregation_options = sum
21 |
22 | # node bias options
23 | bias_init_mean = 0.0
24 | bias_init_stdev = 1.0
25 | bias_max_value = 30.0
26 | bias_min_value = -30.0
27 | bias_mutate_power = 0.5
28 | bias_mutate_rate = 0.7
29 | bias_replace_rate = 0.1
30 |
31 | # genome compatibility options
32 | compatibility_disjoint_coefficient = 1.0
33 | compatibility_weight_coefficient = 1.0
34 |
35 | # connection add/remove rates
36 | conn_add_prob = 0.9
37 | conn_delete_prob = 0.2
38 |
39 | # connection enable options
40 | enabled_default = True
41 | enabled_mutate_rate = 0.01
42 |
43 | feed_forward = True
44 | initial_connection = full
45 | # options (unconnected, fs_neat, full)
46 |
47 | # node add/remove rates
48 | node_add_prob = 0.9
49 | node_delete_prob = 0.2
50 |
51 | # network parameters
52 | num_hidden = 0
53 | num_inputs = 4
54 | num_outputs = 2
55 |
56 | # node response options
57 | response_init_mean = 1.0
58 | response_init_stdev = 0.0
59 | response_max_value = 30.0
60 | response_min_value = -30.0
61 | response_mutate_power = 0.0
62 | response_mutate_rate = 0.0
63 | response_replace_rate = 0.0
64 |
65 | # connection weight options
66 | weight_init_mean = 0.0
67 | weight_init_stdev = 1.0
68 | weight_max_value = 30.
69 | weight_min_value = -30.
70 | weight_mutate_power = 0.5
71 | weight_mutate_rate = 0.8
72 | weight_replace_rate = 0.1
73 |
74 | [DefaultSpeciesSet]
75 | compatibility_threshold = 3.0
76 |
77 | [DefaultStagnation]
78 | species_fitness_func = max
79 | max_stagnation = 20
80 | species_elitism = 4
81 |
82 | [DefaultReproduction]
83 | elitism = 2
84 | survival_threshold = 0.2
--------------------------------------------------------------------------------
/neat/speciation.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
826 |
--------------------------------------------------------------------------------
/neat/visualize.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import copy
4 | import warnings
5 |
6 | import graphviz
7 | import matplotlib.pyplot as plt
8 | import numpy as np
9 |
10 |
11 | def plot_stats(statistics, ylog=False, view=False, filename='avg_fitness.svg'):
12 | """ Plots the population's average and best fitness. """
13 | if plt is None:
14 | warnings.warn("This display is not available due to a missing optional dependency (matplotlib)")
15 | return
16 |
17 | generation = range(len(statistics.most_fit_genomes))
18 | best_fitness = [c.fitness for c in statistics.most_fit_genomes]
19 | avg_fitness = np.array(statistics.get_fitness_mean())
20 | stdev_fitness = np.array(statistics.get_fitness_stdev())
21 |
22 | plt.plot(generation, avg_fitness, 'b-', label="average")
23 | #plt.plot(generation, avg_fitness - stdev_fitness, 'g-.', label="-1 sd")
24 | plt.plot(generation, avg_fitness + stdev_fitness, 'g-.', label="+1 sd")
25 | plt.plot(generation, best_fitness, 'r-', label="best")
26 |
27 | plt.title("Population's average and best fitness")
28 | plt.xlabel("Generations")
29 | plt.ylabel("Fitness")
30 | plt.grid()
31 | plt.legend(loc="best")
32 | if ylog:
33 | plt.gca().set_yscale('symlog')
34 |
35 | plt.savefig(filename)
36 | if view:
37 | plt.show()
38 |
39 | plt.close()
40 |
41 |
42 | def plot_species(statistics, view=False, filename='speciation.svg'):
43 | """ Visualizes speciation throughout evolution. """
44 | if plt is None:
45 | warnings.warn("This display is not available due to a missing optional dependency (matplotlib)")
46 | return
47 |
48 | species_sizes = statistics.get_species_sizes()
49 | num_generations = len(species_sizes)
50 | curves = np.array(species_sizes).T
51 |
52 | fig, ax = plt.subplots()
53 | ax.stackplot(range(num_generations), *curves)
54 |
55 | plt.title("Speciation")
56 | plt.ylabel("Size per Species")
57 | plt.xlabel("Generations")
58 |
59 | plt.savefig(filename)
60 |
61 | if view:
62 | plt.show()
63 |
64 | plt.close()
65 |
66 |
67 | def draw_net(config, genome, view=False, filename=None, node_names=None, show_disabled=True, prune_unused=False,
68 | node_colors=None, fmt='svg'):
69 | """ Receives a genome and draws a neural network with arbitrary topology. """
70 | # Attributes for network nodes.
71 | if graphviz is None:
72 | warnings.warn("This display is not available due to a missing optional dependency (graphviz)")
73 | return
74 |
75 | if node_names is None:
76 | node_names = {}
77 |
78 | assert type(node_names) is dict
79 |
80 | if node_colors is None:
81 | node_colors = {}
82 |
83 | assert type(node_colors) is dict
84 |
85 | node_attrs = {
86 | 'shape': 'circle',
87 | 'fontsize': '9',
88 | 'height': '0.2',
89 | 'width': '0.2'}
90 |
91 | dot = graphviz.Digraph(format=fmt, node_attr=node_attrs)
92 |
93 | inputs = set()
94 | for k in config.genome_config.input_keys:
95 | inputs.add(k)
96 | name = node_names.get(k, str(k))
97 | input_attrs = {'style': 'filled',
98 | 'shape': 'box'}
99 | input_attrs['fillcolor'] = node_colors.get(k, 'lightgray')
100 | dot.node(name, _attributes=input_attrs)
101 |
102 | outputs = set()
103 | for k in config.genome_config.output_keys:
104 | outputs.add(k)
105 | name = node_names.get(k, str(k))
106 | node_attrs = {'style': 'filled'}
107 | node_attrs['fillcolor'] = node_colors.get(k, 'lightblue')
108 |
109 | dot.node(name, _attributes=node_attrs)
110 |
111 | if prune_unused:
112 | connections = set()
113 | for cg in genome.connections.values():
114 | if cg.enabled or show_disabled:
115 | connections.add(cg.key)
116 |
117 | used_nodes = copy.copy(outputs)
118 | pending = copy.copy(outputs)
119 | while pending:
120 | #print(pending, used_nodes)
121 | new_pending = set()
122 | for a, b in connections:
123 | if b in pending and a not in used_nodes:
124 | new_pending.add(a)
125 | used_nodes.add(a)
126 | pending = new_pending
127 | else:
128 | used_nodes = set(genome.nodes.keys())
129 |
130 | for n in used_nodes:
131 | if n in inputs or n in outputs:
132 | continue
133 |
134 | attrs = {'style': 'filled'}
135 | attrs['fillcolor'] = node_colors.get(n, 'white')
136 | dot.node(str(n), _attributes=attrs)
137 |
138 | for cg in genome.connections.values():
139 | if cg.enabled or show_disabled:
140 | #if cg.input not in used_nodes or cg.output not in used_nodes:
141 | # continue
142 | input, output = cg.key
143 | a = node_names.get(input, str(input))
144 | b = node_names.get(output, str(output))
145 | style = 'solid' if cg.enabled else 'dotted'
146 | color = 'green' if cg.weight > 0 else 'red'
147 | width = str(0.1 + abs(cg.weight / 5.0))
148 | dot.edge(a, b, _attributes={'style': style, 'color': color, 'penwidth': width})
149 |
150 | dot.render(filename, view=view)
151 |
152 | return dot
--------------------------------------------------------------------------------
/net.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 |
5 | class AtariNet(nn.Module):
6 |
7 | def __init__(self, num_actions):
8 | super(AtariNet, self).__init__()
9 | self.conv1 = nn.Sequential(
10 | nn.Conv2d(1, 32, kernel_size=8, stride=4),
11 | nn.ReLU()
12 | )
13 | self.conv2 = nn.Sequential(
14 | nn.Conv2d(32, 64, kernel_size=4, stride=2),
15 | nn.ReLU()
16 | )
17 | self.conv3 = nn.Sequential(
18 | nn.Conv2d(64, 64, kernel_size=3, stride=1),
19 | nn.ReLU()
20 | )
21 | self.hidden = nn.Sequential(
22 | nn.Linear(64 * 7 * 7, 512, bias=True),
23 | nn.ReLU()
24 | )
25 | self.out = nn.Sequential(
26 | nn.Linear(512, num_actions, bias=True)
27 | )
28 | self.apply(self.init_weights)
29 |
30 | def init_weights(self, m):
31 | if type(m) == nn.Conv2d:
32 | m.weight.data.normal_(0.0, 0.02)
33 | if type(m) == nn.Linear:
34 | torch.nn.init.xavier_uniform_(m.weight)
35 | m.bias.data.fill_(0.01)
36 |
37 | def forward(self, x):
38 | x = self.conv1(x)
39 | x = self.conv2(x)
40 | x = self.conv3(x)
41 | x = x.view(x.size(0), -1)
42 | x = self.hidden(x)
43 | x = self.out(x)
44 | return x
45 |
46 |
47 | class CnnDQN(nn.Module):
48 | def __init__(self, inputs_shape, num_actions):
49 | super(CnnDQN, self).__init__()
50 |
51 | self.inut_shape = inputs_shape
52 | self.num_actions = num_actions
53 |
54 | self.features = nn.Sequential(
55 | nn.Conv2d(inputs_shape[0], 32, kernel_size=8, stride=4),
56 | nn.ReLU(),
57 | nn.Conv2d(32, 64, kernel_size=4, stride=2),
58 | nn.ReLU(),
59 | nn.Conv2d(64, 64, kernel_size=3, stride=1),
60 | nn.ReLU()
61 | )
62 |
63 | self.fc = nn.Sequential(
64 | nn.Linear(self.features_size(), 512),
65 | nn.ReLU(),
66 | nn.Linear(512, self.num_actions)
67 | )
68 |
69 | def forward(self, x):
70 | x = self.features(x)
71 | x = x.view(x.size(0), -1)
72 | x = self.fc(x)
73 | return x
74 |
75 | def features_size(self):
76 | return self.features(torch.zeros(1, *self.inut_shape)).view(1, -1).size(1)
77 |
--------------------------------------------------------------------------------
/nips-DQN.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from collections import deque
4 | import numpy as np
5 | import gym
6 | import random
7 | from net import AtariNet
8 | from util import preprocess
9 |
10 | BATCH_SIZE = 32
11 | LR = 0.001
12 | START_EPSILON = 1.0
13 | FINAL_EPSILON = 0.1
14 | EPSILON = START_EPSILON
15 | EXPLORE = 1000000
16 | GAMMA = 0.99
17 | TOTAL_EPISODES = 10000000
18 | MEMORY_SIZE = 1000000
19 | MEMORY_THRESHOLD = 100000
20 | TEST_FREQUENCY = 1000
21 | env = gym.make('Pong-v0')
22 | env = env.unwrapped
23 | ACTIONS_SIZE = env.action_space.n
24 |
25 |
26 | class Agent(object):
27 | def __init__(self):
28 | self.network = AtariNet(ACTIONS_SIZE)
29 | self.memory = deque()
30 | self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR)
31 | self.loss_func = nn.MSELoss()
32 |
33 | def action(self, state, israndom):
34 | if israndom and random.random() < EPSILON:
35 | return np.random.randint(0, ACTIONS_SIZE)
36 | state = torch.unsqueeze(torch.FloatTensor(state), 0)
37 | actions_value = self.network.forward(state)
38 | return torch.max(actions_value, 1)[1].data.numpy()[0]
39 |
40 | def learn(self, state, action, reward, next_state, done):
41 | if done:
42 | self.memory.append((state, action, reward, next_state, 0))
43 | else:
44 | self.memory.append((state, action, reward, next_state, 1))
45 | if len(self.memory) > MEMORY_SIZE:
46 | self.memory.popleft()
47 | if len(self.memory) < MEMORY_THRESHOLD:
48 | return
49 |
50 | batch = random.sample(self.memory, BATCH_SIZE)
51 | state = torch.FloatTensor([x[0] for x in batch])
52 | action = torch.LongTensor([[x[1]] for x in batch])
53 | reward = torch.FloatTensor([[x[2]] for x in batch])
54 | next_state = torch.FloatTensor([x[3] for x in batch])
55 | done = torch.FloatTensor([[x[4]] for x in batch])
56 |
57 | eval_q = self.network.forward(state).gather(1, action)
58 | next_q = self.network(next_state).detach()
59 | target_q = reward + GAMMA * next_q.max(1)[0].view(BATCH_SIZE, 1) * done
60 | loss = self.loss_func(eval_q, target_q)
61 |
62 | self.optimizer.zero_grad()
63 | loss.backward()
64 | self.optimizer.step()
65 |
66 |
67 | agent = Agent()
68 |
69 | for i_episode in range(TOTAL_EPISODES):
70 | state = env.reset()
71 | state = preprocess(state)
72 | while True:
73 | # env.render()
74 | action = agent.action(state, True)
75 | next_state, reward, done, info = env.step(action)
76 | next_state = preprocess(next_state)
77 | agent.learn(state, action, reward, next_state, done)
78 |
79 | state = next_state
80 | if done:
81 | break
82 | if EPSILON > FINAL_EPSILON:
83 | EPSILON -= (START_EPSILON - FINAL_EPSILON) / EXPLORE
84 |
85 | # TEST
86 | if i_episode % TEST_FREQUENCY == 0:
87 | state = env.reset()
88 | state = preprocess(state)
89 | total_reward = 0
90 | while True:
91 | # env.render()
92 | action = agent.action(state, israndom=False)
93 | next_state, reward, done, info = env.step(action)
94 | next_state = preprocess(next_state)
95 |
96 | total_reward += reward
97 |
98 | state = next_state
99 | if done:
100 | break
101 | print('episode: {} , total_reward: {}'.format(i_episode, round(total_reward, 3)))
102 |
103 | env.close()
104 |
--------------------------------------------------------------------------------
/recurrent_neural_network/main.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torchvision
4 | import torchvision.transforms as transforms
5 |
6 |
7 | # Device configuration
8 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
9 |
10 | # Hyper-parameters
11 | sequence_length = 28
12 | input_size = 28
13 | hidden_size = 128
14 | num_layers = 2
15 | num_classes = 10
16 | batch_size = 100
17 | num_epochs = 2
18 | learning_rate = 0.01
19 |
20 | # MNIST dataset
21 | train_dataset = torchvision.datasets.MNIST(root='../../data/',
22 | train=True,
23 | transform=transforms.ToTensor(),
24 | download=True)
25 |
26 | test_dataset = torchvision.datasets.MNIST(root='../../data/',
27 | train=False,
28 | transform=transforms.ToTensor())
29 |
30 | # Data loader
31 | train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
32 | batch_size=batch_size,
33 | shuffle=True)
34 |
35 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
36 | batch_size=batch_size,
37 | shuffle=False)
38 |
39 | # Recurrent neural network (many-to-one)
40 | class RNN(nn.Module):
41 | def __init__(self, input_size, hidden_size, num_layers, num_classes):
42 | super(RNN, self).__init__()
43 | self.hidden_size = hidden_size
44 | self.num_layers = num_layers
45 | self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
46 | self.fc = nn.Linear(hidden_size, num_classes)
47 |
48 | def forward(self, x):
49 | # Set initial hidden and cell states
50 | h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
51 | c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
52 |
53 | # Forward propagate LSTM
54 | out, _ = self.lstm(x, (h0, c0)) # out: tensor of shape (batch_size, seq_length, hidden_size)
55 |
56 | # Decode the hidden state of the last time step
57 | out = self.fc(out[:, -1, :])
58 | return out
59 |
60 | model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
61 |
62 |
63 | # Loss and optimizer
64 | criterion = nn.CrossEntropyLoss()
65 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
66 |
67 | # Train the model
68 | total_step = len(train_loader)
69 | for epoch in range(num_epochs):
70 | for i, (images, labels) in enumerate(train_loader):
71 | images = images.reshape(-1, sequence_length, input_size).to(device)
72 | labels = labels.to(device)
73 |
74 | # Forward pass
75 | outputs = model(images)
76 | loss = criterion(outputs, labels)
77 |
78 | # Backward and optimize
79 | optimizer.zero_grad()
80 | loss.backward()
81 | optimizer.step()
82 |
83 | if (i+1) % 100 == 0:
84 | print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
85 | .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
86 |
87 | # Test the model
88 | with torch.no_grad():
89 | correct = 0
90 | total = 0
91 | for images, labels in test_loader:
92 | images = images.reshape(-1, sequence_length, input_size).to(device)
93 | labels = labels.to(device)
94 | outputs = model(images)
95 | _, predicted = torch.max(outputs.data, 1)
96 | total += labels.size(0)
97 | correct += (predicted == labels).sum().item()
98 |
99 | print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
100 |
101 | # Save the model checkpoint
102 | torch.save(model.state_dict(), 'model.ckpt')
103 |
--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 |
4 |
5 | def preprocess(observation):
6 | """
7 | image preprocess
8 | :param observation:
9 | :return:
10 | """
11 | observation = cv2.cvtColor(cv2.resize(observation, (84, 110)), cv2.COLOR_BGR2GRAY)
12 | observation = observation[26:110,:]
13 | ret, observation = cv2.threshold(observation,1,255,cv2.THRESH_BINARY)
14 | x = np.reshape(observation,(84,84,1))
15 | return x.transpose((2, 0, 1))
--------------------------------------------------------------------------------