├── requirements.txt
├── README.md
├── model.py
├── .gitignore
├── policy.py
└── main.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | gym[all]==0.10.5
2 | numpy==1.14.5
3 | ray==0.5.0
4 | torch==0.4.0
5 | torchvision==0.2.1
6 | 
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # evolution-strategies
 2 | 
 3 | Distributed Natutal Evolution Strategies build with pytorch and ray. This project was created because there were no simple distributed evolution strategies implementations I could find. The project's goal is to be minimalistic (simple to understand) and modular (easy to implement your own strategy).  
 4 | 
 5 | ### Built with
 6 | 
 7 | - Pytorch
 8 | - Numpy
 9 | - Ray
10 | - OpenAI Gym
11 | 
12 | ## Contributing
13 | 
14 | 1. Fork it! Star it?
15 | 2. Create your feature branch: `git checkout -b my-new-feature`
16 | 3. Commit your changes: `git commit -am 'Add some feature'`
17 | 4. Push to the branch: `git push origin my-new-feature`
18 | 5. Submit a pull request :D
19 | 
20 | ## Authors
21 | 
22 | * **Jorge Ceja** - *Initial work* - [Account](https://github.com/JorgeCeja)
23 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | class Model(nn.Module):
 6 | 
 7 |     def __init__(self, action_space, in_channels=3, num_features=4):
 8 | 
 9 |         super(Model, self).__init__()
10 |         self.action_space = action_space
11 | 
12 |         self.main = nn.Sequential(
13 |             # in_channels, out_channels, kernel_size, stride, padding
14 |             nn.Conv2d(in_channels, num_features, 4,
15 |                       stride=2, padding=1, bias=False),
16 |             nn.ELU(inplace=True),
17 | 
18 |             nn.Conv2d(num_features, num_features * 2, 4,
19 |                       stride=2, padding=1, bias=False),
20 |             nn.ELU(inplace=True),
21 | 
22 |             nn.Conv2d(num_features * 2, num_features * 4,
23 |                       4, stride=2, padding=1, bias=False),
24 |             nn.ELU(inplace=True),
25 | 
26 |             nn.Conv2d(num_features * 4, num_features * 8,
27 |                       4, stride=2, padding=1, bias=False),
28 |             nn.ELU(inplace=True),
29 | 
30 |             nn.Conv2d(num_features * 8, num_features *
31 |                       16, 4, stride=2, padding=1, bias=False),
32 |             nn.ELU(inplace=True),
33 | 
34 |             nn.Conv2d(num_features * 16, self.action_space, 4,
35 |                       stride=1, padding=0, bias=False),
36 |             nn.Softmax(1)
37 |         )
38 | 
39 |     def forward(self, input):
40 |         main = self.main(input)
41 |         return main
42 | 
43 |     def count_parameters(self):
44 |         return sum(p.numel() for p in self.parameters())
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # saved model
107 | best-model.pth


--------------------------------------------------------------------------------
/policy.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import copy
 3 | import numpy as np
 4 | from torch.nn.utils import parameters_to_vector, vector_to_parameters
 5 | 
 6 | from model import Model
 7 | 
 8 | 
 9 | class Policy(object):
10 | 
11 |     def __init__(self, sigma=0.03, learning_rate=0.001, action_space=None):
12 |         self.model = Model(action_space)
13 |         self.sigma = sigma
14 |         self.learning_rate = learning_rate
15 |         self.num_params = self.model.count_parameters()
16 | 
17 |         # Remove grad from model
18 |         for param in self.model.parameters():
19 |             param.requires_grad = False
20 | 
21 |     def get_parameters(self):
22 |         # Return flat (1D) model parameters
23 |         # Bug/feature with ray? - doesn't work with pytorch variables
24 | 
25 |         # Don't need to detach to allow numpy conversion
26 |         # grad was removed from model in init
27 |         parameters = parameters_to_vector(self.model.parameters())
28 | 
29 |         return parameters.numpy()
30 | 
31 |     def set_parameters(self, parameters, perturbation, evaluate=False):
32 |         # Set parameters from sampled deltas
33 |         perturb_parameters = torch.from_numpy(
34 |             parameters + (self.sigma * perturbation))
35 | 
36 |         vector_to_parameters(perturb_parameters, self.model.parameters())
37 | 
38 |     def evaluate(self, state):
39 |         # No need for `with torch.no_grad():`
40 |         # grad was removed from model in init
41 |         prediction = self.model(state)
42 |         action = np.argmax(prediction.data.numpy())
43 | 
44 |         return action
45 | 
46 |     def update(self, theta, all_rewards, population):
47 |         # Clip denominator to 1e-5 to prevent division by 0
48 |         normalized_rewards = (np.asarray(all_rewards) -
49 |                               np.mean(all_rewards)) / (np.clip(np.std(all_rewards), 1e-5, None))
50 | 
51 |         new_weights = theta + self.learning_rate / torch.from_numpy(
52 |             (len(population) * self.sigma) * np.dot(np.asarray(population).T, normalized_rewards)).float()
53 | 
54 |         # set new parameters to the model, for later retrival
55 |         vector_to_parameters(new_weights, self.model.parameters())
56 | 
57 |     def save_model(self, output_path):
58 |         torch.save(self.model.state_dict(), output_path)
59 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import ray
  2 | import gym
  3 | import torch
  4 | from torchvision import transforms
  5 | import numpy as np
  6 | 
  7 | from policy import Policy
  8 | 
  9 | 
 10 | @ray.remote
 11 | def create_shared_noise():
 12 |     """Create a large array of noise to be shared by all workers."""
 13 |     noise = np.random.RandomState(123).randn(250000000).astype(np.float32)
 14 |     return noise
 15 | 
 16 | 
 17 | class SharedNoiseTable(object):
 18 |     def __init__(self, noise):
 19 |         self.noise = noise
 20 |         assert self.noise.dtype == np.float32
 21 | 
 22 |     def get(self, index, dim):
 23 |         return self.noise[index:index + dim]
 24 | 
 25 |     def sample_index(self, dim):
 26 |         return np.random.randint(0, len(self.noise) - dim + 1)
 27 | 
 28 | 
 29 | @ray.remote
 30 | class Worker(object):
 31 | 
 32 |     def __init__(self, policy_params, env_name, noise):
 33 |         self.env = gym.make(env_name)
 34 |         self.transform = transforms.Compose([
 35 |             transforms.ToPILImage(),
 36 |             transforms.Resize((128, 128)),
 37 |             # transforms.Grayscale(),
 38 |             transforms.ToTensor()
 39 |         ])
 40 |         self.noise = SharedNoiseTable(noise)
 41 |         self.policy = Policy(**policy_params)
 42 | 
 43 |     def do_rollouts(self, parameters, render=False):
 44 |         # Perform simulation and return reward
 45 |         state = self.env.reset()
 46 |         done = False
 47 |         rollout_reward = 0
 48 | 
 49 |         while not done:
 50 |             if render:
 51 |                 self.env.render()
 52 | 
 53 |             noise_index = self.noise.sample_index(self.policy.num_params)
 54 |             perturbation = self.noise.get(
 55 |                 noise_index, self.policy.num_params)
 56 | 
 57 |             # input perturbation and jitters
 58 |             # the model ends up beign only used for foward passes during rollout
 59 |             self.policy.set_parameters(parameters, perturbation)
 60 | 
 61 |             state = self.transform(state).unsqueeze(0)
 62 | 
 63 |             # Do rollout with the perturbed policy.
 64 |             action = self.policy.evaluate(state)
 65 | 
 66 |             state, reward, done, _ = self.env.step(action)
 67 | 
 68 |             rollout_reward += reward
 69 | 
 70 |         # Return the rewards.
 71 |         return {"noise_index": noise_index, "rollout_reward": rollout_reward}
 72 | 
 73 | 
 74 | # Stratey
 75 | batch_size = 32  # aka population size
 76 | policy_params = {
 77 |     "sigma": 0.1,
 78 |     "learning_rate": 0.001
 79 | }
 80 | 
 81 | # Model
 82 | num_features = 8
 83 | 
 84 | # Distributed
 85 | num_workers = 8
 86 | 
 87 | # Training
 88 | steps = 1000
 89 | 
 90 | env_name = "SpaceInvaders-v0"
 91 | 
 92 | env = gym.make(env_name)
 93 | action_space = env.action_space.n
 94 | 
 95 | policy_params["action_space"] = action_space
 96 | 
 97 | ray.init()
 98 | 
 99 | noise_id = create_shared_noise.remote()
100 | noise = SharedNoiseTable(ray.get(noise_id))
101 | 
102 | # Instanciate parent policy
103 | policy = Policy(**policy_params)
104 | 
105 | # Create the actors/workers
106 | workers = [Worker.remote(policy_params, env_name, noise_id)
107 |            for _ in range(num_workers)]
108 | 
109 | total_rewards = []
110 | 
111 | highest_reward = 0
112 | for i in range(steps):
113 | 
114 |     # Loop to fill batch based on number of workers
115 |     rollout_ids = []
116 |     for j in range(batch_size//num_workers):
117 |         # Get the current policy weights.
118 |         theta = policy.get_parameters()
119 | 
120 |         # Put the current policy weights in the object store.
121 |         theta_id = ray.put(theta)
122 | 
123 |         # Use the actors to do rollouts,
124 |         # note that we pass in the ID of the policy weights.
125 |         rollout_ids += [worker.do_rollouts.remote(
126 |             theta_id) for worker in workers]
127 | 
128 |     # Get the results of the rollouts.
129 |     results = ray.get(rollout_ids)
130 | 
131 |     # Loop over the results.
132 |     all_rollout_rewards, population = [], []
133 |     for result in results:
134 |         all_rollout_rewards.append(result["rollout_reward"])
135 | 
136 |         _noise = noise.get(result["noise_index"], policy.num_params)
137 |         population.append(_noise)
138 | 
139 |     avg_reward = np.average(np.asarray(all_rollout_rewards))
140 | 
141 |     print("average reward in episode ", i+1, ": ", avg_reward)
142 | 
143 |     # Update parent parameters
144 |     policy.update(theta, all_rollout_rewards, population)
145 | 
146 |     # Save highest average reward
147 |     if avg_reward > highest_reward:
148 |         highest_reward = avg_reward
149 |         policy.save_model('./best-model.pth')
150 |         print("saved model at episode: ", i+1)
151 | 
152 |     print("\n")
153 | 


--------------------------------------------------------------------------------