├── requirements.txt ├── README.md ├── model.py ├── .gitignore ├── policy.py └── main.py /requirements.txt: -------------------------------------------------------------------------------- 1 | gym[all]==0.10.5 2 | numpy==1.14.5 3 | ray==0.5.0 4 | torch==0.4.0 5 | torchvision==0.2.1 6 | 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # evolution-strategies 2 | 3 | Distributed Natutal Evolution Strategies build with pytorch and ray. This project was created because there were no simple distributed evolution strategies implementations I could find. The project's goal is to be minimalistic (simple to understand) and modular (easy to implement your own strategy). 4 | 5 | ### Built with 6 | 7 | - Pytorch 8 | - Numpy 9 | - Ray 10 | - OpenAI Gym 11 | 12 | ## Contributing 13 | 14 | 1. Fork it! Star it? 15 | 2. Create your feature branch: `git checkout -b my-new-feature` 16 | 3. Commit your changes: `git commit -am 'Add some feature'` 17 | 4. Push to the branch: `git push origin my-new-feature` 18 | 5. Submit a pull request :D 19 | 20 | ## Authors 21 | 22 | * **Jorge Ceja** - *Initial work* - [Account](https://github.com/JorgeCeja) 23 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | 5 | class Model(nn.Module): 6 | 7 | def __init__(self, action_space, in_channels=3, num_features=4): 8 | 9 | super(Model, self).__init__() 10 | self.action_space = action_space 11 | 12 | self.main = nn.Sequential( 13 | # in_channels, out_channels, kernel_size, stride, padding 14 | nn.Conv2d(in_channels, num_features, 4, 15 | stride=2, padding=1, bias=False), 16 | nn.ELU(inplace=True), 17 | 18 | nn.Conv2d(num_features, num_features * 2, 4, 19 | stride=2, padding=1, bias=False), 20 | nn.ELU(inplace=True), 21 | 22 | nn.Conv2d(num_features * 2, num_features * 4, 23 | 4, stride=2, padding=1, bias=False), 24 | nn.ELU(inplace=True), 25 | 26 | nn.Conv2d(num_features * 4, num_features * 8, 27 | 4, stride=2, padding=1, bias=False), 28 | nn.ELU(inplace=True), 29 | 30 | nn.Conv2d(num_features * 8, num_features * 31 | 16, 4, stride=2, padding=1, bias=False), 32 | nn.ELU(inplace=True), 33 | 34 | nn.Conv2d(num_features * 16, self.action_space, 4, 35 | stride=1, padding=0, bias=False), 36 | nn.Softmax(1) 37 | ) 38 | 39 | def forward(self, input): 40 | main = self.main(input) 41 | return main 42 | 43 | def count_parameters(self): 44 | return sum(p.numel() for p in self.parameters()) 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # saved model 107 | best-model.pth -------------------------------------------------------------------------------- /policy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import copy 3 | import numpy as np 4 | from torch.nn.utils import parameters_to_vector, vector_to_parameters 5 | 6 | from model import Model 7 | 8 | 9 | class Policy(object): 10 | 11 | def __init__(self, sigma=0.03, learning_rate=0.001, action_space=None): 12 | self.model = Model(action_space) 13 | self.sigma = sigma 14 | self.learning_rate = learning_rate 15 | self.num_params = self.model.count_parameters() 16 | 17 | # Remove grad from model 18 | for param in self.model.parameters(): 19 | param.requires_grad = False 20 | 21 | def get_parameters(self): 22 | # Return flat (1D) model parameters 23 | # Bug/feature with ray? - doesn't work with pytorch variables 24 | 25 | # Don't need to detach to allow numpy conversion 26 | # grad was removed from model in init 27 | parameters = parameters_to_vector(self.model.parameters()) 28 | 29 | return parameters.numpy() 30 | 31 | def set_parameters(self, parameters, perturbation, evaluate=False): 32 | # Set parameters from sampled deltas 33 | perturb_parameters = torch.from_numpy( 34 | parameters + (self.sigma * perturbation)) 35 | 36 | vector_to_parameters(perturb_parameters, self.model.parameters()) 37 | 38 | def evaluate(self, state): 39 | # No need for `with torch.no_grad():` 40 | # grad was removed from model in init 41 | prediction = self.model(state) 42 | action = np.argmax(prediction.data.numpy()) 43 | 44 | return action 45 | 46 | def update(self, theta, all_rewards, population): 47 | # Clip denominator to 1e-5 to prevent division by 0 48 | normalized_rewards = (np.asarray(all_rewards) - 49 | np.mean(all_rewards)) / (np.clip(np.std(all_rewards), 1e-5, None)) 50 | 51 | new_weights = theta + self.learning_rate / torch.from_numpy( 52 | (len(population) * self.sigma) * np.dot(np.asarray(population).T, normalized_rewards)).float() 53 | 54 | # set new parameters to the model, for later retrival 55 | vector_to_parameters(new_weights, self.model.parameters()) 56 | 57 | def save_model(self, output_path): 58 | torch.save(self.model.state_dict(), output_path) 59 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import ray 2 | import gym 3 | import torch 4 | from torchvision import transforms 5 | import numpy as np 6 | 7 | from policy import Policy 8 | 9 | 10 | @ray.remote 11 | def create_shared_noise(): 12 | """Create a large array of noise to be shared by all workers.""" 13 | noise = np.random.RandomState(123).randn(250000000).astype(np.float32) 14 | return noise 15 | 16 | 17 | class SharedNoiseTable(object): 18 | def __init__(self, noise): 19 | self.noise = noise 20 | assert self.noise.dtype == np.float32 21 | 22 | def get(self, index, dim): 23 | return self.noise[index:index + dim] 24 | 25 | def sample_index(self, dim): 26 | return np.random.randint(0, len(self.noise) - dim + 1) 27 | 28 | 29 | @ray.remote 30 | class Worker(object): 31 | 32 | def __init__(self, policy_params, env_name, noise): 33 | self.env = gym.make(env_name) 34 | self.transform = transforms.Compose([ 35 | transforms.ToPILImage(), 36 | transforms.Resize((128, 128)), 37 | # transforms.Grayscale(), 38 | transforms.ToTensor() 39 | ]) 40 | self.noise = SharedNoiseTable(noise) 41 | self.policy = Policy(**policy_params) 42 | 43 | def do_rollouts(self, parameters, render=False): 44 | # Perform simulation and return reward 45 | state = self.env.reset() 46 | done = False 47 | rollout_reward = 0 48 | 49 | while not done: 50 | if render: 51 | self.env.render() 52 | 53 | noise_index = self.noise.sample_index(self.policy.num_params) 54 | perturbation = self.noise.get( 55 | noise_index, self.policy.num_params) 56 | 57 | # input perturbation and jitters 58 | # the model ends up beign only used for foward passes during rollout 59 | self.policy.set_parameters(parameters, perturbation) 60 | 61 | state = self.transform(state).unsqueeze(0) 62 | 63 | # Do rollout with the perturbed policy. 64 | action = self.policy.evaluate(state) 65 | 66 | state, reward, done, _ = self.env.step(action) 67 | 68 | rollout_reward += reward 69 | 70 | # Return the rewards. 71 | return {"noise_index": noise_index, "rollout_reward": rollout_reward} 72 | 73 | 74 | # Stratey 75 | batch_size = 32 # aka population size 76 | policy_params = { 77 | "sigma": 0.1, 78 | "learning_rate": 0.001 79 | } 80 | 81 | # Model 82 | num_features = 8 83 | 84 | # Distributed 85 | num_workers = 8 86 | 87 | # Training 88 | steps = 1000 89 | 90 | env_name = "SpaceInvaders-v0" 91 | 92 | env = gym.make(env_name) 93 | action_space = env.action_space.n 94 | 95 | policy_params["action_space"] = action_space 96 | 97 | ray.init() 98 | 99 | noise_id = create_shared_noise.remote() 100 | noise = SharedNoiseTable(ray.get(noise_id)) 101 | 102 | # Instanciate parent policy 103 | policy = Policy(**policy_params) 104 | 105 | # Create the actors/workers 106 | workers = [Worker.remote(policy_params, env_name, noise_id) 107 | for _ in range(num_workers)] 108 | 109 | total_rewards = [] 110 | 111 | highest_reward = 0 112 | for i in range(steps): 113 | 114 | # Loop to fill batch based on number of workers 115 | rollout_ids = [] 116 | for j in range(batch_size//num_workers): 117 | # Get the current policy weights. 118 | theta = policy.get_parameters() 119 | 120 | # Put the current policy weights in the object store. 121 | theta_id = ray.put(theta) 122 | 123 | # Use the actors to do rollouts, 124 | # note that we pass in the ID of the policy weights. 125 | rollout_ids += [worker.do_rollouts.remote( 126 | theta_id) for worker in workers] 127 | 128 | # Get the results of the rollouts. 129 | results = ray.get(rollout_ids) 130 | 131 | # Loop over the results. 132 | all_rollout_rewards, population = [], [] 133 | for result in results: 134 | all_rollout_rewards.append(result["rollout_reward"]) 135 | 136 | _noise = noise.get(result["noise_index"], policy.num_params) 137 | population.append(_noise) 138 | 139 | avg_reward = np.average(np.asarray(all_rollout_rewards)) 140 | 141 | print("average reward in episode ", i+1, ": ", avg_reward) 142 | 143 | # Update parent parameters 144 | policy.update(theta, all_rollout_rewards, population) 145 | 146 | # Save highest average reward 147 | if avg_reward > highest_reward: 148 | highest_reward = avg_reward 149 | policy.save_model('./best-model.pth') 150 | print("saved model at episode: ", i+1) 151 | 152 | print("\n") 153 | --------------------------------------------------------------------------------