├── platoon ├── tests │ ├── __init__.py │ ├── unit │ │ ├── __init__.py │ │ ├── test_configparser.py │ │ ├── test_util.py │ │ └── test_controller.py │ └── functional │ │ ├── README.md │ │ ├── time_worker.py │ │ ├── test_worker.py │ │ ├── test_global_dynamics_worker.py │ │ └── test_ops_worker.py ├── training │ ├── __init__.py │ └── global_dynamics.py ├── __init__.py ├── channel │ └── __init__.py ├── configparser.py ├── param_sync.py ├── util.py └── ops.py ├── example ├── data │ ├── .gitignore │ └── readme.txt ├── simple_batched_pixel_sum │ ├── README.txt │ ├── batched_pixel_sum.py │ ├── batched_pixel_sum_worker.py │ └── batched_pixel_sum_controller.py ├── lstm │ ├── README.txt │ ├── lstm_controller.py │ ├── imdb.py │ └── lstm_worker.py └── synchronous_lstm │ ├── README.txt │ ├── imdb.py │ ├── lstm_controller.py │ └── lstm_worker.py ├── setup.py ├── .gitignore ├── LICENSE ├── platoonrc.conf ├── README.md ├── scripts └── platoon-launcher └── doc └── platoon └── control_request.svg /platoon/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /platoon/training/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /platoon/tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example/data/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | !readme.txt 4 | -------------------------------------------------------------------------------- /example/data/readme.txt: -------------------------------------------------------------------------------- 1 | This file is here so the directory exists. 2 | -------------------------------------------------------------------------------- /platoon/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from .channel.controller import Controller 3 | from .channel.worker import Worker 4 | -------------------------------------------------------------------------------- /platoon/channel/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | :mod:`channel` -- Platoon's communication backend 3 | ================================================= 4 | 5 | .. module:: channel 6 | :platform: Unix 7 | :synopsis: Contains controller and worker modules which compose Platoon's 8 | communication architecture. 9 | 10 | This file serves as a backwards compatibility layer for Platoon v0.5.0. 11 | 12 | """ 13 | from __future__ import absolute_import 14 | from .worker import Worker 15 | from .controller import Controller 16 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from setuptools import setup 3 | 4 | setup( 5 | name='platoon', 6 | version='0.6.1', 7 | author='MILA', 8 | packages=['platoon', 'platoon.channel', 'platoon.training'], 9 | scripts=['scripts/platoon-launcher'], 10 | url='https://github.com/mila-udem/platoon/', 11 | license='MIT', 12 | description='Experimental multi-GPU mini-framework for Theano', 13 | long_description=open('README.md').read(), 14 | install_requires=['numpy', 'cffi', 'pyzmq', 'posix_ipc', 'six'] 15 | ) 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # Platoon log folders 60 | PLATOON_LOGS/ 61 | *.prof 62 | *.out 63 | *.err 64 | *.conf 65 | .platoonrc 66 | conf_from_topo.py 67 | test.sh 68 | test_scripts/ 69 | !./platoonrc.conf 70 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 mila-udem 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /example/simple_batched_pixel_sum/README.txt: -------------------------------------------------------------------------------- 1 | ## GOAL ## 2 | The goal of this example is to showcase Platoon's functionality in the simplest way possible. 3 | 4 | 5 | ## CONTENT ## 6 | - README.txt : This file! 7 | - batched_pixel_sum.py : A simple Theano pixel-wise sum on MNIST 8 | - batched_pixel_sum_controller.py : A platoon implementation of batched_pixel_sum.py 9 | batched_pixel_sum_worker.py 10 | 11 | 12 | ## HOW TO USE ## 13 | # USING THE LAUNCHER 14 | 1) Assuming you are in the simple_batched_pixel_sum folder. 15 | `cd platoon/example/simple_batched_pixel_sum/` 16 | 17 | 2) Launch the experiment on 1 gpu using the platoon-launcher script. 18 | All the outputs will be saved in a newly created `PLATOON_LOGS` folder. 19 | `platoon-launcher batched_pixel_sum gpu0` 20 | 21 | # MANUALLY 22 | 1) Assuming you are in the simple_batched_pixel_sum folder. 23 | `cd platoon/example/simple_batched_pixel_sum/` 24 | 25 | 2) Start the controller. 26 | `THEANO_FLAGS='device=cpu' python -u batched_pixel_sum_controller.py` 27 | 28 | 3) Start the worker. 29 | `THEANO_FLAGS='device=gpu0' python -u batched_pixel_sum_worker.py` 30 | 31 | 32 | ## NOTE ## 33 | - Using more than 1 worker causes problem at the moment for THIS particular example. 34 | The reason is that we are using the "dataset handled by the controller" feature which is not quite ready yet. 35 | -------------------------------------------------------------------------------- /platoon/tests/functional/README.md: -------------------------------------------------------------------------------- 1 | To functional test the *all_reduce* worker interface, you need to: 2 | 3 | 1. Export the environmental variable `PLATOON_TEST_WORKERS_NUM` to be equal to 4 | the total number of workers (GPUs) to be spawned across hosts in the 5 | functional test. 6 | 2. Call `platoon-launcher test` to start the test while being in the same 7 | directory as `test_worker.py` file. You can configure the multi-GPU/node 8 | procedure in any possible way as long as the total number of workers, which 9 | was set in the previous step, is respected. 10 | 11 | The procedure exits with 0 for success. If this does not hold, please check 12 | `platoon-launcher`, in order to see a high-level description of the return 13 | code, and `PLATOON_LOGS` of the late procedure in current directory. 14 | 15 | To profile and benchmark the new worker interface, you need to run 16 | `platoon-launcher time` in current directory. Results are written in 17 | `PLATOON_LOGS`. 18 | 19 | To test and profile the Theano Ops of worker interface, you need to run 20 | `platoon-launcher test_ops` in current directory. 21 | 22 | To test implementations of global dynamics, please run 23 | `platoon-launcher test_global_dynamics` in current directory. 24 | 25 | **Note**: Depending on your hardware configuration, launching on defaults 26 | Platoon may not suffice for a successful execution. Please check the 27 | documentation and *platoonrc.conf* on how to configure Platoon. 28 | -------------------------------------------------------------------------------- /platoon/tests/unit/test_configparser.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import os 3 | import unittest 4 | from six.moves import reload_module as reload 5 | 6 | from ... import configparser as cfgp 7 | 8 | 9 | def test_fetch_hosts_from_envs(): 10 | if os.getenv("PLATOON_HOSTS"): 11 | os.environ.pop("PLATOON_HOSTS") 12 | true_hosts = ["test0", "tes1", "te2"] 13 | os.environ["PLATOON_HOSTS"] = "test0,tes1,te2" 14 | reload(cfgp) 15 | hosts = cfgp.fetch_hosts() 16 | assert hosts == true_hosts, (hosts) 17 | 18 | 19 | def test_fetch_hosts_from_rc(): 20 | if os.getenv("PLATOON_HOSTS"): 21 | os.environ.pop("PLATOON_HOSTS") 22 | os.environ["PLATOONRC"] = "../../../platoonrc.conf" 23 | reload(cfgp) 24 | hosts = cfgp.fetch_hosts() 25 | assert hosts == ["lisa0", "lisa1", "lisa3"], (hosts) 26 | 27 | 28 | def test_fetch_devices_from_envs(): 29 | if os.getenv("PLATOON_DEVICES"): 30 | os.environ.pop("PLATOON_DEVICES") 31 | os.environ["PLATOON_DEVICES"] = "cuda0,opencl0:1" 32 | reload(cfgp) 33 | devices = cfgp.fetch_devices_for_host("asfasfa") 34 | assert devices == ["cuda0", "opencl0:1"], (devices) 35 | 36 | 37 | def test_fetch_devices_from_rc(): 38 | if os.getenv("PLATOON_DEVICES"): 39 | os.environ.pop("PLATOON_DEVICES") 40 | os.environ["PLATOON_DEVICES"] = "" 41 | os.environ["PLATOONRC"] = "../../../platoonrc.conf" 42 | reload(cfgp) 43 | devs = cfgp.fetch_devices_for_host("lisa0") 44 | assert devs == ["cuda0", "cuda1"], (devs) 45 | devs = cfgp.fetch_devices_for_host("lisa1") 46 | assert devs == ["cuda3", "cuda0"], (devs) 47 | devs = cfgp.fetch_devices_for_host("lisa3") 48 | assert devs == ["cuda"], (devs) 49 | keyerror = False 50 | try: 51 | devs = cfgp.fetch_devices_for_host("asfasfa") 52 | except KeyError: 53 | keyerror = True 54 | except: 55 | pass 56 | assert keyerror 57 | -------------------------------------------------------------------------------- /example/lstm/README.txt: -------------------------------------------------------------------------------- 1 | ## GOAL ## 2 | LSTM example using Platoon *param sync* interface 3 | 4 | 5 | ## CONTENT ## 6 | - README.txt 7 | - lstm_controller.py 8 | - lstm_worker.py 9 | - imdb.py 10 | 11 | 12 | ## HOW TO USE ## 13 | # USING THE LAUNCHER 14 | 1) Assuming you are in the lstm folder. 15 | `cd platoon/example/lstm/` 16 | 17 | 1) Launch the experiment on 2 GPUs using the platoon-launcher script. 18 | `platoon-launcher lstm -D cuda0 cuda2` 19 | 20 | To see all controller parameters do: `python lstm_controller.py -h` 21 | To pass them via the platoon-launcher script: `platoon-launcher lstm -D cuda0 cuda2 -c=...` 22 | 23 | To see all worker parameters do: `python lstm_worker.py -h` 24 | To pass them via the platoon-launcher script: `platoon-launcher lstm -D cuda0 cuda2 -w=...` 25 | 26 | # MANUALLY 27 | 1) Assuming you are in the lstm folder. 28 | `cd platoon/example/lstm/` 29 | 30 | 2) Start the controller. 31 | `THEANO_FLAGS='device=cpu' python -u lstm_controller.py` 32 | 33 | 3) Start the worker. Repeat as needed changing the GPU id. 34 | `THEANO_FLAGS='device=gpu0' python -u lstm_worker.py` 35 | 36 | 37 | ## NOTE ## 38 | If you use the MANUAL way, you may want to run them in different windows of screen or tmux. 39 | They all expect to be in the foreground. 40 | 41 | 42 | ## Timing ## 43 | This timing was done with 2 k80. 44 | The timing is about efficiency of computation, not efficiency of 45 | training. So the parameter alpha is constant. The number of mini-batches 46 | is fixed as the hyper-parameter. The sync is also fixed to be after 10 47 | mini-batch of computation. 48 | 49 | With 1 worker, Platoon does not give you any advantage. This is 50 | there just to show the overhead of the EASGD implementation. Normal 51 | is without this framework and with SGD, also there for overhead evaluation. 52 | 53 | Normal | 1 GPU | 2 GPUs | 3 GPUs | 4 GPUs 54 | -------|-------|--------|--------|------- 55 | 870s | 912s | 477s | 329s | 254s 56 | 1.00x | 0.95x | 1.82x | 2.65x | 3.42x 57 | -------------------------------------------------------------------------------- /example/synchronous_lstm/README.txt: -------------------------------------------------------------------------------- 1 | ## GOAL ## 2 | LSTM example using Platoon *all reduce* interface 3 | 4 | 5 | ## CONTENT ## 6 | - README.txt 7 | - lstm_controller.py 8 | - lstm_worker.py 9 | - imdb.py 10 | It is assumed that imdb.pkl is in the same foler, otherwise it will be downloaded. 11 | 12 | 13 | ## HOW TO USE ## 14 | # USING THE LAUNCHER 15 | When the launcher is used, the outputs and errors of controller and workers are automatically 16 | stored in an auto-generated folder of PLATOON_LOGS/lstm/DATE_TIME. 17 | 18 | 1) Assuming you are in the synchronous_lstm folder. 19 | `cd platoon/example/synchronous_lstm/` 20 | 21 | 2) Launch the experiment on 2 GPUs using the platoon-launcher script. 22 | `platoon-launcher lstm -D cuda0 cuda1` 23 | 24 | To see all controller parameters do: `python lstm_controller.py -h` 25 | To pass them via the platoon-launcher script: `platoon-launcher lstm -D cuda0 cuda1 -c=...` 26 | 27 | To see all worker parameters do: `python lstm_worker.py -h` 28 | To pass them via the platoon-launcher script: `platoon-launcher lstm -D cuda0 cuda1 -w=...` 29 | 30 | 31 | For setting THEANO_FLAGS for the workers, you can use the 32 | following command which sets floatX to float32 for all the workers: 33 | `THEANO_FLAGS=floatX=float32 platoon-launcher lstm -D cuda0 cuda1` 34 | 35 | # USING THE SCRIPTS 36 | When the scripts are used the path to store the outputs can be given. 37 | 38 | 1) Assuming you are in the synchronous_lstm folder. 39 | `cd platoon/example/synchronous_lstm/` 40 | 41 | 2) Launch the experiment. Platoon will automatically find all the available GPUs 42 | and run the workers on them: 43 | THEANO_FLAGS=floatX=float32 python lstm_controller.py --single lstm PATH/TO/OUTPUT 44 | 45 | --single indicates the GPUs are all on the same machine. 46 | lstm is the name of the experiment. It will look for an lstm_worker.py to run the workers. 47 | THEANO_FLAGS are set for all the workers and not the controller. The controller should use 48 | the CPU. 49 | 50 | 51 | ## TIMING ## 52 | These timings were done using the Nvidia DGX-1 and by averaging results from 53 | two runs for each setup. 54 | 1 GPU : 5.698 seconds / epoch 55 | 2 GPU : 2.230 seconds / epoch 56 | -------------------------------------------------------------------------------- /example/simple_batched_pixel_sum/batched_pixel_sum.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import gzip 3 | from six.moves import cPickle 4 | import numpy as np 5 | from numpy.testing import assert_allclose 6 | import theano 7 | from theano import tensor as T 8 | from theano.compat.python2x import OrderedDict 9 | 10 | 11 | class BatchedPixelSum(object): 12 | 13 | def __init__(self, dataset, batch_size): 14 | self._batch_size = batch_size 15 | self._dataset = dataset 16 | 17 | self._computed_sum = theano.shared(value=np.zeros(dataset.shape[1], dtype=theano.config.floatX), name='sum', borrow=True) 18 | 19 | input = T.matrix(dtype=theano.config.floatX) 20 | batch_sum = T.sum(input, axis=0, dtype=theano.config.floatX) 21 | 22 | updates = OrderedDict() 23 | updates[self._computed_sum] = (self._computed_sum + batch_sum) 24 | 25 | self._update_sum = theano.function(name='learn', 26 | inputs=[input], 27 | updates=updates) 28 | 29 | def get_sum(self): 30 | for i in xrange(self._dataset.shape[0]/self._batch_size): 31 | batch_start = i*self._batch_size 32 | batch_stop = (i + 1)*self._batch_size 33 | print("Summing from {} to {}.".format(batch_start, batch_stop)) 34 | self._update_sum(self._dataset[batch_start:batch_stop]) 35 | return self._computed_sum.get_value() 36 | 37 | 38 | def parse_arguments(): 39 | import argparse 40 | 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument('--batch-size', default=1000, type=int, required=False, help='Size of the batches.') 43 | 44 | return parser.parse_args() 45 | 46 | 47 | def get_mnist(path): 48 | import os 49 | import urllib 50 | 51 | if not os.path.exists(path): 52 | print("Downloading mnist ...", end=' ') 53 | url = "http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz" 54 | 55 | urllib.urlretrieve(url, path) 56 | print("Done") 57 | 58 | if __name__ == '__main__': 59 | args = parse_arguments() 60 | 61 | mnist_path = "../data/mnist.pkl.gz" 62 | 63 | get_mnist(mnist_path) 64 | 65 | with gzip.open(mnist_path, 'rb') as f: 66 | train_set, _, _ = cPickle.load(f) 67 | 68 | bps = BatchedPixelSum(train_set[0], args.batch_size) 69 | 70 | computed_sum = bps.get_sum() 71 | 72 | # Get actual answer for testing 73 | real_sum = train_set[0].sum(axis=0, dtype=theano.config.floatX) 74 | assert_allclose(computed_sum, real_sum) 75 | -------------------------------------------------------------------------------- /platoonrc.conf: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Platoon Configuration Options # 3 | ################################################################################ 4 | # 5 | # Configuring hosts for multi-node training (in decreasing order of priority): 6 | # 1. Use `-H` option of `platoon2-launcher`. 7 | # 2. Use `PLATOON_HOSTS` environmental variable to be a list of comma-separated 8 | # hostnames. 9 | # e.g. PLATOON_HOSTS="lisa1,lisa3" 10 | # 3. Use `PLATOONRC` environmental variable to point to paths of 11 | # configuration files, like this. Files have decreasing order of priority 12 | # from right to left. 13 | # e.g. PLATOONRC="~/platoon.conf.d/morn.conf:~/platoon.conf.d/even.conf" 14 | # 4. Use of `./.platoonrc`. 15 | # 5. Use of `~/.platoonrc`. 16 | # 17 | # If no hosts can be infered, then single-node training is assumed on the host 18 | # on which `platoon2-launcher` is executed. If a single host can be infered, 19 | # then that host must be the one on which `platoon2-launcher` is executed. 20 | # Currently, starting single-node training on other hosts, than the one which 21 | # `platoon2-launcher` is executed, is not supported. 22 | # 23 | # Configuring devices for multi-gpu/node training (in decreasing order of 24 | # priority): 25 | # 1. Use `-D` option of `platoon2-launcher` [NOTE: for single-node training!] 26 | # 2. Use `PLATOON_DEVICES` environmental variable to be a list of 27 | # comma-separated Theano device names [NOTE: Must be set separately for every 28 | # host for multi-node] 29 | # e.g. PLATOON_DEVICES="cuda0,cuda3" 30 | # 3. Use `PLATOONRC` environmental variable to point to paths of 31 | # configuration files, like this. Files have decreasing order of priority 32 | # from right to left. 33 | # e.g. PLATOONRC="~/platoon.conf.d/morn.conf:~/platoon.conf.d/even.conf" 34 | # 4. Use of `./.platoonrc` 35 | # 5. Use of `~/.platoonrc` 36 | # 37 | # If no devices can be inferred from the above, then a query to use all 38 | # compatible devices (currently CUDA GPUs) on a host will start using pygpu 39 | # interface, if available. If this fails (e.g. due to absence of pygpu package), 40 | # an error will be reported and processes will exit. 41 | # 42 | # This file serves as a template for configuring Platoon through a .platoonrc 43 | # file. 44 | # 45 | 46 | # Three hosts: lisa0, lisa1, lisa3 47 | [platoon] 48 | hosts : lisa0 49 | lisa1, lisa3 50 | 51 | # Use cuda0 and cuda1 on lisa0, cuda3 and cuda0 on lisa1, cuda on lisa3 52 | [devices] 53 | lisa0 : cuda0 54 | cuda1 55 | lisa1 : cuda3,cuda0 56 | lisa3 : cuda 57 | -------------------------------------------------------------------------------- /platoon/tests/unit/test_util.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import unittest 3 | from six.moves import reload_module as reload 4 | 5 | import numpy as np 6 | 7 | from ... import util 8 | 9 | try: 10 | from mpi4py import MPI 11 | MPI_IMPORTED = True 12 | except: 13 | MPI_IMPORTED = False 14 | 15 | 16 | class TestOpToMPI(unittest.TestCase): 17 | @unittest.skipUnless(MPI_IMPORTED, "Needs mpi4py module") 18 | def test_op_to_mpi(self): 19 | reload(util) 20 | assert util.op_to_mpi('+') == MPI.SUM 21 | assert util.op_to_mpi("sum") == MPI.SUM 22 | assert util.op_to_mpi("add") == MPI.SUM 23 | assert util.op_to_mpi('*') == MPI.PROD 24 | assert util.op_to_mpi("prod") == MPI.PROD 25 | assert util.op_to_mpi("product") == MPI.PROD 26 | assert util.op_to_mpi("mul") == MPI.PROD 27 | assert util.op_to_mpi("max") == MPI.MAX 28 | assert util.op_to_mpi("maximum") == MPI.MAX 29 | assert util.op_to_mpi("min") == MPI.MIN 30 | assert util.op_to_mpi("minimum") == MPI.MIN 31 | 32 | def test_op_to_mpi_import_fail(self): 33 | util.MPI = None 34 | with self.assertRaises(AttributeError): 35 | util.op_to_mpi('+') 36 | 37 | @unittest.skipUnless(MPI_IMPORTED, "Needs mpi4py module") 38 | def test_op_to_mpi_op_fail(self): 39 | reload(util) 40 | with self.assertRaises(ValueError): 41 | util.op_to_mpi('asdfasfda') 42 | with self.assertRaises(ValueError): 43 | util.op_to_mpi('-') 44 | 45 | 46 | class TestDtypeToMPI(unittest.TestCase): 47 | @unittest.skipUnless(MPI_IMPORTED, "Needs mpi4py module") 48 | def test_dtype_to_mpi(self): 49 | reload(util) 50 | assert util.dtype_to_mpi(np.dtype('bool')) == MPI.C_BOOL 51 | assert util.dtype_to_mpi(np.dtype('int8')) == MPI.INT8_T 52 | assert util.dtype_to_mpi(np.dtype('uint8')) == MPI.UINT8_T 53 | assert util.dtype_to_mpi(np.dtype('int16')) == MPI.INT16_T 54 | assert util.dtype_to_mpi(np.dtype('uint16')) == MPI.UINT16_T 55 | assert util.dtype_to_mpi(np.dtype('int32')) == MPI.INT32_T 56 | assert util.dtype_to_mpi(np.dtype('uint32')) == MPI.UINT32_T 57 | assert util.dtype_to_mpi(np.dtype('int64')) == MPI.INT64_T 58 | assert util.dtype_to_mpi(np.dtype('uint64')) == MPI.UINT64_T 59 | assert util.dtype_to_mpi(np.dtype('float32')) == MPI.FLOAT 60 | assert util.dtype_to_mpi(np.dtype('float64')) == MPI.DOUBLE 61 | assert util.dtype_to_mpi(np.dtype('complex64')) == MPI.C_FLOAT_COMPLEX 62 | assert util.dtype_to_mpi(np.dtype('complex128')) == MPI.C_DOUBLE_COMPLEX 63 | 64 | def test_dtype_to_mpi_import_fail(self): 65 | util.MPI = None 66 | with self.assertRaises(AttributeError): 67 | util.dtype_to_mpi('int8') 68 | 69 | @unittest.skipUnless(MPI_IMPORTED, "Needs mpi4py module") 70 | def test_dtype_to_mpi_dtype_fail(self): 71 | reload(util) 72 | with self.assertRaises(TypeError): 73 | util.dtype_to_mpi('sadfa') 74 | with self.assertRaises(TypeError): 75 | util.dtype_to_mpi('') 76 | # TODO Find how to convert from half type to MPI dtype 77 | # and use in collectives 78 | with self.assertRaises(TypeError): 79 | util.dtype_to_mpi('float16') 80 | -------------------------------------------------------------------------------- /platoon/tests/functional/time_worker.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, print_function, division 2 | import os 3 | import sys 4 | import cProfile 5 | import pstats 6 | from timeit import default_timer as timer 7 | 8 | from six.moves import range 9 | 10 | from pygpu import gpuarray 11 | import numpy as np 12 | from numpy.testing import assert_allclose 13 | 14 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..')) 15 | from platoon import Worker 16 | 17 | SEED = 567 18 | np.random.seed(SEED) 19 | 20 | worker = Worker(control_port=5567) 21 | 22 | 23 | def profile(shape=(1000, 1000), dtype='float64', rng=(-1, 1)): 24 | print("\n### Profiling worker") 25 | print() 26 | print("### shape =", shape) 27 | print("### dtype =", dtype) 28 | print("### range =", sorted(rng)) 29 | 30 | rang = abs(rng[1] - rng[0]) 31 | inp = np.random.random(shape) * rang + min(rng) 32 | inp = inp.astype(dtype) 33 | sinp = gpuarray.asarray(inp, context=worker.gpuctx) 34 | out = np.empty_like(inp) 35 | sout = gpuarray.asarray(out, context=worker.gpuctx) 36 | 37 | print("\n### Profiling worker.all_reduce") 38 | print("## First call to worker.all_reduce") 39 | cProfile.runctx("worker.all_reduce(sinp, '+', sout)", globals(), locals(), 40 | filename="worker.prof") 41 | s = pstats.Stats("worker.prof") 42 | s.strip_dirs().sort_stats("time").print_stats() 43 | assert_allclose(inp * worker.global_size, np.asarray(sout)) 44 | 45 | print("## Second call to worker.all_reduce") 46 | cProfile.runctx("worker.all_reduce(sinp, '+', sout)", globals(), locals(), 47 | filename="worker.prof") 48 | s = pstats.Stats("worker.prof") 49 | s.strip_dirs().sort_stats("time").print_stats() 50 | assert_allclose(inp * worker.global_size, np.asarray(sout)) 51 | if worker._multinode: 52 | print("## Note that there must be difference between the first and") 53 | print("## the second call as a result of the extra call to worker.shared") 54 | print("## during the first time.") 55 | 56 | 57 | def benchmark(shape=(1000, 1000), dtype='float64', rng=(-1, 1), number=10): 58 | print("\n### Benchmarking worker") 59 | print() 60 | print("### shape =", shape) 61 | print("### dtype =", dtype) 62 | print("### range =", sorted(rng)) 63 | print("### num of iterations =", number) 64 | 65 | rang = abs(rng[1] - rng[0]) 66 | inp = np.random.random(shape) * rang + min(rng) 67 | inp = inp.astype(dtype) 68 | sinp = gpuarray.asarray(inp, context=worker.gpuctx) 69 | out = np.empty_like(inp) 70 | sout = gpuarray.asarray(out, context=worker.gpuctx) 71 | 72 | print("\n## Benchmarking worker.shared") 73 | print("# First call") 74 | start = timer() 75 | worker.shared(sinp) 76 | end = timer() 77 | print("Time:", end - start) 78 | print("# Second call") 79 | start = timer() 80 | worker.shared(sinp) 81 | end = timer() 82 | print("Time:", end - start) 83 | 84 | print("\n## Benchmarking worker.all_reduce") 85 | print("# Timing worker.all_reduce w/o calls to worker.shared") 86 | ttime = 0 87 | for _ in range(number): 88 | start = timer() 89 | worker.all_reduce(sinp, '+', sout) 90 | end = timer() 91 | ttime += end - start 92 | assert_allclose(inp * worker.global_size, np.asarray(sout)) 93 | print("Mean time:", ttime / number) 94 | 95 | 96 | if __name__ == '__main__': 97 | try: 98 | benchmark() 99 | profile() 100 | except Exception as exc: 101 | print(exc, file=sys.stderr) 102 | finally: 103 | worker.close() 104 | -------------------------------------------------------------------------------- /platoon/configparser.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shlex 3 | import sys 4 | 5 | from six.moves import configparser as ConfigParser 6 | 7 | # The PLATOON_DEVICES environment variable should be a list of comma-separated 8 | # device name entries, e.g. PLATOON_DEVICES=cuda0,cuda2,cuda3 9 | PLATOON_DEVICES = os.getenv("PLATOON_DEVICES", "") 10 | 11 | # The PLATOON_HOSTS environment variable should be a list of comma-separated 12 | # host machine entries, e.g. PLATOON_HOSTS=lisa1,ceylon 13 | PLATOON_HOSTS = os.getenv("PLATOON_HOSTS", "") 14 | 15 | 16 | def config_files_from_platoonrc(): 17 | if sys.platform != "win32": 18 | rval = [os.path.expanduser('~/.platoonrc')] 19 | rval.append(os.path.join(os.getcwd(), '.platoonrc')) 20 | else: 21 | rval = [os.path.expanduser('~/.platoonrc.txt')] 22 | rval.append(os.path.join(os.getcwd(), '.platoonrc.txt')) 23 | if os.getenv('PLATOONRC') is not None: 24 | rval.extend([os.path.expanduser(s) for s in 25 | os.getenv('PLATOONRC').split(os.pathsep)]) 26 | return rval 27 | 28 | config_files = config_files_from_platoonrc() 29 | platoon_cfg = ConfigParser.SafeConfigParser( 30 | {'USER': os.getenv("USER", os.path.split(os.path.expanduser('~'))[-1]), 31 | 'LSCRATCH': os.getenv("LSCRATCH", ""), 32 | 'TMPDIR': os.getenv("TMPDIR", ""), 33 | 'TEMP': os.getenv("TEMP", ""), 34 | 'TMP': os.getenv("TMP", ""), 35 | 'PID': str(os.getpid()), 36 | } 37 | ) 38 | platoon_cfg.optionxform = str 39 | platoon_cfg.read(config_files) 40 | # Having a raw version of the config around as well enables us to pass 41 | # through config values that contain format strings. 42 | # The time required to parse the config twice is negligible. 43 | platoon_raw_cfg = ConfigParser.RawConfigParser() 44 | platoon_raw_cfg.optionxform = str 45 | platoon_raw_cfg.read(config_files) 46 | 47 | 48 | def fetch_devices_for_host(host): 49 | """A successful search returns a list of theano devices' string values. 50 | An unsuccessful search raises a KeyError. 51 | 52 | The (decreasing) priority order is: 53 | - PLATOON_DEVICES 54 | - PLATOONRC files (if they exist) from right to left 55 | - working directory's ./.platoonrc 56 | - ~/.platoonrc 57 | 58 | """ 59 | # first try to have PLATOON_DEVICES 60 | if PLATOON_DEVICES: 61 | splitter = shlex.shlex(PLATOON_DEVICES, posix=True) 62 | splitter.whitespace += ',' 63 | splitter.whitespace_split = True 64 | return list(splitter) 65 | 66 | # next try to find it in the config file 67 | try: 68 | try: 69 | devices = platoon_cfg.get("devices", host) 70 | except ConfigParser.InterpolationError: 71 | devices = platoon_raw_cfg.get("devices", host) 72 | except (ConfigParser.NoOptionError, ConfigParser.NoSectionError): 73 | raise KeyError(host) 74 | splitter = shlex.shlex(devices, posix=True) 75 | splitter.whitespace += ',' 76 | splitter.whitespace_split = True 77 | return list(splitter) 78 | 79 | 80 | def fetch_hosts(): 81 | """A successful search returns a list of host to participate in a multi-node 82 | platoon. An unsuccessful search raises a KeyError. 83 | 84 | The (decreasing) priority order is: 85 | - PLATOON_HOSTS 86 | - PLATOONRC files (if they exist) from right to left 87 | - working directory's ./.platoonrc 88 | - ~/.platoonrc 89 | 90 | """ 91 | # first try to have PLATOON_HOSTS 92 | if PLATOON_HOSTS: 93 | splitter = shlex.shlex(PLATOON_HOSTS, posix=True) 94 | splitter.whitespace += ',' 95 | splitter.whitespace_split = True 96 | return list(splitter) 97 | 98 | # next try to find it in the config file 99 | try: 100 | try: 101 | hosts = platoon_cfg.get("platoon", "hosts") 102 | except ConfigParser.InterpolationError: 103 | hosts = platoon_raw_cfg.get("platoon", "hosts") 104 | except (ConfigParser.NoOptionError, ConfigParser.NoSectionError): 105 | raise KeyError("hosts") 106 | splitter = shlex.shlex(hosts, posix=True) 107 | splitter.whitespace += ',' 108 | splitter.whitespace_split = True 109 | return list(splitter) 110 | -------------------------------------------------------------------------------- /example/simple_batched_pixel_sum/batched_pixel_sum_worker.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, print_function 2 | import os 3 | import sys 4 | import gzip 5 | import six 6 | from six.moves import cPickle 7 | 8 | import numpy as np 9 | from numpy.testing import assert_allclose 10 | 11 | import theano 12 | from theano import tensor as T 13 | from theano.compat.python2x import OrderedDict 14 | 15 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) 16 | from platoon.channel import Worker 17 | from platoon.param_sync import ParamSyncRule 18 | 19 | 20 | class SUMSync(ParamSyncRule): 21 | 22 | def update_params(self, local_params, master_params): 23 | """ 24 | Update the master params and reset to local params. 25 | """ 26 | master_params[0] += local_params[0] 27 | local_params[0].fill(0) 28 | 29 | 30 | class BatchedPixelSum(object): 31 | 32 | def __init__(self, control_port, batch_port): 33 | self._worker = Worker(control_port=control_port, data_port=batch_port) 34 | 35 | data_shape = self._worker.send_req('get_data_shape') 36 | 37 | self._computed_sum = theano.shared( 38 | value=np.zeros(data_shape, dtype=theano.config.floatX), 39 | name='sum', borrow=True) 40 | 41 | self._worker.init_shared_params(params=[self._computed_sum], 42 | param_sync_rule=SUMSync()) 43 | 44 | input = T.matrix(dtype=theano.config.floatX) 45 | batch_sum = T.sum(input, axis=0, dtype=theano.config.floatX) 46 | 47 | updates = OrderedDict() 48 | updates[self._computed_sum] = (self._computed_sum + batch_sum) 49 | 50 | self._update_sum = theano.function(name='learn', 51 | inputs=[input], 52 | updates=updates) 53 | 54 | def get_sum(self): 55 | nb_batches_before_sync = 10 56 | 57 | while True: 58 | step = self._worker.send_req('next') 59 | print("# Command received: {}".format(step)) 60 | 61 | if step == 'train': 62 | print("# Training", end=' ') 63 | # TODO: Having a fix number of MB before sync can cause 64 | # problems 65 | for i in range(nb_batches_before_sync): 66 | data = np.asarray(self._worker.recv_mb()) 67 | print(".", end=' ') 68 | self._update_sum(data) 69 | print("Done") 70 | import time 71 | time.sleep(1) 72 | step = self._worker.send_req('done', 73 | dict(num_batches=nb_batches_before_sync)) 74 | 75 | print("Syncing with global params.") 76 | self._worker.sync_params(synchronous=True) 77 | 78 | if step == 'stop': 79 | break 80 | 81 | print("All computation done.") 82 | return self._worker.shared_params[0] # Return global params 83 | 84 | 85 | def parse_arguments(): 86 | import argparse 87 | 88 | parser = argparse.ArgumentParser() 89 | parser.add_argument('--batch_port', default=5566, type=int, required=False, 90 | help='Port on which the batches will be transfered.') 91 | parser.add_argument('--control_port', default=5567, type=int, 92 | required=False, help='Port on which the control ' 93 | 'commands will be sent.') 94 | 95 | return parser.parse_args() 96 | 97 | if __name__ == '__main__': 98 | args = parse_arguments() 99 | 100 | print("Init ...", end=' ') 101 | bps = BatchedPixelSum(control_port=args.control_port, 102 | batch_port=args.batch_port) 103 | print("Done") 104 | 105 | computed_sum = bps.get_sum() 106 | 107 | # Get actual answer for testing 108 | with gzip.open("../data/mnist.pkl.gz", 'rb') as f: 109 | kwargs = {} 110 | if six.PY3: 111 | kwargs['encoding'] = 'latin1' 112 | train_set, _, _ = cPickle.load(f, **kwargs) 113 | real_sum = train_set[0].sum(axis=0, dtype=theano.config.floatX) 114 | assert_allclose(computed_sum, real_sum) 115 | -------------------------------------------------------------------------------- /example/simple_batched_pixel_sum/batched_pixel_sum_controller.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, print_function 2 | import os 3 | import sys 4 | import gzip 5 | import time 6 | import six 7 | from six.moves import cPickle 8 | from multiprocessing import Process 9 | 10 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) 11 | from platoon.channel import Controller 12 | 13 | 14 | class BatchedPixelSumController(Controller): 15 | 16 | def __init__(self, batch_port, dataset, batch_size, default_args): 17 | super(BatchedPixelSumController, self).__init__(**default_args) 18 | # The data socket should be initialized in the process that will handle 19 | # the batch. 20 | # That is why it's not initialized in the parent constructor. Second 21 | # param = None 22 | self._batch_port = batch_port 23 | 24 | self._start_time = None 25 | self._should_stop = False 26 | 27 | self._batch_size = batch_size 28 | self._dataset = dataset 29 | 30 | self._nb_batch_processed = 0 31 | self._nb_batch_to_process = (dataset.shape[0] // batch_size) 32 | 33 | def start_batch_server(self): 34 | self.p = Process(target=self._send_mb) 35 | self.p.start() 36 | 37 | def _send_mb(self): 38 | self.init_data(self._batch_port) 39 | 40 | for i in range(self._dataset.shape[0] // self._batch_size): 41 | batch_start = i * self._batch_size 42 | batch_stop = (i + 1) * self._batch_size 43 | self.send_mb(self._dataset[batch_start:batch_stop]) 44 | 45 | self.asocket.close() 46 | print("Done Sending MB.") 47 | 48 | # TODO: Find a solution for this 49 | # Sleeping to give the chance to the worker to empty the queue before 50 | # the MB process dies 51 | import time 52 | time.sleep(2) 53 | 54 | def handle_control(self, req, worker_id, req_info): 55 | print("# Handling req: {}".format(req)) 56 | control_response = '' 57 | 58 | if req == 'next': 59 | if not self._should_stop: 60 | # Start a global execution timer 61 | if self._start_time is None: 62 | self._start_time = time.time() 63 | control_response = 'train' 64 | else: 65 | control_response = 'stop' 66 | elif req == 'get_data_shape': 67 | control_response = self._dataset[0].shape 68 | elif req == 'done': 69 | self._nb_batch_processed += req_info['num_batches'] 70 | print("{} batches processed by worker so far." 71 | .format(self._nb_batch_processed)) 72 | 73 | if self._nb_batch_processed >= self._nb_batch_to_process: 74 | if not self._should_stop: 75 | print("Training time {:.4f}s".format( 76 | time.time() - self._start_time)) 77 | self._should_stop = True 78 | 79 | return control_response 80 | 81 | 82 | def parse_arguments(): 83 | parser = Controller.default_parser() 84 | parser.add_argument('--batch_port', default=5566, type=int, required=False, 85 | help='Port on which the batches will be transfered.') 86 | parser.add_argument('--batch-size', default=1000, type=int, required=False, 87 | help='Size of the batches.') 88 | 89 | return parser.parse_args() 90 | 91 | 92 | def get_mnist(path): 93 | import os 94 | from six.moves import urllib 95 | 96 | if not os.path.exists(path): 97 | print("Downloading mnist ...", end=' ') 98 | url = "http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz" 99 | 100 | urllib.request.urlretrieve(url, path) 101 | print("Done") 102 | 103 | 104 | def spawn_controller(): 105 | args = parse_arguments() 106 | 107 | mnist_path = "../data/mnist.pkl.gz" 108 | 109 | get_mnist(mnist_path) 110 | 111 | with gzip.open(mnist_path, 'rb') as f: 112 | kwargs = {} 113 | if six.PY3: 114 | kwargs['encoding'] = 'latin1' 115 | train_set, _, _ = cPickle.load(f, **kwargs) 116 | 117 | controller = BatchedPixelSumController(batch_port=args.batch_port, 118 | dataset=train_set[0], 119 | batch_size=args.batch_size, 120 | default_args=Controller.default_arguments(args)) 121 | controller.start_batch_server() 122 | return controller.serve() 123 | 124 | if __name__ == '__main__': 125 | rcode = spawn_controller() 126 | if rcode != 0: 127 | sys.exit(rcode) 128 | -------------------------------------------------------------------------------- /platoon/tests/functional/test_worker.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, print_function 2 | import os 3 | import sys 4 | 5 | import unittest 6 | 7 | from pygpu import gpuarray 8 | import numpy as np 9 | 10 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..')) 11 | from platoon import Worker 12 | 13 | 14 | class TestWorker(unittest.TestCase): 15 | @classmethod 16 | def setUpClass(cls): 17 | try: 18 | cls.total_nw = int(os.environ['PLATOON_TEST_WORKERS_NUM']) 19 | cls.worker = Worker(control_port=5567) 20 | cls.ctx = cls.worker.gpuctx 21 | except Exception as exc: 22 | print(exc, file=sys.stderr) 23 | raise exc 24 | 25 | def test_is_singleton(self): 26 | inst = Worker() 27 | assert inst is self.worker 28 | print("The following warning is produced by testing procedure:", file=sys.stderr) 29 | inst = Worker(123413) 30 | assert inst is self.worker 31 | 32 | def test_global_size(self): 33 | assert self.worker.global_size == self.total_nw 34 | 35 | def test_interface1(self): 36 | inp = np.arange(32, dtype='float64') 37 | sinp = gpuarray.asarray(inp, context=self.ctx) 38 | out = np.empty_like(inp) 39 | sout = gpuarray.asarray(out, context=self.ctx) 40 | self.worker.all_reduce(sinp, '+', sout) 41 | expected = self.total_nw * inp 42 | actual = np.asarray(sout) 43 | assert np.allclose(expected, actual) 44 | 45 | def test_interface2(self): 46 | inp = np.arange(32, dtype='float64') 47 | sinp = gpuarray.asarray(inp, context=self.ctx) 48 | self.worker.all_reduce(sinp, '+', sinp) 49 | expected = self.total_nw * inp 50 | actual = np.asarray(sinp) 51 | assert np.allclose(expected, actual) 52 | 53 | def test_interface3(self): 54 | inp = np.arange(32, dtype='float64') 55 | sinp = gpuarray.asarray(inp, context=self.ctx) 56 | sout = self.worker.all_reduce(sinp, '+') 57 | expected = self.total_nw * inp 58 | actual = np.asarray(sout) 59 | assert np.allclose(expected, actual) 60 | 61 | def test_linked_shared(self): 62 | inp = np.arange(32, dtype='float64') 63 | sinp = gpuarray.asarray(inp, context=self.ctx) 64 | insize = sinp.size * sinp.itemsize 65 | out = np.empty_like(inp) 66 | sout = gpuarray.asarray(out, context=self.ctx) 67 | outsize = sout.size * sout.itemsize 68 | 69 | if self.worker._multinode: 70 | try: 71 | self.worker.shared_arrays[outsize] 72 | self.fail("'sout''s size has not been linked yet to a shared buffer") 73 | except KeyError: 74 | pass 75 | try: 76 | self.worker.shared_arrays[insize] 77 | self.fail("'sinp''s size has not been linked yet to a shared buffer") 78 | except KeyError: 79 | pass 80 | 81 | self.worker.all_reduce(sinp, '+', sout) 82 | 83 | if self.worker._multinode: 84 | try: 85 | self.worker.shared_arrays[outsize] 86 | except KeyError: 87 | self.fail("`sout`'s size should have been linked to a shared buffer") 88 | try: 89 | self.worker.shared_arrays[insize] 90 | except KeyError: 91 | self.fail("`sinp`'s size should have been linked to a shared buffer") 92 | 93 | expected = self.total_nw * inp 94 | actual = np.asarray(sout) 95 | assert np.allclose(expected, actual) 96 | 97 | self.worker.all_reduce(sout, '*', sout) 98 | 99 | if self.worker._multinode: 100 | try: 101 | self.worker.shared_arrays[outsize] 102 | except KeyError: 103 | self.fail("`sout`'s size should have been linked to a shared buffer") 104 | try: 105 | self.worker.shared_arrays[insize] 106 | except KeyError: 107 | self.fail("`sinp`'s size should have been linked to a shared buffer") 108 | 109 | expected = expected ** self.total_nw 110 | actual = np.asarray(sout) 111 | assert np.allclose(expected, actual) 112 | 113 | @classmethod 114 | def tearDownClass(cls): 115 | cls.worker.close() 116 | 117 | if __name__ == '__main__': 118 | print("### Beginning Worker's tests...") 119 | suite = unittest.TestLoader().loadTestsFromTestCase(TestWorker) 120 | res = unittest.TextTestRunner(verbosity=1).run(suite) 121 | if len(res.failures) != 0 or len(res.errors) != 0: 122 | sys.exit(1) 123 | -------------------------------------------------------------------------------- /platoon/param_sync.py: -------------------------------------------------------------------------------- 1 | class ParamSyncRule(object): 2 | """ 3 | Abstract parameter synchronisation rule. 4 | 5 | This abstract class defines the interface that should be followed by 6 | implementations of parameter synchronization rules for distributed 7 | training. 8 | """ 9 | 10 | def make_update_function(self, local_params): 11 | """Return a function that will be called with the current value of the 12 | master parameters and should update them inplace. This 13 | function must also update the values of local_params (that are 14 | shared values) as a side effect. 15 | """ 16 | try: 17 | f = self.theano_update(local_params) 18 | 19 | def update(master_params, f=f): 20 | new_master_values = f(*master_params) 21 | for p, v in zip(master_params, new_master_values): 22 | p[:] = v 23 | except NotImplementedError: 24 | def update(master_params, local_params=local_params, 25 | update_params=self.update_params): 26 | local_param_values = [p.get_value() for p in local_params] 27 | update_params(local_param_values, master_params) 28 | for p, v in zip(local_params, local_param_values): 29 | p.set_value(v) 30 | return update 31 | 32 | def theano_update(self, local_params): 33 | """Compile and return a theano function that will update the local 34 | params and return new values for the master params. 35 | 36 | This function is preferred to update_params below. 37 | """ 38 | raise NotImplementedError() 39 | 40 | def update_params(self, local_params, master_params): 41 | """Perform an inplace update of the local and master params according 42 | to some update rule. 43 | 44 | This function need not be implemented if theano_update is 45 | overridden. 46 | 47 | """ 48 | raise NotImplementedError() 49 | 50 | 51 | class EASGD(ParamSyncRule): 52 | """ 53 | Implementation of the EASGD parameter sync rule. 54 | 55 | According to this rule, every N iterations, a worker synchronises his 56 | parameters with the master parameters. This is done by moving each set of 57 | parameters toward the other by an amount proportional to the difference 58 | between the individual params (this proportion is parametrized by `alpha`). 59 | 60 | The sync equations are as follow: 61 | diff = w_worker - w_master 62 | w_worker = w_worker - alpha * diff 63 | w_master = w_master + alpha * diff 64 | 65 | NOTE : if alpha=0 is used, there is no synchronization of the 66 | parameters meaning that each worker is independently training using SGD. 67 | 68 | This algorithm is described in more details in the following paper: 69 | http://arxiv.org/abs/1412.6651 70 | """ 71 | 72 | def __init__(self, alpha): 73 | self.set_alpha(alpha) 74 | 75 | def get_alpha(self): 76 | return self.alpha 77 | 78 | def set_alpha(self, alpha): 79 | self.alpha = alpha 80 | 81 | def theano_update(self, local_params): 82 | # Theano is imported here to avoid a strong dependancy on it. 83 | import theano 84 | master_inps = [l.type() for l in local_params] 85 | master_ups = [] 86 | local_ups = [] 87 | for p_local, p_master in zip(local_params, master_inps): 88 | diff = self.alpha * (p_local - p_master) 89 | local_ups.append(p_local - diff) 90 | master_ups.append(p_master + diff) 91 | return theano.function(master_inps, master_ups, 92 | updates=list(zip(local_params, local_ups))) 93 | 94 | def update_params(self, local_params, master_params): 95 | for p_local, p_master in zip(local_params, master_params): 96 | diff = self.alpha * (p_local - p_master) 97 | p_local -= diff 98 | p_master += diff 99 | 100 | 101 | class ASGD(ParamSyncRule): 102 | def theano_update(self, local_params): 103 | import theano 104 | 105 | local_vals = [p.get_value(borrow=True, return_internal_type=True) 106 | for p in local_params] 107 | master_inps = [l.type() for l in local_params] 108 | self.old_locals = [theano.shared(l) for l in local_vals] 109 | # This updates the global params with the difference between 110 | # old and current (aka the gradients). 111 | ret = [m + (p - o) for (m, p, o) in zip(master_inps, local_params, 112 | self.old_locals)] 113 | # This keeps values before the update for the local params 114 | ups = list(zip(self.old_locals, ret)) 115 | # This updates the local params to be the same as the global 116 | ups += list(zip(local_params, ret)) 117 | return theano.function(master_inps, ret, updates=ups) 118 | -------------------------------------------------------------------------------- /platoon/tests/functional/test_global_dynamics_worker.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, print_function, division 2 | import os 3 | import sys 4 | 5 | import unittest 6 | 7 | import theano 8 | from theano import config 9 | import numpy as np 10 | 11 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..')) 12 | from platoon.training import global_dynamics as gd 13 | from platoon.channel.worker import Worker 14 | 15 | 16 | class TestGlobalDynamicsWorker(unittest.TestCase): 17 | @classmethod 18 | def setUpClass(cls): 19 | try: 20 | cls.worker = Worker(control_port=5567) 21 | cls.total_nw = cls.worker.global_size 22 | cls.rank = cls.worker.global_rank 23 | except Exception as exc: 24 | print(exc, file=sys.stderr) 25 | raise exc 26 | 27 | def setUp(self): 28 | super(TestGlobalDynamicsWorker, self).setUp() 29 | SEED = 567 30 | np.random.seed(SEED) 31 | self.inp1 = 30 * np.random.random((8, 10, 5)).astype(config.floatX) 32 | self.sinp1 = theano.shared(self.inp1) 33 | self.inp2 = 50 * np.random.random((5, 20)).astype(config.floatX) 34 | self.sinp2 = theano.shared(self.inp2) 35 | 36 | def test_sumSGD_object(self): 37 | sumsgd = gd.SumSGD() 38 | sumsgd.make_rule(self.sinp1) 39 | sumsgd() 40 | expected = self.inp1 * self.total_nw 41 | actual = self.sinp1.get_value() 42 | assert np.allclose(expected, actual) 43 | 44 | def test_sumSGD_list(self): 45 | sumsgd = gd.SumSGD() 46 | sumsgd.make_rule([self.sinp1, self.sinp2]) 47 | sumsgd() 48 | expected = self.inp1 * self.total_nw 49 | actual = self.sinp1.get_value() 50 | assert np.allclose(expected, actual) 51 | expected = self.inp2 * self.total_nw 52 | actual = self.sinp2.get_value() 53 | assert np.allclose(expected, actual) 54 | 55 | def test_averageSGD_object(self): 56 | averagesgd = gd.AverageSGD() 57 | averagesgd.make_rule(self.sinp1) 58 | averagesgd() 59 | expected = self.inp1 60 | actual = self.sinp1.get_value() 61 | assert np.allclose(expected, actual) 62 | 63 | def test_averageSGD_list(self): 64 | averagesgd = gd.AverageSGD() 65 | averagesgd.make_rule([self.sinp1, self.sinp2]) 66 | averagesgd() 67 | expected = self.inp1 68 | actual = self.sinp1.get_value() 69 | assert np.allclose(expected, actual) 70 | expected = self.inp2 71 | actual = self.sinp2.get_value() 72 | assert np.allclose(expected, actual) 73 | 74 | def test_EASGD(self): 75 | lp = np.array([3, 4], dtype=config.floatX) 76 | if self.rank % 2 != 0: 77 | lp = -lp 78 | slp = theano.shared(lp) 79 | cp = np.array([0, 0], dtype=config.floatX) 80 | scp = theano.shared(cp) 81 | alpha = 0.5 82 | 83 | easgd = gd.EASGD() 84 | easgd.make_rule(slp, scp, alpha) 85 | easgd() 86 | 87 | if self.total_nw % 2 == 0: 88 | expectedcp = cp 89 | actualcp = scp.get_value() 90 | assert np.allclose(expectedcp, actualcp), (expectedcp, actualcp) 91 | expectedlp = lp / 2 92 | actuallp = slp.get_value() 93 | assert np.allclose(expectedlp, actuallp), (expectedlp, actuallp) 94 | else: 95 | expectedcp = lp / 2 96 | actualcp = scp.get_value() 97 | assert np.allclose(expectedcp, actualcp), (expectedcp, actualcp) 98 | expectedlp = lp / 2 99 | actuallp = slp.get_value() 100 | assert np.allclose(expectedlp, actuallp), (expectedlp, actuallp) 101 | 102 | def test_Downpour(self): 103 | lp = np.random.random((2,)).astype(config.floatX) 104 | slp = theano.shared(lp) 105 | gp = np.array([0, 1], dtype=config.floatX) 106 | sgp = theano.shared(gp) 107 | lau = (self.rank + 1) * np.array([1, 1], dtype=config.floatX) 108 | slau = theano.shared(lau) 109 | 110 | downpour = gd.Downpour() 111 | downpour.make_rule(slp, slau, sgp) 112 | downpour() 113 | 114 | expected = np.array([0, 0], dtype=config.floatX) 115 | actual = slau.get_value() 116 | assert np.allclose(expected, actual), (expected, actual) 117 | expected = sum(np.arange(self.total_nw + 1)) * np.array([1, 1], dtype=config.floatX) 118 | expected += np.array([0, 1], dtype=config.floatX) 119 | actual = sgp.get_value() 120 | assert np.allclose(expected, actual), (expected, actual) 121 | actual = slp.get_value() 122 | assert np.allclose(expected, actual), (expected, actual) 123 | 124 | @classmethod 125 | def tearDownClass(cls): 126 | cls.worker.close() 127 | 128 | if __name__ == '__main__': 129 | suite = unittest.TestLoader().loadTestsFromTestCase(TestGlobalDynamicsWorker) 130 | res = unittest.TextTestRunner(verbosity=1).run(suite) 131 | if len(res.failures) != 0 or len(res.errors) != 0: 132 | sys.exit(1) 133 | -------------------------------------------------------------------------------- /example/lstm/lstm_controller.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, print_function 2 | import os 3 | import sys 4 | import time 5 | 6 | import numpy 7 | 8 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) 9 | from platoon.channel import Controller 10 | 11 | 12 | class LSTMController(Controller): 13 | """ 14 | This multi-process controller implements patience-based early-stopping SGD 15 | """ 16 | 17 | def __init__(self, max_mb, patience, valid_freq, default_args): 18 | """ 19 | Initialize the LSTMController 20 | 21 | Parameters 22 | ---------- 23 | max_mb : int 24 | Max number of minibatches to train on. 25 | patience: : int 26 | Training stops when this many minibatches have been trained on 27 | without any reported improvement. 28 | valid_freq : int 29 | Number of minibatches to train on between every monitoring step. 30 | default_args : dict 31 | Arguments of default class Controller 32 | """ 33 | 34 | super(LSTMController, self).__init__(**default_args) 35 | self.patience = patience 36 | self.max_mb = int(max_mb) 37 | 38 | self.valid_freq = valid_freq 39 | self.uidx = 0 40 | self.eidx = 0 41 | self.history_errs = [] 42 | self.bad_counter = 0 43 | 44 | self.valid = False 45 | self.start_time = None 46 | self._should_stop = False 47 | 48 | def handle_control(self, req, worker_id, req_info): 49 | """ 50 | Handles a control_request received from a worker 51 | 52 | Parameters 53 | ---------- 54 | req : str or dict 55 | Control request received from a worker. 56 | The control request can be one of the following 57 | 1) "next" : request by a worker to be informed of its next action 58 | to perform. The answers from the server can be 'train' (the 59 | worker should keep training on its training data), 'valid' (the 60 | worker should perform monitoring on its validation set and test 61 | set) or 'stop' (the worker should stop training). 62 | 2) dict of format {"done":N} : used by a worker to inform the 63 | server that is has performed N more training iterations and 64 | synced its parameters. The server will respond 'stop' if the 65 | maximum number of training minibatches has been reached. 66 | 3) dict of format {"valid_err":x, "test_err":x2} : used by a worker 67 | to inform the server that it has performed a monitoring step 68 | and obtained the included errors on the monitoring datasets. 69 | The server will respond "best" if this is the best reported 70 | validation error so far, otherwise it will respond 'stop' if 71 | the patience has been exceeded. 72 | """ 73 | control_response = "" 74 | 75 | if req == 'next': 76 | if not self._should_stop: 77 | if self.start_time is None: 78 | self.start_time = time.time() 79 | 80 | if self.valid: 81 | self.valid = False 82 | control_response = 'valid' 83 | else: 84 | control_response = 'train' 85 | else: 86 | control_response = 'stop' 87 | elif req == 'done': 88 | self.uidx += req_info['train_len'] 89 | 90 | if numpy.mod(self.uidx, self.valid_freq) == 0: 91 | self.valid = True 92 | elif req == 'pred_errors': 93 | valid_err = req_info['valid_err'] 94 | test_err = req_info['test_err'] 95 | self.history_errs.append([valid_err, test_err]) 96 | harr = numpy.array(self.history_errs)[:, 0] 97 | 98 | if valid_err <= harr.min(): 99 | self.bad_counter = 0 100 | control_response = 'best' 101 | print("Best error valid:", valid_err, "test:", test_err) 102 | elif (len(self.history_errs) > self.patience and valid_err >= harr[:-self.patience].min()): 103 | self.bad_counter += 1 104 | 105 | if self.uidx > self.max_mb or self.bad_counter > self.patience: 106 | if not self._should_stop: 107 | print("Training time {:.4f}s".format(time.time() - self.start_time)) 108 | print("Number of samples:", self.uidx) 109 | self._should_stop = True 110 | 111 | return control_response 112 | 113 | 114 | def lstm_control(saveFreq=1110, saveto=None): 115 | parser = Controller.default_parser() 116 | parser.add_argument('--max-mb', default=((5000 * 1998) / 10), type=int, 117 | required=False, help='Maximum mini-batches to train upon in total.') 118 | parser.add_argument('--patience', default=10, type=int, 119 | required=False, help='Maximum patience when failing to get better validation results.') 120 | parser.add_argument('--valid-freq', default=370, type=int, 121 | required=False, help='How often in mini-batches prediction function should get validated.') 122 | args = parser.parse_args() 123 | 124 | l = LSTMController(max_mb=args.max_mb, 125 | patience=args.patience, 126 | valid_freq=args.valid_freq, 127 | default_args=Controller.default_arguments(args)) 128 | 129 | print("Controller is ready") 130 | return l.serve() 131 | 132 | if __name__ == '__main__': 133 | rcode = lstm_control() 134 | if rcode != 0: 135 | sys.exit(rcode) 136 | -------------------------------------------------------------------------------- /example/lstm/imdb.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import gzip 4 | 5 | from six.moves import cPickle 6 | 7 | import numpy 8 | import theano 9 | 10 | 11 | def prepare_data(seqs, labels, maxlen=None): 12 | """Create the matrices from the datasets. 13 | 14 | This pad each sequence to the same lenght: the lenght of the 15 | longuest sequence or maxlen. 16 | 17 | if maxlen is set, we will cut all sequence to this maximum 18 | lenght. 19 | 20 | This swap the axis! 21 | """ 22 | # x: a list of sentences 23 | lengths = [len(s) for s in seqs] 24 | 25 | if maxlen is not None: 26 | new_seqs = [] 27 | new_labels = [] 28 | new_lengths = [] 29 | for l, s, y in zip(lengths, seqs, labels): 30 | if l < maxlen: 31 | new_seqs.append(s) 32 | new_labels.append(y) 33 | new_lengths.append(l) 34 | lengths = new_lengths 35 | labels = new_labels 36 | seqs = new_seqs 37 | 38 | if len(lengths) < 1: 39 | return None, None, None 40 | 41 | n_samples = len(seqs) 42 | maxlen = numpy.max(lengths) 43 | 44 | x = numpy.zeros((maxlen, n_samples)).astype('int64') 45 | x_mask = numpy.zeros((maxlen, n_samples)).astype(theano.config.floatX) 46 | for idx, s in enumerate(seqs): 47 | x[:lengths[idx], idx] = s 48 | x_mask[:lengths[idx], idx] = 1. 49 | 50 | return x, x_mask, labels 51 | 52 | 53 | def get_dataset_file(dataset, default_dataset, origin): 54 | '''Look for it as if it was a full path, if not, try local file, 55 | if not try in the data directory. 56 | 57 | Download dataset if it is not present 58 | 59 | ''' 60 | data_dir, data_file = os.path.split(dataset) 61 | if data_dir == "" and not os.path.isfile(dataset): 62 | # Check if dataset is in the data directory. 63 | new_path = os.path.join( 64 | os.path.split(__file__)[0], 65 | "..", 66 | "data", 67 | dataset 68 | ) 69 | if os.path.isfile(new_path) or data_file == default_dataset: 70 | dataset = new_path 71 | 72 | if (not os.path.isfile(dataset)) and data_file == default_dataset: 73 | from six.moves import urllib 74 | print('Downloading data from %s' % origin) 75 | urllib.request.urlretrieve(origin, dataset) 76 | return dataset 77 | 78 | 79 | def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None, 80 | sort_by_len=True): 81 | '''Loads the dataset 82 | 83 | :type path: String 84 | :param path: The path to the dataset (here IMDB) 85 | :type n_words: int 86 | :param n_words: The number of word to keep in the vocabulary. 87 | All extra words are set to unknow (1). 88 | :type valid_portion: float 89 | :param valid_portion: The proportion of the full train set used for 90 | the validation set. 91 | :type maxlen: None or positive int 92 | :param maxlen: the max sequence length we use in the train/valid set. 93 | :type sort_by_len: bool 94 | :name sort_by_len: Sort by the sequence lenght for the train, 95 | valid and test set. This allow faster execution as it cause 96 | less padding per minibatch. Another mechanism must be used to 97 | shuffle the train set at each epoch. 98 | 99 | ''' 100 | 101 | ############# 102 | # LOAD DATA # 103 | ############# 104 | 105 | # Load the dataset 106 | path = get_dataset_file( 107 | path, "imdb.pkl", 108 | "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl") 109 | 110 | if path.endswith(".gz"): 111 | f = gzip.open(path, 'rb') 112 | else: 113 | f = open(path, 'rb') 114 | 115 | train_set = cPickle.load(f) 116 | test_set = cPickle.load(f) 117 | f.close() 118 | if maxlen: 119 | new_train_set_x = [] 120 | new_train_set_y = [] 121 | for x, y in zip(train_set[0], train_set[1]): 122 | if len(x) < maxlen: 123 | new_train_set_x.append(x) 124 | new_train_set_y.append(y) 125 | train_set = (new_train_set_x, new_train_set_y) 126 | del new_train_set_x, new_train_set_y 127 | 128 | # split training set into validation set 129 | train_set_x, train_set_y = train_set 130 | n_samples = len(train_set_x) 131 | sidx = numpy.random.permutation(n_samples) 132 | n_train = int(numpy.round(n_samples * (1. - valid_portion))) 133 | valid_set_x = [train_set_x[s] for s in sidx[n_train:]] 134 | valid_set_y = [train_set_y[s] for s in sidx[n_train:]] 135 | train_set_x = [train_set_x[s] for s in sidx[:n_train]] 136 | train_set_y = [train_set_y[s] for s in sidx[:n_train]] 137 | 138 | train_set = (train_set_x, train_set_y) 139 | valid_set = (valid_set_x, valid_set_y) 140 | 141 | def remove_unk(x): 142 | return [[1 if w >= n_words else w for w in sen] for sen in x] 143 | 144 | test_set_x, test_set_y = test_set 145 | valid_set_x, valid_set_y = valid_set 146 | train_set_x, train_set_y = train_set 147 | 148 | train_set_x = remove_unk(train_set_x) 149 | valid_set_x = remove_unk(valid_set_x) 150 | test_set_x = remove_unk(test_set_x) 151 | 152 | def len_argsort(seq): 153 | return sorted(range(len(seq)), key=lambda x: len(seq[x])) 154 | 155 | if sort_by_len: 156 | sorted_index = len_argsort(test_set_x) 157 | test_set_x = [test_set_x[i] for i in sorted_index] 158 | test_set_y = [test_set_y[i] for i in sorted_index] 159 | 160 | sorted_index = len_argsort(valid_set_x) 161 | valid_set_x = [valid_set_x[i] for i in sorted_index] 162 | valid_set_y = [valid_set_y[i] for i in sorted_index] 163 | 164 | sorted_index = len_argsort(train_set_x) 165 | train_set_x = [train_set_x[i] for i in sorted_index] 166 | train_set_y = [train_set_y[i] for i in sorted_index] 167 | 168 | train = (train_set_x, train_set_y) 169 | valid = (valid_set_x, valid_set_y) 170 | test = (test_set_x, test_set_y) 171 | 172 | return train, valid, test 173 | -------------------------------------------------------------------------------- /example/synchronous_lstm/imdb.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import gzip 4 | 5 | from six.moves import cPickle 6 | 7 | import numpy 8 | import theano 9 | 10 | 11 | def prepare_data(seqs, labels, maxlen=None): 12 | """Create the matrices from the datasets. 13 | 14 | This pad each sequence to the same lenght: the lenght of the 15 | longuest sequence or maxlen. 16 | 17 | if maxlen is set, we will cut all sequence to this maximum 18 | lenght. 19 | 20 | This swap the axis! 21 | """ 22 | # x: a list of sentences 23 | lengths = [len(s) for s in seqs] 24 | 25 | if maxlen is not None: 26 | new_seqs = [] 27 | new_labels = [] 28 | new_lengths = [] 29 | for l, s, y in zip(lengths, seqs, labels): 30 | if l < maxlen: 31 | new_seqs.append(s) 32 | new_labels.append(y) 33 | new_lengths.append(l) 34 | lengths = new_lengths 35 | labels = new_labels 36 | seqs = new_seqs 37 | 38 | if len(lengths) < 1: 39 | return None, None, None 40 | 41 | n_samples = len(seqs) 42 | maxlen = numpy.max(lengths) 43 | 44 | x = numpy.zeros((maxlen, n_samples)).astype('int64') 45 | x_mask = numpy.zeros((maxlen, n_samples)).astype(theano.config.floatX) 46 | for idx, s in enumerate(seqs): 47 | x[:lengths[idx], idx] = s 48 | x_mask[:lengths[idx], idx] = 1. 49 | 50 | return x, x_mask, labels 51 | 52 | 53 | def get_dataset_file(dataset, default_dataset, origin): 54 | '''Look for it as if it was a full path, if not, try local file, 55 | if not try in the data directory. 56 | 57 | Download dataset if it is not present 58 | 59 | ''' 60 | data_dir, data_file = os.path.split(dataset) 61 | if data_dir == "" and not os.path.isfile(dataset): 62 | # Check if dataset is in the data directory. 63 | new_path = os.path.join( 64 | os.path.split(__file__)[0], 65 | "..", 66 | "data", 67 | dataset 68 | ) 69 | if os.path.isfile(new_path) or data_file == default_dataset: 70 | dataset = new_path 71 | 72 | if (not os.path.isfile(dataset)) and data_file == default_dataset: 73 | from six.moves import urllib 74 | print('Downloading data from %s' % origin) 75 | urllib.request.urlretrieve(origin, dataset) 76 | return dataset 77 | 78 | 79 | def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None, 80 | sort_by_len=True): 81 | '''Loads the dataset 82 | 83 | :type path: String 84 | :param path: The path to the dataset (here IMDB) 85 | :type n_words: int 86 | :param n_words: The number of word to keep in the vocabulary. 87 | All extra words are set to unknow (1). 88 | :type valid_portion: float 89 | :param valid_portion: The proportion of the full train set used for 90 | the validation set. 91 | :type maxlen: None or positive int 92 | :param maxlen: the max sequence length we use in the train/valid set. 93 | :type sort_by_len: bool 94 | :name sort_by_len: Sort by the sequence lenght for the train, 95 | valid and test set. This allow faster execution as it cause 96 | less padding per minibatch. Another mechanism must be used to 97 | shuffle the train set at each epoch. 98 | 99 | ''' 100 | 101 | ############# 102 | # LOAD DATA # 103 | ############# 104 | 105 | # Load the dataset 106 | path = get_dataset_file( 107 | path, "imdb.pkl", 108 | "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl") 109 | 110 | if path.endswith(".gz"): 111 | f = gzip.open(path, 'rb') 112 | else: 113 | f = open(path, 'rb') 114 | 115 | train_set = cPickle.load(f) 116 | test_set = cPickle.load(f) 117 | f.close() 118 | if maxlen: 119 | new_train_set_x = [] 120 | new_train_set_y = [] 121 | for x, y in zip(train_set[0], train_set[1]): 122 | if len(x) < maxlen: 123 | new_train_set_x.append(x) 124 | new_train_set_y.append(y) 125 | train_set = (new_train_set_x, new_train_set_y) 126 | del new_train_set_x, new_train_set_y 127 | 128 | # split training set into validation set 129 | train_set_x, train_set_y = train_set 130 | n_samples = len(train_set_x) 131 | sidx = numpy.random.permutation(n_samples) 132 | n_train = int(numpy.round(n_samples * (1. - valid_portion))) 133 | valid_set_x = [train_set_x[s] for s in sidx[n_train:]] 134 | valid_set_y = [train_set_y[s] for s in sidx[n_train:]] 135 | train_set_x = [train_set_x[s] for s in sidx[:n_train]] 136 | train_set_y = [train_set_y[s] for s in sidx[:n_train]] 137 | 138 | train_set = (train_set_x, train_set_y) 139 | valid_set = (valid_set_x, valid_set_y) 140 | 141 | def remove_unk(x): 142 | return [[1 if w >= n_words else w for w in sen] for sen in x] 143 | 144 | test_set_x, test_set_y = test_set 145 | valid_set_x, valid_set_y = valid_set 146 | train_set_x, train_set_y = train_set 147 | 148 | train_set_x = remove_unk(train_set_x) 149 | valid_set_x = remove_unk(valid_set_x) 150 | test_set_x = remove_unk(test_set_x) 151 | 152 | def len_argsort(seq): 153 | return sorted(range(len(seq)), key=lambda x: len(seq[x])) 154 | 155 | if sort_by_len: 156 | sorted_index = len_argsort(test_set_x) 157 | test_set_x = [test_set_x[i] for i in sorted_index] 158 | test_set_y = [test_set_y[i] for i in sorted_index] 159 | 160 | sorted_index = len_argsort(valid_set_x) 161 | valid_set_x = [valid_set_x[i] for i in sorted_index] 162 | valid_set_y = [valid_set_y[i] for i in sorted_index] 163 | 164 | sorted_index = len_argsort(train_set_x) 165 | train_set_x = [train_set_x[i] for i in sorted_index] 166 | train_set_y = [train_set_y[i] for i in sorted_index] 167 | 168 | train = (train_set_x, train_set_y) 169 | valid = (valid_set_x, valid_set_y) 170 | test = (test_set_x, test_set_y) 171 | 172 | return train, valid, test 173 | -------------------------------------------------------------------------------- /platoon/util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | :mod:`util` -- Common utility functions for Platoon's classes 4 | ============================================================= 5 | 6 | .. module:: util 7 | :platform: Unix 8 | :synopsis: Contains PlatoonException classes and various helpers. 9 | 10 | """ 11 | from __future__ import print_function 12 | import os 13 | import sys 14 | import subprocess 15 | import cffi 16 | 17 | import numpy as np 18 | try: 19 | from mpi4py import MPI 20 | except ImportError: 21 | MPI = None 22 | 23 | 24 | class PlatoonException(Exception): 25 | """Exception used for abnormal behaviour related to Platoon. 26 | 27 | Useful for logging and managing error. 28 | 29 | """ 30 | def __init__(self, severity, descr, from_exc=None): 31 | self.severity = severity 32 | self.descr = descr 33 | self.from_exc = from_exc 34 | 35 | def __str__(self): 36 | d = str(self.severity) + "! " + str(self.descr) 37 | if self.from_exc is not None: 38 | d += "\nReason: " + str(self.from_exc) 39 | return d 40 | 41 | 42 | class PlatoonError(PlatoonException): 43 | """ 44 | Exception used for errors related to Platoon. 45 | """ 46 | def __init__(self, descr, from_exc=None): 47 | super(PlatoonError, self).__init__("ERROR", descr, from_exc) 48 | 49 | 50 | class PlatoonWarning(PlatoonException): 51 | """ 52 | Exception used for warnings related to Platoon. 53 | """ 54 | def __init__(self, descr, from_exc=None): 55 | super(PlatoonWarning, self).__init__("WARNING", descr, from_exc) 56 | 57 | 58 | def mmap(length=0, prot=0x3, flags=0x1, fd=0, offset=0): 59 | """ 60 | Map file descriptor or shared memory buffer to virtual address space of this 61 | process and create an object with Python buffer interface for that address. 62 | """ 63 | _ffi = cffi.FFI() 64 | _ffi.cdef("void *mmap(void *, size_t, int, int, int, size_t);") 65 | _lib = _ffi.dlopen(None) 66 | 67 | addr = _ffi.NULL 68 | 69 | m = _lib.mmap(addr, length, prot, flags, fd, offset) 70 | if m == _ffi.cast('void *', -1): 71 | raise OSError(_ffi.errno, "for mmap") 72 | return _ffi.buffer(m, length) 73 | 74 | 75 | def launch_process(logs_folder, experiment_name, args, device, 76 | process_type="worker"): 77 | """ 78 | Helper function for a Platoon subprocess. 79 | """ 80 | print("## Starting {0} on {1} ...".format(process_type, device), end=' ') 81 | 82 | log_file = os.path.join(logs_folder, "{0}_{1}.{{}}".format(process_type, device)) 83 | with open(log_file.format("out"), 'w') as stdout_file: 84 | with open(log_file.format("err"), 'w') as stderr_file: 85 | env = dict(os.environ) 86 | env['THEANO_FLAGS'] = '{},device={}'.format(env.get('THEANO_FLAGS', ''), device) 87 | if experiment_name == "platoon" and process_type == "controller": 88 | executable = ["-m", "platoon.channel.controller"] 89 | else: 90 | executable = ["{0}_{1}.py".format(experiment_name, process_type)] 91 | command = [sys.executable, "-u"] + executable 92 | if args: 93 | command += args 94 | process = subprocess.Popen(command, bufsize=0, stdout=stdout_file, stderr=stderr_file, env=env) 95 | 96 | print("Done") 97 | return process 98 | 99 | if MPI: 100 | GA_TO_MPI_OP = { 101 | '+': MPI.SUM, 102 | "sum": MPI.SUM, 103 | "add": MPI.SUM, 104 | '*': MPI.PROD, 105 | "prod": MPI.PROD, 106 | "product": MPI.PROD, 107 | "mul": MPI.PROD, 108 | "max": MPI.MAX, 109 | "maximum": MPI.MAX, 110 | "min": MPI.MIN, 111 | "minimum": MPI.MIN, 112 | } 113 | 114 | NP_TO_MPI_TYPE = { 115 | np.dtype('bool'): MPI.C_BOOL, 116 | np.dtype('int8'): MPI.INT8_T, 117 | np.dtype('uint8'): MPI.UINT8_T, 118 | np.dtype('int16'): MPI.INT16_T, 119 | np.dtype('uint16'): MPI.UINT16_T, 120 | np.dtype('int32'): MPI.INT32_T, 121 | np.dtype('uint32'): MPI.UINT32_T, 122 | np.dtype('int64'): MPI.INT64_T, 123 | np.dtype('uint64'): MPI.UINT64_T, 124 | np.dtype('float32'): MPI.FLOAT, 125 | np.dtype('float64'): MPI.DOUBLE, 126 | np.dtype('complex64'): MPI.C_FLOAT_COMPLEX, 127 | np.dtype('complex128'): MPI.C_DOUBLE_COMPLEX, 128 | # TODO How to handle half types in MPI? 129 | # np.dtype('float16'): MPI.HALF, 130 | } 131 | 132 | 133 | def op_to_mpi(op): 134 | """ 135 | Converts pygpu collective reduce operation types to MPI reduce operation 136 | types. 137 | """ 138 | if MPI is None: 139 | raise AttributeError("mpi4py is not imported") 140 | res = GA_TO_MPI_OP.get(op.lower()) 141 | if res is not None: 142 | return res 143 | raise ValueError("Invalid reduce operation: {}".format(str(op))) 144 | 145 | 146 | def dtype_to_mpi(dtype): 147 | """ 148 | Converts numpy datatypes to MPI datatypes. 149 | """ 150 | if MPI is None: 151 | raise AttributeError("mpi4py is not imported") 152 | res = NP_TO_MPI_TYPE.get(np.dtype(dtype)) 153 | if res is not None: 154 | return res 155 | raise TypeError("Conversion from dtype {} is not known".format(dtype)) 156 | 157 | 158 | class SingletonType(type): 159 | """ 160 | Metaclass that implements the singleton pattern for a Python class. 161 | """ 162 | def __init__(cls, name, bases, dict): 163 | super(SingletonType, cls).__init__(name, bases, dict) 164 | cls.instance = None 165 | 166 | def __call__(cls, *args, **kwds): 167 | if cls.instance is None: 168 | cls.args = args 169 | cls.kwds = kwds 170 | cls.instance = super(SingletonType, cls).__call__(*args, **kwds) 171 | else: 172 | if args or kwds: 173 | print(PlatoonWarning("Worker instance has already been initialized." 174 | "\nArgs: {0}, Kwds: {1}".format(args, kwds)), 175 | file=sys.stderr) 176 | return cls.instance 177 | -------------------------------------------------------------------------------- /platoon/tests/unit/test_controller.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import six 3 | import unittest 4 | 5 | from ...channel import controller 6 | 7 | if six.PY3: 8 | buffer_ = memoryview 9 | else: 10 | buffer_ = buffer # noqa 11 | 12 | 13 | class TestController(unittest.TestCase): 14 | @classmethod 15 | def setUpClass(cls): 16 | cls.local_size = 3 17 | cls.devices = ["cuda0", "cuda1", "cuda2"] 18 | cls.control = controller.Controller(5567, devices=cls.devices) 19 | 20 | @classmethod 21 | def tearDownClass(cls): 22 | cls.control._close() 23 | 24 | def test_is_worker_first(self): 25 | first = self.control._is_worker_first(self.control._am_i_first_count) 26 | assert first 27 | first = self.control._is_worker_first(self.control._am_i_first_count) 28 | assert not first 29 | first = self.control._is_worker_first(self.control._am_i_first_count) 30 | assert not first 31 | first = self.control._is_worker_first(self.control._am_i_first_count) 32 | assert first 33 | first = self.control._is_worker_first(self.control._am_i_first_count) 34 | assert not first 35 | first = self.control._is_worker_first(self.control._am_i_first_count) 36 | assert not first 37 | 38 | def test_get_platoon_info(self): 39 | req_info = {} 40 | 41 | req_info['local_id'] = '1' 42 | req_info['device'] = 'cuda0' 43 | res = self.control._get_platoon_info(req_info) 44 | assert set(res.keys()) == set(['local_id', 'local_size', 'local_rank', 'multinode', 'global_size', 'global_rank']) 45 | assert res['local_id'] == "platoon-1" 46 | assert res['local_size'] == self.local_size 47 | assert res['local_rank'] == 0 48 | assert not res['multinode'] 49 | assert res['global_size'] == self.local_size 50 | 51 | req_info['local_id'] = '2' 52 | req_info['device'] = 'cuda1' 53 | res = self.control._get_platoon_info(req_info) 54 | assert set(res.keys()) == set(['local_id', 'local_size', 'local_rank', 'multinode', 'global_size', 'global_rank']) 55 | assert res['local_id'] == "platoon-1" 56 | assert res['local_size'] == self.local_size 57 | assert res['local_rank'] == 1 58 | assert not res['multinode'] 59 | assert res['global_size'] == self.local_size 60 | 61 | req_info['local_id'] = '3' 62 | req_info['device'] = 'cuda2' 63 | res = self.control._get_platoon_info(req_info) 64 | assert set(res.keys()) == set(['local_id', 'local_size', 'local_rank', 'multinode', 'global_size', 'global_rank']) 65 | assert res['local_id'] == "platoon-1" 66 | assert res['local_size'] == self.local_size 67 | assert res['local_rank'] == 2 68 | assert not res['multinode'] 69 | assert res['global_size'] == self.local_size 70 | 71 | req_info['local_id'] = 'asdfasfda' 72 | req_info['device'] = 'cuda1' 73 | res = self.control._get_platoon_info(req_info) 74 | assert set(res.keys()) == set(['local_id', 'local_size', 'local_rank', 'multinode', 'global_size', 'global_rank']) 75 | assert res['local_id'] == "platoon-asdfasfda" 76 | assert res['local_size'] == self.local_size 77 | assert res['local_rank'] == 1 78 | assert not res['multinode'] 79 | assert res['global_size'] == self.local_size 80 | 81 | def test_init_new_shmem(self): 82 | self.control._job_uid = "yo" 83 | req_info = {'size': 64} 84 | 85 | res = self.control._init_new_shmem(req_info) 86 | assert res == "platoon-yo_0_buffer" 87 | assert len(self.control.shared_buffers) == 1 88 | assert len(self.control._shmrefs) == 1 89 | assert self.control._last_shmem_name == "platoon-yo_0_buffer" 90 | a = self.control.shared_buffers[res] 91 | try: 92 | buffer_(a) 93 | except TypeError: 94 | self.fail("self.control.shared_buffers[{}] does not provide buffer interface.".format(0)) 95 | assert len(a) == 64 96 | 97 | res = self.control._init_new_shmem(req_info) 98 | assert res == "platoon-yo_0_buffer" 99 | assert len(self.control.shared_buffers) == 1 100 | assert len(self.control._shmrefs) == 1 101 | assert self.control._last_shmem_name == "platoon-yo_0_buffer" 102 | b = self.control.shared_buffers[res] 103 | try: 104 | buffer_(b) 105 | except TypeError: 106 | self.fail("self.control.shared_buffers[{}] does not provide buffer interface.".format(0)) 107 | assert len(b) == 64 108 | assert b == a 109 | 110 | res = self.control._init_new_shmem(req_info) 111 | assert res == "platoon-yo_0_buffer" 112 | assert len(self.control.shared_buffers) == 1 113 | assert len(self.control._shmrefs) == 1 114 | assert self.control._last_shmem_name == "platoon-yo_0_buffer" 115 | c = self.control.shared_buffers[res] 116 | try: 117 | buffer_(c) 118 | except TypeError: 119 | self.fail("self.control.shared_buffers[{}] does not provide buffer interface.".format(0)) 120 | assert len(c) == 64 121 | assert c == a 122 | 123 | req_info = {'size': 512} 124 | res = self.control._init_new_shmem(req_info) 125 | assert res == "platoon-yo_1_buffer" 126 | assert len(self.control.shared_buffers) == 2 127 | assert len(self.control._shmrefs) == 2 128 | assert self.control._last_shmem_name == "platoon-yo_1_buffer" 129 | e = self.control.shared_buffers[res] 130 | try: 131 | buffer_(e) 132 | except TypeError: 133 | self.fail("self.control.shared_buffers[{}] does not provide buffer interface.".format(1)) 134 | assert len(e) == 512 135 | assert e != c 136 | 137 | res = self.control._init_new_shmem(req_info) 138 | assert res == "platoon-yo_1_buffer" 139 | assert len(self.control.shared_buffers) == 2 140 | assert len(self.control._shmrefs) == 2 141 | assert self.control._last_shmem_name == "platoon-yo_1_buffer" 142 | f = self.control.shared_buffers[res] 143 | try: 144 | buffer_(f) 145 | except TypeError: 146 | self.fail("self.control.shared_buffers[{}] does not provide buffer interface.".format(1)) 147 | assert len(f) == 512 148 | assert f != c 149 | assert f == e 150 | 151 | res = self.control._init_new_shmem(req_info) 152 | assert res == "platoon-yo_1_buffer" 153 | assert len(self.control.shared_buffers) == 2 154 | assert len(self.control._shmrefs) == 2 155 | assert self.control._last_shmem_name == "platoon-yo_1_buffer" 156 | g = self.control.shared_buffers[res] 157 | try: 158 | buffer_(g) 159 | except TypeError: 160 | self.fail("self.control.shared_buffers[{}] does not provide buffer interface.".format(1)) 161 | assert len(g) == 512 162 | assert g != c 163 | assert g == e 164 | -------------------------------------------------------------------------------- /example/synchronous_lstm/lstm_controller.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, print_function 2 | import os 3 | import sys 4 | import time 5 | 6 | import numpy 7 | 8 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) 9 | from platoon.channel import Controller 10 | 11 | 12 | class LSTMController(Controller): 13 | """ 14 | This multi-process controller implements patience-based early-stopping SGD 15 | """ 16 | 17 | def __init__(self, seed, patience, default_args): 18 | """ 19 | Initialize the LSTMController 20 | 21 | Parameters 22 | ---------- 23 | max_mb : int 24 | Max number of minibatches to train on. 25 | patience: : int 26 | Training stops when this many minibatches have been trained on 27 | without any reported improvement. 28 | valid_freq : int 29 | Number of minibatches to train on between every monitoring step. 30 | default_args : dict 31 | Arguments of default class Controller 32 | """ 33 | super(LSTMController, self).__init__(**default_args) 34 | self.nb_worker = len(self._devices) 35 | # map ids to members of range(nb_worker) 36 | self.worker_ids_dict = dict(zip(self._workers, [i for i in range(len(self._workers))])) 37 | 38 | self.patience = patience 39 | self.seed = seed 40 | 41 | self.valid_history_errs = [[None for i in range(self.nb_worker)]] 42 | self.test_history_errs = [[None for i in range(self.nb_worker)]] 43 | self.bad_counter = 0 44 | self._epoch = 0 45 | self.best_dict = dict(best__epoch=-1, best_valid=numpy.inf) 46 | 47 | 48 | def handle_control(self, req, worker_id, req_info): 49 | """ 50 | Handles a control_request received from a worker 51 | 52 | Parameters 53 | ---------- 54 | req : str or dict 55 | Control request received from a worker. 56 | The control request can be one of the following 57 | 1) "next" : request by a worker to be informed of its next action 58 | to perform. The answers from the server can be 'train' (the 59 | worker should keep training on its training data), 'valid' (the 60 | worker should perform monitoring on its validation set and test 61 | set) or 'stop' (the worker should stop training). 62 | 2) dict of format {"done":N} : used by a worker to inform the 63 | server that is has performed N more training iterations and 64 | synced its parameters. The server will respond 'stop' if the 65 | maximum number of training minibatches has been reached. 66 | 3) dict of format {"valid_err":x, "test_err":x2} : used by a worker 67 | to inform the server that it has performed a monitoring step 68 | and obtained the included errors on the monitoring datasets. 69 | The server will respond "best" if this is the best reported 70 | validation error so far, otherwise it will respond 'stop' if 71 | the patience has been exceeded. 72 | """ 73 | control_response = "" 74 | worker_id = self.worker_ids_dict[worker_id] 75 | 76 | if req == 'pred_errors': 77 | if self.valid_history_errs[self._epoch][worker_id] is not None: 78 | # if a worker tries to add a valid error where there is no None 79 | # it means it tries to index after or before current _epoch 80 | raise RuntimeError('Worker got out of synch!') 81 | self.valid_history_errs[self._epoch][worker_id] = req_info['valid_err'] 82 | self.test_history_errs[self._epoch][worker_id] = req_info['test_err'] 83 | 84 | if not any([i is None for i in self.valid_history_errs[self._epoch]]): 85 | print('Epoch %d is done'%req_info['epoch']) 86 | valid_err = sum(self.valid_history_errs[self._epoch]) / float(self.nb_worker) 87 | 88 | if valid_err <= self.best_dict['best_valid']: 89 | self.best_dict['best_epoch'] = self._epoch 90 | self.best_dict['best_valid'] = valid_err 91 | self.bad_counter = 0 92 | control_response = 'best' 93 | print("Best error valid:", valid_err) 94 | else: 95 | self.bad_counter += 1 96 | self.valid_history_errs += [[None for i in range(self.nb_worker)]] 97 | self.test_history_errs += [[None for i in range(self.nb_worker)]] 98 | self._epoch += 1 99 | 100 | elif req == 'splits': 101 | # the controller never loads the dataset but the worker doesn't 102 | # know how many workers there are 103 | train_len = req_info['train_len'] // self.nb_worker 104 | valid_len = req_info['valid_len'] // self.nb_worker 105 | test_len = req_info['test_len'] // self.nb_worker 106 | splits = dict(train_splits=[train_len * worker_id, train_len * (worker_id + 1)], 107 | valid_splits=[valid_len * worker_id, valid_len * (worker_id + 1)], 108 | test_splits=[test_len * worker_id, test_len * (worker_id + 1)]) 109 | control_response = splits 110 | 111 | # kind of when the training start but not really 112 | self.start_time = time.time() 113 | 114 | elif req == 'seed': 115 | control_response = self.seed 116 | 117 | if self.bad_counter > self.patience: 118 | print("Early stopping!") 119 | end_time = time.time() - self.start_time 120 | # should terminate with best printing and best dumping of params 121 | # and then close everything 122 | print("Best error valid:", self.best_dict['best_valid']) 123 | test_err = sum(self.test_history_errs[self.best_dict['best_epoch']]) / \ 124 | float(self.nb_worker) 125 | print("Best error test:", test_err) 126 | print( ("Training took %.1fs" % (end_time)), file=sys.stderr) 127 | control_response = 'stop' 128 | 129 | return control_response 130 | 131 | 132 | def lstm_control(saveFreq=1110, saveto=None): 133 | parser = Controller.default_parser() 134 | parser.add_argument('--seed', default=1234, type=int, 135 | required=False, help='Maximum mini-batches to train upon in total.') 136 | parser.add_argument('--patience', default=10, type=int, required=False, 137 | help='Maximum patience when failing to get better validation results.') 138 | args = parser.parse_args() 139 | 140 | l = LSTMController(seed=args.seed, 141 | patience=args.patience, 142 | default_args=Controller.default_arguments(args)) 143 | 144 | print("Controller is ready") 145 | return l.serve() 146 | 147 | if __name__ == '__main__': 148 | rcode = lstm_control() 149 | if rcode != 0: 150 | sys.exit(rcode) 151 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # platoon 2 | Experimental multi-GPU mini-framework for Theano 3 | 4 | It supports **data-parallelism** inside one compute node, not 5 | model-parallelism. For model-parallelism check [Theano multiple GPUs 6 | tutorial](http://deeplearning.net/software/theano/tutorial/using_multi_gpu.html). 7 | 8 | In Platoon, there are two main components : workers, and controllers. 9 | Workers do the bulk of the work (training, monitoring, ...). Controllers 10 | interact with multiple workers to coordinate their work, collect the results 11 | and decide how to act on them. To use Platoon, you will need to write code which 12 | uses a worker. You can also extend the functionality of a worker or a controller by 13 | implementing your own. Platoon provides helper classes to 14 | facilitate this. 15 | 16 | This framework is under development. Its interface is not polished and it is 17 | likely to undergo changes in the future. 18 | 19 | The framework provides two separate worker interfaces that allow user to implement 20 | multiple data-parallel algorithms: *param_sync* and *all_reduce*. The default interface 21 | is *param_sync*. Installing optional dependencies listed in the features table below 22 | will make *all_reduce* interface available too. 23 | 24 | Interface | sync type | multi-node | Theano Ops | extra dependencies 25 | ----------|-----------|-----------------------------|------------|-------------------- 26 | param_sync| sync/async| no | no | no 27 | all_reduce| sync only | yes (if [mpi4py](https://github.com/mpi4py/mpi4py) is installed)| yes | [NCCL](https://github.com/NVIDIA/nccl), [pygpu](https://github.com/Theano/libgpuarray), [Theano](https://github.com/Theano/Theano) 28 | 29 | There are currently two algorithms for distributed gradient descent implemented with 30 | *param_sync* interface and three with *all_reduce* interface. 31 | 32 | * **param_sync**: [EASGD](http://arxiv.org/abs/1412.6651) and ASGD. 33 | * **all_reduce**: Synchronous sum/average SGD, EASGD and a synchronous variant of [Downpour](http://research.google.com/archive/large_deep_networks_nips2012.html) 34 | 35 | There are working examples in the examples directory. 36 | 37 | The steps below describe what needs to be done to use Platoon for 38 | data-parallelism. The LSTM example in the folder 'example' was implemented 39 | following these steps and should be referred to for guidance. 40 | 41 | 42 | ## Install 43 | You can simply install it using pip. 44 | `pip install git+https://github.com/mila-udem/platoon` 45 | 46 | 47 | If you would like to use the examples or help develop platoon first you have to clone the repo. 48 | 49 | `git clone https://github.com/mila-udem/platoon` 50 | 51 | Then install what you just cloned. 52 | 53 | `pip install -e ` 54 | 55 | 56 | ## Usage 57 | The simplest way to launch a multi-gpu experiment is to first implement a controller and a worker as described below and then launch it using the `platoon-launcher`. It is not necessary that you have implemented a controller file if you want 58 | to use the existing controller functionality. 59 | 60 | The launcher assume that you named both files as such: `_controller.py` and `_worker.py`. 61 | 62 | Then to launch the experiment you just need to specify the experiment name and GPUs you want to use: 63 | 64 | `platoon-launcher -D gpu0 gpu1` 65 | 66 | You can also omit the `-D` argument and let launcher find all available CUDA GPUs to use 67 | in the single-node experiment: 68 | 69 | `platoon-launcher ` 70 | 71 | For more configuration options, see `platoon-launcher -h`. 72 | 73 | 74 | ### Implementing a controller 75 | These steps describe how to implement the Python script that will launch 76 | your controller. In the included LSTM example, both of these steps are done 77 | in the file `lstm_controller.py` 78 | 79 | 1) Define which commands your controller can receive and how it responds to 80 | them. Commands starting by "platoon-" are reserved by platoon. 81 | 82 | This is done by creating a new class that inherits from channel.Controller 83 | and having it override the method `handle_control()` which will be called 84 | whenever your controller receives a request from a worker. 85 | 86 | 2) Instantiate and launch your custom controller. 87 | 88 | Create a script that will instantiate your custom controller. Once this is 89 | done, define the port on which the controller should listen by calling the 90 | function `init_control`. Finally, call your controller's `serve` method which 91 | will make him ready to receive requests from workers. 92 | 93 | ### Implementing the workers 94 | These steps describe how to start with a script that performs stand-alone 95 | training of a machine learning model and adapt it to serve as a worker in 96 | Platoon. 97 | 98 | 1) Add a new parameter to the script which will be used during execution to 99 | know whether the worker is the first one to be launched and should create the 100 | central parameters or not. 101 | 102 | 2) Before entering the main loop, the script must create an instance of the 103 | class channel.Worker, providing it with the same port number as used to 104 | initialize the controller. It is not necessary to sub-class Worker, you can 105 | instantiate it directly. This object will provide the necessary methods to 106 | handle communication with the controller. 107 | 108 | 3) After the model has been built and the parameters initialized, 109 | initialize the central parameters by calling the Worker's 110 | `init_shared_params()` method. Every worker should call this method. 111 | 112 | 4) In the main loop, instead of deciding when to train and when to monitor 113 | performance, the worker should send control request to the controller to know 114 | what action it should take, according to the communication protocol 115 | established in the controller's `handle_control()` method. 116 | 117 | 5) In the main loop, whenever the worker has performed `N` (a hyper-parameter) 118 | iterations of training, it should synchronize it's parameters with the central 119 | parameters using it's Worker's `sync_params()` method. 120 | 121 | 122 | ### Real usage consideration 123 | The optimal (as in more efficient for learning) hyper-parameters values are 124 | dependent on the number of workers. At least, consider tuning the 125 | learning rate and the alpha parameter of EASGD. 126 | 127 | How to change the alpha hyper-parameter isn't clear. An alpha of 0.5 128 | for the LSTM example with 2 workers seem to have good training 129 | efficiency for this model/dataset/hyper-parameter combination. 130 | 131 | Using alpha = 1/N (with N being the number of workers) might be a 132 | reasonable guideline but the experiments performed with Platoon are 133 | insufficient to conclude anything. 134 | 135 | In the EASGD paper it is shown that in some cases a larger number of 136 | workers can result in a better test error. 137 | 138 | ## Examples 139 | For *param sync* interface, see `example/lstm/` [folder](https://github.com/mila-udem/platoon/tree/master/example/lstm). 140 | 141 | For *all reduce* interface, see `example/synchronous_lstm/` [folder](https://github.com/mila-udem/platoon/tree/master/example/synchronous_lstm). 142 | 143 | -------------------------------------------------------------------------------- /platoon/tests/functional/test_ops_worker.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, print_function, division 2 | import os 3 | import sys 4 | 5 | import unittest 6 | 7 | import theano 8 | from theano import config 9 | import numpy as np 10 | 11 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..')) 12 | from platoon import Worker 13 | from platoon import ops 14 | 15 | 16 | class TestOpsWorker(unittest.TestCase): 17 | @classmethod 18 | def setUpClass(cls): 19 | try: 20 | cls.total_nw = int(os.environ['PLATOON_TEST_WORKERS_NUM']) 21 | cls.worker = Worker(control_port=5567) 22 | except Exception as exc: 23 | print(exc, file=sys.stderr) 24 | raise exc 25 | 26 | def setUp(self): 27 | super(TestOpsWorker, self).setUp() 28 | SEED = 567 29 | np.random.seed(SEED) 30 | self.inp = 30 * np.random.random((100, 400, 200)).astype(config.floatX) 31 | self.sinp = theano.shared(self.inp) 32 | self.out = np.empty_like(self.inp) 33 | self.sout = theano.shared(self.out) 34 | 35 | def test_all_reduce_sum(self): 36 | res = ops.AllReduceSum(self.sinp) 37 | f = theano.function([], [], updates=[(self.sout, res)], 38 | profile=True) 39 | expected = self.total_nw * self.inp 40 | f() 41 | actual = self.sout.get_value() 42 | assert np.allclose(expected, actual) 43 | 44 | # This is faster, because it runs inplace! 45 | res = ops.AllReduceSum(self.sinp, self.sout) 46 | f = theano.function([], [], updates=[(self.sout, res)], 47 | accept_inplace=True, profile=True) 48 | expected = self.total_nw * self.inp 49 | f() 50 | actual = self.sout.get_value() 51 | assert np.allclose(expected, actual) 52 | 53 | x = theano.tensor.scalar(dtype=config.floatX) 54 | res = ops.AllReduceSum(self.sinp, self.sout) 55 | f = theano.function([x], [], updates=[(self.sout, res / x)], 56 | accept_inplace=True, profile=True) 57 | expected = self.total_nw * self.inp / 2 58 | f(2) 59 | actual = self.sout.get_value() 60 | assert np.allclose(expected, actual) 61 | expected = self.total_nw * self.inp / 3.14159 62 | f(3.14159) 63 | actual = self.sout.get_value() 64 | assert np.allclose(expected, actual) 65 | 66 | x = theano.tensor.scalar(dtype=config.floatX) 67 | self.sinp *= x 68 | res = ops.AllReduceSum(self.sinp, self.sout) 69 | f = theano.function([x], [], updates=[(self.sout, res)], 70 | accept_inplace=True, profile=True) 71 | expected = self.total_nw * self.inp * 2 72 | f(2) 73 | actual = self.sout.get_value() 74 | assert np.allclose(expected, actual) 75 | expected = self.total_nw * self.inp * 3.14159 76 | f(3.14159) 77 | actual = self.sout.get_value() 78 | assert np.allclose(expected, actual) 79 | 80 | def test_all_reduce_sum_inplace(self): 81 | res = ops.AllReduceSum(self.sinp, inplace=True) 82 | f = theano.function([], [], updates=[(self.sinp, res)], 83 | accept_inplace=True, profile=True) 84 | expected = self.total_nw * self.inp 85 | f() 86 | actual = self.sinp.get_value() 87 | assert np.allclose(expected, actual) 88 | 89 | def test_all_reduce_prod(self): 90 | res = ops.AllReduceProd(self.sinp) 91 | f = theano.function([], [], updates=[(self.sout, res)], 92 | profile=True) 93 | expected = self.inp ** self.total_nw 94 | f() 95 | actual = self.sout.get_value() 96 | assert np.allclose(expected, actual) 97 | 98 | # This is faster, because it runs inplace! 99 | res = ops.AllReduceProd(self.sinp, self.sout) 100 | f = theano.function([], [], updates=[(self.sout, res)], 101 | accept_inplace=True, profile=True) 102 | expected = self.inp ** self.total_nw 103 | f() 104 | actual = self.sout.get_value() 105 | assert np.allclose(expected, actual) 106 | 107 | def test_all_reduce_prod_inplace(self): 108 | res = ops.AllReduceProd(self.sinp, inplace=True) 109 | f = theano.function([], [], updates=[(self.sinp, res)], 110 | accept_inplace=True, profile=True) 111 | expected = self.inp ** self.total_nw 112 | f() 113 | actual = self.sinp.get_value() 114 | assert np.allclose(expected, actual) 115 | 116 | def test_all_reduce_maximum(self): 117 | res = ops.AllReduceMax(self.sinp) 118 | f = theano.function([], [], updates=[(self.sout, res)], 119 | profile=True) 120 | expected = self.inp 121 | f() 122 | actual = self.sout.get_value() 123 | assert np.allclose(expected, actual) 124 | 125 | # This is faster, because it runs inplace! 126 | res = ops.AllReduceMax(self.sinp, self.sout) 127 | f = theano.function([], [], updates=[(self.sout, res)], 128 | accept_inplace=True, profile=True) 129 | expected = self.inp 130 | f() 131 | actual = self.sout.get_value() 132 | assert np.allclose(expected, actual) 133 | 134 | def test_all_reduce_max_inplace(self): 135 | res = ops.AllReduceMax(self.sinp, inplace=True) 136 | f = theano.function([], [], updates=[(self.sinp, res)], 137 | accept_inplace=True, profile=True) 138 | expected = self.inp 139 | f() 140 | actual = self.sinp.get_value() 141 | assert np.allclose(expected, actual) 142 | 143 | def test_all_reduce_minimum(self): 144 | res = ops.AllReduceMin(self.sinp) 145 | f = theano.function([], [], updates=[(self.sout, res)], 146 | profile=True) 147 | expected = self.inp 148 | f() 149 | actual = self.sout.get_value() 150 | assert np.allclose(expected, actual) 151 | 152 | # This is faster, because it runs inplace! 153 | res = ops.AllReduceMin(self.sinp, self.sout) 154 | f = theano.function([], [], updates=[(self.sout, res)], 155 | accept_inplace=True, profile=True) 156 | expected = self.inp 157 | f() 158 | actual = self.sout.get_value() 159 | assert np.allclose(expected, actual) 160 | 161 | def test_all_reduce_min_inplace(self): 162 | res = ops.AllReduceMin(self.sinp, inplace=True) 163 | f = theano.function([], [], updates=[(self.sinp, res)], 164 | accept_inplace=True, profile=True) 165 | expected = self.inp 166 | f() 167 | actual = self.sinp.get_value() 168 | assert np.allclose(expected, actual) 169 | 170 | def test_on_diferent_types(self): 171 | tmp = np.empty_like(self.inp, dtype='int32') 172 | stmp = theano.shared(tmp) 173 | self.assertRaises(TypeError, ops.AllReduceSum, self.sinp, stmp) 174 | 175 | @classmethod 176 | def tearDownClass(cls): 177 | cls.worker.close() 178 | 179 | if __name__ == '__main__': 180 | suite = unittest.TestLoader().loadTestsFromTestCase(TestOpsWorker) 181 | res = unittest.TextTestRunner(verbosity=1).run(suite) 182 | if len(res.failures) != 0 or len(res.errors) != 0: 183 | sys.exit(1) 184 | -------------------------------------------------------------------------------- /platoon/ops.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | :mod:`ops` -- Theano Ops for Worker interface 4 | ============================================= 5 | 6 | .. module:: ops 7 | :platform: Unix 8 | :synopsis: Contains AllReduce Theano Op and builder function for each 9 | reduce operation type. 10 | 11 | """ 12 | from __future__ import absolute_import, print_function 13 | import sys 14 | 15 | from six.moves import xrange 16 | 17 | try: 18 | import theano 19 | from theano.gradient import grad_not_implemented 20 | from theano.gpuarray.basic_ops import as_gpuarray_variable 21 | except ImportError as exc: 22 | print("ERROR! On {}:".format(__name__), exc, file=sys.stderr) 23 | theano = None 24 | 25 | from .channel.worker import Worker 26 | 27 | 28 | if theano: 29 | class AllReduce(theano.Op): 30 | """Wrapper of :class:`channel.worker.Worker`. 31 | 32 | For full documentation, see builder functions: 33 | * :func:`AllReduceSum` 34 | * :func:`AllReduceProd` 35 | * :func:`AllReduceMax` 36 | * :func:`AllReduceMin` 37 | 38 | :param scalar_op: Representation of collective reduce operation type. 39 | :type scalar_op: {str, :ref:`theano.scalar.add`, :ref:`theano.scalar.mul`, 40 | :ref:`theano.scalar.maximum`, :ref:`theano.scalar.minimum`} 41 | 42 | .. seealso:: module :mod:`channel.worker` 43 | 44 | .. versionadded:: 0.6.0 45 | 46 | """ 47 | __props__ = ("scalar_op", ) 48 | 49 | def __init__(self, scalar_op, inplace=False, worker=None): 50 | if worker is not None: 51 | if isinstance(worker, Worker): 52 | self.worker = worker 53 | else: 54 | raise TypeError("Argument `worker` is not of platoon.Worker type.") 55 | else: 56 | try: 57 | self.worker = Worker() # Get singleton instance 58 | except TypeError: 59 | raise AttributeError("Worker instance has not been created yet.") 60 | # This is because I have not found a way to use half-types through MPI 61 | self._f16_ok = not self.worker._multinode 62 | self.scalar_op = scalar_op 63 | self.inplace = inplace 64 | 65 | def __str__(self): 66 | if self.inplace: 67 | return "AllReduce{%s,inplace}" % (str(self.scalar_op).capitalize()) 68 | else: 69 | return "AllReduce{%s,no_inplace}" % (str(self.scalar_op).capitalize()) 70 | 71 | def make_node(self, src, dest=None): 72 | if dest is None: 73 | inputs = [src] 74 | if self.inplace: 75 | self.inplace_pattern = {0: 0} 76 | else: 77 | self.inplace_pattern = {} 78 | else: 79 | inputs = [src, dest] 80 | self.inplace = True 81 | self.inplace_pattern = {0: 1} 82 | self.destroy_map = dict((o, [i]) for o, i in self.inplace_pattern.items()) 83 | inputs = [as_gpuarray_variable(i, self.worker.ctx_name) for i in inputs] 84 | if dest is not None: 85 | if not inputs[0].type == inputs[1].type: 86 | raise TypeError("`src` and `dest` must have the same Type:", 87 | (inputs[0].type, inputs[1].type)) 88 | out_type = inputs[0].type.clone() 89 | return theano.Apply(self, inputs, [out_type()]) 90 | 91 | def infer_shapes(self, node, shapes): 92 | return [shapes[0]] 93 | 94 | def perform(self, node, inputs, outputs): 95 | out = outputs[0] 96 | src = inputs[0] 97 | if len(node.inputs) == 2: # If inplace op 98 | dest = inputs[1] 99 | self.worker.all_reduce(src, str(self.scalar_op), dest) 100 | out[0] = dest 101 | elif self.inplace: 102 | self.worker.all_reduce(src, str(self.scalar_op), src) 103 | out[0] = src 104 | else: 105 | out[0] = self.worker.all_reduce(src, str(self.scalar_op)) 106 | 107 | def grad(self, inputs, ograds): 108 | return [grad_not_implemented(self, i, inputs[i]) for i in xrange(len(inputs))] 109 | 110 | def AllReduceSum(src, dest=None, inplace=False, worker=None): 111 | """ 112 | Element-wise sum of `src` GPU tensor across all 113 | Platoon worker processes. 114 | 115 | Parameters 116 | ---------- 117 | src : GPU tensor (array-like) 118 | Input array. 119 | dest : GPU tensor (array-like), optional 120 | Output array. If None (default) is given, then an GPU array-like 121 | will be returned with result, which has the same shape and datatype 122 | as `src`. 123 | inplace : bool, optional 124 | If True, then operation will happen inplace and the result will be 125 | written in array `src`. 126 | worker : :class:`channel.worker.Worker`, optional 127 | Platoon Worker instance unique to a single process which will be used 128 | to execute the operation. If None (default) is given, the singleton 129 | instance will be used. 130 | 131 | Returns 132 | ------- 133 | result : GPU tensor (array-like) 134 | Result array will be `dest` if it was specified in the arguments, 135 | `src` if `inplace` is True, else a new variable which points to 136 | operation's result. 137 | 138 | Notes 139 | ----- 140 | * If `dest` is given, then the Op is inplace in Theano sense. 141 | * If a `worker` is not given, then a Worker instance must have been 142 | already instantiated. 143 | 144 | Raises 145 | ------ 146 | TypeError 147 | If `worker` specified is not of type :class:`channel.worker.Worker` 148 | or if `src` and `dest` are not of the same Theano Type. 149 | AttributeError 150 | If singleton Worker has not been instantiated yet. 151 | 152 | .. versionadded:: 0.6.0 153 | 154 | """ 155 | return AllReduce(theano.scalar.add, inplace, worker)(src, dest) 156 | 157 | def AllReduceProd(src, dest=None, inplace=False, worker=None): 158 | """ 159 | Element-wise multiplication of `src` GPU tensor across all 160 | Platoon worker processes. 161 | 162 | .. seealso:: 163 | Function :func:`AllReduceSum` 164 | For documentation on parameters, return variables, notes and 165 | raises. 166 | 167 | .. versionadded:: 0.6.0 168 | 169 | """ 170 | return AllReduce(theano.scalar.mul, inplace, worker)(src, dest) 171 | 172 | def AllReduceMax(src, dest=None, inplace=False, worker=None): 173 | """ 174 | Find element-wise maximum of `src` GPU tensor across all 175 | Platoon worker processes. 176 | 177 | .. seealso:: 178 | Function :func:`AllReduceSum` 179 | For documentation on parameters, return variables, notes and 180 | raises. 181 | 182 | .. versionadded:: 0.6.0 183 | 184 | """ 185 | return AllReduce(theano.scalar.maximum, inplace, worker)(src, dest) 186 | 187 | def AllReduceMin(src, dest=None, inplace=False, worker=None): 188 | """ 189 | Find element-wise minimum of `src` GPU tensor across all 190 | Platoon worker processes. 191 | 192 | .. seealso:: 193 | Function :func:`AllReduceSum` 194 | For documentation on parameters, return variables, notes and 195 | raises. 196 | 197 | .. versionadded:: 0.6.0 198 | 199 | """ 200 | return AllReduce(theano.scalar.minimum, inplace, worker)(src, dest) 201 | else: 202 | AllReduce = None 203 | AllReduceSum = None 204 | AllReduceProd = None 205 | AllReduceMax = None 206 | AllReduceMin = None 207 | -------------------------------------------------------------------------------- /scripts/platoon-launcher: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | platoon-launcher 5 | 6 | This file serves as an executable for launching a training procedure with 7 | Platoon. Depending on the given arguments or configuration, the training will 8 | start in a single machine or multiple hosts. Execute `platoon-launcher -h` to 9 | see instructions or read the docs. 10 | 11 | Exit Codes 12 | ---------- 13 | 0: Success 14 | 1: A worker or controller has exited with non-success status 15 | 2: False arguments 16 | 3: Subprocess or OS errors 17 | 4: Other error 18 | 19 | """ 20 | 21 | from __future__ import print_function 22 | import os 23 | import sys 24 | import subprocess 25 | import signal 26 | import time 27 | import shlex 28 | import argparse 29 | import textwrap 30 | 31 | from platoon.util import launch_process 32 | from platoon import configparser 33 | 34 | 35 | def parse_arguments(): 36 | parser = argparse.ArgumentParser( 37 | formatter_class=argparse.RawDescriptionHelpFormatter, 38 | description=textwrap.dedent('''\ 39 | ################################################################################ 40 | # Launcher for Platoon multi-GPU/node training framework # 41 | ################################################################################ 42 | Platoon will train your Theano models using multiple GPUs even if they do not 43 | reside in the same host. 44 | 45 | In order to use it, a worker file needs to be provided. A worker file defines 46 | the training process of a single set of model parameters in a parallel and 47 | distributed manner. Optionally and in case you want to extend the distributed 48 | computation capabilities of the training process, you are encouraged to provide 49 | a controller file which extends the default one (`platoon.controller` module) in 50 | this framework. 51 | 52 | Platoon is configured through the command-line arguments of this launcher and in 53 | case of their absence (or if it needed) through environmental variables or 54 | Platoon configuration files. Please read `platoonrc.conf` in package's root 55 | directory to learn about every way that Platoon can be configured. 56 | 57 | If single-node is explicitly specified through command-line arguments, the 58 | specified devices will be used in the GPU communicator world in the order they 59 | are parsed. The same thing applies also for lists of devices found in Platoon 60 | environmentals or configuration files. 61 | 62 | e.g. usage: platoon-launcher lstm -D cuda0 cuda3 (explicit config) 63 | platoon-launcher lstm (config with envs/files - may be multi-node) 64 | 65 | If multi-node is explicitly specified through command-line arguments, extra 66 | configuration through appropriate environmentals per host or files needs to be 67 | done in order to describe which devices will be used in each host. Host names 68 | are given the same way they are given in MPI's `mpirun`. 69 | 70 | e.g. usage: platoon-launcher lstm -H lisa0 lisa1 71 | (gpus on lisa0 and gpus on lisa1) 72 | 73 | NOTIFICATION: This launcher is used to set up the new worker interface (the old 74 | is still usable - but not in multi-node configs). The new worker interface 75 | supports only CUDA devices currently. NVIDIA's "NCCL" collectives library and 76 | "pygpu" are required for multi-GPU, while "mpi4py" is required in addition for 77 | multi-node.''')) 78 | parser.add_argument('experiment_name', help='The name of your experiment. The launcher will expect to find the files _worker.py and optionally _controller.py.') 79 | single_or_multi = parser.add_mutually_exclusive_group(required=False) 80 | single_or_multi.add_argument('-D', '--devices', nargs='+', type=str, metavar='devname', 81 | required=False, help='List of Theano device names (e.g. gpu0 or cuda1). Each device will be assigned to a separate worker. If this option is specified, experiment will be run in a single node.') 82 | single_or_multi.add_argument('-H', '--hosts', nargs='+', type=str, metavar='hostname', 83 | required=False, help='List of host names to participate in multi-node training. Each host will be assigned to a separate controller. If this option is specified, experiment will be run in multiple nodes.') 84 | parser.add_argument('-c', '--controller-args', required=False, help='The arguments that will be passed to your controllers. (Ex: -c="--sync_rule EASGD")') 85 | parser.add_argument('-w', '--worker-args', required=False, help='The arguments that will be passed to your workers. (Ex: -w="learning_rate=0.1")') 86 | 87 | return parser.parse_args() 88 | 89 | if __name__ == '__main__': 90 | args = parse_arguments() 91 | 92 | logs_folder = os.path.join("PLATOON_LOGS", args.experiment_name, time.strftime("%Y-%m-%d_%H-%M-%S")) 93 | os.makedirs(logs_folder) 94 | 95 | print("### Launching experiment: {}".format(args.experiment_name)) 96 | 97 | # check for worker executable, else fail 98 | if not os.path.isfile("./{}_worker.py".format(args.experiment_name)): 99 | print("\nERROR! Cannot find worker executable: {}_worker.py".format(args.experiment_name)) 100 | sys.exit(2) 101 | # check for custom controller executable, else use default 102 | if os.path.isfile("./{}_controller.py".format(args.experiment_name)): 103 | controller_type = args.experiment_name 104 | else: 105 | controller_type = "platoon" 106 | 107 | # If not specified in launcher, check for other configuration types 108 | if args.hosts is None: 109 | try: 110 | hosts = configparser.fetch_hosts() 111 | except KeyError: 112 | hosts = None 113 | else: 114 | hosts = args.hosts 115 | 116 | # Check if we run on multi-node 117 | if hosts and len(hosts) > 1: 118 | print("### Starting multi-node/gpu training on: {} ...".format(' '.join(hosts)), end=' ') 119 | log_file = os.path.join(logs_folder, "multi-node-controllers.{}") 120 | env = dict(os.environ) 121 | theano_flags = "THEANO_FLAGS={0},device={1}".format(env.pop('THEANO_FLAGS', ''), "cpu") 122 | command = ["mpirun"] 123 | command += ["--output-filename", log_file.format("log")] 124 | command += ["-mca", "mpi_warn_on_fork", "0"] 125 | command += ["-np", str(len(hosts))] 126 | command += ["-H", ','.join(hosts)] 127 | command += ["--map-by", "ppr:1:node"] 128 | command += shlex.split("-x " + " -x ".join(env.keys()) + " -x " + theano_flags) 129 | if controller_type == "platoon": 130 | executable = ["-m", "platoon.channel.controller"] 131 | else: 132 | executable = ["{}_controller.py".format(controller_type)] 133 | command += [sys.executable, "-u"] + executable 134 | command += [args.experiment_name, logs_folder, "--multi"] 135 | if args.controller_args: 136 | command += shlex.split(args.controller_args) 137 | if args.worker_args: 138 | command += ["-w", args.worker_args] 139 | try: 140 | p = subprocess.Popen(command) 141 | except OSError as exc: 142 | print("\nERROR! OS error in Popen: {}".format(exc)) 143 | sys.exit(3) 144 | except Exception as exc: 145 | print("\nERROR! Other in Popen: {}".format(exc)) 146 | sys.exit(4) 147 | print("Done") 148 | experiment_type = "Multi-node Controllers" 149 | else: 150 | print("### Starting single-node multi-gpu training") 151 | if hosts: 152 | import socket 153 | hostname = socket.gethostname() 154 | if hosts[0] != hostname: 155 | print("\nERROR! A single host '{0}' was specified which is not " 156 | "the same as the current host '{1}'.\nThis is not currently " 157 | "supported.".format(hosts[0], hostname)) 158 | sys.exit(2) 159 | controller_args = [args.experiment_name, logs_folder, '--single'] 160 | if args.devices: 161 | controller_args += ['-D'] 162 | controller_args += args.devices 163 | if args.controller_args: 164 | controller_args += shlex.split(args.controller_args) 165 | if args.worker_args: 166 | controller_args += ["-w={}".format(args.worker_args)] 167 | try: 168 | p = launch_process(logs_folder, controller_type, controller_args, "cpu", "controller") 169 | except OSError as exc: 170 | print("\nERROR! OS error in Popen: {}".format(exc)) 171 | sys.exit(3) 172 | except Exception as exc: 173 | print("\nERROR! Other while launching process: {}".format(exc)) 174 | sys.exit(4) 175 | experiment_type = "Single-node Controller" 176 | 177 | print("\n### Logs folder ###\n{}".format(logs_folder)) 178 | print("\n### Waiting on experiment to finish ...") 179 | try: 180 | try: 181 | pid, status = os.waitpid(p.pid, 0) 182 | except OSError as exc: 183 | print("\nERROR! OS error: {}".format(exc)) 184 | sys.exit(3) 185 | if pid != p.pid: 186 | print("\nWARNING! Received status for unknown process {}".format(pid)) 187 | sys.exit(3) 188 | if os.WIFEXITED(status): 189 | rcode = os.WEXITSTATUS(status) 190 | print("## {0} terminated with return code: {1}.".format(experiment_type, rcode)) 191 | if rcode != 0: 192 | print("\nERROR! An error has occured.\nSee logs for more info.") 193 | sys.exit(1) 194 | else: 195 | print("\nSUCCESS! Training with Platoon has finished.") 196 | else: 197 | print("\nWARNING! {} changed status but has not exited.".format(experiment_type)) 198 | raise RuntimeError("ERROR! Unexpected controller status change.") 199 | except (RuntimeError, KeyboardInterrupt) as exc: 200 | print(exc) 201 | print("Killing controller processes...") 202 | try: 203 | os.kill(p.pid, signal.SIGTERM) 204 | pid, status = os.waitpid(p.pid, 0) 205 | except OSError as exc: 206 | print("\nERROR! OS error: {}".format(exc)) 207 | sys.exit(3) 208 | -------------------------------------------------------------------------------- /platoon/training/global_dynamics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | :mod:`training.global_dynamics` -- Collection of global SGD strategies 4 | ====================================================================== 5 | 6 | .. module:: global_dynamics 7 | :platform: Unix 8 | :synopsis: Contains :class:`GlobalDynamics` base class for synchronous 9 | global gradient descents and implementation of various techniques 10 | using Platoon's :class:`channel.worker.Worker`'s 11 | :meth:`channel.worker.Worker.all_reduce` interface. 12 | 13 | Implementations 14 | --------------- 15 | * *:class:`SGD`* : Synchronous variant of Stochastic Gradient Descent for many 16 | descending particles. 17 | * *:class:`EASGD`* : Elastic Averaging Stochastic Gradient Descent (synchronous) 18 | * *:class:`Downpour`* : A synchronous variant of Downpour 19 | 20 | """ 21 | from __future__ import absolute_import, division 22 | 23 | from ..channel.worker import Worker 24 | from ..ops import AllReduceSum 25 | 26 | 27 | class GlobalDynamics(object): 28 | """Abstract class which declares the methods and properties that need to 29 | be implemented by a synchronous global dynamics rule. 30 | 31 | Parameters 32 | ---------- 33 | worker : :class:`channel.Worker`, optional 34 | A reference to Worker's instance 35 | 36 | .. versionadded:: 0.6.0 37 | 38 | """ 39 | def __init__(self, worker=None): 40 | self._worker = None 41 | if worker is not None: 42 | self.worker = worker 43 | self._fn = None 44 | 45 | def __call__(self): 46 | if self._fn is None: 47 | raise NotImplementedError("Functionality has not been specified.\n" 48 | "Please use {} method to setup GlobalDynamics" 49 | "for a set of Variables\nor supply your own" 50 | "using {} method.".format( 51 | repr(self.make_rule), repr(self.fn))) 52 | self._fn() 53 | 54 | @property 55 | def worker(self): 56 | """Worker class instance used for global operations""" 57 | if self._worker is None: 58 | try: 59 | self._worker = Worker() # Draw singleton instance 60 | except TypeError: 61 | raise AttributeError("Worker instance has not been created yet.") 62 | return self._worker 63 | 64 | @worker.setter 65 | def worker(self, inst): 66 | if not isinstance(inst, Worker): 67 | raise TypeError("Argument `inst` is not of platoon.Worker type.") 68 | self._worker = inst 69 | 70 | def register_fn(self, fun): 71 | """Internal function implementing global dynamics. Does not accept 72 | parameters. Global optimization must be done through shared variables. 73 | 74 | The responsibility for supplying a valid internal function falls to the 75 | user. It must be able to be called like this: ``fun()``. Also in order 76 | to serve its purpose, it needs to have multi-GPU or even multi-node 77 | functionality. As a result, a :class:`channel.Worker` or other interface 78 | need to be used. 79 | 80 | :param fun: Implements global dynamics by using information 81 | from many workers. 82 | :type fun: callable 83 | 84 | """ 85 | if not hasattr(fun, '__call__'): 86 | raise TypeError("Supplied object is not a callable.") 87 | self._fn = fun 88 | 89 | def make_rule(self, *args): 90 | """ 91 | Create :class:`GlobalDynamics` optimization function for 92 | local data in `args`. 93 | 94 | Implementation in a child class must return a callable object which 95 | expects no arguments. User must be careful to create a function which 96 | uses shared objects in order to update local model parameters, such as 97 | Theano Shared Variables. 98 | 99 | Notes 100 | ----- 101 | For better performance, try to batch together in the same 102 | :ref:`theano.compile.SharedVariable` as many model parameter arrays as 103 | possible. This reduces the number of calls and utilizes the most out of 104 | the underlying algorithms. One way to do this is to create one c 105 | contiguous array that contains every set (matrix) of model parameters 106 | along the first dimension. Then in order to use each set separately, 107 | create as many view arrays as the number of sets of model parameters, 108 | i.e. the length of the first dimension. Use the whole array as an input 109 | to the :meth:`make_rule` function! 110 | 111 | """ 112 | raise NotImplementedError(self.make_rule.__doc__) 113 | 114 | 115 | class _GlobalDynamicsNoSet(GlobalDynamics): 116 | def register_fn(self, fun): 117 | raise AttributeError("Cannot set internal function. Use {} method.".format( 118 | repr(self.make_rule))) 119 | 120 | 121 | class SGD(_GlobalDynamicsNoSet): 122 | """Synchronous Stochastic Gradient Descent: 123 | 124 | It sums or averages model parameter updates found separately (and 125 | concurrently) by workers which are training on (different) random 126 | mini-batches of a dataset. 127 | 128 | Parameters 129 | ---------- 130 | average : bool, optional 131 | If True, it will normalize the summation of model param updates across 132 | all workers with the number of workers participating in optimization. 133 | worker : :class:`channel.Worker` 134 | See :class:`GlobalDynamics`. 135 | 136 | .. versionadded:: 0.6.0 137 | 138 | """ 139 | def __init__(self, average=False, worker=None): 140 | self.average = average 141 | super(SGD, self).__init__(worker) 142 | 143 | def make_rule(self, local_updates): 144 | """Makes global synchronous SGD rule for the parameters in `local_updates`. 145 | 146 | Parameters 147 | ---------- 148 | local_updates : {:ref:`theano.compile.SharedVariable`, 149 | list of :ref:`theano.compile.SharedVariable`} 150 | These variables represent the updates found 151 | by local optimization dynamics on the model's parameters. 152 | 153 | .. seealso:: Notes on :meth:`GlobalDynamics.make_rule` 154 | 155 | """ 156 | import theano 157 | if isinstance(local_updates, theano.compile.SharedVariable): 158 | local_updates = [local_updates] 159 | global_updates = [] 160 | for update in local_updates: 161 | gup = AllReduceSum(update, inplace=True) 162 | if self.average: 163 | gup /= self.worker.global_size 164 | global_updates.append(gup) 165 | self._fn = theano.function([], [], 166 | updates=list(zip(local_updates, global_updates)), 167 | accept_inplace=True) 168 | 169 | 170 | def SumSGD(worker=None): 171 | """Synchronous Stochastic Gradient Descent: summing version 172 | 173 | .. seealso:: Class :class:`SGD` 174 | .. versionadded:: 0.6.0 175 | 176 | """ 177 | return SGD(average=False, worker=worker) 178 | 179 | 180 | def AverageSGD(worker=None): 181 | """Synchronous Stochastic Gradient Descent: averaging version 182 | 183 | .. seealso:: Class :class:`SGD` 184 | .. versionadded:: 0.6.0 185 | 186 | """ 187 | return SGD(average=True, worker=worker) 188 | 189 | 190 | class EASGD(_GlobalDynamicsNoSet): 191 | """Synchronous variant of Elastic Averaging Stochastic Gradient Descent 192 | 193 | This algorithm is described in more details in the following paper: 194 | http://arxiv.org/abs/1412.6651 195 | 196 | .. seealso:: Class :class:`GlobalDynamics` for parameters 197 | .. versionadded:: 0.6.0 198 | 199 | """ 200 | def make_rule(self, local_particle, central_particle, alpha): 201 | """Make EASGD rule. 202 | 203 | According to this rule, every N iterations, a worker synchronizes his 204 | parameters with the master parameters. This is done by moving each set of 205 | parameters toward the other by an amount proportional to the difference 206 | between the individual params (this proportion is parameterized by `alpha`). 207 | 208 | Parameters 209 | ---------- 210 | local_particle : {:ref:`theano.compile.SharedVariable`, 211 | list of :ref:`theano.compile.SharedVariable`} 212 | A particle's position in parameter space doing local SGD. 213 | central_particle : {:ref:`theano.compile.SharedVariable`, 214 | list of :ref:`theano.compile.SharedVariable`} 215 | Central particle's position in parameter space interacting with 216 | local particles. 217 | alpha: scalar 218 | "Elastic" force's coefficient 219 | 220 | .. note:: 221 | If `alpha` == 0 is used, there is no synchronization of the 222 | parameters meaning that each worker is independently training using SGD. 223 | 224 | .. seealso:: Notes on :meth:`GlobalDynamics.make_rule` 225 | 226 | """ 227 | import theano 228 | if isinstance(local_particle, theano.compile.SharedVariable): 229 | local_particle = [local_particle] 230 | if isinstance(central_particle, theano.compile.SharedVariable): 231 | central_particle = [central_particle] 232 | self.alpha = alpha 233 | 234 | new_local = [] 235 | new_central = [] 236 | for local_position, central_position in zip(local_particle, central_particle): 237 | distance = local_position - central_position 238 | elastic_force = alpha * distance 239 | # Note: not equivalent to physical force as `elastic_force`:=Δx/Δt 240 | # and not Δp/Δt 241 | local_new_position = local_position - elastic_force 242 | total_elastic_force = AllReduceSum(elastic_force, inplace=True) 243 | central_new_position = central_position + total_elastic_force 244 | 245 | new_local.append(local_new_position) 246 | new_central.append(central_new_position) 247 | 248 | updates = list(zip(local_particle, new_local)) + \ 249 | list(zip(central_particle, new_central)) 250 | self._fn = theano.function([], [], updates=updates, accept_inplace=True) 251 | 252 | 253 | class Downpour(_GlobalDynamicsNoSet): 254 | """Synchronous variant of Downpour distributed optimization technique 255 | 256 | This algorithm is described in details in the following paper: 257 | http://research.google.com/archive/large_deep_networks_nips2012.html 258 | 259 | Parameters 260 | ---------- 261 | average : bool, optional 262 | If True, it will average the sum of locally accumulated parameter updates 263 | in every global update. 264 | worker : :class:`channel.Worker`, optional 265 | See :class:`GlobalDynamics`. 266 | 267 | .. versionadded:: 0.6.0 268 | 269 | """ 270 | def __init__(self, average=False, worker=None): 271 | self.average = average 272 | super(Downpour, self).__init__(worker) 273 | 274 | def make_rule(self, local_particle, local_acc_updates, global_particle): 275 | """Make Downpour rule. 276 | 277 | All particles along with the global particle start from the same 278 | position. According to this rule, each local particle executes descent 279 | normally but their parameter updates are accumulated (e.g. by moving 280 | average) to a variable. Every N iterations, the local accumulated 281 | updates are added together and applied to the global particle. Each 282 | local particle restarts from global particle's position. 283 | 284 | Parameters 285 | ---------- 286 | local_particle : {:ref:`theano.compile.SharedVariable`, 287 | list of :ref:`theano.compile.SharedVariable`} 288 | A particle's position in parameter space doing local SGD. 289 | local_acc_updates : {:ref:`theano.compile.SharedVariable`, 290 | list of :ref:`theano.compile.SharedVariable`} 291 | Shared variable accumulating local parameter updates. 292 | global_particle : {:ref:`theano.compile.SharedVariable`, 293 | list of :ref:`theano.compile.SharedVariable`} 294 | A particle whose position is updated only by the Downpour process and 295 | resets position of local particles. 296 | 297 | .. seealso:: Notes on :meth:`GlobalDynamics.make_rule` 298 | 299 | """ 300 | import theano 301 | from theano.tensor import basic 302 | if isinstance(local_particle, theano.compile.SharedVariable): 303 | local_particle = [local_particle] 304 | if isinstance(local_acc_updates, theano.compile.SharedVariable): 305 | local_acc_updates = [local_acc_updates] 306 | if isinstance(global_particle, theano.compile.SharedVariable): 307 | global_particle = [global_particle] 308 | 309 | new_global = [] 310 | new_local = [] 311 | new_acc_updates = [] 312 | for lp, lau, gp in zip(local_particle, local_acc_updates, global_particle): 313 | global_acc_updates = AllReduceSum(lau, inplace=True) 314 | if self.average: 315 | global_acc_updates /= self.worker.global_size 316 | new_global.append(gp + global_acc_updates) 317 | new_local.append(new_global[-1]) 318 | new_acc_updates.append(basic.zeros_like(lau)) 319 | 320 | updates = list(zip(local_particle, new_local)) + \ 321 | list(zip(local_acc_updates, new_acc_updates)) + \ 322 | list(zip(global_particle, new_global)) 323 | 324 | self._fn = theano.function([], [], updates=updates, accept_inplace=True) 325 | -------------------------------------------------------------------------------- /doc/platoon/control_request.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example/synchronous_lstm/lstm_worker.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Build a tweet sentiment analyzer 3 | ''' 4 | from __future__ import absolute_import, print_function 5 | from collections import OrderedDict 6 | import sys 7 | import argparse 8 | 9 | import six 10 | from six import iteritems 11 | from six.moves import range 12 | 13 | import numpy 14 | import theano 15 | from theano import config 16 | import theano.tensor as tensor 17 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 18 | 19 | import os 20 | 21 | sys.path.append(os.path.dirname(__file__)) 22 | import imdb 23 | 24 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) 25 | from platoon.channel import Worker 26 | from platoon.training.global_dynamics import AverageSGD 27 | 28 | worker = None 29 | datasets = {'imdb': (imdb.load_data, imdb.prepare_data)} 30 | 31 | 32 | def numpy_floatX(data): 33 | return numpy.asarray(data, dtype=config.floatX) 34 | 35 | 36 | def get_minibatches_idx(n, minibatch_size, shuffle=False): 37 | """ 38 | Used to shuffle the dataset at each iteration. 39 | """ 40 | 41 | idx_list = numpy.arange(n, dtype="int32") 42 | 43 | if shuffle: 44 | numpy.random.shuffle(idx_list) 45 | 46 | minibatches = [] 47 | minibatch_start = 0 48 | for i in range(n // minibatch_size): 49 | minibatches.append(idx_list[minibatch_start: 50 | minibatch_start + minibatch_size]) 51 | minibatch_start += minibatch_size 52 | 53 | if (minibatch_start != n): 54 | # Make a minibatch out of what is left 55 | minibatches.append(idx_list[minibatch_start:]) 56 | 57 | return zip(range(len(minibatches)), minibatches) 58 | 59 | 60 | def get_dataset(name): 61 | return datasets[name][0], datasets[name][1] 62 | 63 | 64 | def zipp(params, tparams): 65 | """ 66 | When we reload the model. Needed for the GPU stuff. 67 | """ 68 | for kk, vv in iteritems(params): 69 | tparams[kk].set_value(vv) 70 | 71 | 72 | def unzip(zipped): 73 | """ 74 | When we pickle the model. Needed for the GPU stuff. 75 | """ 76 | new_params = OrderedDict() 77 | for kk, vv in iteritems(zipped): 78 | new_params[kk] = vv.get_value() 79 | return new_params 80 | 81 | 82 | def dropout_layer(state_before, use_noise, trng): 83 | proj = tensor.switch(use_noise, 84 | (state_before * 85 | trng.binomial(state_before.shape, 86 | p=0.5, n=1, 87 | dtype=state_before.dtype)), 88 | state_before * 0.5) 89 | return proj 90 | 91 | 92 | def _p(pp, name): 93 | return '%s_%s' % (pp, name) 94 | 95 | 96 | def init_params(options): 97 | """ 98 | Global (not LSTM) parameter. For the embeding and the classifier. 99 | """ 100 | params = OrderedDict() 101 | # embedding 102 | randn = numpy.random.rand(options['n_words'], 103 | options['dim_proj']) 104 | params['Wemb'] = (0.01 * randn).astype(config.floatX) 105 | params = get_layer(options['encoder'])[0](options, 106 | params, 107 | prefix=options['encoder']) 108 | # classifier 109 | params['U'] = 0.01 * numpy.random.randn(options['dim_proj'], 110 | options['ydim']).astype(config.floatX) 111 | params['b'] = numpy.zeros((options['ydim'],)).astype(config.floatX) 112 | 113 | return params 114 | 115 | 116 | def load_params(path, params): 117 | pp = numpy.load(path) 118 | for kk, vv in iteritems(params): 119 | if kk not in pp: 120 | raise Warning('%s is not in the archive' % kk) 121 | params[kk] = pp[kk] 122 | 123 | return params 124 | 125 | 126 | def init_tparams(params): 127 | tparams = OrderedDict() 128 | for kk, pp in iteritems(params): 129 | tparams[kk] = theano.shared(params[kk], name=kk) 130 | return tparams 131 | 132 | 133 | def get_layer(name): 134 | fns = layers[name] 135 | return fns 136 | 137 | 138 | def ortho_weight(ndim): 139 | W = numpy.random.randn(ndim, ndim) 140 | u, s, v = numpy.linalg.svd(W) 141 | return u.astype(config.floatX) 142 | 143 | 144 | def param_init_lstm(options, params, prefix='lstm'): 145 | """ 146 | Init the LSTM parameter: 147 | 148 | :see: init_params 149 | """ 150 | W = numpy.concatenate([ortho_weight(options['dim_proj']), 151 | ortho_weight(options['dim_proj']), 152 | ortho_weight(options['dim_proj']), 153 | ortho_weight(options['dim_proj'])], axis=1) 154 | params[_p(prefix, 'W')] = W 155 | U = numpy.concatenate([ortho_weight(options['dim_proj']), 156 | ortho_weight(options['dim_proj']), 157 | ortho_weight(options['dim_proj']), 158 | ortho_weight(options['dim_proj'])], axis=1) 159 | params[_p(prefix, 'U')] = U 160 | b = numpy.zeros((4 * options['dim_proj'],)) 161 | params[_p(prefix, 'b')] = b.astype(config.floatX) 162 | 163 | return params 164 | 165 | 166 | def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): 167 | nsteps = state_below.shape[0] 168 | if state_below.ndim == 3: 169 | n_samples = state_below.shape[1] 170 | else: 171 | n_samples = 1 172 | 173 | assert mask is not None 174 | 175 | def _slice(_x, n, dim): 176 | if _x.ndim == 3: 177 | return _x[:, :, n * dim:(n + 1) * dim] 178 | return _x[:, n * dim:(n + 1) * dim] 179 | 180 | def _step(m_, x_, h_, c_): 181 | preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) 182 | preact += x_ 183 | 184 | i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) 185 | f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) 186 | o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj'])) 187 | c = tensor.tanh(_slice(preact, 3, options['dim_proj'])) 188 | 189 | c = f * c_ + i * c 190 | c = m_[:, None] * c + (1. - m_)[:, None] * c_ 191 | 192 | h = o * tensor.tanh(c) 193 | h = m_[:, None] * h + (1. - m_)[:, None] * h_ 194 | 195 | return h, c 196 | 197 | state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + 198 | tparams[_p(prefix, 'b')]) 199 | 200 | dim_proj = options['dim_proj'] 201 | rval, updates = theano.scan(_step, 202 | sequences=[mask, state_below], 203 | outputs_info=[tensor.alloc(numpy_floatX(0.), 204 | n_samples, 205 | dim_proj), 206 | tensor.alloc(numpy_floatX(0.), 207 | n_samples, 208 | dim_proj)], 209 | name=_p(prefix, '_layers'), 210 | n_steps=nsteps) 211 | return rval[0] 212 | 213 | 214 | # ff: Feed Forward (normal neural net), only useful to put after lstm 215 | # before the classifier. 216 | layers = {'lstm': (param_init_lstm, lstm_layer)} 217 | 218 | 219 | def sgd(lr, tparams, grads, x, mask, y, cost): 220 | """ Stochastic Gradient Descent 221 | 222 | :note: A more complicated version of sgd then needed. This is 223 | done like that for adadelta and rmsprop. 224 | 225 | """ 226 | # New set of shared variable that will contain the gradient 227 | # for a mini-batch. 228 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) 229 | for k, p in iteritems(tparams)] 230 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 231 | 232 | # Function that computes gradients for a mini-batch, but do not 233 | # updates the weights. 234 | f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, 235 | name='sgd_f_grad_shared') 236 | 237 | pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)] 238 | 239 | # Function that updates the weights from the previously computed 240 | # gradient. 241 | f_update = theano.function([lr], [], updates=pup, 242 | name='sgd_f_update') 243 | 244 | return f_grad_shared, f_update 245 | 246 | 247 | def adadelta(lr, tparams, grads, x, mask, y, cost): 248 | """ 249 | An adaptive learning rate optimizer 250 | 251 | Parameters 252 | ---------- 253 | lr : Theano SharedVariable 254 | Initial learning rate 255 | tpramas: Theano SharedVariable 256 | Model parameters 257 | grads: Theano variable 258 | Gradients of cost w.r.t to parameres 259 | x: Theano variable 260 | Model inputs 261 | mask: Theano variable 262 | Sequence mask 263 | y: Theano variable 264 | Targets 265 | cost: Theano variable 266 | Objective fucntion to minimize 267 | 268 | Notes 269 | ----- 270 | For more information, see [ADADELTA]_. 271 | 272 | .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning 273 | Rate Method*, arXiv:1212.5701. 274 | """ 275 | 276 | zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), 277 | name='%s_grad' % k) 278 | for k, p in iteritems(tparams)] 279 | running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), 280 | name='%s_rup2' % k) 281 | for k, p in iteritems(tparams)] 282 | running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), 283 | name='%s_rgrad2' % k) 284 | for k, p in iteritems(tparams)] 285 | 286 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 287 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 288 | for rg2, g in zip(running_grads2, grads)] 289 | 290 | f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up, 291 | name='adadelta_f_grad_shared') 292 | 293 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 294 | for zg, ru2, rg2 in zip(zipped_grads, 295 | running_up2, 296 | running_grads2)] 297 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 298 | for ru2, ud in zip(running_up2, updir)] 299 | param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] 300 | 301 | f_update = theano.function([lr], [], updates=ru2up + param_up, 302 | on_unused_input='ignore', 303 | name='adadelta_f_update') 304 | 305 | return f_grad_shared, f_update 306 | 307 | 308 | def rmsprop(lr, tparams, grads, x, mask, y, cost): 309 | """ 310 | A variant of SGD that scales the step size by running average of the 311 | recent step norms. 312 | 313 | Parameters 314 | ---------- 315 | lr : Theano SharedVariable 316 | Initial learning rate 317 | tpramas: Theano SharedVariable 318 | Model parameters 319 | grads: Theano variable 320 | Gradients of cost w.r.t to parameres 321 | x: Theano variable 322 | Model inputs 323 | mask: Theano variable 324 | Sequence mask 325 | y: Theano variable 326 | Targets 327 | cost: Theano variable 328 | Objective fucntion to minimize 329 | 330 | Notes 331 | ----- 332 | For more information, see [Hint2014]_. 333 | 334 | .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*, 335 | lecture 6a, 336 | http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf 337 | """ 338 | 339 | zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), 340 | name='%s_grad' % k) 341 | for k, p in iteritems(tparams)] 342 | running_grads = [theano.shared(p.get_value() * numpy_floatX(0.), 343 | name='%s_rgrad' % k) 344 | for k, p in iteritems(tparams)] 345 | running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), 346 | name='%s_rgrad2' % k) 347 | for k, p in iteritems(tparams)] 348 | 349 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 350 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] 351 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 352 | for rg2, g in zip(running_grads2, grads)] 353 | 354 | f_grad_shared = theano.function([x, mask, y], cost, 355 | updates=zgup + rgup + rg2up, 356 | name='rmsprop_f_grad_shared') 357 | 358 | updir = [theano.shared(p.get_value() * numpy_floatX(0.), 359 | name='%s_updir' % k) 360 | for k, p in iteritems(tparams)] 361 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) 362 | for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, 363 | running_grads2)] 364 | param_up = [(p, p + udn[1]) 365 | for p, udn in zip(tparams.values(), updir_new)] 366 | f_update = theano.function([lr], [], updates=updir_new + param_up, 367 | on_unused_input='ignore', 368 | name='rmsprop_f_update') 369 | 370 | return f_grad_shared, f_update 371 | 372 | 373 | def build_model(tparams, options, seed=1234): 374 | print(seed) 375 | trng = RandomStreams(seed) 376 | 377 | # Used for dropout. 378 | use_noise = theano.shared(numpy_floatX(0.)) 379 | 380 | x = tensor.matrix('x', dtype='int64') 381 | mask = tensor.matrix('mask', dtype=config.floatX) 382 | y = tensor.vector('y', dtype='int64') 383 | 384 | n_timesteps = x.shape[0] 385 | n_samples = x.shape[1] 386 | 387 | emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, 388 | n_samples, 389 | options['dim_proj']]) 390 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 391 | prefix=options['encoder'], 392 | mask=mask) 393 | if options['encoder'] == 'lstm': 394 | proj = (proj * mask[:, :, None]).sum(axis=0) 395 | proj = proj / mask.sum(axis=0)[:, None] 396 | if options['use_dropout']: 397 | proj = dropout_layer(proj, use_noise, trng) 398 | 399 | pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b']) 400 | 401 | f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob') 402 | f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred') 403 | 404 | off = 1e-8 405 | if pred.dtype == 'float16': 406 | off = 1e-6 407 | 408 | cost = -tensor.log(pred[tensor.arange(n_samples), y] + off).mean() 409 | 410 | return use_noise, x, mask, y, f_pred_prob, f_pred, cost 411 | 412 | 413 | def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False): 414 | """ If you want to use a trained model, this is useful to compute 415 | the probabilities of new examples. 416 | """ 417 | n_samples = len(data[0]) 418 | probs = numpy.zeros((n_samples, 2)).astype(config.floatX) 419 | 420 | n_done = 0 421 | 422 | for _, valid_index in iterator: 423 | x, mask, y = prepare_data([data[0][t] for t in valid_index], 424 | numpy.array(data[1])[valid_index], 425 | maxlen=None) 426 | pred_probs = f_pred_prob(x, mask) 427 | probs[valid_index, :] = pred_probs 428 | 429 | n_done += len(valid_index) 430 | if verbose: 431 | print('%d/%d samples classified' % (n_done, n_samples)) 432 | 433 | return probs 434 | 435 | 436 | def pred_error(f_pred, prepare_data, data, iterator, verbose=False): 437 | """ 438 | Just compute the error 439 | f_pred: Theano fct computing the prediction 440 | prepare_data: usual prepare_data for that dataset. 441 | """ 442 | valid_err = 0 443 | for _, valid_index in iterator: 444 | x, mask, y = prepare_data([data[0][t] for t in valid_index], 445 | numpy.array(data[1])[valid_index], 446 | maxlen=None) 447 | preds = f_pred(x, mask) 448 | targets = numpy.array(data[1])[valid_index] 449 | valid_err += (preds == targets).sum() 450 | valid_err = 1. - numpy_floatX(valid_err) / len(data[0]) 451 | 452 | return valid_err 453 | 454 | 455 | def train_lstm( 456 | dim_proj=1024, # word embeding dimension and LSTM number of hidden units. 457 | 458 | decay_c=0., # Weight decay for the classifier applied to the U weights. 459 | lrate=0.0001, # Learning rate for sgd (not used for adadelta and rmsprop) 460 | n_words=10000, # Vocabulary size 461 | optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). 462 | encoder='lstm', # TODO: can be removed must be lstm. 463 | saveto='lstm_model.npz', # The best model will be saved there 464 | maxlen=100, # Sequence longer then this get ignored 465 | batch_size=50, # The batch size during training. 466 | #batch_size=100, # This size for a single gpu 467 | valid_batch_size=60, # The batch size used for validation/test set. 468 | #valid_batch_size=120, # This size for a single gpu 469 | validFreq=3, # epoch frequency 470 | dataset='imdb', 471 | 472 | # Parameter for extra option 473 | noise_std=0., 474 | use_dropout=False, # if False slightly faster, but worst test error 475 | # This frequently need a bigger model. 476 | reload_model=None, # Path to a saved model we want to start from. 477 | test_size=-1, # If >0, we keep only this number of test example. 478 | ): 479 | 480 | # Each worker needs the same seed in order to draw the same parameters. 481 | # This will also make them shuffle the batches the same way, but splits are 482 | # different so doesnt matter 483 | seed = worker.send_req('seed') 484 | numpy.random.seed(seed) 485 | 486 | # Model options 487 | model_options = locals().copy() 488 | print("model options", model_options) 489 | 490 | load_data, prepare_data = get_dataset('imdb') 491 | 492 | print('Loading data') 493 | train, valid, test = load_data(n_words=n_words, valid_portion=0.05, 494 | maxlen=maxlen) 495 | if test_size > 0: 496 | # The test set is sorted by size, but we want to keep random 497 | # size example. So we must select a random selection of the 498 | # examples. 499 | idx = numpy.arange(len(test[0])) 500 | numpy.random.shuffle(idx) 501 | idx = idx[:test_size] 502 | test = ([test[0][n] for n in idx], [test[1][n] for n in idx]) 503 | 504 | ydim = numpy.max(train[1]) + 1 505 | 506 | model_options['ydim'] = ydim 507 | 508 | print('Building model') 509 | # This create the initial parameters as numpy ndarrays. 510 | # Dict name (string) -> numpy ndarray 511 | params = init_params(model_options) 512 | 513 | if reload_model: 514 | load_params('lstm_model.npz', params) 515 | 516 | # This creates Theano Shared Variable from the parameters. 517 | # Dict name (string) -> Theano Tensor Shared Variable 518 | # params and tparams have different copy of the weights. 519 | tparams = init_tparams(params) 520 | 521 | list_tparams = list(tparams.values()) 522 | print("Using all_reduce worker's interface!") 523 | asgd = AverageSGD(worker) 524 | asgd.make_rule(list_tparams) 525 | print("Params init done") 526 | 527 | # use_noise is for dropout 528 | # here we could use a different seed? 529 | (use_noise, x, mask, 530 | y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options, 531 | #seed=seed + worker.global_rank) 532 | seed=seed) 533 | 534 | if decay_c > 0.: 535 | decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') 536 | weight_decay = 0. 537 | weight_decay += (tparams['U'] ** 2).sum() 538 | weight_decay *= decay_c 539 | cost += weight_decay 540 | 541 | f_cost = theano.function([x, mask, y], cost, name='f_cost') 542 | 543 | grads = tensor.grad(cost, wrt=list_tparams) 544 | f_grad = theano.function([x, mask, y], grads, name='f_grad') 545 | 546 | lr = tensor.scalar(name='lr') 547 | f_grad_shared, f_update = optimizer(lr, tparams, grads, 548 | x, mask, y, cost) 549 | 550 | print('Optimization') 551 | splits = worker.send_req('splits', {'train_len': len(train[0]), 552 | 'valid_len': len(valid[0]), 553 | 'test_len' : len(test[0])}) 554 | 555 | train = numpy.asarray(train) 556 | valid = numpy.asarray(valid) 557 | test = numpy.asarray(test) 558 | train = train[:, splits['train_splits'][0]:splits['train_splits'][1]] 559 | valid = valid[:, splits['valid_splits'][0]:splits['valid_splits'][1]] 560 | test = test[:, splits['test_splits'][0]:splits['test_splits'][1]] 561 | train = train.tolist() 562 | valid = valid.tolist() 563 | test = test.tolist() 564 | 565 | kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) 566 | kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) 567 | 568 | def train_iter(): 569 | while True: 570 | kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) 571 | for _, train_index in kf: 572 | y = [train[1][t] for t in train_index] 573 | x = [train[0][t] for t in train_index] 574 | x, mask, y = prepare_data(x, y) 575 | yield x, mask, y 576 | 577 | train_it = train_iter() 578 | nb_train = len(train[0]) // batch_size 579 | 580 | epoch = 0 581 | while True: 582 | use_noise.set_value(numpy_floatX(1.)) 583 | for i in range(nb_train): 584 | x, mask, y = next(train_it) 585 | cost = f_grad_shared(x, mask, y) 586 | f_update(lrate) 587 | asgd() 588 | 589 | print('Train cost:', cost) 590 | 591 | if numpy.mod(epoch, validFreq) == 0: 592 | # do validation 593 | # trick : each worker can do their valid without talking to the controller 594 | # even if they finish before another worker, they will wait in the next 595 | # epoch at the calling of all_reduce when they need to sync again 596 | use_noise.set_value(numpy_floatX(0.)) 597 | valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) 598 | test_err = pred_error(f_pred, prepare_data, test, kf_test) 599 | 600 | # they do need to send the result to the controller 601 | res = worker.send_req('pred_errors', dict(test_err=float(test_err), 602 | valid_err=float(valid_err), epoch=epoch)) 603 | 604 | if res == 'best': 605 | # should save the param at best 606 | pass 607 | 608 | if res == 'stop': 609 | break 610 | epoch += 1 611 | 612 | # Release all shared resources. 613 | worker.close() 614 | 615 | 616 | if __name__ == '__main__': 617 | # See function train for all possible parameter and there definition. 618 | parser = Worker.default_parser() 619 | args = parser.parse_args() 620 | 621 | worker = Worker(**Worker.default_arguments(args)) 622 | train_lstm(test_size=500) 623 | -------------------------------------------------------------------------------- /example/lstm/lstm_worker.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Build a tweet sentiment analyzer 3 | ''' 4 | from __future__ import absolute_import, print_function 5 | from collections import OrderedDict 6 | import sys 7 | import argparse 8 | 9 | import six 10 | from six import iteritems 11 | from six.moves import range 12 | 13 | import numpy 14 | import theano 15 | from theano import config 16 | import theano.tensor as tensor 17 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 18 | 19 | import os 20 | 21 | sys.path.append(os.path.dirname(__file__)) 22 | import imdb 23 | 24 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) 25 | from platoon.channel import Worker 26 | from platoon.param_sync import EASGD 27 | 28 | worker = None 29 | datasets = {'imdb': (imdb.load_data, imdb.prepare_data)} 30 | 31 | 32 | def numpy_floatX(data): 33 | return numpy.asarray(data, dtype=config.floatX) 34 | 35 | 36 | def get_minibatches_idx(n, minibatch_size, shuffle=False): 37 | """ 38 | Used to shuffle the dataset at each iteration. 39 | """ 40 | 41 | idx_list = numpy.arange(n, dtype="int32") 42 | 43 | if shuffle: 44 | numpy.random.shuffle(idx_list) 45 | 46 | minibatches = [] 47 | minibatch_start = 0 48 | for i in range(n // minibatch_size): 49 | minibatches.append(idx_list[minibatch_start: 50 | minibatch_start + minibatch_size]) 51 | minibatch_start += minibatch_size 52 | 53 | if (minibatch_start != n): 54 | # Make a minibatch out of what is left 55 | minibatches.append(idx_list[minibatch_start:]) 56 | 57 | return zip(range(len(minibatches)), minibatches) 58 | 59 | 60 | def get_dataset(name): 61 | return datasets[name][0], datasets[name][1] 62 | 63 | 64 | def zipp(params, tparams): 65 | """ 66 | When we reload the model. Needed for the GPU stuff. 67 | """ 68 | for kk, vv in iteritems(params): 69 | tparams[kk].set_value(vv) 70 | 71 | 72 | def unzip(zipped): 73 | """ 74 | When we pickle the model. Needed for the GPU stuff. 75 | """ 76 | new_params = OrderedDict() 77 | for kk, vv in iteritems(zipped): 78 | new_params[kk] = vv.get_value() 79 | return new_params 80 | 81 | 82 | def dropout_layer(state_before, use_noise, trng): 83 | proj = tensor.switch(use_noise, 84 | (state_before * 85 | trng.binomial(state_before.shape, 86 | p=0.5, n=1, 87 | dtype=state_before.dtype)), 88 | state_before * 0.5) 89 | return proj 90 | 91 | 92 | def _p(pp, name): 93 | return '%s_%s' % (pp, name) 94 | 95 | 96 | def init_params(options): 97 | """ 98 | Global (not LSTM) parameter. For the embeding and the classifier. 99 | """ 100 | params = OrderedDict() 101 | # embedding 102 | randn = numpy.random.rand(options['n_words'], 103 | options['dim_proj']) 104 | params['Wemb'] = (0.01 * randn).astype(config.floatX) 105 | params = get_layer(options['encoder'])[0](options, 106 | params, 107 | prefix=options['encoder']) 108 | # classifier 109 | params['U'] = 0.01 * numpy.random.randn(options['dim_proj'], 110 | options['ydim']).astype(config.floatX) 111 | params['b'] = numpy.zeros((options['ydim'],)).astype(config.floatX) 112 | 113 | return params 114 | 115 | 116 | def load_params(path, params): 117 | pp = numpy.load(path) 118 | for kk, vv in iteritems(params): 119 | if kk not in pp: 120 | raise Warning('%s is not in the archive' % kk) 121 | params[kk] = pp[kk] 122 | 123 | return params 124 | 125 | 126 | def init_tparams(params): 127 | tparams = OrderedDict() 128 | for kk, pp in iteritems(params): 129 | tparams[kk] = theano.shared(params[kk], name=kk) 130 | return tparams 131 | 132 | 133 | def get_layer(name): 134 | fns = layers[name] 135 | return fns 136 | 137 | 138 | def ortho_weight(ndim): 139 | W = numpy.random.randn(ndim, ndim) 140 | u, s, v = numpy.linalg.svd(W) 141 | return u.astype(config.floatX) 142 | 143 | 144 | def param_init_lstm(options, params, prefix='lstm'): 145 | """ 146 | Init the LSTM parameter: 147 | 148 | :see: init_params 149 | """ 150 | W = numpy.concatenate([ortho_weight(options['dim_proj']), 151 | ortho_weight(options['dim_proj']), 152 | ortho_weight(options['dim_proj']), 153 | ortho_weight(options['dim_proj'])], axis=1) 154 | params[_p(prefix, 'W')] = W 155 | U = numpy.concatenate([ortho_weight(options['dim_proj']), 156 | ortho_weight(options['dim_proj']), 157 | ortho_weight(options['dim_proj']), 158 | ortho_weight(options['dim_proj'])], axis=1) 159 | params[_p(prefix, 'U')] = U 160 | b = numpy.zeros((4 * options['dim_proj'],)) 161 | params[_p(prefix, 'b')] = b.astype(config.floatX) 162 | 163 | return params 164 | 165 | 166 | def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): 167 | nsteps = state_below.shape[0] 168 | if state_below.ndim == 3: 169 | n_samples = state_below.shape[1] 170 | else: 171 | n_samples = 1 172 | 173 | assert mask is not None 174 | 175 | def _slice(_x, n, dim): 176 | if _x.ndim == 3: 177 | return _x[:, :, n * dim:(n + 1) * dim] 178 | return _x[:, n * dim:(n + 1) * dim] 179 | 180 | def _step(m_, x_, h_, c_): 181 | preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) 182 | preact += x_ 183 | 184 | i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) 185 | f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) 186 | o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj'])) 187 | c = tensor.tanh(_slice(preact, 3, options['dim_proj'])) 188 | 189 | c = f * c_ + i * c 190 | c = m_[:, None] * c + (1. - m_)[:, None] * c_ 191 | 192 | h = o * tensor.tanh(c) 193 | h = m_[:, None] * h + (1. - m_)[:, None] * h_ 194 | 195 | return h, c 196 | 197 | state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + 198 | tparams[_p(prefix, 'b')]) 199 | 200 | dim_proj = options['dim_proj'] 201 | rval, updates = theano.scan(_step, 202 | sequences=[mask, state_below], 203 | outputs_info=[tensor.alloc(numpy_floatX(0.), 204 | n_samples, 205 | dim_proj), 206 | tensor.alloc(numpy_floatX(0.), 207 | n_samples, 208 | dim_proj)], 209 | name=_p(prefix, '_layers'), 210 | n_steps=nsteps) 211 | return rval[0] 212 | 213 | 214 | # ff: Feed Forward (normal neural net), only useful to put after lstm 215 | # before the classifier. 216 | layers = {'lstm': (param_init_lstm, lstm_layer)} 217 | 218 | 219 | def sgd(lr, tparams, grads, x, mask, y, cost): 220 | """ Stochastic Gradient Descent 221 | 222 | :note: A more complicated version of sgd then needed. This is 223 | done like that for adadelta and rmsprop. 224 | 225 | """ 226 | # New set of shared variable that will contain the gradient 227 | # for a mini-batch. 228 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) 229 | for k, p in iteritems(tparams)] 230 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 231 | 232 | # Function that computes gradients for a mini-batch, but do not 233 | # updates the weights. 234 | f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, 235 | name='sgd_f_grad_shared') 236 | 237 | pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)] 238 | 239 | # Function that updates the weights from the previously computed 240 | # gradient. 241 | f_update = theano.function([lr], [], updates=pup, 242 | name='sgd_f_update') 243 | 244 | return f_grad_shared, f_update 245 | 246 | 247 | def adadelta(lr, tparams, grads, x, mask, y, cost): 248 | """ 249 | An adaptive learning rate optimizer 250 | 251 | Parameters 252 | ---------- 253 | lr : Theano SharedVariable 254 | Initial learning rate 255 | tpramas: Theano SharedVariable 256 | Model parameters 257 | grads: Theano variable 258 | Gradients of cost w.r.t to parameres 259 | x: Theano variable 260 | Model inputs 261 | mask: Theano variable 262 | Sequence mask 263 | y: Theano variable 264 | Targets 265 | cost: Theano variable 266 | Objective fucntion to minimize 267 | 268 | Notes 269 | ----- 270 | For more information, see [ADADELTA]_. 271 | 272 | .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning 273 | Rate Method*, arXiv:1212.5701. 274 | """ 275 | 276 | zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), 277 | name='%s_grad' % k) 278 | for k, p in iteritems(tparams)] 279 | running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), 280 | name='%s_rup2' % k) 281 | for k, p in iteritems(tparams)] 282 | running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), 283 | name='%s_rgrad2' % k) 284 | for k, p in iteritems(tparams)] 285 | 286 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 287 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 288 | for rg2, g in zip(running_grads2, grads)] 289 | 290 | f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up, 291 | name='adadelta_f_grad_shared') 292 | 293 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 294 | for zg, ru2, rg2 in zip(zipped_grads, 295 | running_up2, 296 | running_grads2)] 297 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 298 | for ru2, ud in zip(running_up2, updir)] 299 | param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] 300 | 301 | f_update = theano.function([lr], [], updates=ru2up + param_up, 302 | on_unused_input='ignore', 303 | name='adadelta_f_update') 304 | 305 | return f_grad_shared, f_update 306 | 307 | 308 | def rmsprop(lr, tparams, grads, x, mask, y, cost): 309 | """ 310 | A variant of SGD that scales the step size by running average of the 311 | recent step norms. 312 | 313 | Parameters 314 | ---------- 315 | lr : Theano SharedVariable 316 | Initial learning rate 317 | tpramas: Theano SharedVariable 318 | Model parameters 319 | grads: Theano variable 320 | Gradients of cost w.r.t to parameres 321 | x: Theano variable 322 | Model inputs 323 | mask: Theano variable 324 | Sequence mask 325 | y: Theano variable 326 | Targets 327 | cost: Theano variable 328 | Objective fucntion to minimize 329 | 330 | Notes 331 | ----- 332 | For more information, see [Hint2014]_. 333 | 334 | .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*, 335 | lecture 6a, 336 | http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf 337 | """ 338 | 339 | zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), 340 | name='%s_grad' % k) 341 | for k, p in iteritems(tparams)] 342 | running_grads = [theano.shared(p.get_value() * numpy_floatX(0.), 343 | name='%s_rgrad' % k) 344 | for k, p in iteritems(tparams)] 345 | running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), 346 | name='%s_rgrad2' % k) 347 | for k, p in iteritems(tparams)] 348 | 349 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 350 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] 351 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 352 | for rg2, g in zip(running_grads2, grads)] 353 | 354 | f_grad_shared = theano.function([x, mask, y], cost, 355 | updates=zgup + rgup + rg2up, 356 | name='rmsprop_f_grad_shared') 357 | 358 | updir = [theano.shared(p.get_value() * numpy_floatX(0.), 359 | name='%s_updir' % k) 360 | for k, p in iteritems(tparams)] 361 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) 362 | for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, 363 | running_grads2)] 364 | param_up = [(p, p + udn[1]) 365 | for p, udn in zip(tparams.values(), updir_new)] 366 | f_update = theano.function([lr], [], updates=updir_new + param_up, 367 | on_unused_input='ignore', 368 | name='rmsprop_f_update') 369 | 370 | return f_grad_shared, f_update 371 | 372 | 373 | def build_model(tparams, options): 374 | trng = RandomStreams(SEED) 375 | 376 | # Used for dropout. 377 | use_noise = theano.shared(numpy_floatX(0.)) 378 | 379 | x = tensor.matrix('x', dtype='int64') 380 | mask = tensor.matrix('mask', dtype=config.floatX) 381 | y = tensor.vector('y', dtype='int64') 382 | 383 | n_timesteps = x.shape[0] 384 | n_samples = x.shape[1] 385 | 386 | emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, 387 | n_samples, 388 | options['dim_proj']]) 389 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 390 | prefix=options['encoder'], 391 | mask=mask) 392 | if options['encoder'] == 'lstm': 393 | proj = (proj * mask[:, :, None]).sum(axis=0) 394 | proj = proj / mask.sum(axis=0)[:, None] 395 | if options['use_dropout']: 396 | proj = dropout_layer(proj, use_noise, trng) 397 | 398 | pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b']) 399 | 400 | f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob') 401 | f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred') 402 | 403 | off = 1e-8 404 | if pred.dtype == 'float16': 405 | off = 1e-6 406 | 407 | cost = -tensor.log(pred[tensor.arange(n_samples), y] + off).mean() 408 | 409 | return use_noise, x, mask, y, f_pred_prob, f_pred, cost 410 | 411 | 412 | def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False): 413 | """ If you want to use a trained model, this is useful to compute 414 | the probabilities of new examples. 415 | """ 416 | n_samples = len(data[0]) 417 | probs = numpy.zeros((n_samples, 2)).astype(config.floatX) 418 | 419 | n_done = 0 420 | 421 | for _, valid_index in iterator: 422 | x, mask, y = prepare_data([data[0][t] for t in valid_index], 423 | numpy.array(data[1])[valid_index], 424 | maxlen=None) 425 | pred_probs = f_pred_prob(x, mask) 426 | probs[valid_index, :] = pred_probs 427 | 428 | n_done += len(valid_index) 429 | if verbose: 430 | print('%d/%d samples classified' % (n_done, n_samples)) 431 | 432 | return probs 433 | 434 | 435 | def pred_error(f_pred, prepare_data, data, iterator, verbose=False): 436 | """ 437 | Just compute the error 438 | f_pred: Theano fct computing the prediction 439 | prepare_data: usual prepare_data for that dataset. 440 | """ 441 | valid_err = 0 442 | for _, valid_index in iterator: 443 | x, mask, y = prepare_data([data[0][t] for t in valid_index], 444 | numpy.array(data[1])[valid_index], 445 | maxlen=None) 446 | preds = f_pred(x, mask) 447 | targets = numpy.array(data[1])[valid_index] 448 | valid_err += (preds == targets).sum() 449 | valid_err = 1. - numpy_floatX(valid_err) / len(data[0]) 450 | 451 | return valid_err 452 | 453 | 454 | def train_lstm( 455 | dim_proj=1024, # word embeding dimension and LSTM number of hidden units. 456 | 457 | # This value is suggested as being good in the EASGD paper, but 458 | # you may want to tune this 459 | train_len=10, # Train for this many minibatches when requested 460 | 461 | decay_c=0., # Weight decay for the classifier applied to the U weights. 462 | lrate=0.0001, # Learning rate for sgd (not used for adadelta and rmsprop) 463 | n_words=10000, # Vocabulary size 464 | optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). 465 | encoder='lstm', # TODO: can be removed must be lstm. 466 | saveto='lstm_model.npz', # The best model will be saved there 467 | maxlen=100, # Sequence longer then this get ignored 468 | batch_size=16, # The batch size during training. 469 | valid_batch_size=64, # The batch size used for validation/test set. 470 | dataset='imdb', 471 | 472 | # Parameter for extra option 473 | noise_std=0., 474 | use_dropout=True, # if False slightly faster, but worst test error 475 | # This frequently need a bigger model. 476 | reload_model=None, # Path to a saved model we want to start from. 477 | test_size=-1, # If >0, we keep only this number of test example. 478 | valid_sync=False, 479 | param_sync_api=False 480 | ): 481 | 482 | # Model options 483 | model_options = locals().copy() 484 | print("model options", model_options) 485 | 486 | load_data, prepare_data = get_dataset('imdb') 487 | 488 | print('Loading data') 489 | train, valid, test = load_data(n_words=n_words, valid_portion=0.05, 490 | maxlen=maxlen) 491 | if test_size > 0: 492 | # The test set is sorted by size, but we want to keep random 493 | # size example. So we must select a random selection of the 494 | # examples. 495 | idx = numpy.arange(len(test[0])) 496 | numpy.random.shuffle(idx) 497 | idx = idx[:test_size] 498 | test = ([test[0][n] for n in idx], [test[1][n] for n in idx]) 499 | 500 | ydim = numpy.max(train[1]) + 1 501 | 502 | model_options['ydim'] = ydim 503 | 504 | print('Building model') 505 | # This create the initial parameters as numpy ndarrays. 506 | # Dict name (string) -> numpy ndarray 507 | params = init_params(model_options) 508 | 509 | if reload_model: 510 | load_params('lstm_model.npz', params) 511 | 512 | # This creates Theano Shared Variable from the parameters. 513 | # Dict name (string) -> Theano Tensor Shared Variable 514 | # params and tparams have different copy of the weights. 515 | tparams = init_tparams(params) 516 | 517 | list_tparams = list(tparams.values()) 518 | if param_sync_api: 519 | print("Using param_sync worker's interface!") 520 | worker.init_shared_params(list_tparams, param_sync_rule=EASGD(0.5)) 521 | else: 522 | print("Using all_reduce worker's interface!") 523 | from platoon.training import global_dynamics as gd 524 | cparams = init_tparams(params) 525 | list_cparams = list(cparams.values()) 526 | easgd = gd.EASGD(worker) 527 | easgd.make_rule(list_tparams, list_cparams, 0.5) 528 | print("Params init done") 529 | 530 | # use_noise is for dropout 531 | (use_noise, x, mask, 532 | y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) 533 | 534 | if decay_c > 0.: 535 | decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') 536 | weight_decay = 0. 537 | weight_decay += (tparams['U'] ** 2).sum() 538 | weight_decay *= decay_c 539 | cost += weight_decay 540 | 541 | f_cost = theano.function([x, mask, y], cost, name='f_cost') 542 | 543 | grads = tensor.grad(cost, wrt=list_tparams) 544 | f_grad = theano.function([x, mask, y], grads, name='f_grad') 545 | 546 | lr = tensor.scalar(name='lr') 547 | f_grad_shared, f_update = optimizer(lr, tparams, grads, 548 | x, mask, y, cost) 549 | 550 | print('Optimization') 551 | 552 | kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) 553 | kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) 554 | 555 | def train_iter(): 556 | while True: 557 | kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) 558 | for _, train_index in kf: 559 | y = [train[1][t] for t in train_index] 560 | x = [train[0][t] for t in train_index] 561 | x, mask, y = prepare_data(x, y) 562 | yield x, mask, y 563 | 564 | train_it = train_iter() 565 | 566 | best_p = None 567 | 568 | # Making sure that the worker start training with the most recent params 569 | if param_sync_api: 570 | worker.copy_to_local() 571 | 572 | while True: 573 | step = worker.send_req('next') 574 | print(step) 575 | 576 | if step == 'train': 577 | use_noise.set_value(numpy_floatX(1.)) 578 | for i in range(train_len): 579 | x, mask, y = next(train_it) 580 | cost = f_grad_shared(x, mask, y) 581 | f_update(lrate) 582 | print('Train cost:', cost) 583 | step = worker.send_req('done', {'train_len': train_len}) 584 | 585 | print("Syncing with global params") 586 | if param_sync_api: 587 | worker.sync_params(synchronous=True) 588 | else: 589 | easgd() 590 | 591 | """ 592 | if step.startswith('save '): 593 | _, saveto = step.split(' ', 1) 594 | print 'Saving...', 595 | # TODO fix that shit so that saving works. 596 | numpy.savez(saveto, history_errs=history_errs, **s.params) 597 | pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) 598 | print 'Done' 599 | """ 600 | 601 | if step == 'valid': 602 | if param_sync_api and valid_sync: 603 | worker.copy_to_local() 604 | use_noise.set_value(numpy_floatX(0.)) 605 | valid_err = pred_error(f_pred, prepare_data, valid, 606 | kf_valid) 607 | test_err = pred_error(f_pred, prepare_data, test, kf_test) 608 | res = worker.send_req('pred_errors', dict(test_err=float(test_err), 609 | valid_err=float(valid_err))) 610 | 611 | if res == 'best': 612 | best_p = unzip(tparams) 613 | 614 | print(('Valid ', valid_err, 615 | 'Test ', test_err)) 616 | if param_sync_api and valid_sync: 617 | worker.copy_to_local() 618 | 619 | if step == 'stop': 620 | break 621 | 622 | # Release all shared resources. 623 | worker.close() 624 | 625 | # FIX that shit later. 626 | """ 627 | if best_p is not None: 628 | zipp(best_p, tparams) 629 | else: 630 | best_p = unzip(tparams) 631 | 632 | use_noise.set_value(numpy_floatX(0.)) 633 | kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) 634 | train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted) 635 | valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) 636 | test_err = pred_error(f_pred, prepare_data, test, kf_test) 637 | 638 | print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err 639 | if saveto: 640 | numpy.savez(saveto, train_err=train_err, 641 | valid_err=valid_err, test_err=test_err, 642 | history_errs=history_errs, **best_p) 643 | print 'The code run for %d epochs, with %f sec/epochs' % ( 644 | (eidx + 1), (end_time - start_time) / (1. * (eidx + 1))) 645 | print >> sys.stderr, ('Training took %.1fs' % 646 | (end_time - start_time)) 647 | return train_err, valid_err, test_err 648 | """ 649 | 650 | if __name__ == '__main__': 651 | # See function train for all possible parameter and there definition. 652 | parser = Worker.default_parser() 653 | parser.add_argument('--valid_sync', dest='valid_sync', action='store_true', default=False) 654 | parser.add_argument('--param-sync-api', action='store_true', default=False) 655 | args = parser.parse_args() 656 | 657 | worker = Worker(**Worker.default_arguments(args)) 658 | # Set the random number generators' seeds for consistency 659 | # Each worker **MUST** be seeded with a different number, so that 660 | # they do not draw the same minibatches! 661 | SEED = 123 662 | numpy.random.seed(SEED + worker.global_rank) 663 | 664 | train_lstm(valid_sync=args.valid_sync, test_size=500, 665 | param_sync_api=args.param_sync_api) 666 | --------------------------------------------------------------------------------