├── .github ├── issue_template.md └── pull_request_template.md ├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── doc ├── figure │ ├── LM-1B Benchmark.png │ ├── Resnet50 Benchmark.png │ ├── benchmark.png │ ├── exec_model.png │ ├── hybrid.png │ ├── lm1b_convergence.png │ ├── nmt_convergence.png │ └── resnet50_convergence.png ├── installation.md ├── parallax_api.md ├── quick_start.md └── trouble_shooting.md ├── parallax ├── WORKSPACE └── parallax │ ├── BUILD │ ├── __init__.py │ ├── core │ ├── BUILD │ ├── __init__.py │ └── python │ │ ├── BUILD │ │ ├── __init__.py │ │ ├── common │ │ ├── BUILD │ │ ├── __init__.py │ │ ├── config.py │ │ ├── consts.py │ │ ├── graph_transform_lib.py │ │ ├── lib.py │ │ ├── partitions.py │ │ ├── runner.py │ │ ├── session_context.py │ │ └── shard.py │ │ ├── hybrid │ │ ├── BUILD │ │ ├── __init__.py │ │ ├── between_graph_parallel.py │ │ ├── graph_transform.py │ │ ├── in_graph_parallel.py │ │ └── runner.py │ │ ├── mpi │ │ ├── BUILD │ │ ├── __init__.py │ │ ├── graph_transform.py │ │ └── runner.py │ │ ├── ps │ │ ├── BUILD │ │ ├── __init__.py │ │ ├── between_graph_parallel.py │ │ ├── graph_transform.py │ │ ├── in_graph_parallel.py │ │ └── runner.py │ │ └── tools │ │ ├── BUILD │ │ ├── __init__.py │ │ └── launch_ps.py │ ├── examples │ ├── lm1b │ │ ├── LICENSE │ │ ├── README.md │ │ ├── __init__.py │ │ ├── data_utils.py │ │ ├── language_model.py │ │ ├── language_model_graph.py │ │ ├── lm1b_distributed_driver.py │ │ ├── lm1b_eval.py │ │ ├── lm1b_input.py │ │ ├── parallax_config.py │ │ ├── resource_info │ │ └── testdata │ │ │ ├── test_s2.txt │ │ │ ├── test_sentences.txt │ │ │ └── test_vocab.txt │ ├── nmt │ │ ├── .gitignore │ │ ├── CONTRIBUTING.md │ │ ├── LICENSE │ │ ├── README.md │ │ ├── __init__.py │ │ ├── attention_model.py │ │ ├── g3doc │ │ │ └── img │ │ │ │ ├── attention_equation_0.jpg │ │ │ │ ├── attention_equation_1.jpg │ │ │ │ ├── attention_mechanism.jpg │ │ │ │ ├── attention_vis.jpg │ │ │ │ ├── encdec.jpg │ │ │ │ ├── greedy_dec.jpg │ │ │ │ └── seq2seq.jpg │ │ ├── gnmt_model.py │ │ ├── inference.py │ │ ├── inference_test.py │ │ ├── model.py │ │ ├── model_helper.py │ │ ├── model_test.py │ │ ├── nmt.py │ │ ├── nmt_distributed_driver.py │ │ ├── nmt_eval.py │ │ ├── nmt_test.py │ │ ├── parallax_config.py │ │ ├── resource_info │ │ ├── scripts │ │ │ ├── __init__.py │ │ │ ├── bleu.py │ │ │ ├── download_iwslt15.sh │ │ │ ├── rouge.py │ │ │ └── wmt16_en_de.sh │ │ ├── standard_hparams │ │ │ ├── iwslt15.json │ │ │ ├── wmt16.json │ │ │ ├── wmt16_gnmt_4_layer.json │ │ │ └── wmt16_gnmt_8_layer.json │ │ ├── testdata │ │ │ ├── deen_output │ │ │ ├── deen_ref_bpe │ │ │ ├── deen_ref_spm │ │ │ ├── iwslt15.tst2013.100.en │ │ │ ├── iwslt15.tst2013.100.vi │ │ │ ├── iwslt15.vocab.100.en │ │ │ ├── iwslt15.vocab.100.vi │ │ │ ├── label_ref │ │ │ ├── pred_output │ │ │ ├── test_embed.txt │ │ │ ├── test_embed_with_header.txt │ │ │ ├── test_infer_file │ │ │ ├── test_infer_vocab.src │ │ │ └── test_infer_vocab.tgt │ │ ├── train.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── common_test_utils.py │ │ │ ├── evaluation_utils.py │ │ │ ├── evaluation_utils_test.py │ │ │ ├── iterator_utils.py │ │ │ ├── iterator_utils_test.py │ │ │ ├── misc_utils.py │ │ │ ├── misc_utils_test.py │ │ │ ├── nmt_utils.py │ │ │ ├── standard_hparams_utils.py │ │ │ ├── vocab_utils.py │ │ │ └── vocab_utils_test.py │ ├── simple │ │ ├── README.md │ │ ├── resource_info │ │ └── simple_driver.py │ ├── skip_thoughts │ │ ├── LICENSE │ │ ├── README.md │ │ ├── configuration.py │ │ ├── data │ │ │ ├── preprocess_dataset.py │ │ │ └── special_words.py │ │ ├── encoder_manager.py │ │ ├── evaluate.py │ │ ├── ops │ │ │ ├── __init__.py │ │ │ ├── gru_cell.py │ │ │ └── input_ops.py │ │ ├── parallax_config.py │ │ ├── resource_info │ │ ├── skip_distributed_driver.py │ │ ├── skip_thoughts_encoder.py │ │ ├── skip_thoughts_model.py │ │ ├── track_perplexity.py │ │ ├── train.py │ │ └── vocabulary_expansion.py │ └── tf_cnn_benchmarks │ │ ├── CNNBenchmark_distributed_driver.py │ │ ├── CNNBenchmark_eval.py │ │ ├── LICENSE │ │ ├── README.md │ │ ├── benchmark_cnn.py │ │ ├── cnn_util.py │ │ ├── convnet_builder.py │ │ ├── datasets.py │ │ ├── models │ │ ├── __init__.py │ │ ├── alexnet_model.py │ │ ├── densenet_model.py │ │ ├── googlenet_model.py │ │ ├── inception_model.py │ │ ├── lenet_model.py │ │ ├── model.py │ │ ├── model_config.py │ │ ├── overfeat_model.py │ │ ├── resnet_model.py │ │ ├── trivial_model.py │ │ └── vgg_model.py │ │ ├── parallax_config.py │ │ ├── platforms │ │ ├── __init__.py │ │ ├── default │ │ │ ├── __init__.py │ │ │ └── util.py │ │ └── util.py │ │ ├── preprocessing.py │ │ └── resource_info │ └── util │ ├── BUILD │ ├── build_pip_package.sh │ └── setup.py └── tools ├── bazel.rc └── style_check.py /.github/issue_template.md: -------------------------------------------------------------------------------- 1 | ### Things to Change 2 | 3 | ### Current Behavior 4 | 5 | ### Expected Behavior 6 | 7 | ### Failure Information (for bugs) 8 | 9 | #### Failure Logs 10 | 11 | #### How to Reproduce 12 | 13 | ### Related Issues 14 | 15 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | Github issue: #XX 2 | 3 | **Major changes:** 4 | - 5 | 6 | **Minor changes to note:** 7 | - 8 | 9 | **Tests for the changes:** 10 | - 11 | 12 | **Other comments:** 13 | - 14 | 15 | resolves #XX 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.sh 2 | *.pyc 3 | *bazel* 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tensorflow"] 2 | path = tensorflow 3 | url = https://github.com/snuspl/tensorflow.git 4 | [submodule "horovod"] 5 | path = horovod 6 | url = https://github.com/horovod/horovod.git 7 | branch = v0.16.3 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Parallax 2 | **Parallax** is a tool that optimizes data parallel training by considering whether each variable in a deep learning model is sparse or dense. The sparsity-aware data parallel training improves performance of models with sparse variables that show relatively low scalability on existing frameworks while maintaining equal performance for models with only dense variables such as ResNet-50 and Inception-V3. In addition, Parallax automatically parallelizes training of a single-GPU deep learning model to minimize user efforts. If you are interested, you can find the technical details of Parallax in [our paper](https://dl.acm.org/citation.cfm?id=3303957). 3 | 4 | Parallax is currently implemented on TensorFlow. We support [TensorFlow v1.6](https://github.com/tensorflow/tensorflow/tree/r1.6) and [TensorFlow v1.11](https://github.com/tensorflow/tensorflow/tree/r1.11). In case that Parallax uses Message Passing Interface (MPI), Parallax requires *AllReduce*, *AllGather* operations implemented in [Horovod v0.11.2](https://github.com/uber/horovod/tree/v0.11.2). We plan to support multiple TensorFlow versions. 5 | 6 | * [Installation](doc/installation.md) 7 | * [Running Parallax](doc/quick_start.md) 8 | * [Parallax API](doc/parallax_api.md) 9 | 10 | ## Why Parallax? 11 | Parallax makes it easier for users to do distributed training of a deep learning model developed in a single device (e.g., GPU or CPU) while employing various optimization techniques that Parallax provides. A Parallax user simply specifies a single-device model graph, resource specification for distributed training and Parallax does the rest! For distributed training, Parallax supports hybrid architecture that combines two different distributed training architectures: Parameter Server (PS) and AllReduce (AR). Hybrid architecture exploits the advantages of both architectures. Moreover, Parallax will provide large sparse variable partitioning soon to maximize parallelism while maintaining low computation and communication overhead. Parallax further optimizes training with local aggregation and smart operation placement to mitigate communication overhead. 12 | 13 | PS and AR architectures are still available in Parallax; users can choose the training architecture if they want (default is hybrid for synchronous training). 14 | 15 | ### Hybrid Architecture 16 |

17 | 18 | The amount of data transfer of each PS and AR achitecture changes according to whether a variable is sparse or dense. Based on the fact, Parallax pursues a hybrid architecture in which the AR architecture handles dense variables and the PS architecture handles sparse variables to minimize communication overhead. Each worker has a replica of dense variables, while separate server processes manage only sparse variables. 19 | 20 | ### Parallax Execution Model 21 | 22 |

23 | 24 | 25 | When a client initiates a deep learning job with a single-device computation graph, resource information, and optionally a flag that indicates either synchronous or asynchronous training, Parallax transforms the computation graph by analyzing its characteristics. Then, Parallax executes the transformed graph with its optimized communication layer in the distributed environment. 26 | 27 | ### Parallax Benchmark 28 | 29 | To give you an idea on how well Parallax performs, we present the following chart that shows the result of experiments done in a cluster of eight machines that are connected via Mellanox ConnectX-4 cards with 100Gbps InfiniBand. Each machine has six NVIDIA GeForce TITAN Xp GPU cards. 30 | 31 |

32 | 33 | 34 |

35 | 36 | Parallax converges correctly as other frameworks(TensorFlow and Horovod). Parallax is faster than TensorFlow and similiar to Horovod for ResNet50 (dense model). In case of LM1B (sparse model), Parallax outperforms than both TensorFlow and Horovod. 37 | 38 |

39 | 40 |

41 | Parallax outperforms TensorFlow for both Resnet50 and LM1B. In addition, Parallax outperforms Horovod for LM1B. 42 | 43 | ## Troubleshooting 44 | See the [Troubleshooting](doc/trouble_shooting.md) page and submit a new [issue](https://github.com/snuspl/parallax/issues/new) or [contact us](#contact-us) if you cannot find an answer. 45 | 46 | ## Contact us 47 | To contact us, send an email to parallax-dev@googlegroups.com. 48 | 49 | ## License 50 | [Apache License 2.0](LICENSE) 51 | -------------------------------------------------------------------------------- /doc/figure/LM-1B Benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/LM-1B Benchmark.png -------------------------------------------------------------------------------- /doc/figure/Resnet50 Benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/Resnet50 Benchmark.png -------------------------------------------------------------------------------- /doc/figure/benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/benchmark.png -------------------------------------------------------------------------------- /doc/figure/exec_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/exec_model.png -------------------------------------------------------------------------------- /doc/figure/hybrid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/hybrid.png -------------------------------------------------------------------------------- /doc/figure/lm1b_convergence.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/lm1b_convergence.png -------------------------------------------------------------------------------- /doc/figure/nmt_convergence.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/nmt_convergence.png -------------------------------------------------------------------------------- /doc/figure/resnet50_convergence.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/resnet50_convergence.png -------------------------------------------------------------------------------- /doc/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | Parallax runs under Linux with Python 2.7 and 3.6; we haven't yet tested Parallax on other platforms and 3.3+. 3 | Parallax depends on a modified version of TensorFlow 1.6/1.11 and horovod 0.11.2 in parallax repository as submodules. *Each of these frameworks needs to be built and installed from source, which is explained in further detail below*. Parallax itself also requires installing from sources, and below explains the installation process step by step. We plan to provide binary files in the near future. 4 | 5 | First, clone the parallax repository on your linux machine: 6 | ```shell 7 | $ git clone --recurse-submodules https://github.com/snuspl/parallax.git 8 | ``` 9 | We recommend installing using Virtualenv and pip. 10 | 11 | Install Python, pip, and Virtualenv: 12 | ```shell 13 | $ sudo apt-get install python-pip python-dev python-virtualenv 14 | ``` 15 | 16 | Create a Virtualenv environment in the directory `parallax_venv`(specify whichever name you prefer), and then activate it. 17 | ```shell 18 | $ virtualenv parallax_venv 19 | $ source parallax_venv/bin/activate 20 | ``` 21 | 22 | ## Install TensorFlow 23 | TensorFlow requires [Bazel](https://docs.bazel.build/versions/master/install.html) to build a binary file. (See [TF install](https://www.tensorflow.org/install/install_sources) for more instructions on how to build TensorFlow from source.) TensorFlow can be built CPU-only but Parallax needs TensorFlow with GPU support using [CUDA Toolkit 9.0 or 10.0](https://developer.nvidia.com/cuda-zone) and [CuDNN SDK v7](https://developer.nvidia.com/cudnn). To install TensorFlow with GPU support, follow the commands below. 24 | 25 | ```shell 26 | $ cd parallax/tensorflow 27 | $ git checkout r1.11 (optional for TensorFlow v1.11) 28 | $ pip install numpy 29 | $ ./configure 30 | (Configurations related to cuda should be turned on to use GPUs) 31 | (verbs: ibverbs RDMA) 32 | (gdr: GPU Direct (only for GPUs with GDR support)) 33 | $ bazel build --config=opt --config=cuda //tensorflow/tools/pip_package:build_pip_package 34 | $ bazel-bin/tensorflow/tools/pip_package/build_pip_package {target_directory} 35 | $ pip install {target_directory}/tensorflow-*.whl 36 | ``` 37 | 38 | 39 | ## Install Horovod 40 | To install horovod, [Open MPI](https://www.open-mpi.org/faq/?category=building#easy-build) and [NCCL](https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html) are required as MPI implementations. To install OpenMPI, `--with-cuda` flag should be in the configure line, and you can also add `--with-verbs` to use ibverbs. 41 | We tested on openmpi-3.0.0, NCCL 2.1.15(for cuda9.0) and NCCL 2.3.5(for cuda10.0). 42 | ```shell 43 | $ cd ../horovod 44 | $ python setup.py sdist 45 | $ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITHOUT_PYTORCH=True HOROVOD_WITHOUT_MXNET=True pip install --no-cache-dir dist/horovod-*.tar.gz 46 | ``` 47 | 48 | ## Install Parallax 49 | Parallax also uses [Bazel](https://docs.bazel.build/versions/master/install.html) for installation. 50 | ```shell 51 | $ cd ../parallax # parallax directory 52 | $ bazel build //parallax/util:build_pip_package 53 | $ bazel-bin/parallax/util/build_pip_package {target_directory} 54 | $ pip install {target_directory}/parallax-*.whl 55 | -------------------------------------------------------------------------------- /doc/trouble_shooting.md: -------------------------------------------------------------------------------- 1 | # Trouble Shooting 2 | 3 | Because Parallax execution involves many dependent software and hardware packages, debugging can be tricky if errors occur. 4 | This page collects the troublesome situations we have experienced and the solutions. If you have a similar symptom, try following the suggestions. Also, if you have any additional trouble shooting case, please add it here. 5 | 6 | ### Device placement error 7 | Error message: 8 | 9 | `device placement error(Cannot assign a device for operation)` 10 | 11 | Parallax assumes `allow_soft_placement=True` because Parallax assigns operators on CPU/GPU devices according to their characteristics(shared or replicated) if the placement of the device is not specified. If you face a device placement error, try setting `allow_soft_placement=True` on the session configuration. 12 | 13 | ### RDMA queue issue while running parameter server model 14 | Error message: 15 | ``` 16 | tensorflow/contrib/verbs/rdma.cc:1009] Check failed: status.ok() RecvLocalAsync was not ok. error message: Step 123330693738664103 17 | tensorflow/contrib/verbs/rdma.cc:1009] Check failed: status.ok() RecvLocalAsync was not ok. error message: Step 95609778068110326 18 | ``` 19 | There are some issues related to managing RDMA queue in Tensorflow. Consider increasing the RDMA queue depth by adjusting `RDMA_QUEUE_DEPTH=` in `.ssh/environment` or elsewhere you managing environment variables. 20 | 21 | ### NCCL different version issue 22 | Error message: 23 | ``` 24 | Signal: Segmentation fault (11) 25 | Signal code: Address not mapped (1) 26 | Failing at address: 0xa0 27 | ``` 28 | This error can occur if multiple machines use different versions of NCCL. 29 | 30 | ### Hang by fetching gradients from non-chief workers while running parameter server model 31 | Error message: None 32 | 33 | There are a chief(worker 0) worker and non-chief workers, and Parallax assumes that only the chief worker 34 | can fetch the gradients. It means fetching gradients from non-chief workers can block the distributed training. 35 | -------------------------------------------------------------------------------- /parallax/WORKSPACE: -------------------------------------------------------------------------------- 1 | workspace(name = "parallax") 2 | -------------------------------------------------------------------------------- /parallax/parallax/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) # Apache 2.0 2 | 3 | package( 4 | default_visibility = [ 5 | "//visibility:public", 6 | ], 7 | ) 8 | 9 | native.py_library( 10 | name = "parallax", 11 | srcs = ["__init__.py"], 12 | deps = [ 13 | "//parallax/core:core", 14 | "//parallax/core/python/common:runner", 15 | "//parallax/core/python/common:shard", 16 | ], 17 | ) -------------------------------------------------------------------------------- /parallax/parallax/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Seoul National University 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from parallax.core.python.common.partitions import get_partitioner 17 | from parallax.core.python.common.runner import parallel_run 18 | from parallax.core.python.common import shard 19 | from parallax.core.python.common.lib import parallax_log as log 20 | 21 | from parallax.core.python.common.config import ParallaxConfig as Config 22 | from parallax.core.python.common.config import PSConfig 23 | from parallax.core.python.common.config import MPIConfig 24 | from parallax.core.python.common.config import CommunicationConfig 25 | from parallax.core.python.common.config import CheckPointConfig 26 | from parallax.core.python.common.config import ProfileConfig 27 | -------------------------------------------------------------------------------- /parallax/parallax/core/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) # Apache 2.0 2 | 3 | package( 4 | default_visibility = [ 5 | "//visibility:public", 6 | ], 7 | ) 8 | 9 | native.py_library( 10 | name = "core", 11 | srcs = ["__init__.py"], 12 | deps = [ 13 | "//parallax/core/python:python" 14 | ], 15 | ) -------------------------------------------------------------------------------- /parallax/parallax/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/__init__.py -------------------------------------------------------------------------------- /parallax/parallax/core/python/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) # Apache 2.0 2 | 3 | package( 4 | default_visibility = [ 5 | "//visibility:public", 6 | ], 7 | ) 8 | 9 | native.py_library( 10 | name = "python", 11 | srcs = ["__init__.py"], 12 | deps = [ 13 | "//parallax/core/python/common:common", 14 | "//parallax/core/python/mpi:mpi", 15 | "//parallax/core/python/ps:ps", 16 | "//parallax/core/python/hybrid:hybrid", 17 | "//parallax/core/python/tools:tools", 18 | ], 19 | ) 20 | -------------------------------------------------------------------------------- /parallax/parallax/core/python/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/__init__.py -------------------------------------------------------------------------------- /parallax/parallax/core/python/common/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) # Apache 2.0 2 | 3 | package( 4 | default_visibility = [ 5 | "//visibility:public", 6 | ], 7 | ) 8 | 9 | 10 | 11 | native.py_library( 12 | name = "lib", 13 | srcs = ["lib.py"], 14 | deps = [ 15 | "consts", 16 | ] 17 | ) 18 | 19 | native.py_library( 20 | name = "config", 21 | srcs = ["config.py"], 22 | deps = [ 23 | ] 24 | ) 25 | 26 | native.py_library( 27 | name = "graph_transform_lib", 28 | srcs = ["graph_transform_lib.py"], 29 | deps = [ 30 | "lib", 31 | ] 32 | ) 33 | 34 | native.py_library( 35 | name = "session_context", 36 | srcs = ["session_context.py"], 37 | deps = [ 38 | ] 39 | ) 40 | 41 | native.py_library( 42 | name = "runner", 43 | srcs = ["runner.py"], 44 | deps = [ 45 | "lib", 46 | "graph_transform_lib", 47 | "consts", 48 | "partitions", 49 | "//parallax/core/python/ps:runner", 50 | "//parallax/core/python/mpi:runner", 51 | "//parallax/core/python/hybrid:runner" 52 | ] 53 | ) 54 | 55 | native.py_library( 56 | name = "shard", 57 | srcs = ["shard.py"], 58 | deps = [ 59 | "graph_transform_lib", 60 | ], 61 | ) 62 | 63 | native.py_library( 64 | name = "consts", 65 | srcs = ["consts.py"], 66 | deps = [ 67 | ], 68 | ) 69 | 70 | native.py_library( 71 | name = "partitions", 72 | srcs = ["partitions.py"], 73 | deps = [ 74 | ], 75 | ) 76 | native.py_library( 77 | name = "common", 78 | srcs = ["__init__.py"], 79 | deps = [ 80 | "graph_transform_lib", 81 | "runner", 82 | "shard", 83 | "config", 84 | "session_context", 85 | "partitions" 86 | ], 87 | ) 88 | 89 | -------------------------------------------------------------------------------- /parallax/parallax/core/python/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/common/__init__.py -------------------------------------------------------------------------------- /parallax/parallax/core/python/common/consts.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Seoul National University 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import os 17 | 18 | PARALLAX_RUN_OPTION = "PARALLAX_RUN_OPTION" 19 | PARALLAX_RUN_MASTER = "PARALLAX_RUN_MASTER" 20 | PARALLAX_RUN_MPI = "PARALLAX_RUN_MPI" 21 | PARALLAX_RUN_PS = "PARALLAX_RUN_PS" 22 | PARALLAX_RUN_HYBRID = "PARALLAX_RUN_HYBRID" 23 | PARALLAX_WORKER_ID = "PARALLAX_WORKER_ID" 24 | PARALLAX_NUM_WORKERS = "PARALLAX_NUM_WORKERS" 25 | PARALLAX_RESOURCE_INFO = "PARALLAX_RESOURCE_INFO" 26 | PARALLAX_MACHINE_ID = "PARALLAX_MACHINE_ID" 27 | PARALLAX_HOSTNAME = "PARALLAX_HOSTNAME" 28 | 29 | LOCAL_CODE_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 30 | LOCAL_LAUNCH_PS_PATH = os.path.join(LOCAL_CODE_ROOT, 'tools', 31 | 'launch_ps.py') 32 | 33 | REMOTE_PARALLAX_ROOT = os.path.join('/tmp', 'parallax-%s' % os.environ['USER']) 34 | REMOTE_LAUNCH_PS_PATH = os.path.join(REMOTE_PARALLAX_ROOT, 'launch_ps.py') 35 | REMOTE_MPI_SCRIPT_PATH = os.path.join(REMOTE_PARALLAX_ROOT, 'mpi_run.sh') 36 | 37 | NUM_ITERATIONS_FOR_TEST = 200 38 | NUM_ITERATIONS_FOR_WARMUP = 200 39 | -------------------------------------------------------------------------------- /parallax/parallax/core/python/common/shard.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Seoul National University 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import tensorflow as tf 17 | 18 | 19 | NUM_SHARDS = "num_shards" 20 | SHARD_ID = "shard_id" 21 | SHARD_FILTER_PRED = "shard_filter_predicate" 22 | FILTER_DATASET_NUM_SHARDS_POS = 1 23 | FILTER_DATASET_SHARD_ID_POS = 2 24 | 25 | 26 | def create_num_shards_and_shard_id(): 27 | """Returns and create the num shards and the shard id tensors. 28 | 29 | Returns: 30 | The num shards and the shard id tensors. 31 | 32 | Raises: 33 | ValueError: if the num shards tensor or the shard id tensor is already 34 | defined. 35 | """ 36 | 37 | # TODO: allow num_shards and shard_id inside a library function 38 | graph = tf.get_default_graph() 39 | num_shards_tensors = graph.get_collection(NUM_SHARDS) 40 | if len(num_shards_tensors) > 0: 41 | raise ValueError('"num_shards" already exists.') 42 | shard_id_tensors = graph.get_collection(SHARD_ID) 43 | if len(shard_id_tensors) > 0: 44 | raise ValueError('"shard_id" already exists.') 45 | # Create in proper graph and base name_scope. 46 | with graph.as_default() as g, g.name_scope(None): 47 | # Initialize num_shards_tensor=1, and shard_id_tensor=0. 48 | # parallax updates the value when the graph is transformed 49 | # for distributed version. 50 | num_shards_tensor = tf.constant(1, dtype=tf.int64, name="num_shards") 51 | shard_id_tensor = tf.constant(0, dtype=tf.int64, name="shard_id") 52 | tf.add_to_collection(NUM_SHARDS, num_shards_tensor) 53 | tf.add_to_collection(SHARD_ID, shard_id_tensor) 54 | return num_shards_tensor, shard_id_tensor 55 | 56 | 57 | def _get_or_create_num_shards_and_shard_id(): 58 | graph = tf.get_default_graph() 59 | num_shards_tensors = graph.get_collection(NUM_SHARDS) 60 | if len(num_shards_tensors) > 0: 61 | num_shards_tensor = num_shards_tensors[0] 62 | shard_id_tensor = \ 63 | graph.get_collection(SHARD_ID)[0] 64 | else: 65 | num_shards_tensor, shard_id_tensor = create_num_shards_and_shard_id() 66 | return num_shards_tensor, shard_id_tensor 67 | 68 | 69 | def shard(ds): 70 | """Convert a dataset to include shard, it has same effect 71 | with ds.shard(num_shards, index). 72 | """ 73 | 74 | # TODO: allow dataset shard inside a function or dataset api 75 | # (e.g., map, parallel_interleave) 76 | num_shards, shard_id = _get_or_create_num_shards_and_shard_id() 77 | 78 | def filter_fn(elem_index, _): 79 | mod_result = tf.mod(elem_index, num_shards) 80 | return tf.equal(mod_result, shard_id) 81 | 82 | f = ds._enumerate().filter(filter_fn) 83 | assert f._predicate.captured_inputs[0] == num_shards 84 | assert f._predicate.captured_inputs[1] == shard_id 85 | tf.add_to_collection(SHARD_FILTER_PRED, 86 | f._predicate.name) 87 | return f.map(lambda _, elem: elem) 88 | -------------------------------------------------------------------------------- /parallax/parallax/core/python/hybrid/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) # Apache 2.0 2 | 3 | package( 4 | default_visibility = [ 5 | "//visibility:public", 6 | ], 7 | ) 8 | 9 | native.py_library( 10 | name = "graph_transform", 11 | srcs = ["graph_transform.py"], 12 | deps = [ 13 | "//parallax/core/python/common:graph_transform_lib", 14 | "//parallax/core/python/common:lib", 15 | "between_graph_parallel", 16 | "in_graph_parallel", 17 | ] 18 | ) 19 | 20 | native.py_library( 21 | name = "between_graph_parallel", 22 | srcs = ["between_graph_parallel.py"], 23 | deps = [ 24 | "//parallax/core/python/common:graph_transform_lib", 25 | "//parallax/core/python/common:lib", 26 | ] 27 | ) 28 | 29 | native.py_library( 30 | name = "in_graph_parallel", 31 | srcs = ["in_graph_parallel.py"], 32 | deps = [ 33 | "//parallax/core/python/common:graph_transform_lib", 34 | "//parallax/core/python/common:lib", 35 | ] 36 | ) 37 | 38 | native.py_library( 39 | name = "runner", 40 | srcs = ["runner.py"], 41 | deps = [ 42 | "graph_transform", 43 | "//parallax/core/python/common:lib", 44 | "//parallax/core/python/common:consts", 45 | ] 46 | ) 47 | 48 | native.py_library( 49 | name = "hybrid", 50 | srcs = ["__init__.py"], 51 | deps = [ 52 | "runner" 53 | ] 54 | ) 55 | -------------------------------------------------------------------------------- /parallax/parallax/core/python/hybrid/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/hybrid/__init__.py -------------------------------------------------------------------------------- /parallax/parallax/core/python/mpi/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) # Apache 2.0 2 | 3 | package( 4 | default_visibility = [ 5 | "//visibility:public", 6 | ], 7 | ) 8 | 9 | native.py_library( 10 | name = "graph_transform", 11 | srcs = ["graph_transform.py"], 12 | deps = [ 13 | "//parallax/core/python/common:graph_transform_lib", 14 | "//parallax/core/python/common:lib", 15 | ] 16 | ) 17 | 18 | native.py_library( 19 | name = "runner", 20 | srcs = ["runner.py"], 21 | deps = [ 22 | "graph_transform", 23 | "//parallax/core/python/common:lib", 24 | "//parallax/core/python/common:consts", 25 | "//parallax/core/python/common:session_context", 26 | ] 27 | ) 28 | 29 | native.py_library( 30 | name = "mpi", 31 | srcs = ["__init__.py"], 32 | deps = [ 33 | "runner" 34 | ] 35 | ) 36 | -------------------------------------------------------------------------------- /parallax/parallax/core/python/mpi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/mpi/__init__.py -------------------------------------------------------------------------------- /parallax/parallax/core/python/mpi/graph_transform.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Seoul National University 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import tensorflow as tf 17 | import horovod.tensorflow as hvd 18 | 19 | from parallax.core.python.common.graph_transform_lib import get_all_control_consumers 20 | from parallax.core.python.common.graph_transform_lib import update_consumers 21 | from parallax.core.python.common.graph_transform_lib import update_control_consumers 22 | from parallax.core.python.common.graph_transform_lib import update_shard_values_for_worker 23 | from parallax.core.python.common.lib import * 24 | 25 | 26 | def _add_broadcast_ops(): 27 | bcast_global_variables_ops = [] 28 | for var in tf.global_variables(): 29 | bcast_global_variables_ops.append( 30 | tf.assign(var, hvd.broadcast(var, 0))) 31 | with tf.control_dependencies(bcast_global_variables_ops): 32 | tf.no_op(name='auto_parallel_bcast_global_vars') 33 | 34 | 35 | def _add_aggregation_ops(gradients_info, op_to_control_consumer_ops, config): 36 | grad_tensor = gradients_info._grad 37 | if isinstance(grad_tensor, tf.Tensor): 38 | grad = grad_tensor 39 | grad_consumers = [c for c in grad.consumers()] 40 | agg_grad = hvd.allreduce(grad, 41 | average=True) 42 | update_consumers(grad_consumers, grad, agg_grad) 43 | update_control_consumers(op_to_control_consumer_ops[grad.op], 44 | grad.op, agg_grad.op) 45 | else: 46 | grad = grad_tensor.values 47 | indices = grad_tensor.indices 48 | dense_shape = grad_tensor.dense_shape 49 | grad_consumers = [c for c in grad.consumers()] 50 | indices_consumers = [c for c in indices.consumers()] 51 | agg_grad = \ 52 | hvd.allreduce(tf.IndexedSlices(grad, indices, dense_shape), 53 | average=config.average_sparse) 54 | update_consumers(grad_consumers, grad, agg_grad.values) 55 | update_consumers(indices_consumers, indices, agg_grad.indices) 56 | update_control_consumers(op_to_control_consumer_ops[grad.op], 57 | grad.op, agg_grad.values.op) 58 | update_control_consumers( 59 | op_to_control_consumer_ops[indices.op], indices.op, 60 | agg_grad.indices.op) 61 | gradients_info._grad = agg_grad 62 | 63 | 64 | def graph_transform_mpi(single_gpu_meta_graph_def, config, 65 | op_library_path=None): 66 | if op_library_path is not None: 67 | tf.load_op_library(op_library_path) 68 | 69 | with tf.Graph().as_default() as replica: 70 | tf.train.import_meta_graph(single_gpu_meta_graph_def) 71 | 72 | tensor_or_op_name_to_replica_names = {} 73 | for op in replica.get_operations(): 74 | tensor_or_op_name_to_replica_names[op.name] = [op.name] 75 | for output in op.outputs: 76 | tensor_or_op_name_to_replica_names[output.name] = [output.name] 77 | 78 | # Initialize horovod 79 | hvd.init() 80 | 81 | num_workers = hvd.size() 82 | worker_id = hvd.rank() 83 | update_shard_values_for_worker(num_workers, worker_id) 84 | 85 | op_to_control_consumer_ops = get_all_control_consumers(replica) 86 | trainable_variable_ops = [var.op for var in tf.get_collection( 87 | tf.GraphKeys.TRAINABLE_VARIABLES)] 88 | 89 | for gradients_info in tf.get_collection(tf.GraphKeys.GRADIENTS_INFO): 90 | target_tensor = gradients_info._target 91 | if target_tensor.op not in trainable_variable_ops: 92 | parallax_log.debug( 93 | "Gradient for non-trainable variable %s is created, ignore" 94 | % target_tensor.op.name) 95 | continue 96 | 97 | _add_aggregation_ops(gradients_info, op_to_control_consumer_ops, config) 98 | _add_broadcast_ops() 99 | 100 | return tf.train.export_meta_graph(graph=replica), \ 101 | tensor_or_op_name_to_replica_names 102 | -------------------------------------------------------------------------------- /parallax/parallax/core/python/ps/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) # Apache 2.0 2 | 3 | package( 4 | default_visibility = [ 5 | "//visibility:public", 6 | ], 7 | ) 8 | 9 | native.py_library( 10 | name = "graph_transform", 11 | srcs = ["graph_transform.py"], 12 | deps = [ 13 | "//parallax/core/python/common:graph_transform_lib", 14 | "//parallax/core/python/common:lib", 15 | "//parallax/core/python/ps:between_graph_parallel", 16 | "//parallax/core/python/ps:in_graph_parallel", 17 | ] 18 | ) 19 | 20 | native.py_library( 21 | name = "between_graph_parallel", 22 | srcs = ["between_graph_parallel.py"], 23 | deps = [ 24 | "//parallax/core/python/common:graph_transform_lib", 25 | "//parallax/core/python/common:lib", 26 | ] 27 | ) 28 | 29 | native.py_library( 30 | name = "in_graph_parallel", 31 | srcs = ["in_graph_parallel.py"], 32 | deps = [ 33 | "//parallax/core/python/common:graph_transform_lib", 34 | "//parallax/core/python/common:lib", 35 | ] 36 | ) 37 | 38 | native.py_library( 39 | name = "runner", 40 | srcs = ["runner.py"], 41 | deps = [ 42 | "graph_transform", 43 | "//parallax/core/python/common:lib", 44 | "//parallax/core/python/common:consts", 45 | "//parallax/core/python/common:graph_transform_lib", 46 | ] 47 | ) 48 | 49 | native.py_library( 50 | name = "ps", 51 | srcs = ["__init__.py"], 52 | deps = [ 53 | "runner" 54 | ] 55 | ) -------------------------------------------------------------------------------- /parallax/parallax/core/python/ps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/ps/__init__.py -------------------------------------------------------------------------------- /parallax/parallax/core/python/ps/graph_transform.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Seoul National University 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from parallax.core.python.common.lib import * 17 | from parallax.core.python.ps.in_graph_parallel import in_graph_auto_parallel_compute 18 | from parallax.core.python.ps.between_graph_parallel import between_graph_auto_parallel_compute 19 | 20 | 21 | def graph_transform_ps(single_gpu_meta_graph_def, 22 | worker_id, 23 | config, 24 | op_library_path=None): 25 | cluster_info = config.resource_info 26 | # TODO: Handle all ps configurations 27 | if config.communication_config.ps_config.replicate_variables and not config.sync: 28 | raise ValueError('replicate_variables is only possible with sync') 29 | ps_device = '/job:ps' if 'ps' in cluster_info else '/job:worker/cpu:0' 30 | cluster_spec = get_tf_clusterspec(cluster_info) 31 | worker = cluster_info['worker'][worker_id] 32 | num_gpus = len(worker['gpus']) 33 | 34 | parallax_log.debug( 35 | "Starting graph transformation for PS for worker %d" % worker_id) 36 | 37 | tensor_or_op_name_to_replica_names = TensorOrOpNameToReplicaNames( 38 | single_gpu_meta_graph_def.meta_info_def.stripped_op_list) 39 | 40 | multi_gpu_meta_graph_def = \ 41 | in_graph_auto_parallel_compute( 42 | single_gpu_meta_graph_def, num_gpus, config=config, 43 | op_library_path=op_library_path, 44 | tensor_or_op_name_to_replica_names=tensor_or_op_name_to_replica_names) 45 | 46 | ps_meta_graph_def = \ 47 | between_graph_auto_parallel_compute( 48 | multi_gpu_meta_graph_def, 49 | worker_id=worker_id, 50 | ps_device=ps_device, 51 | worker_device='/job:worker/task:%d' % worker_id, 52 | merge_devices=True, 53 | cluster_spec=cluster_spec, 54 | config=config, 55 | op_library_path=op_library_path, 56 | num_replicas_per_worker=num_gpus, 57 | tensor_or_op_name_to_replica_names=tensor_or_op_name_to_replica_names) 58 | parallax_log.debug( 59 | "Finished graph transformation for PS for worker %d" % worker_id) 60 | return ps_meta_graph_def, tensor_or_op_name_to_replica_names.export() 61 | -------------------------------------------------------------------------------- /parallax/parallax/core/python/tools/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) # Apache 2.0 2 | 3 | package( 4 | default_visibility = [ 5 | "//visibility:public", 6 | ], 7 | ) 8 | 9 | native.py_library( 10 | name = "launch_ps", 11 | srcs = ["launch_ps.py"] 12 | ) 13 | 14 | native.py_library( 15 | name = "tools", 16 | srcs = ["__init__.py"], 17 | deps = [ 18 | "launch_ps" 19 | ], 20 | ) 21 | -------------------------------------------------------------------------------- /parallax/parallax/core/python/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/tools/__init__.py -------------------------------------------------------------------------------- /parallax/parallax/core/python/tools/launch_ps.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Seoul National University 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import argparse 17 | import sys, os 18 | import json 19 | 20 | import tensorflow as tf 21 | 22 | FLAGS = tf.app.flags.FLAGS 23 | 24 | tf.app.flags.DEFINE_string('ps_hosts', '', 25 | """Comma-separated list of target hosts""") 26 | tf.app.flags.DEFINE_string('worker_hosts', '', 27 | """Comma-separated list of target hosts""") 28 | tf.app.flags.DEFINE_string('job_name', '', 29 | """Job name in cluster""") 30 | tf.app.flags.DEFINE_integer('task_index', -1, 31 | """Task index of the job""") 32 | tf.app.flags.DEFINE_string('protocol', 'grpc', 33 | """Server protocol: grpc, grpc+verbs, grpc+gdr""") 34 | 35 | 36 | def main(argv=None): 37 | assert FLAGS.job_name == 'ps' 38 | tf_cluster_dict = {} 39 | 40 | if not FLAGS.ps_hosts == '': 41 | tf_cluster_dict['ps'] = [] 42 | for ps in FLAGS.ps_hosts.split(','): 43 | tf_cluster_dict['ps'].append(ps) 44 | 45 | tf_cluster_dict['worker'] = [] 46 | for worker in FLAGS.worker_hosts.split(','): 47 | tf_cluster_dict['worker'].append(worker) 48 | cluster = tf.train.ClusterSpec(tf_cluster_dict) 49 | 50 | server = tf.train.Server(cluster, job_name='ps', 51 | task_index=FLAGS.task_index, 52 | protocol=FLAGS.protocol) 53 | server.join() 54 | 55 | 56 | if __name__ == "__main__": 57 | tf.app.run() 58 | -------------------------------------------------------------------------------- /parallax/parallax/examples/lm1b/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Rafal Jozefowicz 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /parallax/parallax/examples/lm1b/README.md: -------------------------------------------------------------------------------- 1 | # LM-1B 2 | LM-1B implements the LSTM language model described in [LM](https://arxiv.org/abs/1602.02410). 3 | The original code comes from https://github.com/rafaljozefowicz/lm, which supports 4 | synchronous training with multiple GPUs. We change the code as single GPU code, and 5 | then apply parallax auto-parallelization for multi-GPU, multi-machine with synchronous 6 | or asynchronous training. 7 | 8 | ## Dataset 9 | * [1B Word Benchmark Dataset](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) 10 | 11 | ## To Run 12 | Set your resource information in the `resource_info` file. 13 | 14 | Then, you can run lm1b model with data in `` in parallel by executing: 15 | ```shell 16 | $ python lm1b_distributed_driver.py --datadir 17 | ``` 18 | 19 | The command above runs a single LM model on multiple devices specified in `resource_info`. 20 | The command assumes that the data directory and the LM-1B codebase are distributed and reachable in the same absolute path in each of the machines. 21 | 22 | Also, we have a few more options you can choose for distributed running. 23 | 24 | | Parameter Name | Default | Description | 25 | | :------------------- |:-----------------------| :-----------| 26 | | --logdir | /tmp/lm1b | Logging directory | 27 | | --datadir | None | Data directory | 28 | | --hpconfig | "" | Overrides default hyper-parameters | 29 | | --eval_steps | 70 | Number of evaluation steps | 30 | | --resource_info_file | `./resource_info` | Filename containing cluster information written | 31 | | --max_steps | 1000000 | Number of iterations to run for each workers | 32 | | --log_frequency | 100 | How many steps between two runop log | 33 | | --sync | True | Whether to synchronize learning or not | 34 | | --ckpt_dir | None | Directory to save checkpoints | 35 | | --save_ckpt_steps | 0 | Number of steps between two consecutive checkpoints | 36 | | --save_n_ckpts_per_epoch | -1 | Number of checkpoints to save per each epoch | 37 | | --run_option | None | The run option whether PS or MPI, None utilizes both | 38 | | --search_partitions | False | Whether to use Parallax's variable partitioning method or not 39 | 40 | You can adapt the distributed running with above options. For example, if you want to fix the communication model as MPI mode, you can add `run_option` value like below. 41 | 42 | ```shell 43 | $ python lm1b_distributed_driver.py --datadir --run_option=MPI 44 | ``` 45 | -------------------------------------------------------------------------------- /parallax/parallax/examples/lm1b/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/lm1b/__init__.py -------------------------------------------------------------------------------- /parallax/parallax/examples/lm1b/data_utils.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import glob 3 | import json 4 | import random 5 | 6 | import numpy as np 7 | 8 | 9 | class Vocabulary(object): 10 | 11 | def __init__(self): 12 | self._token_to_id = {} 13 | self._token_to_count = {} 14 | self._id_to_token = [] 15 | self._num_tokens = 0 16 | self._s_id = None 17 | self._unk_id = None 18 | 19 | @property 20 | def num_tokens(self): 21 | return self._num_tokens 22 | 23 | @property 24 | def unk(self): 25 | return "" 26 | 27 | @property 28 | def unk_id(self): 29 | return self._unk_id 30 | 31 | @property 32 | def s(self): 33 | return "" 34 | 35 | @property 36 | def s_id(self): 37 | return self._s_id 38 | 39 | def add(self, token, count): 40 | self._token_to_id[token] = self._num_tokens 41 | self._token_to_count[token] = count 42 | self._id_to_token.append(token) 43 | self._num_tokens += 1 44 | 45 | def finalize(self): 46 | self._s_id = self.get_id(self.s) 47 | self._unk_id = self.get_id(self.unk) 48 | 49 | def get_id(self, token): 50 | return self._token_to_id.get(token, self.unk_id) 51 | 52 | def get_token(self, id_): 53 | return self._id_to_token[id_] 54 | 55 | @staticmethod 56 | def from_file(filename): 57 | vocab = Vocabulary() 58 | with codecs.open(filename, "r", "utf-8") as f: 59 | for line in f: 60 | word, count = line.strip().split() 61 | vocab.add(word, int(count)) 62 | vocab.finalize() 63 | return vocab 64 | 65 | 66 | class Dataset(object): 67 | 68 | def __init__(self, vocab, file_pattern, deterministic=False): 69 | self._vocab = vocab 70 | self._file_pattern = file_pattern 71 | self._deterministic = deterministic 72 | 73 | def _parse_sentence(self, line): 74 | s_id = self._vocab.s_id 75 | return [s_id]\ 76 | + [self._vocab.get_id(word) for word in line.strip().split()]\ 77 | + [s_id] 78 | 79 | def _parse_file(self, file_name): 80 | print("Processing file: %s" % file_name) 81 | with codecs.open(file_name, "r", "utf-8") as f: 82 | lines = [line.strip() for line in f] 83 | if not self._deterministic: 84 | random.shuffle(lines) 85 | print("Finished processing!") 86 | for line in lines: 87 | yield self._parse_sentence(line) 88 | 89 | def _sentence_stream(self, file_stream): 90 | for file_name in file_stream: 91 | for sentence in self._parse_file(file_name): 92 | yield sentence 93 | 94 | def _iterate(self, sentences, batch_size, num_steps): 95 | streams = [None] * batch_size 96 | x = np.zeros([batch_size, num_steps], np.int32) 97 | y = np.zeros([batch_size, num_steps], np.int32) 98 | w = np.zeros([batch_size, num_steps], np.uint8) 99 | while True: 100 | x[:] = 0 101 | y[:] = 0 102 | w[:] = 0 103 | for i in range(batch_size): 104 | tokens_filled = 0 105 | try: 106 | while tokens_filled < num_steps: 107 | if streams[i] is None or len(streams[i]) <= 1: 108 | streams[i] = next(sentences) 109 | num_tokens = min(len(streams[i]) - 1, 110 | num_steps - tokens_filled) 111 | x[i, tokens_filled:tokens_filled+num_tokens] = \ 112 | streams[i][:num_tokens] 113 | y[i, tokens_filled:tokens_filled + num_tokens] = \ 114 | streams[i][1:num_tokens+1] 115 | w[i, tokens_filled:tokens_filled + num_tokens] = 1 116 | streams[i] = streams[i][num_tokens:] 117 | tokens_filled += num_tokens 118 | except StopIteration: 119 | pass 120 | if not np.any(w): 121 | return 122 | 123 | yield x, y, w 124 | 125 | def iterate_once(self, batch_size, num_steps): 126 | def file_stream(): 127 | for file_name in glob.glob(self._file_pattern): 128 | yield file_name 129 | for value in self._iterate( 130 | self._sentence_stream(file_stream()), batch_size, num_steps): 131 | yield value 132 | 133 | def iterate_forever(self, batch_size, num_steps, num_workers, worker_id): 134 | def file_stream(): 135 | while True: 136 | file_patterns = glob.glob(self._file_pattern) 137 | file_patterns.sort() 138 | filenames_for_worker = [] 139 | for i in range(len(file_patterns)): 140 | if i % num_workers == worker_id: 141 | filenames_for_worker.append(file_patterns[i]) 142 | if not self._deterministic: 143 | random.shuffle(filenames_for_worker) 144 | for filename in filenames_for_worker: 145 | yield filename 146 | for value in self._iterate( 147 | self._sentence_stream(file_stream()), batch_size, num_steps): 148 | yield value 149 | -------------------------------------------------------------------------------- /parallax/parallax/examples/lm1b/language_model.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import functools 6 | 7 | import numpy as np 8 | import tensorflow as tf 9 | from tensorflow.python.framework import ops 10 | from tensorflow.python.framework import tensor_shape 11 | from tensorflow.python.layers import base 12 | 13 | import parallax 14 | 15 | FLAGS = tf.flags.FLAGS 16 | tf.flags.DEFINE_integer('num_variable_shards', 32, 'Number of variable shard') 17 | 18 | class LM(base.Layer): 19 | def __init__(self, num_steps): 20 | super(LM, self).__init__() 21 | self.num_steps = num_steps 22 | self.num_shards = FLAGS.num_variable_shards 23 | # Use keep_prob 1.0 at evaluation 24 | self.keep_prob = 0.9 25 | 26 | self.vocab_size = 793470 27 | self.emb_size = 512 28 | self.state_size = 2048 29 | self.projected_size = 512 30 | # Use num_sampled 0 (full softmax) at evaluation 31 | self.num_sampled = 8192 32 | 33 | def build(self, input_shape): 34 | partitioner = parallax.get_partitioner(self.num_shards) 35 | with tf.variable_scope(tf.get_variable_scope(), partitioner=partitioner): 36 | self.emb = tf.get_variable('emb', 37 | shape=[self.vocab_size, self.emb_size], 38 | initializer=tf.uniform_unit_scaling_initializer(), 39 | trainable=True, 40 | dtype=tf.float32) 41 | self.softmax_w = tf.get_variable(name='softmax_w', 42 | shape=[self.vocab_size, self.projected_size], 43 | initializer=tf.uniform_unit_scaling_initializer(), 44 | trainable=True, 45 | dtype=tf.float32) 46 | 47 | self.softmax_b = self.add_variable(name='softmax_b', 48 | shape=[self.vocab_size], 49 | trainable=True, 50 | dtype=tf.float32) 51 | self.W = self.add_variable(name='W', 52 | shape=[self.emb_size + self.projected_size, 4 * self.state_size], 53 | trainable=True, 54 | dtype=tf.float32) 55 | self.B = self.add_variable(name='B', 56 | shape=[4 * self.state_size], 57 | trainable=True, 58 | dtype=tf.float32) 59 | self.W_P = self.add_variable(name='W_P', 60 | shape=[self.state_size, self.projected_size], 61 | trainable=True, 62 | dtype=tf.float32) 63 | self.built = True 64 | 65 | def call(self, x, y, w, initial_state_c, initial_state_h, training): 66 | # [bs, steps, emb_size] 67 | x = tf.nn.embedding_lookup(self.emb, x) 68 | if training: 69 | x = tf.nn.dropout(x, self.keep_prob) 70 | 71 | # [bs, emb_size] * steps 72 | inputs = [tf.squeeze(v, axis=[1]) for v in tf.split(value=x, num_or_size_splits=self.num_steps, axis=1)] 73 | 74 | c = initial_state_c 75 | h = initial_state_h 76 | for t in range(self.num_steps): 77 | # i = input_gate, j = new_input, f = forget_gate, o = output_gate 78 | cell_inputs = tf.concat([inputs[t], h], axis=1) 79 | lstm_matrix = tf.nn.xw_plus_b(cell_inputs, self.W, self.B) 80 | i, j, f, o = tf.split(lstm_matrix, 4, axis=1) 81 | 82 | c = tf.sigmoid(f + 1.0) * c + tf.sigmoid(i) * tf.tanh(j) 83 | h = tf.sigmoid(o) * tf.tanh(c) 84 | h = tf.matmul(h, self.W_P) 85 | inputs[t] = h 86 | if training: 87 | inputs[t] = tf.nn.dropout(inputs[t], self.keep_prob) 88 | 89 | inputs[t] = tf.identity(inputs[t]) 90 | 91 | inputs = tf.reshape(tf.concat(inputs, axis=1), [-1, self.projected_size]) 92 | 93 | if training: 94 | targets = tf.reshape(y, [-1, 1]) 95 | loss = tf.nn.sampled_softmax_loss(self.softmax_w, 96 | self.softmax_b, 97 | targets, 98 | inputs, 99 | self.num_sampled, 100 | self.vocab_size) 101 | else: 102 | full_softmax_w = tf.reshape(tf.concat(self.softmax_w, axis=1), [-1, self.projected_size]) 103 | full_softmax_w = full_softmax_w[:self.vocab_size, :] 104 | 105 | logits = tf.matmul(inputs, full_softmax_w, transpose_b=True) + self.softmax_b 106 | targets = tf.reshape(y, [-1]) 107 | loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets) 108 | 109 | loss = tf.reduce_mean(loss * tf.reshape(tf.to_float(w), [-1])) 110 | return loss, c, h 111 | -------------------------------------------------------------------------------- /parallax/parallax/examples/lm1b/language_model_graph.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | import language_model 9 | 10 | _NUM_WORDS = { 11 | 'train': 798945280, 12 | 'validation': 7789987, 13 | } 14 | 15 | FLAGS = tf.flags.FLAGS 16 | tf.flags.DEFINE_integer('batch_size', 128, 'Batch size') 17 | tf.flags.DEFINE_integer('num_steps', 20, 'Number of steps') 18 | tf.flags.DEFINE_float('learning_rate', 0.2, 'Learning rate') 19 | tf.flags.DEFINE_float('max_grad_norm', 10.0, 'max_grad_norm') 20 | tf.flags.DEFINE_integer('num_epoch', 5, 'Number of epoch') 21 | tf.flags.DEFINE_boolean('use_synthetic', False, 'whether to use synthetic data or not') 22 | 23 | 24 | def build_model(): 25 | model = language_model.LM(FLAGS.num_steps) 26 | global_step = tf.train.get_or_create_global_step() 27 | 28 | with tf.device('/gpu:0'): 29 | placeholder_x = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.num_steps]) 30 | placeholder_y = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.num_steps]) 31 | placeholder_w = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.num_steps]) 32 | initial_state_c = tf.placeholder(dtype=tf.float32, 33 | shape=[FLAGS.batch_size, model.state_size], 34 | name='initial_c') 35 | initial_state_h = tf.placeholder(dtype=tf.float32, 36 | shape=[FLAGS.batch_size, model.projected_size], 37 | name='initial_h') 38 | loss, final_state_c, final_state_h = model(placeholder_x, placeholder_y, placeholder_w, initial_state_c, initial_state_h, training=True) 39 | scaled_loss = loss * FLAGS.num_steps 40 | 41 | emb_vars = list(model.emb) 42 | lstm_vars = [model.W, model.B, model.W_P] 43 | softmax_vars = list(model.softmax_w) + [model.softmax_b] 44 | all_vars = emb_vars + lstm_vars + softmax_vars 45 | grads = tf.gradients(scaled_loss, all_vars) 46 | 47 | emb_grads = grads[:len(emb_vars)] 48 | emb_grads = [tf.IndexedSlices(grad.values * FLAGS.batch_size, 49 | grad.indices, 50 | grad.dense_shape) for grad in emb_grads] 51 | 52 | lstm_grads = grads[len(emb_vars):len(emb_vars) + len(lstm_vars)] 53 | lstm_grads, _ = tf.clip_by_global_norm(lstm_grads, FLAGS.max_grad_norm) 54 | 55 | softmax_grads = grads[len(emb_vars) + len(lstm_vars):] 56 | 57 | clipped_grads = emb_grads + lstm_grads + softmax_grads 58 | grads_and_vars = list(zip(clipped_grads, all_vars)) 59 | 60 | optimizer = tf.train.AdagradOptimizer(FLAGS.learning_rate, initial_accumulator_value=1.0) 61 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 62 | 63 | ema = tf.train.ExponentialMovingAverage(decay=0.999) 64 | with tf.control_dependencies([train_op]): 65 | train_op = ema.apply(lstm_vars) 66 | 67 | model.global_step = global_step 68 | model.loss = loss 69 | model.train_op = train_op 70 | 71 | model.final_state_c = final_state_c 72 | model.final_state_h = final_state_h 73 | 74 | model.initial_state_c = initial_state_c 75 | model.initial_state_h = initial_state_h 76 | 77 | model.x = placeholder_x 78 | model.y = placeholder_y 79 | model.w = placeholder_w 80 | 81 | return model 82 | -------------------------------------------------------------------------------- /parallax/parallax/examples/lm1b/lm1b_distributed_driver.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Seoul National University 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import tensorflow as tf 17 | import parallax 18 | 19 | 20 | import os 21 | import time 22 | import math 23 | import json 24 | import sys 25 | import numpy as np 26 | 27 | from data_utils import Vocabulary, Dataset 28 | import language_model_graph 29 | import parallax_config 30 | 31 | flags = tf.app.flags 32 | flags.DEFINE_string("logdir", "/tmp/lm1b", "Logging directory.") 33 | flags.DEFINE_string("datadir", None, "Logging directory.") 34 | flags.DEFINE_string("hpconfig", "", "Overrides default hyper-parameters.") 35 | flags.DEFINE_integer("eval_steps", 70, "Number of eval steps.") 36 | flags.DEFINE_string('resource_info_file', 37 | os.path.abspath(os.path.join(os.path.dirname(__file__), 38 | '.', 39 | 'resource_info')), 40 | 'Filename containing cluster information') 41 | flags.DEFINE_integer('max_steps', 1000000, 42 | """Number of iterations to run for each workers.""") 43 | flags.DEFINE_integer('log_frequency', 100, 44 | """How many steps between two runop logs.""") 45 | flags.DEFINE_boolean('sync', True, '') 46 | FLAGS = flags.FLAGS 47 | 48 | 49 | def main(_): 50 | 51 | vocab = Vocabulary.from_file(os.path.join(FLAGS.datadir, "1b_word_vocab.txt")) 52 | dataset = Dataset(vocab, os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*")) 53 | 54 | single_gpu_graph = tf.Graph() 55 | with single_gpu_graph.as_default(): 56 | with tf.variable_scope("model"): 57 | model = language_model_graph.build_model() 58 | 59 | def run(sess,num_workers, worker_id, num_replicas_per_worker): 60 | 61 | state_c = [] 62 | state_h = [] 63 | 64 | if len(state_c) == 0: 65 | state_c.extend([np.zeros([FLAGS.batch_size, model.state_size], dtype=np.float32) for _ in range(num_replicas_per_worker)]) 66 | state_h.extend([np.zeros([FLAGS.batch_size, model.projected_size], dtype=np.float32) for _ in range(num_replicas_per_worker)]) 67 | 68 | prev_global_step = sess.run(model.global_step)[0] 69 | prev_time = time.time() 70 | data_iterator = dataset.iterate_forever(FLAGS.batch_size * num_replicas_per_worker, 71 | FLAGS.num_steps, num_workers, worker_id) 72 | fetches = { 73 | 'global_step': model.global_step, 74 | 'loss': model.loss, 75 | 'train_op': model.train_op, 76 | 'final_state_c': model.final_state_c, 77 | 'final_state_h': model.final_state_h 78 | } 79 | 80 | for local_step in range(FLAGS.max_steps): 81 | if FLAGS.use_synthetic: 82 | x = np.random.randint(low=0, high=model.vocab_size, size=(FLAGS.batch_size*num_replicas_per_worker, FLAGS.num_steps)) 83 | y = np.random.randint(low=0, high=model.vocab_size, size=(FLAGS.batch_size*num_replicas_per_worker, FLAGS.num_steps)) 84 | w = np.ones((FLAGS.batch_size*num_replicas_per_worker, FLAGS.num_steps)) 85 | else: 86 | x, y, w = next(data_iterator) 87 | feeds = {} 88 | feeds[model.x] = np.split(x, num_replicas_per_worker) 89 | feeds[model.y] = np.split(y, num_replicas_per_worker) 90 | feeds[model.w] = np.split(w, num_replicas_per_worker) 91 | feeds[model.initial_state_c] = state_c 92 | feeds[model.initial_state_h] = state_h 93 | fetched = sess.run(fetches, feeds) 94 | 95 | state_c = fetched['final_state_c'] 96 | state_h = fetched['final_state_h'] 97 | 98 | if local_step % FLAGS.log_frequency == 0: 99 | cur_time = time.time() 100 | elapsed_time = cur_time - prev_time 101 | num_words = FLAGS.batch_size * FLAGS.num_steps 102 | wps = (fetched['global_step'][0] - prev_global_step) * num_words / elapsed_time 103 | prev_global_step = fetched['global_step'][0] 104 | parallax.log.info("Iteration %d, time = %.2fs, wps = %.0f, train loss = %.4f" % ( 105 | fetched['global_step'][0], cur_time - prev_time, wps, fetched['loss'][0])) 106 | prev_time = cur_time 107 | 108 | sess, num_workers, worker_id, num_replicas_per_worker = \ 109 | parallax.parallel_run(single_gpu_graph, 110 | FLAGS.resource_info_file, 111 | sync=FLAGS.sync, 112 | parallax_config=parallax_config.build_config()) 113 | run(sess, num_workers, worker_id, num_replicas_per_worker) 114 | 115 | if __name__ == "__main__": 116 | tf.app.run() 117 | -------------------------------------------------------------------------------- /parallax/parallax/examples/lm1b/lm1b_input.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import glob 3 | import json 4 | import random 5 | 6 | import numpy as np 7 | import sys 8 | 9 | 10 | class Vocabulary(object): 11 | 12 | def __init__(self): 13 | self._token_to_id = {} 14 | self._token_to_count = {} 15 | self._id_to_token = [] 16 | self._num_tokens = 0 17 | self._s_id = None 18 | self._unk_id = None 19 | 20 | @property 21 | def num_tokens(self): 22 | return self._num_tokens 23 | 24 | @property 25 | def unk(self): 26 | return "" 27 | 28 | @property 29 | def unk_id(self): 30 | return self._unk_id 31 | 32 | @property 33 | def s(self): 34 | return "" 35 | 36 | @property 37 | def s_id(self): 38 | return self._s_id 39 | 40 | def add(self, token, count): 41 | self._token_to_id[token] = self._num_tokens 42 | self._token_to_count[token] = count 43 | self._id_to_token.append(token) 44 | self._num_tokens += 1 45 | 46 | def finalize(self): 47 | self._s_id = self.get_id(self.s) 48 | self._unk_id = self.get_id(self.unk) 49 | 50 | def get_id(self, token): 51 | return self._token_to_id.get(token, self.unk_id) 52 | 53 | def get_token(self, id_): 54 | return self._id_to_token[id_] 55 | 56 | @staticmethod 57 | def from_file(filename, num_tokens_limit=None): 58 | vocab = Vocabulary() 59 | with codecs.open(filename, "r", "utf-8") as f: 60 | for line in f: 61 | word, count = line.strip().split() 62 | vocab.add(word, int(count)) 63 | if num_tokens_limit is not None: 64 | if vocab.num_tokens == num_tokens_limit: 65 | break 66 | vocab.finalize() 67 | return vocab 68 | 69 | 70 | class Dataset(object): 71 | 72 | def __init__(self, vocab, filenames, deterministic=False): 73 | self._vocab = vocab 74 | self._filenames = filenames 75 | self._deterministic = deterministic 76 | 77 | def _parse_sentence(self, line): 78 | s_id = self._vocab.s_id 79 | return [s_id] + [self._vocab.get_id(word) for word in line.strip().split()] + [s_id] 80 | 81 | def _parse_file(self, file_name): 82 | print("Processing file: %s" % file_name) 83 | with codecs.open(file_name, "r", "utf-8") as f: 84 | lines = [line.strip() for line in f] 85 | if not self._deterministic: 86 | random.shuffle(lines) 87 | print("Finished processing!") 88 | for line in lines: 89 | yield self._parse_sentence(line) 90 | 91 | def _sentence_stream(self, file_stream): 92 | for file_name in file_stream: 93 | for sentence in self._parse_file(file_name): 94 | yield sentence 95 | 96 | def _iterate(self, sentences, batch_size, num_steps): 97 | streams = [None] * batch_size 98 | x = np.zeros([batch_size, num_steps], np.int32) 99 | y = np.zeros([batch_size, num_steps], np.int32) 100 | w = np.zeros([batch_size, num_steps], np.uint8) 101 | while True: 102 | x[:] = 0 103 | y[:] = 0 104 | w[:] = 0 105 | for i in range(batch_size): 106 | tokens_filled = 0 107 | try: 108 | while tokens_filled < num_steps: 109 | if streams[i] is None or len(streams[i]) <= 1: 110 | streams[i] = next(sentences) 111 | num_tokens = min(len(streams[i]) - 1, num_steps - tokens_filled) 112 | x[i, tokens_filled:tokens_filled+num_tokens] = streams[i][:num_tokens] 113 | y[i, tokens_filled:tokens_filled + num_tokens] = streams[i][1:num_tokens+1] 114 | w[i, tokens_filled:tokens_filled + num_tokens] = 1 115 | streams[i] = streams[i][num_tokens:] 116 | tokens_filled += num_tokens 117 | except StopIteration: 118 | pass 119 | if not np.any(w): 120 | return 121 | 122 | yield x, y, w 123 | 124 | def iterate_once(self, batch_size, num_steps): 125 | def file_stream(): 126 | for file_name in self._filenames: 127 | yield file_name 128 | for value in self._iterate(self._sentence_stream(file_stream()), batch_size, num_steps): 129 | yield value 130 | 131 | def iterate_forever(self, batch_size, num_steps): 132 | def file_stream(): 133 | while True: 134 | if not self._deterministic: 135 | random.shuffle(self._filenames) 136 | for file_name in self._filenames: 137 | yield file_name 138 | for value in self._iterate(self._sentence_stream(file_stream()), batch_size, num_steps): 139 | yield value 140 | -------------------------------------------------------------------------------- /parallax/parallax/examples/lm1b/parallax_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Seoul National University 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import tensorflow as tf 17 | import parallax 18 | 19 | flags = tf.app.flags 20 | flags.DEFINE_boolean('replicate_variables', True, """replicate_variables""") 21 | flags.DEFINE_string('protocol', 'grpc', """The method for managing variables""") 22 | flags.DEFINE_string('mpirun_options', '', 'The option for mpirun') 23 | flags.DEFINE_string('run_option', 'HYBRID', 24 | 'The run option whether PS, MPI or HYBRID') 25 | flags.DEFINE_string('redirect_path', None, """redirect path to keep the log of distributed workers""") 26 | flags.DEFINE_string('ckpt_dir', None, """Directory to save checkpoints""") 27 | flags.DEFINE_integer('save_ckpt_steps', None, 28 | """Number of steps between two consecutive checkpoints""") 29 | flags.DEFINE_string('profile_dir', None, """Directory to save RunMetadata""") 30 | flags.DEFINE_string('profile_steps', None, """Comma separated porfile steps""") 31 | flags.DEFINE_string('profile_range', None, """profile_start_step,profile_end_step""") 32 | flags.DEFINE_boolean('local_aggregation', True, 33 | """Whether to use local aggregation or not""") 34 | flags.DEFINE_boolean('boundary_among_servers', True, 35 | """Whether to use operation placement among servers""") 36 | flags.DEFINE_boolean('boundary_between_workers_and_servers', True, 37 | """Whether to use operation placement between workers and servers""") 38 | flags.DEFINE_string('export_graph_path', None, """export path to keep transformed graph definintion""") 39 | flags.DEFINE_boolean('search_partitions', False, "Whether to use variable partitioning method") 40 | FLAGS = flags.FLAGS 41 | 42 | def build_config(): 43 | 44 | ckpt_config = parallax.CheckPointConfig(ckpt_dir=FLAGS.ckpt_dir, 45 | save_ckpt_steps=FLAGS.save_ckpt_steps) 46 | ps_config = parallax.PSConfig(replicate_variables=FLAGS.replicate_variables, 47 | protocol=FLAGS.protocol, 48 | local_aggregation=FLAGS.local_aggregation, 49 | boundary_among_servers=FLAGS.boundary_among_servers, 50 | boundary_between_workers_and_servers=\ 51 | FLAGS.boundary_between_workers_and_servers) 52 | mpi_config = parallax.MPIConfig(mpirun_options=FLAGS.mpirun_options) 53 | def get_profile_steps(): 54 | if FLAGS.profile_steps: 55 | FLAGS.profile_steps = FLAGS.profile_steps.strip() 56 | return [int(step) for step in FLAGS.profile_steps.split(',')] 57 | return None 58 | 59 | def get_profile_range(): 60 | if FLAGS.profile_range: 61 | FLAGS.profile_range = FLAGS.profile_range.strip() 62 | splits = FLAGS.profile_range.split(',') 63 | return (int(splits[0]), int(splits[1])) 64 | return None 65 | 66 | profile_config = parallax.ProfileConfig(profile_dir=FLAGS.profile_dir, 67 | profile_steps=get_profile_steps(), 68 | profile_range=get_profile_range()) 69 | parallax_config = parallax.Config() 70 | parallax_config.run_option = FLAGS.run_option 71 | parallax_config.average_sparse = False 72 | parallax_config.communication_config = parallax.CommunicationConfig(ps_config, mpi_config) 73 | parallax_config.ckpt_config = ckpt_config 74 | parallax_config.profile_config = profile_config 75 | parallax_config.redirect_path = FLAGS.redirect_path 76 | parallax_config.export_graph_path = FLAGS.export_graph_path 77 | parallax_config.search_partitions = FLAGS.search_partitions 78 | 79 | return parallax_config 80 | -------------------------------------------------------------------------------- /parallax/parallax/examples/lm1b/resource_info: -------------------------------------------------------------------------------- 1 | 123.456.78.90:1,2 2 | -------------------------------------------------------------------------------- /parallax/parallax/examples/lm1b/testdata/test_s2.txt: -------------------------------------------------------------------------------- 1 | 非婚姻所生 非婚姻所生 2 | ala ma kota 3 | test -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/.gitignore: -------------------------------------------------------------------------------- 1 | bazel-bin 2 | bazel-genfiles 3 | bazel-out 4 | bazel-testlogs 5 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Want to contribute? Great! First, read this page (including the small print at the end). 2 | 3 | ### Before you contribute 4 | 5 | Before we can use your code, you must sign the 6 | [Google Individual Contributor License Agreement] 7 | (https://cla.developers.google.com/about/google-individual) 8 | (CLA), which you can do online. The CLA is necessary mainly because you own the 9 | copyright to your changes, even after your contribution becomes part of our 10 | codebase, so we need your permission to use and distribute your code. We also 11 | need to be sure of various other things—for instance that you'll tell us if you 12 | know that your code infringes on other people's patents. You don't have to sign 13 | the CLA until after you've submitted your code for review and a member has 14 | approved it, but you must do it before we can put your code into our codebase. 15 | Before you start working on a larger contribution, you should get in touch with 16 | us first through the issue tracker with your idea so that we can help out and 17 | possibly guide you. Coordinating up front makes it much easier to avoid 18 | frustration later on. 19 | 20 | ### Code reviews 21 | 22 | All submissions, including submissions by project members, require review. We 23 | use Github pull requests for this purpose. 24 | 25 | ### The small print 26 | 27 | Contributions made by corporations are covered by a different agreement than 28 | the one above, the 29 | [Software Grant and Corporate Contributor License Agreement] 30 | (https://cla.developers.google.com/about/google-corporate). 31 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/README.md: -------------------------------------------------------------------------------- 1 | # Neural Machine Translation (seq2seq) 2 | 3 | Neural Machine Translation (NMT) mimics translation process of human. For more detailed description about the program itself, please check out [https://github.com/tensorflow/nmt](https://github.com/tensorflow/nmt) where this program comes from. 4 | 5 | ## Dataset 6 | 7 | We can use the following publicly available datasets: 8 | 9 | 1. *Small-scale*: English-Vietnamese parallel corpus of TED talks (133K sentence 10 | pairs) provided by 11 | the 12 | [IWSLT Evaluation Campaign](https://sites.google.com/site/iwsltevaluation2015/). 13 | 1. *Large-scale*: German-English parallel corpus (4.5M sentence pairs) provided 14 | by the [WMT Evaluation Campaign](http://www.statmt.org/wmt16/translation-task.html). 15 | 16 | ## To Run 17 | 18 | Set your resource information in the `resource_info` file. 19 | 20 | The command below runs a single GNMT WMT German-English model on multiple devices specified in `resource_info`. The command assumes that the data directory and the NMT codebase are distributed and reachable in the same absolute path in each of the machines. 21 | 22 | 23 | ``` 24 | $ python nmt_distributed_driver.py \ 25 | --src=de --tgt=en \ 26 | --hparams_path=nmt/standard_hparams/wmt16_gnmt_4_layer.json \ 27 | --out_dir=/tmp/deen_gnmt \ 28 | --vocab_prefix=/tmp/wmt16/vocab.bpe.32000 \ 29 | --train_prefix=/tmp/wmt16/train.tok.clean.bpe.32000 \ 30 | --dev_prefix=/tmp/wmt16/newstest2013.tok.bpe.32000 \ 31 | --test_prefix=/tmp/wmt16/newstest2015.tok.bpe.32000 32 | ``` 33 | 34 | For more options of nmt model command, please check out [https://github.com/tensorflow/nmt](https://github.com/tensorflow/nmt) again. 35 | 36 | Besides, we have a few more options you can choose for distributed running. 37 | 38 | | Parameter Name | Default | Description | 39 | | :------------------- |:-----------------------| :-----------| 40 | | --resource_info_file | `./resource_info` | Filename containing cluster information written | 41 | | --max_steps | 1000000 | Number of iterations to run for each workers | 42 | | --steps_per_stats | 100 | How many steps between two runop log | 43 | | --sync | True | Whether to synchronize learning or not | 44 | | --ckpt_dir | None | Directory to save checkpoints | 45 | | --save_ckpt_steps | 0 | Number of steps between two consecutive checkpoints | 46 | | --run_option | None | The run option whether PS or MPI, None utilizes both | 47 | | --epoch_size | 0 | total number of data instances | 48 | | --search_partitions | False | Whether to use Parallax's variable partitioning method or not | 49 | 50 | You can adapt the distributed running with above options. For example, you can run the GNMT WMT German-English model in MPI mode by just adding `--run_option` value to the script like below: 51 | 52 | ``` 53 | $ python nmt_distributed_driver.py \ 54 | --src=de --tgt=en \ 55 | --hparams_path=${PWD}/nmt/standard_hparams/wmt16_gnmt_4_layer.json \ 56 | --out_dir=/tmp/deen_gnmt \ 57 | --vocab_prefix=/tmp/wmt16/vocab.bpe.32000 \ 58 | --train_prefix=/tmp/wmt16/train.tok.clean.bpe.32000 \ 59 | --dev_prefix=/tmp/wmt16/newstest2013.tok.bpe.32000 \ 60 | --test_prefix=/tmp/wmt16/newstest2015.tok.bpe.32000 61 | --run_option=MPI 62 | ``` 63 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/__init__.py -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/g3doc/img/attention_equation_0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/attention_equation_0.jpg -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/g3doc/img/attention_equation_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/attention_equation_1.jpg -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/g3doc/img/attention_mechanism.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/attention_mechanism.jpg -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/g3doc/img/attention_vis.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/attention_vis.jpg -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/g3doc/img/encdec.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/encdec.jpg -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/g3doc/img/greedy_dec.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/greedy_dec.jpg -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/g3doc/img/seq2seq.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/seq2seq.jpg -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/nmt_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for nmt.py, train.py and inference.py.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import argparse 22 | import os 23 | 24 | import tensorflow as tf 25 | 26 | from . import inference 27 | from . import nmt 28 | from . import train 29 | 30 | 31 | def _update_flags(flags, test_name): 32 | """Update flags for basic training.""" 33 | flags.num_train_steps = 100 34 | flags.steps_per_stats = 5 35 | flags.src = "en" 36 | flags.tgt = "vi" 37 | flags.train_prefix = ("nmt/testdata/" 38 | "iwslt15.tst2013.100") 39 | flags.vocab_prefix = ("nmt/testdata/" 40 | "iwslt15.vocab.100") 41 | flags.dev_prefix = ("nmt/testdata/" 42 | "iwslt15.tst2013.100") 43 | flags.test_prefix = ("nmt/testdata/" 44 | "iwslt15.tst2013.100") 45 | flags.out_dir = os.path.join(tf.test.get_temp_dir(), test_name) 46 | 47 | 48 | class NMTTest(tf.test.TestCase): 49 | 50 | def testTrain(self): 51 | """Test the training loop is functional with basic hparams.""" 52 | nmt_parser = argparse.ArgumentParser() 53 | nmt.add_arguments(nmt_parser) 54 | FLAGS, unparsed = nmt_parser.parse_known_args() 55 | 56 | _update_flags(FLAGS, "nmt_train_test") 57 | 58 | default_hparams = nmt.create_hparams(FLAGS) 59 | 60 | train_fn = train.train 61 | nmt.run_main(FLAGS, default_hparams, train_fn, None) 62 | 63 | 64 | def testTrainWithAvgCkpts(self): 65 | """Test the training loop is functional with basic hparams.""" 66 | nmt_parser = argparse.ArgumentParser() 67 | nmt.add_arguments(nmt_parser) 68 | FLAGS, unparsed = nmt_parser.parse_known_args() 69 | 70 | _update_flags(FLAGS, "nmt_train_test_avg_ckpts") 71 | FLAGS.avg_ckpts = True 72 | 73 | default_hparams = nmt.create_hparams(FLAGS) 74 | 75 | train_fn = train.train 76 | nmt.run_main(FLAGS, default_hparams, train_fn, None) 77 | 78 | 79 | def testInference(self): 80 | """Test inference is function with basic hparams.""" 81 | nmt_parser = argparse.ArgumentParser() 82 | nmt.add_arguments(nmt_parser) 83 | FLAGS, unparsed = nmt_parser.parse_known_args() 84 | 85 | _update_flags(FLAGS, "nmt_train_infer") 86 | 87 | # Train one step so we have a checkpoint. 88 | FLAGS.num_train_steps = 1 89 | default_hparams = nmt.create_hparams(FLAGS) 90 | train_fn = train.train 91 | nmt.run_main(FLAGS, default_hparams, train_fn, None) 92 | 93 | # Update FLAGS for inference. 94 | FLAGS.inference_input_file = ("nmt/testdata/" 95 | "iwslt15.tst2013.100.en") 96 | FLAGS.inference_output_file = os.path.join(FLAGS.out_dir, "output") 97 | FLAGS.inference_ref_file = ("nmt/testdata/" 98 | "iwslt15.tst2013.100.vi") 99 | 100 | default_hparams = nmt.create_hparams(FLAGS) 101 | 102 | inference_fn = inference.inference 103 | nmt.run_main(FLAGS, default_hparams, None, inference_fn) 104 | 105 | 106 | if __name__ == "__main__": 107 | tf.test.main() 108 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/parallax_config.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (C) 2018 Seoul National University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | import tensorflow as tf 18 | import parallax 19 | 20 | 21 | flags = tf.app.flags 22 | flags.DEFINE_boolean('replicate_variables', True, """replicate_variables""") 23 | flags.DEFINE_string('protocol', 'grpc', """The method for managing variables""") 24 | tf.app.flags.DEFINE_string('mpirun_options', '', 'option for mpirun') 25 | flags.DEFINE_string('run_option', 'HYBRID', 26 | 'The run option whether PS, MPI or HYBRID') 27 | flags.DEFINE_string('redirect_path', None, """redirect path to keep the log of distributed workers""") 28 | flags.DEFINE_integer('save_ckpt_steps', None, 29 | """Number of steps between two consecutive checkpoints""") 30 | flags.DEFINE_integer('save_n_ckpts_per_epoch', -1, """Save n checkpoints per every epoch""") 31 | flags.DEFINE_string('ckpt_dir', None, """Directory to save checkpoints""") 32 | flags.DEFINE_string('profile_dir', None, """Directory to save RunMetadata""") 33 | flags.DEFINE_string('profile_steps', None, """Comma separated porfile steps""") 34 | flags.DEFINE_boolean('local_aggregation', True, 35 | """Whether to use local aggregation or not""") 36 | flags.DEFINE_boolean('boundary_among_servers', True, 37 | """Whether to use operation placement among servers""") 38 | flags.DEFINE_boolean('boundary_between_workers_and_servers', True, 39 | """Whether to use operation placement between workers and servers""") 40 | flags.DEFINE_string('export_graph_path', None, """export path to keep transformed graph definintion""") 41 | flags.DEFINE_boolean('search_partitions', False, """Whether to use variable partitioning method""") 42 | FLAGS = flags.FLAGS 43 | 44 | def calculate_ckpt_steps(): 45 | if FLAGS.save_n_ckpts_per_epoch > 0: 46 | with open(FLAGS.resource_info_file) as resource_info: 47 | num_workers = sum([len(w['gpus']) for w in json.load(resource_info)['worker']]) 48 | num_words_per_iter = FLAGS.batch_size * FLAGS.num_steps * num_workers 49 | num_iters_per_epoch = math.ceil(language_model_graph._NUM_WORDS['train'] / num_words_per_iter / FLAGS.save_n_ckpts_per_epoch) 50 | save_ckpt_steps = num_iters_per_epoch if FLAGS.sync else num_iters_per_epoch * num_workers 51 | parallax.log.info('Save checkpoint for every %d iters' % save_ckpt_steps) 52 | else: 53 | save_ckpt_steps = FLAGS.save_ckpt_steps 54 | 55 | return save_ckpt_steps 56 | 57 | 58 | def build_config(): 59 | 60 | ckpt_config = parallax.CheckPointConfig(ckpt_dir=FLAGS.ckpt_dir, 61 | save_ckpt_steps=calculate_ckpt_steps()) 62 | ps_config = parallax.PSConfig(replicate_variables=FLAGS.replicate_variables, 63 | protocol=FLAGS.protocol, 64 | local_aggregation=FLAGS.local_aggregation, 65 | boundary_among_servers=FLAGS.boundary_among_servers, 66 | boundary_between_workers_and_servers=\ 67 | FLAGS.boundary_between_workers_and_servers) 68 | mpi_config = parallax.MPIConfig(mpirun_options=FLAGS.mpirun_options) 69 | parallax_config = parallax.Config() 70 | parallax_config.run_option = FLAGS.run_option 71 | parallax_config.average_sparse = False 72 | parallax_config.communication_config = parallax.CommunicationConfig(ps_config, mpi_config) 73 | parallax_config.ckpt_config = ckpt_config 74 | def get_profile_steps(): 75 | if not FLAGS.profile_steps: 76 | return [] 77 | FLAGS.profile_steps = FLAGS.profile_steps.strip() 78 | return [int(step) for step in FLAGS.profile_steps.split(',')] 79 | profile_config = parallax.ProfileConfig(profile_dir=FLAGS.profile_dir, 80 | profile_steps=get_profile_steps()) 81 | parallax_config.profile_config = profile_config 82 | parallax_config.redirect_path = FLAGS.redirect_path 83 | parallax_config.export_graph_path = FLAGS.export_graph_path 84 | parallax_config.search_partitions = FLAGS.search_partitions 85 | 86 | return parallax_config 87 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/resource_info: -------------------------------------------------------------------------------- 1 | 123.456.78.90:1,2 2 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/scripts/__init__.py -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/scripts/bleu.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Python implementation of BLEU and smooth-BLEU. 17 | 18 | This module provides a Python implementation of BLEU and smooth-BLEU. 19 | Smooth BLEU is computed following the method outlined in the paper: 20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic 21 | evaluation metrics for machine translation. COLING 2004. 22 | """ 23 | 24 | import collections 25 | import math 26 | 27 | 28 | def _get_ngrams(segment, max_order): 29 | """Extracts all n-grams upto a given maximum order from an input segment. 30 | 31 | Args: 32 | segment: text segment from which n-grams will be extracted. 33 | max_order: maximum length in tokens of the n-grams returned by this 34 | methods. 35 | 36 | Returns: 37 | The Counter containing all n-grams upto max_order in segment 38 | with a count of how many times each n-gram occurred. 39 | """ 40 | ngram_counts = collections.Counter() 41 | for order in range(1, max_order + 1): 42 | for i in range(0, len(segment) - order + 1): 43 | ngram = tuple(segment[i:i+order]) 44 | ngram_counts[ngram] += 1 45 | return ngram_counts 46 | 47 | 48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4, 49 | smooth=False): 50 | """Computes BLEU score of translated segments against one or more references. 51 | 52 | Args: 53 | reference_corpus: list of lists of references for each translation. Each 54 | reference should be tokenized into a list of tokens. 55 | translation_corpus: list of translations to score. Each translation 56 | should be tokenized into a list of tokens. 57 | max_order: Maximum n-gram order to use when computing BLEU score. 58 | smooth: Whether or not to apply Lin et al. 2004 smoothing. 59 | 60 | Returns: 61 | 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram 62 | precisions and brevity penalty. 63 | """ 64 | matches_by_order = [0] * max_order 65 | possible_matches_by_order = [0] * max_order 66 | reference_length = 0 67 | translation_length = 0 68 | for (references, translation) in zip(reference_corpus, 69 | translation_corpus): 70 | reference_length += min(len(r) for r in references) 71 | translation_length += len(translation) 72 | 73 | merged_ref_ngram_counts = collections.Counter() 74 | for reference in references: 75 | merged_ref_ngram_counts |= _get_ngrams(reference, max_order) 76 | translation_ngram_counts = _get_ngrams(translation, max_order) 77 | overlap = translation_ngram_counts & merged_ref_ngram_counts 78 | for ngram in overlap: 79 | matches_by_order[len(ngram)-1] += overlap[ngram] 80 | for order in range(1, max_order+1): 81 | possible_matches = len(translation) - order + 1 82 | if possible_matches > 0: 83 | possible_matches_by_order[order-1] += possible_matches 84 | 85 | precisions = [0] * max_order 86 | for i in range(0, max_order): 87 | if smooth: 88 | precisions[i] = ((matches_by_order[i] + 1.) / 89 | (possible_matches_by_order[i] + 1.)) 90 | else: 91 | if possible_matches_by_order[i] > 0: 92 | precisions[i] = (float(matches_by_order[i]) / 93 | possible_matches_by_order[i]) 94 | else: 95 | precisions[i] = 0.0 96 | 97 | if min(precisions) > 0: 98 | p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions) 99 | geo_mean = math.exp(p_log_sum) 100 | else: 101 | geo_mean = 0 102 | 103 | ratio = float(translation_length) / reference_length 104 | 105 | if ratio > 1.0: 106 | bp = 1. 107 | else: 108 | bp = math.exp(1 - 1. / ratio) 109 | 110 | bleu = geo_mean * bp 111 | 112 | return (bleu, precisions, bp, ratio, translation_length, reference_length) 113 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/scripts/download_iwslt15.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Download small-scale IWSLT15 Vietnames to English translation data for NMT 3 | # model training. 4 | # 5 | # Usage: 6 | # ./download_iwslt15.sh path-to-output-dir 7 | # 8 | # If output directory is not specified, "./iwslt15" will be used as the default 9 | # output directory. 10 | OUT_DIR="${1:-iwslt15}" 11 | SITE_PREFIX="https://nlp.stanford.edu/projects/nmt/data" 12 | 13 | mkdir -v -p $OUT_DIR 14 | 15 | # Download iwslt15 small dataset from standford website. 16 | echo "Download training dataset train.en and train.vi." 17 | curl -o "$OUT_DIR/train.en" "$SITE_PREFIX/iwslt15.en-vi/train.en" 18 | curl -o "$OUT_DIR/train.vi" "$SITE_PREFIX/iwslt15.en-vi/train.vi" 19 | 20 | echo "Download dev dataset tst2012.en and tst2012.vi." 21 | curl -o "$OUT_DIR/tst2012.en" "$SITE_PREFIX/iwslt15.en-vi/tst2012.en" 22 | curl -o "$OUT_DIR/tst2012.vi" "$SITE_PREFIX/iwslt15.en-vi/tst2012.vi" 23 | 24 | echo "Download test dataset tst2013.en and tst2013.vi." 25 | curl -o "$OUT_DIR/tst2013.en" "$SITE_PREFIX/iwslt15.en-vi/tst2013.en" 26 | curl -o "$OUT_DIR/tst2013.vi" "$SITE_PREFIX/iwslt15.en-vi/tst2013.vi" 27 | 28 | echo "Download vocab file vocab.en and vocab.vi." 29 | curl -o "$OUT_DIR/vocab.en" "$SITE_PREFIX/iwslt15.en-vi/vocab.en" 30 | curl -o "$OUT_DIR/vocab.vi" "$SITE_PREFIX/iwslt15.en-vi/vocab.vi" 31 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/standard_hparams/iwslt15.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention": "scaled_luong", 3 | "attention_architecture": "standard", 4 | "batch_size": 128, 5 | "colocate_gradients_with_ops": true, 6 | "dropout": 0.2, 7 | "encoder_type": "bi", 8 | "eos": "", 9 | "forget_bias": 1.0, 10 | "infer_batch_size": 32, 11 | "init_weight": 0.1, 12 | "learning_rate": 1.0, 13 | "max_gradient_norm": 5.0, 14 | "metrics": ["bleu"], 15 | "num_buckets": 5, 16 | "num_layers": 2, 17 | "num_train_steps": 12000, 18 | "decay_scheme": "luong234", 19 | "num_units": 512, 20 | "optimizer": "sgd", 21 | "residual": false, 22 | "share_vocab": false, 23 | "subword_option": "", 24 | "sos": "", 25 | "src_max_len": 50, 26 | "src_max_len_infer": null, 27 | "steps_per_external_eval": null, 28 | "steps_per_stats": 100, 29 | "tgt_max_len": 50, 30 | "tgt_max_len_infer": null, 31 | "time_major": true, 32 | "unit_type": "lstm", 33 | "beam_width": 10 34 | } 35 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/standard_hparams/wmt16.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention": "normed_bahdanau", 3 | "attention_architecture": "standard", 4 | "batch_size": 128, 5 | "colocate_gradients_with_ops": true, 6 | "dropout": 0.2, 7 | "encoder_type": "bi", 8 | "eos": "", 9 | "forget_bias": 1.0, 10 | "infer_batch_size": 32, 11 | "init_weight": 0.1, 12 | "learning_rate": 1.0, 13 | "max_gradient_norm": 5.0, 14 | "metrics": ["bleu"], 15 | "num_buckets": 5, 16 | "num_layers": 4, 17 | "num_train_steps": 340000, 18 | "decay_scheme": "luong10", 19 | "num_units": 1024, 20 | "optimizer": "sgd", 21 | "residual": false, 22 | "share_vocab": false, 23 | "subword_option": "bpe", 24 | "sos": "", 25 | "src_max_len": 50, 26 | "src_max_len_infer": null, 27 | "steps_per_external_eval": null, 28 | "steps_per_stats": 100, 29 | "tgt_max_len": 50, 30 | "tgt_max_len_infer": null, 31 | "time_major": true, 32 | "unit_type": "lstm", 33 | "beam_width": 10 34 | } 35 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/standard_hparams/wmt16_gnmt_4_layer.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention": "normed_bahdanau", 3 | "attention_architecture": "gnmt_v2", 4 | "batch_size": 128, 5 | "colocate_gradients_with_ops": true, 6 | "dropout": 0.2, 7 | "encoder_type": "gnmt", 8 | "eos": "", 9 | "forget_bias": 1.0, 10 | "infer_batch_size": 32, 11 | "init_weight": 0.1, 12 | "learning_rate": 1.0, 13 | "max_gradient_norm": 5.0, 14 | "metrics": ["bleu"], 15 | "num_buckets": 5, 16 | "num_layers": 4, 17 | "num_train_steps": 340000, 18 | "decay_scheme": "luong10", 19 | "num_units": 1024, 20 | "optimizer": "sgd", 21 | "residual": true, 22 | "share_vocab": false, 23 | "subword_option": "bpe", 24 | "sos": "", 25 | "src_max_len": 50, 26 | "src_max_len_infer": null, 27 | "steps_per_external_eval": null, 28 | "steps_per_stats": 100, 29 | "tgt_max_len": 50, 30 | "tgt_max_len_infer": null, 31 | "time_major": true, 32 | "unit_type": "lstm", 33 | "beam_width": 10, 34 | "length_penalty_weight": 1.0 35 | } 36 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/standard_hparams/wmt16_gnmt_8_layer.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention": "normed_bahdanau", 3 | "attention_architecture": "gnmt_v2", 4 | "batch_size": 128, 5 | "colocate_gradients_with_ops": true, 6 | "dropout": 0.2, 7 | "encoder_type": "gnmt", 8 | "eos": "", 9 | "forget_bias": 1.0, 10 | "infer_batch_size": 32, 11 | "init_weight": 0.1, 12 | "learning_rate": 1.0, 13 | "max_gradient_norm": 5.0, 14 | "metrics": ["bleu"], 15 | "num_buckets": 5, 16 | "num_layers": 8, 17 | "num_train_steps": 340000, 18 | "decay_scheme": "luong10", 19 | "num_units": 1024, 20 | "optimizer": "sgd", 21 | "residual": true, 22 | "share_vocab": false, 23 | "subword_option": "bpe", 24 | "sos": "", 25 | "src_max_len": 50, 26 | "src_max_len_infer": null, 27 | "steps_per_external_eval": null, 28 | "steps_per_stats": 50, 29 | "tgt_max_len": 50, 30 | "tgt_max_len_infer": null, 31 | "time_major": true, 32 | "unit_type": "lstm", 33 | "beam_width": 10, 34 | "length_penalty_weight": 1.0 35 | } 36 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/testdata/iwslt15.vocab.100.en: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Rachel 5 | : 6 | The 7 | science 8 | behind 9 | a 10 | climate 11 | headline 12 | In 13 | 4 14 | minutes 15 | , 16 | atmospheric 17 | chemist 18 | provides 19 | glimpse 20 | of 21 | the 22 | massive 23 | scientific 24 | effort 25 | bold 26 | headlines 27 | on 28 | change 29 | with 30 | her 31 | team 32 | -- 33 | one 34 | thousands 35 | who 36 | contributed 37 | taking 38 | risky 39 | flight 40 | over 41 | rainforest 42 | in 43 | pursuit 44 | data 45 | key 46 | molecule 47 | . 48 | I 49 | 'd 50 | like 51 | to 52 | talk 53 | you 54 | today 55 | about 56 | scale 57 | that 58 | goes 59 | into 60 | making 61 | see 62 | paper 63 | look 64 | this 65 | when 66 | they 67 | have 68 | do 69 | and 70 | air 71 | quality 72 | or 73 | smog 74 | They 75 | are 76 | both 77 | two 78 | branches 79 | same 80 | field 81 | Recently 82 | looked 83 | Panel 84 | Climate 85 | Change 86 | IPCC 87 | put 88 | out 89 | their 90 | report 91 | state 92 | understanding 93 | system 94 | That 95 | was 96 | written 97 | by 98 | scientists 99 | from 100 | 40 101 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/testdata/iwslt15.vocab.100.vi: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Khoa 5 | học 6 | đằng 7 | sau 8 | một 9 | tiêu 10 | đề 11 | về 12 | khí 13 | hậu 14 | Trong 15 | 4 16 | phút 17 | , 18 | chuyên 19 | gia 20 | hoá 21 | quyển 22 | Rachel 23 | giới 24 | thiệu 25 | sơ 26 | lược 27 | những 28 | nỗ 29 | lực 30 | khoa 31 | miệt 32 | mài 33 | táo 34 | bạo 35 | biến 36 | đổi 37 | cùng 38 | với 39 | đoàn 40 | nghiên 41 | cứu 42 | của 43 | mình 44 | -- 45 | hàng 46 | ngàn 47 | người 48 | đã 49 | cống 50 | hiến 51 | cho 52 | dự 53 | án 54 | này 55 | chuyến 56 | bay 57 | mạo 58 | hiểm 59 | qua 60 | rừng 61 | già 62 | để 63 | tìm 64 | kiếm 65 | thông 66 | tin 67 | phân 68 | tử 69 | then 70 | chốt 71 | . 72 | Tôi 73 | muốn 74 | các 75 | bạn 76 | biết 77 | sự 78 | to 79 | lớn 80 | góp 81 | phần 82 | làm 83 | nên 84 | dòng 85 | tít 86 | thường 87 | thấy 88 | trên 89 | báo 90 | Có 91 | trông 92 | như 93 | thế 94 | khi 95 | bàn 96 | và 97 | nói 98 | chất 99 | lượng 100 | không 101 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/testdata/label_ref: -------------------------------------------------------------------------------- 1 | positive 2 | positive 3 | positive 4 | negative 5 | negative -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/testdata/pred_output: -------------------------------------------------------------------------------- 1 | positive 2 | positive 3 | negative 4 | negative 5 | positive -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/testdata/test_embed.txt: -------------------------------------------------------------------------------- 1 | some_word 1.0 2.0 3.0 4.0 2 | some_other_word 4.0 3.0 2.0 1.0 3 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/testdata/test_embed_with_header.txt: -------------------------------------------------------------------------------- 1 | 2 4 2 | some_word 1.0 2.0 3.0 4.0 3 | some_other_word 4.0 3.0 2.0 1.0 4 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/testdata/test_infer_file: -------------------------------------------------------------------------------- 1 | A Republic@@ an strategy to counter the re-@@ election of Obama 2 | Republic@@ an leaders justified their policy by the need to combat electoral fraud . 3 | However , the Brenn@@ an Centre considers this a my@@ th , stating that electoral fraud is rar@@ er in the United States than the number of people killed by ligh@@ tn@@ ing . 4 | Indeed , Republic@@ an lawyers identified only 300 cases of electoral fraud in the United States in a decade . 5 | One thing is certain : these new provisions will have a negative impact on vot@@ er tur@@ n-@@ out . -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/testdata/test_infer_vocab.src: -------------------------------------------------------------------------------- 1 | unk 2 | eos 3 | sos 4 | test1 5 | test2 6 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/testdata/test_infer_vocab.tgt: -------------------------------------------------------------------------------- 1 | unk 2 | eos 3 | test1 4 | test2 5 | test3 6 | test4 7 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/utils/__init__.py -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/utils/common_test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Common utility functions for tests.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import tensorflow as tf 23 | 24 | from tensorflow.python.ops import lookup_ops 25 | 26 | from ..utils import iterator_utils 27 | from ..utils import standard_hparams_utils 28 | 29 | 30 | def create_test_hparams(unit_type="lstm", 31 | encoder_type="uni", 32 | num_layers=4, 33 | attention="", 34 | attention_architecture=None, 35 | use_residual=False, 36 | inference_indices=None, 37 | num_translations_per_input=1, 38 | beam_width=0, 39 | init_op="uniform"): 40 | """Create training and inference test hparams.""" 41 | num_residual_layers = 0 42 | if use_residual: 43 | # TODO(rzhao): Put num_residual_layers computation logic into 44 | # `model_utils.py`, so we can also test it here. 45 | num_residual_layers = 2 46 | 47 | standard_hparams = standard_hparams_utils.create_standard_hparams() 48 | 49 | # Networks 50 | standard_hparams.num_units = 5 51 | standard_hparams.num_encoder_layers = num_layers 52 | standard_hparams.num_decoder_layers = num_layers 53 | standard_hparams.dropout = 0.5 54 | standard_hparams.unit_type = unit_type 55 | standard_hparams.encoder_type = encoder_type 56 | standard_hparams.residual = use_residual 57 | standard_hparams.num_residual_layers = num_residual_layers 58 | 59 | # Attention mechanisms 60 | standard_hparams.attention = attention 61 | standard_hparams.attention_architecture = attention_architecture 62 | 63 | # Train 64 | standard_hparams.init_op = init_op 65 | standard_hparams.num_train_steps = 1 66 | standard_hparams.decay_scheme = "" 67 | 68 | # Infer 69 | standard_hparams.tgt_max_len_infer = 100 70 | standard_hparams.beam_width = beam_width 71 | standard_hparams.num_translations_per_input = num_translations_per_input 72 | 73 | # Misc 74 | standard_hparams.forget_bias = 0.0 75 | standard_hparams.random_seed = 3 76 | 77 | # Vocab 78 | standard_hparams.src_vocab_size = 5 79 | standard_hparams.tgt_vocab_size = 5 80 | standard_hparams.eos = "eos" 81 | standard_hparams.sos = "sos" 82 | standard_hparams.src_vocab_file = "" 83 | standard_hparams.tgt_vocab_file = "" 84 | standard_hparams.src_embed_file = "" 85 | standard_hparams.tgt_embed_file = "" 86 | 87 | # For inference.py test 88 | standard_hparams.subword_option = "bpe" 89 | standard_hparams.src = "src" 90 | standard_hparams.tgt = "tgt" 91 | standard_hparams.src_max_len = 400 92 | standard_hparams.tgt_eos_id = 0 93 | standard_hparams.inference_indices = inference_indices 94 | return standard_hparams 95 | 96 | 97 | def create_test_iterator(hparams, mode): 98 | """Create test iterator.""" 99 | src_vocab_table = lookup_ops.index_table_from_tensor( 100 | tf.constant([hparams.eos, "a", "b", "c", "d"])) 101 | tgt_vocab_mapping = tf.constant([hparams.sos, hparams.eos, "a", "b", "c"]) 102 | tgt_vocab_table = lookup_ops.index_table_from_tensor(tgt_vocab_mapping) 103 | if mode == tf.contrib.learn.ModeKeys.INFER: 104 | reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_tensor( 105 | tgt_vocab_mapping) 106 | 107 | src_dataset = tf.data.Dataset.from_tensor_slices( 108 | tf.constant(["a a b b c", "a b b"])) 109 | 110 | if mode != tf.contrib.learn.ModeKeys.INFER: 111 | tgt_dataset = tf.data.Dataset.from_tensor_slices( 112 | tf.constant(["a b c b c", "a b c b"])) 113 | return ( 114 | iterator_utils.get_iterator( 115 | src_dataset=src_dataset, 116 | tgt_dataset=tgt_dataset, 117 | src_vocab_table=src_vocab_table, 118 | tgt_vocab_table=tgt_vocab_table, 119 | batch_size=hparams.batch_size, 120 | sos=hparams.sos, 121 | eos=hparams.eos, 122 | random_seed=hparams.random_seed, 123 | num_buckets=hparams.num_buckets), 124 | src_vocab_table, 125 | tgt_vocab_table) 126 | else: 127 | return ( 128 | iterator_utils.get_infer_iterator( 129 | src_dataset=src_dataset, 130 | src_vocab_table=src_vocab_table, 131 | eos=hparams.eos, 132 | batch_size=hparams.batch_size), 133 | src_vocab_table, 134 | tgt_vocab_table, 135 | reverse_tgt_vocab_table) 136 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/utils/evaluation_utils_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Tests for evaluation_utils.py.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import tensorflow as tf 23 | 24 | from ..utils import evaluation_utils 25 | 26 | 27 | class EvaluationUtilsTest(tf.test.TestCase): 28 | 29 | def testEvaluate(self): 30 | output = "nmt/testdata/deen_output" 31 | ref_bpe = "nmt/testdata/deen_ref_bpe" 32 | ref_spm = "nmt/testdata/deen_ref_spm" 33 | 34 | expected_bleu_score = 22.5855084573 35 | expected_rouge_score = 50.8429782599 36 | 37 | bpe_bleu_score = evaluation_utils.evaluate( 38 | ref_bpe, output, "bleu", "bpe") 39 | bpe_rouge_score = evaluation_utils.evaluate( 40 | ref_bpe, output, "rouge", "bpe") 41 | 42 | self.assertAlmostEqual(expected_bleu_score, bpe_bleu_score) 43 | self.assertAlmostEqual(expected_rouge_score, bpe_rouge_score) 44 | 45 | spm_bleu_score = evaluation_utils.evaluate( 46 | ref_spm, output, "bleu", "spm") 47 | spm_rouge_score = evaluation_utils.evaluate( 48 | ref_spm, output, "rouge", "spm") 49 | 50 | self.assertAlmostEqual(expected_rouge_score, spm_rouge_score) 51 | self.assertAlmostEqual(expected_bleu_score, spm_bleu_score) 52 | 53 | def testAccuracy(self): 54 | pred_output = "nmt/testdata/pred_output" 55 | label_ref = "nmt/testdata/label_ref" 56 | 57 | expected_accuracy_score = 60.00 58 | 59 | accuracy_score = evaluation_utils.evaluate( 60 | label_ref, pred_output, "accuracy") 61 | self.assertAlmostEqual(expected_accuracy_score, accuracy_score) 62 | 63 | def testWordAccuracy(self): 64 | pred_output = "nmt/testdata/pred_output" 65 | label_ref = "nmt/testdata/label_ref" 66 | 67 | expected_word_accuracy_score = 60.00 68 | 69 | word_accuracy_score = evaluation_utils.evaluate( 70 | label_ref, pred_output, "word_accuracy") 71 | self.assertAlmostEqual(expected_word_accuracy_score, word_accuracy_score) 72 | 73 | 74 | if __name__ == "__main__": 75 | tf.test.main() 76 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/utils/misc_utils_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Tests for vocab_utils.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import tensorflow as tf 23 | 24 | from ..utils import misc_utils 25 | 26 | 27 | class MiscUtilsTest(tf.test.TestCase): 28 | 29 | def testFormatBpeText(self): 30 | bpe_line = ( 31 | b"En@@ ough to make already reluc@@ tant men hesitate to take screening" 32 | b" tests ." 33 | ) 34 | expected_result = ( 35 | b"Enough to make already reluctant men hesitate to take screening tests" 36 | b" ." 37 | ) 38 | self.assertEqual(expected_result, 39 | misc_utils.format_bpe_text(bpe_line.split(b" "))) 40 | 41 | def testFormatSPMText(self): 42 | spm_line = u"\u2581This \u2581is \u2581a \u2581 te st .".encode("utf-8") 43 | expected_result = "This is a test." 44 | self.assertEqual(expected_result, 45 | misc_utils.format_spm_text(spm_line.split(b" "))) 46 | 47 | 48 | if __name__ == "__main__": 49 | tf.test.main() 50 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/utils/nmt_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Utility functions specifically for NMT.""" 17 | from __future__ import print_function 18 | 19 | import codecs 20 | import time 21 | import numpy as np 22 | import tensorflow as tf 23 | 24 | from utils import evaluation_utils 25 | from utils import misc_utils as utils 26 | 27 | __all__ = ["decode_and_evaluate", "get_translation"] 28 | 29 | 30 | def decode_and_evaluate(name, 31 | model, 32 | sess, 33 | trans_file, 34 | ref_file, 35 | metrics, 36 | subword_option, 37 | beam_width, 38 | tgt_eos, 39 | num_translations_per_input=1, 40 | decode=True): 41 | """Decode a test set and compute a score according to the evaluation task.""" 42 | # Decode 43 | if decode: 44 | utils.print_out(" decoding to output %s." % trans_file) 45 | 46 | start_time = time.time() 47 | num_sentences = 0 48 | with codecs.getwriter("utf-8")( 49 | tf.gfile.GFile(trans_file, mode="wb")) as trans_f: 50 | trans_f.write("") # Write empty string to ensure file is created. 51 | 52 | num_translations_per_input = max( 53 | min(num_translations_per_input, beam_width), 1) 54 | while True: 55 | try: 56 | nmt_outputs, _ = model.decode(sess) 57 | if beam_width == 0: 58 | nmt_outputs = np.expand_dims(nmt_outputs, 0) 59 | 60 | batch_size = nmt_outputs.shape[1] 61 | num_sentences += batch_size 62 | 63 | for sent_id in range(batch_size): 64 | for beam_id in range(num_translations_per_input): 65 | translation = get_translation( 66 | nmt_outputs[beam_id], 67 | sent_id, 68 | tgt_eos=tgt_eos, 69 | subword_option=subword_option) 70 | trans_f.write((translation + b"\n").decode("utf-8")) 71 | except tf.errors.OutOfRangeError: 72 | utils.print_time( 73 | " done, num sentences %d, num translations per input %d" % 74 | (num_sentences, num_translations_per_input), start_time) 75 | break 76 | 77 | # Evaluation 78 | evaluation_scores = {} 79 | if ref_file and tf.gfile.Exists(trans_file): 80 | for metric in metrics: 81 | score = evaluation_utils.evaluate( 82 | ref_file, 83 | trans_file, 84 | metric, 85 | subword_option=subword_option) 86 | evaluation_scores[metric] = score 87 | utils.print_out(" %s %s: %.1f" % (metric, name, score)) 88 | 89 | return evaluation_scores 90 | 91 | 92 | def get_translation(nmt_outputs, sent_id, tgt_eos, subword_option): 93 | """Given batch decoding outputs, select a sentence and turn to text.""" 94 | if tgt_eos: tgt_eos = tgt_eos.encode("utf-8") 95 | # Select a sentence 96 | output = nmt_outputs[sent_id, :].tolist() 97 | 98 | # If there is an eos symbol in outputs, cut them at that point. 99 | if tgt_eos and tgt_eos in output: 100 | output = output[:output.index(tgt_eos)] 101 | 102 | if subword_option == "bpe": # BPE 103 | translation = utils.format_bpe_text(output) 104 | elif subword_option == "spm": # SPM 105 | translation = utils.format_spm_text(output) 106 | else: 107 | translation = utils.format_text(output) 108 | 109 | return translation 110 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/utils/standard_hparams_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """standard hparams utils.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import tensorflow as tf 23 | 24 | 25 | def create_standard_hparams(): 26 | return tf.contrib.training.HParams( 27 | # Data 28 | src="", 29 | tgt="", 30 | train_prefix="", 31 | dev_prefix="", 32 | test_prefix="", 33 | vocab_prefix="", 34 | embed_prefix="", 35 | out_dir="", 36 | 37 | # Networks 38 | num_units=512, 39 | num_layers=2, 40 | num_encoder_layers=2, 41 | num_decoder_layers=2, 42 | dropout=0.2, 43 | unit_type="lstm", 44 | encoder_type="bi", 45 | residual=False, 46 | time_major=True, 47 | num_embeddings_partitions=0, 48 | 49 | # Attention mechanisms 50 | attention="scaled_luong", 51 | attention_architecture="standard", 52 | output_attention=True, 53 | pass_hidden_state=True, 54 | 55 | # Train 56 | optimizer="sgd", 57 | batch_size=128, 58 | init_op="uniform", 59 | init_weight=0.1, 60 | max_gradient_norm=5.0, 61 | learning_rate=1.0, 62 | warmup_steps=0, 63 | warmup_scheme="t2t", 64 | decay_scheme="luong234", 65 | colocate_gradients_with_ops=True, 66 | num_train_steps=12000, 67 | 68 | # Data constraints 69 | num_buckets=5, 70 | max_train=0, 71 | src_max_len=50, 72 | tgt_max_len=50, 73 | src_max_len_infer=0, 74 | tgt_max_len_infer=0, 75 | 76 | # Data format 77 | sos="", 78 | eos="", 79 | subword_option="", 80 | check_special_token=True, 81 | 82 | # Misc 83 | forget_bias=1.0, 84 | num_gpus=1, 85 | epoch_step=0, # record where we were within an epoch. 86 | steps_per_stats=100, 87 | steps_per_external_eval=0, 88 | share_vocab=False, 89 | metrics=["bleu"], 90 | log_device_placement=False, 91 | random_seed=None, 92 | # only enable beam search during inference when beam_width > 0. 93 | beam_width=0, 94 | length_penalty_weight=0.0, 95 | override_loaded_hparams=True, 96 | num_keep_ckpts=5, 97 | avg_ckpts=False, 98 | 99 | # For inference 100 | inference_indices=None, 101 | infer_batch_size=32, 102 | sampling_temperature=0.0, 103 | num_translations_per_input=1, 104 | ) 105 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/utils/vocab_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Utility to handle vocabularies.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import codecs 23 | import os 24 | import tensorflow as tf 25 | 26 | from tensorflow.python.ops import lookup_ops 27 | 28 | from utils import misc_utils as utils 29 | 30 | 31 | UNK = "" 32 | SOS = "" 33 | EOS = "" 34 | UNK_ID = 0 35 | 36 | 37 | def load_vocab(vocab_file): 38 | vocab = [] 39 | with codecs.getreader("utf-8")(tf.gfile.GFile(vocab_file, "rb")) as f: 40 | vocab_size = 0 41 | for word in f: 42 | vocab_size += 1 43 | vocab.append(word.strip()) 44 | return vocab, vocab_size 45 | 46 | 47 | def check_vocab(vocab_file, out_dir, check_special_token=True, sos=None, 48 | eos=None, unk=None): 49 | """Check if vocab_file doesn't exist, create from corpus_file.""" 50 | if tf.gfile.Exists(vocab_file): 51 | utils.print_out("# Vocab file %s exists" % vocab_file) 52 | vocab, vocab_size = load_vocab(vocab_file) 53 | if check_special_token: 54 | # Verify if the vocab starts with unk, sos, eos 55 | # If not, prepend those tokens & generate a new vocab file 56 | if not unk: unk = UNK 57 | if not sos: sos = SOS 58 | if not eos: eos = EOS 59 | assert len(vocab) >= 3 60 | if vocab[0] != unk or vocab[1] != sos or vocab[2] != eos: 61 | utils.print_out("The first 3 vocab words [%s, %s, %s]" 62 | " are not [%s, %s, %s]" % 63 | (vocab[0], vocab[1], vocab[2], unk, sos, eos)) 64 | vocab = [unk, sos, eos] + vocab 65 | vocab_size += 3 66 | new_vocab_file = os.path.join(out_dir, os.path.basename(vocab_file)) 67 | with codecs.getwriter("utf-8")( 68 | tf.gfile.GFile(new_vocab_file, "wb")) as f: 69 | for word in vocab: 70 | f.write("%s\n" % word) 71 | vocab_file = new_vocab_file 72 | else: 73 | raise ValueError("vocab_file '%s' does not exist." % vocab_file) 74 | 75 | vocab_size = len(vocab) 76 | return vocab_size, vocab_file 77 | 78 | 79 | def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): 80 | """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" 81 | src_vocab_table = lookup_ops.index_table_from_file( 82 | src_vocab_file, default_value=UNK_ID) 83 | if share_vocab: 84 | tgt_vocab_table = src_vocab_table 85 | else: 86 | tgt_vocab_table = lookup_ops.index_table_from_file( 87 | tgt_vocab_file, default_value=UNK_ID) 88 | return src_vocab_table, tgt_vocab_table 89 | 90 | 91 | def load_embed_txt(embed_file): 92 | """Load embed_file into a python dictionary. 93 | 94 | Note: the embed_file should be a Glove formated txt file. Assuming 95 | embed_size=5, for example: 96 | 97 | the -0.071549 0.093459 0.023738 -0.090339 0.056123 98 | to 0.57346 0.5417 -0.23477 -0.3624 0.4037 99 | and 0.20327 0.47348 0.050877 0.002103 0.060547 100 | 101 | Args: 102 | embed_file: file path to the embedding file. 103 | Returns: 104 | a dictionary that maps word to vector, and the size of embedding dimensions. 105 | """ 106 | emb_dict = dict() 107 | emb_size = None 108 | with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, 'rb')) as f: 109 | for line in f: 110 | tokens = line.strip().split(" ") 111 | word = tokens[0] 112 | vec = list(map(float, tokens[1:])) 113 | emb_dict[word] = vec 114 | if emb_size: 115 | assert emb_size == len(vec), "All embedding size should be same." 116 | else: 117 | emb_size = len(vec) 118 | return emb_dict, emb_size 119 | -------------------------------------------------------------------------------- /parallax/parallax/examples/nmt/utils/vocab_utils_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Tests for vocab_utils.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import codecs 23 | import os 24 | import tensorflow as tf 25 | 26 | from ..utils import vocab_utils 27 | 28 | 29 | class VocabUtilsTest(tf.test.TestCase): 30 | 31 | def testCheckVocab(self): 32 | # Create a vocab file 33 | vocab_dir = os.path.join(tf.test.get_temp_dir(), "vocab_dir") 34 | os.makedirs(vocab_dir) 35 | vocab_file = os.path.join(vocab_dir, "vocab_file") 36 | vocab = ["a", "b", "c"] 37 | with codecs.getwriter("utf-8")(tf.gfile.GFile(vocab_file, "wb")) as f: 38 | for word in vocab: 39 | f.write("%s\n" % word) 40 | 41 | # Call vocab_utils 42 | out_dir = os.path.join(tf.test.get_temp_dir(), "out_dir") 43 | os.makedirs(out_dir) 44 | vocab_size, new_vocab_file = vocab_utils.check_vocab( 45 | vocab_file, out_dir) 46 | 47 | # Assert: we expect the code to add , , and 48 | # create a new vocab file 49 | self.assertEqual(len(vocab) + 3, vocab_size) 50 | self.assertEqual(os.path.join(out_dir, "vocab_file"), new_vocab_file) 51 | new_vocab, _ = vocab_utils.load_vocab(new_vocab_file) 52 | self.assertEqual( 53 | [vocab_utils.UNK, vocab_utils.SOS, vocab_utils.EOS] + vocab, new_vocab) 54 | 55 | 56 | if __name__ == "__main__": 57 | tf.test.main() 58 | -------------------------------------------------------------------------------- /parallax/parallax/examples/simple/README.md: -------------------------------------------------------------------------------- 1 | # Simple Example 2 | This is a basic distributed training example with parallax. 3 | 4 | ## To Run 5 | Set your resource information in the `resource_info` file. 6 | 7 | Then execute: 8 | ```shell 9 | $ python simple_driver.py 10 | ``` 11 | 12 | The command assumes the simple example codebase is distributed and reachable in the same absolute path in each of the machines. 13 | -------------------------------------------------------------------------------- /parallax/parallax/examples/simple/resource_info: -------------------------------------------------------------------------------- 1 | 123.456.78.90:1,2,4,5 2 | -------------------------------------------------------------------------------- /parallax/parallax/examples/simple/simple_driver.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Seoul National University 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import os 17 | import numpy as np 18 | import tensorflow as tf 19 | import argparse 20 | 21 | import parallax 22 | 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument('-lr', "--learning_rate", type=float, default=0.01, 25 | help='Learning rate') 26 | 27 | args = parser.parse_args() 28 | 29 | [-0.880728357, -0.706550564], 30 | [-0.179175969, 0.052373456], 31 | [0.460992645, 0.328267666], 32 | [-0.378916048, 0.86581809], 33 | [-0.064562793, -0.755948805], 34 | [-0.585833517, -0.46743004], 35 | [-0.151177544, -0.582325109], 36 | [-0.720116833, 0.834904979], 37 | [-0.518939078, -0.670627318], 38 | [-0.035878422, 0.750102543], 39 | [-0.673400627, -0.919498322], 40 | [-0.731202767, -0.159733489], 41 | [-0.463404605, 0.697764632], 42 | [0.706744043, 0.458026442], 43 | [0.819940015, -0.867168658], 44 | [-0.056113501, -0.602024627], 45 | [0.213450484, -0.20133007], 46 | [-0.358544296, -0.40380244], 47 | 48 | train_x = np.array([ 49 | [-0.880728357, -0.706550564], 50 | [-0.179175969, 0.052373456], 51 | [0.460992645, 0.328267666], 52 | [-0.378916048, 0.86581809], 53 | [-0.064562793, -0.755948805], 54 | [-0.585833517, -0.46743004], 55 | [-0.151177544, -0.582325109], 56 | [-0.720116833, 0.834904979], 57 | [-0.518939078, -0.670627318], 58 | [-0.035878422, 0.750102543], 59 | [-0.673400627, -0.919498322], 60 | [-0.731202767, -0.159733489], 61 | [-0.463404605, 0.697764632], 62 | [0.706744043, 0.458026442], 63 | [0.819940015, -0.867168658], 64 | [-0.056113501, -0.602024627], 65 | [0.213450484, -0.20133007], 66 | [-0.358544296, -0.40380244] 67 | ]) 68 | 69 | train_y = np.array([ 70 | [2.306799664], 71 | [1.825970013], 72 | [1.901374447], 73 | [0.909895597], 74 | [2.723102683], 75 | [2.145410027], 76 | [2.498034199], 77 | [0.844066487], 78 | [2.401599333], 79 | [1.274285598], 80 | [2.542184193], 81 | [1.81653423], 82 | [1.06511757], 83 | [1.891457798], 84 | [3.317388286], 85 | [2.579920223], 86 | [2.301286159], 87 | [2.197386858], 88 | ]) 89 | 90 | num_samples = train_x.shape[0] 91 | 92 | 93 | def main(_): 94 | single_gpu_graph = tf.Graph() 95 | with single_gpu_graph.as_default(): 96 | global_step = tf.train.get_or_create_global_step() 97 | x = tf.placeholder(tf.float32, shape=(2)) 98 | y = tf.placeholder(tf.float32, shape=(1)) 99 | 100 | w = tf.get_variable(name='w', shape=(2, 1)) 101 | b = tf.get_variable(name='b', shape=(1)) 102 | 103 | pred = tf.nn.bias_add(tf.matmul(tf.expand_dims(x, axis=0), w), b) 104 | loss = tf.reduce_sum(tf.pow(pred - tf.expand_dims(y, axis=0), 2)) / 2 105 | 106 | optimizer = tf.train.GradientDescentOptimizer(args.learning_rate) 107 | train_op = optimizer.minimize(loss, global_step=global_step) 108 | 109 | # init = tf.global_variables_initializer() 110 | 111 | def run(sess, num_workers, worker_id, num_replicas_per_worker): 112 | cursor = 0 113 | for i in range(1000): 114 | feed_dict = {} 115 | feed_dict[x] = [train_x[(cursor + j) % num_samples] for j in \ 116 | range(num_replicas_per_worker)] 117 | feed_dict[y] = [train_y[(cursor + j) % num_samples] for j in \ 118 | range(num_replicas_per_worker)] 119 | cursor += num_replicas_per_worker 120 | fetches = { 121 | 'global_step': global_step, 122 | 'loss': loss, 123 | 'train_op': train_op 124 | } 125 | 126 | results = sess.run(fetches, feed_dict=feed_dict) 127 | 128 | if i % 5 == 0: 129 | print("global step: %d, loss: %f" 130 | % (results['global_step'][0], results['loss'][0])) 131 | 132 | resource_info = os.path.join(os.path.dirname(os.path.abspath(__file__)), 133 | 'resource_info') 134 | sess, num_workers, worker_id, num_replicas_per_worker = \ 135 | parallax.parallel_run(single_gpu_graph, resource_info) 136 | run(sess, num_workers, worker_id, num_replicas_per_worker) 137 | 138 | if __name__ == '__main__': 139 | tf.app.run() 140 | -------------------------------------------------------------------------------- /parallax/parallax/examples/skip_thoughts/README.md: -------------------------------------------------------------------------------- 1 | # Skip-Thought Vectors 2 | This example implements the model described in [Skip-Thought Vectors](https://papers.nips.cc/paper/5950-skip-thought-vectors.pdf). 3 | The original code comes from [here](https://github.com/tensorflow/models/tree/master/research/skip_thoughts). 4 | We changed a minimal amount of the original code;`import path` code and BUILD file. 5 | We added the `skip_distributed_driver.py` file and modified `ops/input_ops.py`(for data sharding) file to run the example on parallax. 6 | 7 | ## Dataset 8 | * Follow the instructions shown in [Prepare the Training Data](https://github.com/tensorflow/models/tree/master/research/skip_thoughts). 9 | 10 | ## To Run 11 | Set your resource information in the `resource_info` file. 12 | 13 | Then execute: 14 | ```shell 15 | $ python skip_distributed_driver.py --input_file_pattern ${DATA_DIR}/data/train-?????-of-00100 16 | ``` 17 | The command above runs a single Skip-Thought Vectors model on multiple devices specified in `resource_info`. 18 | The command assumes that the data directory and the Skip-Thought Vectors codebase are distributed and reachable in the same absolute path in each of the machines. 19 | 20 | Also, we have a few more options you can choose for distributed running. 21 | 22 | | Parameter Name | Default | Description | 23 | | :------------------- |:-----------------------| :-----------| 24 | | --data_path | None | Where to training/test data is stored | 25 | | --input_file_pattern | "" | File pattern of training data | 26 | | --batch_size | 128 | Batch size | 27 | | --resource_info_file | `./resource_info` | Filename containing cluster information written | 28 | | --max_steps | 1000000 | Number of iterations to run for each workers | 29 | | --log_frequency | 100 | How many steps between two runop log | 30 | | --sync | True | Whether to synchronize learning or not | 31 | | --ckpt_dir | None | Directory to save checkpoints | 32 | | --save_ckpt_steps | 0 | Number of steps between two consecutive checkpoints | 33 | | --run_option | None | The run option whether PS or MPI, None utilizes both | 34 | 35 | 36 | You can adapt the distributed running with above options. For example, if you want to fix the communication model as MPI mode, you can add `run_option` value like below. 37 | 38 | ```shell 39 | $ python skip_distributed_driver.py --input_file_pattern ${DATA_DIR}/data/train-?????-of-00100 --run_option MPI 40 | ``` 41 | -------------------------------------------------------------------------------- /parallax/parallax/examples/skip_thoughts/configuration.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Default configuration for model architecture and training. 16 | 17 | Original source : https://github.com/tensorflow/models/tree/master/skip_thoughts 18 | 19 | """ 20 | 21 | from __future__ import absolute_import 22 | from __future__ import division 23 | from __future__ import print_function 24 | 25 | 26 | class _HParams(object): 27 | """Wrapper for configuration parameters.""" 28 | pass 29 | 30 | 31 | def model_config(input_file_pattern=None, 32 | input_queue_capacity=640000, 33 | num_input_reader_threads=1, 34 | shuffle_input_data=True, 35 | uniform_init_scale=0.1, 36 | vocab_size=20000, 37 | batch_size=128, 38 | word_embedding_dim=620, 39 | bidirectional_encoder=False, 40 | encoder_dim=2400): 41 | """Creates a model configuration object. 42 | 43 | Args: 44 | input_file_pattern: File pattern of sharded TFRecord files containing 45 | tf.Example protobufs. 46 | input_queue_capacity: Number of examples to keep in the input queue. 47 | num_input_reader_threads: Number of threads for prefetching input 48 | tf.Examples. 49 | shuffle_input_data: Whether to shuffle the input data. 50 | uniform_init_scale: Scale of random uniform initializer. 51 | vocab_size: Number of unique words in the vocab. 52 | batch_size: Batch size (training and evaluation only). 53 | word_embedding_dim: Word embedding dimension. 54 | bidirectional_encoder: Whether to use a bidirectional or unidirectional 55 | encoder RNN. 56 | encoder_dim: Number of output dimensions of the sentence encoder. 57 | 58 | Returns: 59 | An object containing model configuration parameters. 60 | """ 61 | config = _HParams() 62 | config.input_file_pattern = input_file_pattern 63 | config.input_queue_capacity = input_queue_capacity 64 | config.num_input_reader_threads = num_input_reader_threads 65 | config.shuffle_input_data = shuffle_input_data 66 | config.uniform_init_scale = uniform_init_scale 67 | config.vocab_size = vocab_size 68 | config.batch_size = batch_size 69 | config.word_embedding_dim = word_embedding_dim 70 | config.bidirectional_encoder = bidirectional_encoder 71 | config.encoder_dim = encoder_dim 72 | return config 73 | 74 | 75 | def training_config(learning_rate=0.0008, 76 | learning_rate_decay_factor=0.5, 77 | learning_rate_decay_steps=400000, 78 | number_of_steps=500000, 79 | clip_gradient_norm=5.0, 80 | save_model_secs=600, 81 | save_summaries_secs=600): 82 | """Creates a training configuration object. 83 | 84 | Args: 85 | learning_rate: Initial learning rate. 86 | learning_rate_decay_factor: If > 0, the learning rate decay factor. 87 | learning_rate_decay_steps: The number of steps before the learning rate 88 | decays by learning_rate_decay_factor. 89 | number_of_steps: The total number of training steps to run. Passing None 90 | will cause the training script to run indefinitely. 91 | clip_gradient_norm: If not None, then clip gradients to this value. 92 | save_model_secs: How often (in seconds) to save model checkpoints. 93 | save_summaries_secs: How often (in seconds) to save model summaries. 94 | 95 | Returns: 96 | An object containing training configuration parameters. 97 | 98 | Raises: 99 | ValueError: If learning_rate_decay_factor is set and 100 | learning_rate_decay_steps is unset. 101 | """ 102 | if learning_rate_decay_factor and not learning_rate_decay_steps: 103 | raise ValueError( 104 | "learning_rate_decay_factor requires learning_rate_decay_steps.") 105 | 106 | config = _HParams() 107 | config.learning_rate = learning_rate 108 | config.learning_rate_decay_factor = learning_rate_decay_factor 109 | config.learning_rate_decay_steps = learning_rate_decay_steps 110 | config.number_of_steps = number_of_steps 111 | config.clip_gradient_norm = clip_gradient_norm 112 | config.save_model_secs = save_model_secs 113 | config.save_summaries_secs = save_summaries_secs 114 | return config 115 | -------------------------------------------------------------------------------- /parallax/parallax/examples/skip_thoughts/data/special_words.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Special word constants. 16 | 17 | NOTE: The ids of the EOS and UNK constants should not be modified. It is assumed 18 | that these always occupy the first two ids. 19 | """ 20 | 21 | # End of sentence. 22 | EOS = "" 23 | EOS_ID = 0 24 | 25 | # Unknown. 26 | UNK = "" 27 | UNK_ID = 1 28 | -------------------------------------------------------------------------------- /parallax/parallax/examples/skip_thoughts/encoder_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Manager class for loading and encoding with multiple skip-thoughts models. 16 | 17 | If multiple models are loaded at once then the encode() function returns the 18 | concatenation of the outputs of each model. 19 | 20 | Example usage: 21 | manager = EncoderManager() 22 | manager.load_model(model_config_1, vocabulary_file_1, embedding_matrix_file_1, 23 | checkpoint_path_1) 24 | manager.load_model(model_config_2, vocabulary_file_2, embedding_matrix_file_2, 25 | checkpoint_path_2) 26 | encodings = manager.encode(data) 27 | 28 | Original source : https://github.com/tensorflow/models/tree/master/skip_thoughts 29 | 30 | """ 31 | 32 | from __future__ import absolute_import 33 | from __future__ import division 34 | from __future__ import print_function 35 | 36 | import collections 37 | 38 | import numpy as np 39 | import tensorflow as tf 40 | 41 | import skip_thoughts_encoder 42 | 43 | 44 | class EncoderManager(object): 45 | """Manager class for loading and encoding with skip-thoughts models.""" 46 | 47 | def __init__(self): 48 | self.encoders = [] 49 | self.sessions = [] 50 | 51 | def load_model(self, model_config, vocabulary_file, embedding_matrix_file, 52 | checkpoint_path): 53 | """Loads a skip-thoughts model. 54 | 55 | Args: 56 | model_config: Object containing parameters for building the model. 57 | vocabulary_file: Path to vocabulary file containing a list of newline- 58 | separated words where the word id is the corresponding 0-based index in 59 | the file. 60 | embedding_matrix_file: Path to a serialized numpy array of shape 61 | [vocab_size, embedding_dim]. 62 | checkpoint_path: SkipThoughtsModel checkpoint file or a directory 63 | containing a checkpoint file. 64 | """ 65 | tf.logging.info("Reading vocabulary from %s", vocabulary_file) 66 | with tf.gfile.GFile(vocabulary_file, mode="r") as f: 67 | lines = list(f.readlines()) 68 | reverse_vocab = [line.decode("utf-8").strip() for line in lines] 69 | tf.logging.info("Loaded vocabulary with %d words.", len(reverse_vocab)) 70 | 71 | tf.logging.info("Loading embedding matrix from %s", 72 | embedding_matrix_file) 73 | # Note: tf.gfile.GFile doesn't work here because np.load() calls f.seek() 74 | # with 3 arguments. 75 | with open(embedding_matrix_file, "r") as f: 76 | embedding_matrix = np.load(f) 77 | tf.logging.info("Loaded embedding matrix with shape %s", 78 | embedding_matrix.shape) 79 | 80 | word_embeddings = collections.OrderedDict( 81 | zip(reverse_vocab, embedding_matrix)) 82 | 83 | g = tf.Graph() 84 | with g.as_default(): 85 | encoder = skip_thoughts_encoder.SkipThoughtsEncoder(word_embeddings) 86 | restore_model = encoder.build_graph_from_config(model_config, 87 | checkpoint_path) 88 | 89 | sess = tf.Session(graph=g) 90 | restore_model(sess) 91 | 92 | self.encoders.append(encoder) 93 | self.sessions.append(sess) 94 | 95 | def encode(self, 96 | data, 97 | use_norm=True, 98 | verbose=False, 99 | batch_size=128, 100 | use_eos=False): 101 | """Encodes a sequence of sentences as skip-thought vectors. 102 | 103 | Args: 104 | data: A list of input strings. 105 | use_norm: If True, normalize output skip-thought vectors to unit 106 | L2 norm. 107 | verbose: Whether to log every batch. 108 | batch_size: Batch size for the RNN encoders. 109 | use_eos: If True, append the end-of-sentence word to each input 110 | sentence. 111 | 112 | Returns: 113 | thought_vectors: A list of numpy arrays corresponding to 'data'. 114 | 115 | Raises: 116 | ValueError: If called before calling load_encoder. 117 | """ 118 | if not self.encoders: 119 | raise ValueError( 120 | "Must call load_model at least once before calling encode.") 121 | 122 | encoded = [] 123 | for encoder, sess in zip(self.encoders, self.sessions): 124 | encoded.append( 125 | np.array( 126 | encoder.encode( 127 | sess, 128 | data, 129 | use_norm=use_norm, 130 | verbose=verbose, 131 | batch_size=batch_size, 132 | use_eos=use_eos))) 133 | 134 | return np.concatenate(encoded, axis=1) 135 | 136 | def close(self): 137 | """Closes the active TensorFlow Sessions.""" 138 | for sess in self.sessions: 139 | sess.close() 140 | -------------------------------------------------------------------------------- /parallax/parallax/examples/skip_thoughts/evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Script to evaluate a skip-thoughts model. 16 | 17 | This script can evaluate a model with a unidirectional encoder ("uni-skip" in 18 | the paper); or a model with a bidirectional encoder ("bi-skip"); or the 19 | combination of a model with a unidirectional encoder and a model with a 20 | bidirectional encoder ("combine-skip"). 21 | 22 | The uni-skip model (if it exists) is specified by the flags 23 | --uni_vocab_file, --uni_embeddings_file, --uni_checkpoint_path. 24 | 25 | The bi-skip model (if it exists) is specified by the flags 26 | --bi_vocab_file, --bi_embeddings_path, --bi_checkpoint_path. 27 | 28 | The evaluation tasks have different running times. SICK may take 5-10 minutes. 29 | MSRP, TREC and CR may take 20-60 minutes. SUBJ, MPQA and MR may take 2+ hours. 30 | 31 | Original source : https://github.com/tensorflow/models/tree/master/skip_thoughts 32 | 33 | """ 34 | 35 | from __future__ import absolute_import 36 | from __future__ import division 37 | from __future__ import print_function 38 | 39 | import tensorflow as tf 40 | 41 | from skipthoughts import eval_classification 42 | from skipthoughts import eval_msrp 43 | from skipthoughts import eval_sick 44 | from skipthoughts import eval_trec 45 | from skip_thoughts import configuration 46 | from skip_thoughts import encoder_manager 47 | 48 | FLAGS = tf.flags.FLAGS 49 | 50 | tf.flags.DEFINE_string("eval_task", "CR", 51 | "Name of the evaluation task to run. Available tasks: " 52 | "MR, CR, SUBJ, MPQA, SICK, MSRP, TREC.") 53 | 54 | tf.flags.DEFINE_string("data_dir", None, "Directory containing training data.") 55 | 56 | tf.flags.DEFINE_string("uni_vocab_file", None, 57 | "Path to vocabulary file containing a list of newline-" 58 | "separated words where the word id is the " 59 | "corresponding 0-based index in the file.") 60 | tf.flags.DEFINE_string("bi_vocab_file", None, 61 | "Path to vocabulary file containing a list of newline-" 62 | "separated words where the word id is the " 63 | "corresponding 0-based index in the file.") 64 | 65 | tf.flags.DEFINE_string("uni_embeddings_file", None, 66 | "Path to serialized numpy array of shape " 67 | "[vocab_size, embedding_dim].") 68 | tf.flags.DEFINE_string("bi_embeddings_file", None, 69 | "Path to serialized numpy array of shape " 70 | "[vocab_size, embedding_dim].") 71 | 72 | tf.flags.DEFINE_string("uni_checkpoint_path", None, 73 | "Checkpoint file or directory containing a checkpoint " 74 | "file.") 75 | tf.flags.DEFINE_string("bi_checkpoint_path", None, 76 | "Checkpoint file or directory containing a checkpoint " 77 | "file.") 78 | 79 | tf.logging.set_verbosity(tf.logging.INFO) 80 | 81 | 82 | def main(unused_argv): 83 | if not FLAGS.data_dir: 84 | raise ValueError("--data_dir is required.") 85 | 86 | encoder = encoder_manager.EncoderManager() 87 | 88 | # Maybe load unidirectional encoder. 89 | if FLAGS.uni_checkpoint_path: 90 | print("Loading unidirectional model...") 91 | uni_config = configuration.model_config() 92 | encoder.load_model(uni_config, FLAGS.uni_vocab_file, 93 | FLAGS.uni_embeddings_file, FLAGS.uni_checkpoint_path) 94 | 95 | # Maybe load bidirectional encoder. 96 | if FLAGS.bi_checkpoint_path: 97 | print("Loading bidirectional model...") 98 | bi_config = configuration.model_config(bidirectional_encoder=True) 99 | encoder.load_model(bi_config, FLAGS.bi_vocab_file, 100 | FLAGS.bi_embeddings_file, 101 | FLAGS.bi_checkpoint_path) 102 | 103 | if FLAGS.eval_task in ["MR", "CR", "SUBJ", "MPQA"]: 104 | eval_classification.eval_nested_kfold( 105 | encoder, FLAGS.eval_task, FLAGS.data_dir, use_nb=False) 106 | elif FLAGS.eval_task == "SICK": 107 | eval_sick.evaluate(encoder, evaltest=True, loc=FLAGS.data_dir) 108 | elif FLAGS.eval_task == "MSRP": 109 | eval_msrp.evaluate( 110 | encoder, evalcv=True, evaltest=True, use_feats=True, 111 | loc=FLAGS.data_dir) 112 | elif FLAGS.eval_task == "TREC": 113 | eval_trec.evaluate(encoder, evalcv=True, evaltest=True, 114 | loc=FLAGS.data_dir) 115 | else: 116 | raise ValueError("Unrecognized eval_task: %s" % FLAGS.eval_task) 117 | 118 | encoder.close() 119 | 120 | 121 | if __name__ == "__main__": 122 | tf.app.run() 123 | -------------------------------------------------------------------------------- /parallax/parallax/examples/skip_thoughts/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/skip_thoughts/ops/__init__.py -------------------------------------------------------------------------------- /parallax/parallax/examples/skip_thoughts/ops/gru_cell.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """GRU cell implementation for the skip-thought vectors model.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | 22 | import tensorflow as tf 23 | 24 | _layer_norm = tf.contrib.layers.layer_norm 25 | 26 | 27 | class LayerNormGRUCell(tf.contrib.rnn.RNNCell): 28 | """GRU cell with layer normalization. 29 | 30 | The layer normalization implementation is based on: 31 | 32 | https://arxiv.org/abs/1607.06450. 33 | 34 | "Layer Normalization" 35 | Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton 36 | """ 37 | 38 | def __init__(self, 39 | num_units, 40 | w_initializer, 41 | u_initializer, 42 | b_initializer, 43 | activation=tf.nn.tanh): 44 | """Initializes the cell. 45 | 46 | Args: 47 | num_units: Number of cell units. 48 | w_initializer: Initializer for the "W" (input) parameter matrices. 49 | u_initializer: Initializer for the "U" (recurrent) parameter matrices. 50 | b_initializer: Initializer for the "b" (bias) parameter vectors. 51 | activation: Cell activation function. 52 | """ 53 | self._num_units = num_units 54 | self._w_initializer = w_initializer 55 | self._u_initializer = u_initializer 56 | self._b_initializer = b_initializer 57 | self._activation = activation 58 | 59 | @property 60 | def state_size(self): 61 | return self._num_units 62 | 63 | @property 64 | def output_size(self): 65 | return self._num_units 66 | 67 | def _w_h_initializer(self): 68 | """Returns an initializer for the "W_h" parameter matrix. 69 | 70 | See equation (23) in the paper. The "W_h" parameter matrix is the 71 | concatenation of two parameter submatrices. The matrix returned is 72 | [U_z, U_r]. 73 | 74 | Returns: 75 | A Tensor with shape [num_units, 2 * num_units] as described above. 76 | """ 77 | 78 | def _initializer(shape, dtype=tf.float32, partition_info=None): 79 | num_units = self._num_units 80 | assert shape == [num_units, 2 * num_units] 81 | u_z = self._u_initializer([num_units, num_units], dtype, partition_info) 82 | u_r = self._u_initializer([num_units, num_units], dtype, partition_info) 83 | return tf.concat([u_z, u_r], 1) 84 | 85 | return _initializer 86 | 87 | def _w_x_initializer(self, input_dim): 88 | """Returns an initializer for the "W_x" parameter matrix. 89 | 90 | See equation (23) in the paper. The "W_x" parameter matrix is the 91 | concatenation of two parameter submatrices. The matrix returned is 92 | [W_z, W_r]. 93 | 94 | Args: 95 | input_dim: The dimension of the cell inputs. 96 | 97 | Returns: 98 | A Tensor with shape [input_dim, 2 * num_units] as described above. 99 | """ 100 | 101 | def _initializer(shape, dtype=tf.float32, partition_info=None): 102 | num_units = self._num_units 103 | assert shape == [input_dim, 2 * num_units] 104 | w_z = self._w_initializer([input_dim, num_units], dtype, partition_info) 105 | w_r = self._w_initializer([input_dim, num_units], dtype, partition_info) 106 | return tf.concat([w_z, w_r], 1) 107 | 108 | return _initializer 109 | 110 | def __call__(self, inputs, state, scope=None): 111 | """GRU cell with layer normalization.""" 112 | input_dim = inputs.get_shape().as_list()[1] 113 | num_units = self._num_units 114 | 115 | with tf.variable_scope(scope or "gru_cell"): 116 | with tf.variable_scope("gates"): 117 | w_h = tf.get_variable( 118 | "w_h", [num_units, 2 * num_units], 119 | initializer=self._w_h_initializer()) 120 | w_x = tf.get_variable( 121 | "w_x", [input_dim, 2 * num_units], 122 | initializer=self._w_x_initializer(input_dim)) 123 | z_and_r = (_layer_norm(tf.matmul(state, w_h), scope="layer_norm/w_h") + 124 | _layer_norm(tf.matmul(inputs, w_x), scope="layer_norm/w_x")) 125 | z, r = tf.split(tf.sigmoid(z_and_r), 2, 1) 126 | with tf.variable_scope("candidate"): 127 | w = tf.get_variable( 128 | "w", [input_dim, num_units], initializer=self._w_initializer) 129 | u = tf.get_variable( 130 | "u", [num_units, num_units], initializer=self._u_initializer) 131 | h_hat = (r * _layer_norm(tf.matmul(state, u), scope="layer_norm/u") + 132 | _layer_norm(tf.matmul(inputs, w), scope="layer_norm/w")) 133 | new_h = (1 - z) * state + z * self._activation(h_hat) 134 | return new_h, new_h 135 | -------------------------------------------------------------------------------- /parallax/parallax/examples/skip_thoughts/ops/input_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Input ops.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | 23 | 24 | import tensorflow as tf 25 | from parallax import shard 26 | 27 | # A SentenceBatch is a pair of Tensors: 28 | # ids: Batch of input sentences represented as sequences of word ids: an int64 29 | # Tensor with shape [batch_size, padded_length]. 30 | # mask: Boolean mask distinguishing real words (1) from padded words (0): an 31 | # int32 Tensor with shape [batch_size, padded_length]. 32 | SentenceBatch = collections.namedtuple("SentenceBatch", ("ids", "mask")) 33 | 34 | 35 | def parse_example_batch(serialized): 36 | """Parses a batch of tf.Example protos. 37 | 38 | Args: 39 | serialized: A 1-D string Tensor; a batch of serialized tf.Example protos. 40 | Returns: 41 | encode: A SentenceBatch of encode sentences. 42 | decode_pre: A SentenceBatch of "previous" sentences to decode. 43 | decode_post: A SentenceBatch of "post" sentences to decode. 44 | """ 45 | features = tf.parse_example( 46 | serialized, 47 | features={ 48 | "encode": tf.VarLenFeature(dtype=tf.int64), 49 | "decode_pre": tf.VarLenFeature(dtype=tf.int64), 50 | "decode_post": tf.VarLenFeature(dtype=tf.int64), 51 | }) 52 | 53 | def _sparse_to_batch(sparse): 54 | ids = tf.sparse_tensor_to_dense(sparse) # Padding with zeroes. 55 | mask = tf.sparse_to_dense(sparse.indices, sparse.dense_shape, 56 | tf.ones_like(sparse.values, dtype=tf.int32)) 57 | return SentenceBatch(ids=ids, mask=mask) 58 | 59 | output_names = ("encode", "decode_pre", "decode_post") 60 | return tuple(_sparse_to_batch(features[x]) for x in output_names) 61 | 62 | 63 | def prefetch_input_data(reader, 64 | file_pattern, 65 | shuffle, 66 | capacity, 67 | num_reader_threads=1): 68 | """Prefetches string values from disk into an input queue. 69 | 70 | Args: 71 | reader: Instance of tf.ReaderBase. 72 | file_pattern: Comma-separated list of file patterns (e.g. 73 | "/tmp/train_data-?????-of-00100", where '?' acts as a wildcard that 74 | matches any character). 75 | shuffle: Boolean; whether to randomly shuffle the input data. 76 | capacity: Queue capacity (number of records). 77 | num_reader_threads: Number of reader threads feeding into the queue. 78 | 79 | Returns: 80 | A Queue containing prefetched string values. 81 | """ 82 | data_files = [] 83 | for pattern in file_pattern.split(","): 84 | data_files.extend(tf.gfile.Glob(pattern)) 85 | if not data_files: 86 | tf.logging.fatal("Found no input files matching %s", file_pattern) 87 | else: 88 | tf.logging.info("Prefetching values from %d files matching %s", 89 | len(data_files), file_pattern) 90 | data_files.sort() 91 | num_files = len(data_files) 92 | num_shards, shard_id = shard.create_num_shards_and_shard_id() 93 | shard_size = num_files / num_shards 94 | shard_size = tf.cast(shard_size, dtype=tf.int64) 95 | remainder = num_files % num_shards 96 | 97 | slice_begin = tf.cond(tf.less(shard_id, remainder + 1), 98 | lambda: (shard_size + 1) * shard_id, 99 | lambda: shard_size * shard_id + remainder) 100 | slice_size = tf.cond(tf.less(shard_id, remainder), lambda: shard_size + 1, 101 | lambda: shard_size) 102 | data_files = tf.slice(data_files, [slice_begin], [slice_size]) 103 | filename_queue = tf.train.string_input_producer( 104 | data_files, shuffle=shuffle, capacity=16, name="filename_queue") 105 | 106 | if shuffle: 107 | min_after_dequeue = int(0.6 * capacity) 108 | values_queue = tf.RandomShuffleQueue( 109 | capacity=capacity, 110 | min_after_dequeue=min_after_dequeue, 111 | dtypes=[tf.string], 112 | shapes=[[]], 113 | name="random_input_queue") 114 | else: 115 | values_queue = tf.FIFOQueue( 116 | capacity=capacity, 117 | dtypes=[tf.string], 118 | shapes=[[]], 119 | name="fifo_input_queue") 120 | 121 | enqueue_ops = [] 122 | for _ in range(num_reader_threads): 123 | _, value = reader.read(filename_queue) 124 | enqueue_ops.append(values_queue.enqueue([value])) 125 | tf.train.queue_runner.add_queue_runner( 126 | tf.train.queue_runner.QueueRunner(values_queue, enqueue_ops)) 127 | tf.summary.scalar("queue/%s/fraction_of_%d_full" % (values_queue.name, 128 | capacity), 129 | tf.cast(values_queue.size(), tf.float32) * (1.0 / capacity)) 130 | 131 | return values_queue 132 | -------------------------------------------------------------------------------- /parallax/parallax/examples/skip_thoughts/parallax_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Seoul National University 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import tensorflow as tf 17 | import parallax 18 | 19 | 20 | flags = tf.app.flags 21 | flags.DEFINE_boolean('replicate_variables', True, """replicate_variables""") 22 | flags.DEFINE_string('protocol', 'grpc', """The method for managing variables""") 23 | flags.DEFINE_string('mpirun_options', '', 'The option for mpirun') 24 | flags.DEFINE_string('run_option', 'HYBRID', 25 | 'The run option whether PS, MPI or HYBRID') 26 | flags.DEFINE_string('redirect_path', None, """redirect path to keep the log of distributed workers""") 27 | flags.DEFINE_string('ckpt_dir', None, """Directory to save checkpoints""") 28 | flags.DEFINE_integer('save_ckpt_steps', None, 29 | """Number of steps between two consecutive checkpoints""") 30 | flags.DEFINE_string('profile_dir', None, """Directory to save RunMetadata""") 31 | flags.DEFINE_string('profile_steps', None, """Comma separated porfile steps""") 32 | flags.DEFINE_boolean('local_aggregation', True, 33 | """Whether to use local aggregation or not""") 34 | flags.DEFINE_boolean('boundary_among_servers', True, 35 | """Whether to use operation placement among servers""") 36 | flags.DEFINE_boolean('boundary_between_workers_and_servers', True, 37 | """Whether to use operation placement between workers and servers""") 38 | flags.DEFINE_string('export_graph_path', None, """export path to keep transformed graph definintion""") 39 | FLAGS = flags.FLAGS 40 | 41 | def build_config(): 42 | 43 | ckpt_config = parallax.CheckPointConfig(ckpt_dir=FLAGS.ckpt_dir, 44 | save_ckpt_steps=FLAGS.save_ckpt_steps) 45 | ps_config = parallax.PSConfig(replicate_variables=FLAGS.replicate_variables, 46 | protocol=FLAGS.protocol, 47 | local_aggregation=FLAGS.local_aggregation, 48 | boundary_among_servers=FLAGS.boundary_among_servers, 49 | boundary_between_workers_and_servers=\ 50 | FLAGS.boundary_between_workers_and_servers) 51 | mpi_config = parallax.MPIConfig(mpirun_options=FLAGS.mpirun_options) 52 | parallax_config = parallax.Config() 53 | parallax_config.run_option = FLAGS.run_option 54 | parallax_config.average_sparse = False 55 | parallax_config.communication_config = parallax.CommunicationConfig(ps_config, mpi_config) 56 | parallax_config.ckpt_config=ckpt_config 57 | def get_profile_steps(): 58 | if not FLAGS.profile_steps: 59 | return [] 60 | FLAGS.profile_steps = FLAGS.profile_steps.strip() 61 | return [int(step) for step in FLAGS.profile_steps.split(',')] 62 | profile_config = parallax.ProfileConfig(profile_dir=FLAGS.profile_dir, 63 | profile_steps=get_profile_steps()) 64 | parallax_config.profile_config = profile_config 65 | parallax_config.redirect_path = FLAGS.redirect_path 66 | parallax_config.export_graph_path = FLAGS.export_graph_path 67 | 68 | return parallax_config 69 | -------------------------------------------------------------------------------- /parallax/parallax/examples/skip_thoughts/resource_info: -------------------------------------------------------------------------------- 1 | 123.456.78.90:0 2 | -------------------------------------------------------------------------------- /parallax/parallax/examples/skip_thoughts/skip_distributed_driver.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Seoul National University 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import sys 17 | import os 18 | import json 19 | import time 20 | 21 | import tensorflow as tf 22 | from tensorflow.core.protobuf import config_pb2 23 | import parallax 24 | import parallax_config 25 | 26 | import configuration 27 | import skip_thoughts_model 28 | 29 | FLAGS = tf.app.flags.FLAGS 30 | 31 | tf.app.flags.DEFINE_string('data_path', None, 32 | """Where to training/test data is stored.""") 33 | tf.app.flags.DEFINE_string('input_file_pattern', '', 34 | """File pattern of train data""") 35 | tf.app.flags.DEFINE_integer('batch_size', 128, 36 | """Batch_size""") 37 | tf.app.flags.DEFINE_string('resource_info_file', 38 | os.path.abspath( 39 | os.path.join(os.path.dirname(__file__), '.', 40 | 'resource_info')), 41 | 'Filename containing cluster information') 42 | tf.app.flags.DEFINE_integer('max_steps', 1000000, 43 | """Number of iterations to run for each workers.""") 44 | tf.app.flags.DEFINE_integer('log_frequency', 100, 45 | """How many steps between two runop logs.""") 46 | tf.app.flags.DEFINE_boolean('sync', True, '') 47 | 48 | def main(_): 49 | single_gpu_graph = tf.Graph() 50 | with single_gpu_graph.as_default(): 51 | model_config = configuration.model_config( 52 | input_file_pattern=FLAGS.input_file_pattern, 53 | batch_size=FLAGS.batch_size) 54 | training_config = configuration.training_config() 55 | model = skip_thoughts_model.SkipThoughtsModel(model_config, 56 | mode="train") 57 | model.build() 58 | 59 | # Setup learning rate 60 | if training_config.learning_rate_decay_factor > 0: 61 | learning_rate = tf.train.exponential_decay( 62 | learning_rate=float(training_config.learning_rate), 63 | global_step=model.global_step, 64 | decay_steps=training_config.learning_rate_decay_steps, 65 | decay_rate=training_config.learning_rate_decay_factor, 66 | staircase=False) 67 | else: 68 | learning_rate = tf.constant(training_config.learning_rate) 69 | 70 | optimizer = tf.train.AdamOptimizer(learning_rate) 71 | 72 | train_tensor = tf.contrib.slim.learning.create_train_op( 73 | total_loss=model.total_loss, 74 | optimizer=optimizer, 75 | global_step=model.global_step, 76 | clip_gradient_norm=training_config.clip_gradient_norm) 77 | 78 | def run(sess, num_workers, worker_id, num_replicas_per_worker): 79 | fetches = { 80 | 'global_step': 81 | model.global_step, 82 | 'cost': 83 | model.total_loss, 84 | 'train_op': 85 | train_tensor, 86 | } 87 | 88 | start = time.time() 89 | for i in range(FLAGS.max_steps): 90 | results = sess.run(fetches) 91 | if i % FLAGS.log_frequency == 0: 92 | end = time.time() 93 | throughput = float(FLAGS.log_frequency) / float(end - start) 94 | parallax.log.info( 95 | "global step: %d, loss: %f, throughput: %f steps/sec" 96 | % (results['global_step'][0], results['cost'][0], throughput)) 97 | start = time.time() 98 | 99 | sess, num_workers, worker_id, num_replicas_per_worker = \ 100 | parallax.parallel_run(single_gpu_graph, 101 | FLAGS.resource_info_file, 102 | sync=FLAGS.sync, 103 | parallax_config=parallax_config.build_config()) 104 | run(sess, num_workers, worker_id, num_replicas_per_worker) 105 | 106 | if __name__ == "__main__": 107 | tf.logging.set_verbosity(tf.logging.INFO) 108 | tf.app.run() 109 | -------------------------------------------------------------------------------- /parallax/parallax/examples/skip_thoughts/train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Train the skip-thoughts model.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import tensorflow as tf 22 | 23 | from skip_thoughts import configuration 24 | from skip_thoughts import skip_thoughts_model 25 | 26 | FLAGS = tf.flags.FLAGS 27 | 28 | tf.flags.DEFINE_string("input_file_pattern", None, 29 | "File pattern of sharded TFRecord files containing " 30 | "tf.Example protos.") 31 | tf.flags.DEFINE_string("train_dir", None, 32 | "Directory for saving and loading checkpoints.") 33 | 34 | tf.logging.set_verbosity(tf.logging.INFO) 35 | 36 | 37 | def _setup_learning_rate(config, global_step): 38 | """Sets up the learning rate with optional exponential decay. 39 | 40 | Args: 41 | config: Object containing learning rate configuration parameters. 42 | global_step: Tensor; the global step. 43 | 44 | Returns: 45 | learning_rate: Tensor; the learning rate with exponential decay. 46 | """ 47 | if config.learning_rate_decay_factor > 0: 48 | learning_rate = tf.train.exponential_decay( 49 | learning_rate=float(config.learning_rate), 50 | global_step=global_step, 51 | decay_steps=config.learning_rate_decay_steps, 52 | decay_rate=config.learning_rate_decay_factor, 53 | staircase=False) 54 | else: 55 | learning_rate = tf.constant(config.learning_rate) 56 | return learning_rate 57 | 58 | 59 | def main(unused_argv): 60 | if not FLAGS.input_file_pattern: 61 | raise ValueError("--input_file_pattern is required.") 62 | if not FLAGS.train_dir: 63 | raise ValueError("--train_dir is required.") 64 | 65 | model_config = configuration.model_config( 66 | input_file_pattern=FLAGS.input_file_pattern) 67 | training_config = configuration.training_config() 68 | 69 | tf.logging.info("Building training graph.") 70 | g = tf.Graph() 71 | with g.as_default(): 72 | model = skip_thoughts_model.SkipThoughtsModel(model_config, 73 | mode="train") 74 | model.build() 75 | 76 | learning_rate = _setup_learning_rate(training_config, model.global_step) 77 | optimizer = tf.train.AdamOptimizer(learning_rate) 78 | 79 | train_tensor = tf.contrib.slim.learning.create_train_op( 80 | total_loss=model.total_loss, 81 | optimizer=optimizer, 82 | global_step=model.global_step, 83 | clip_gradient_norm=training_config.clip_gradient_norm) 84 | 85 | saver = tf.train.Saver() 86 | 87 | tf.contrib.slim.learning.train( 88 | train_op=train_tensor, 89 | logdir=FLAGS.train_dir, 90 | graph=g, 91 | global_step=model.global_step, 92 | number_of_steps=training_config.number_of_steps, 93 | save_summaries_secs=training_config.save_summaries_secs, 94 | saver=saver, 95 | save_interval_secs=training_config.save_model_secs) 96 | 97 | 98 | if __name__ == "__main__": 99 | tf.app.run() 100 | -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/CNNBenchmark_distributed_driver.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Seoul National University 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import argparse 17 | import sys 18 | import os 19 | import json 20 | import time 21 | 22 | from absl import flags 23 | import tensorflow as tf 24 | 25 | import benchmark_cnn 26 | import cnn_util 27 | import parallax_config 28 | from cnn_util import log_fn 29 | from tensorflow.core.protobuf import config_pb2 30 | 31 | import parallax 32 | 33 | benchmark_cnn.define_flags() 34 | flags.adopt_module_key_flags(benchmark_cnn) 35 | 36 | FLAGS = tf.app.flags.FLAGS 37 | 38 | tf.app.flags.DEFINE_string('resource_info_file', 39 | os.path.abspath(os.path.join( 40 | os.path.dirname(__file__), 41 | '.', 42 | 'resource_info')), 43 | 'Filename containing cluster information') 44 | tf.app.flags.DEFINE_integer('max_steps', 1000000, 45 | """Number of iterations to run for each workers.""") 46 | tf.app.flags.DEFINE_integer('log_frequency', 100, 47 | """How many steps between two runop logs.""") 48 | tf.app.flags.DEFINE_boolean('sync', True, '') 49 | 50 | def main(_): 51 | # Build benchmark_cnn model 52 | params = benchmark_cnn.make_params_from_flags() 53 | params, sess_config = benchmark_cnn.setup(params) 54 | bench = benchmark_cnn.BenchmarkCNN(params) 55 | 56 | # Print informaton 57 | tfversion = cnn_util.tensorflow_version_tuple() 58 | log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) 59 | bench.print_info() 60 | 61 | # Build single-GPU benchmark_cnn model 62 | single_gpu_graph = tf.Graph() 63 | with single_gpu_graph.as_default(): 64 | bench.build_model() 65 | 66 | config = parallax_config.build_config() 67 | config.sess_config = sess_config 68 | 69 | sess, num_workers, worker_id, num_replicas_per_worker = \ 70 | parallax.parallel_run(single_gpu_graph, 71 | FLAGS.resource_info_file, 72 | sync=FLAGS.sync, 73 | parallax_config=config) 74 | 75 | 76 | fetches = { 77 | 'global_step': bench.global_step, 78 | 'cost': bench.cost, 79 | 'train_op': bench.train_op, 80 | } 81 | 82 | start = time.time() 83 | for i in range(FLAGS.max_steps): 84 | results = sess.run(fetches) 85 | if (i + 1) % FLAGS.log_frequency == 0: 86 | end = time.time() 87 | throughput = float(FLAGS.log_frequency) / float(end - start) 88 | parallax.log.info( 89 | "global step: %d, loss: %f, throughput: %f steps/sec" 90 | % (results['global_step'][0]+1, results['cost'][0], throughput)) 91 | start = time.time() 92 | 93 | if __name__ == '__main__': 94 | tf.app.run() 95 | -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/CNNBenchmark_eval.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Seoul National University 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from absl import flags 17 | import tensorflow as tf 18 | 19 | import benchmark_cnn 20 | 21 | benchmark_cnn.define_flags() 22 | flags.adopt_module_key_flags(benchmark_cnn) 23 | 24 | FLAGS = tf.app.flags.FLAGS 25 | 26 | def main(_): 27 | FLAGS.eval = True 28 | params = benchmark_cnn.make_params_from_flags() 29 | params, config = benchmark_cnn.setup(params) 30 | bench = benchmark_cnn.BenchmarkCNN(params) 31 | bench.evaluate() 32 | 33 | if __name__ == '__main__': 34 | tf.app.run() 35 | -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # TensorFlow CNN Benchmarks 2 | The original code of this example comes from [tf_cnn_benchmarks](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks). 3 | We modified this code to build a computation graph for a single-gpu environment instead of a multi-GPU and multi-machine environment(We removed the unnecessary communication-related files like `varialble_mgr.py`, `variable_mgr_util.py`). 4 | We added `CNNBenchmark_distributed_driver.py` for training and `CNNBenchmark_eval.py` for evaluation. 5 | 6 | ## Dataset 7 | * Synthetic data or imagenet data can be used. To use imagenet data follow these [instructions](https://github.com/tensorflow/models/tree/master/research/inception#getting-started). 8 | 9 | ## Training 10 | Set your resource information in the `resource_info` file. 11 | 12 | Then, execute: 13 | ```shell 14 | $ python CNNBenchmark_distributed_driver.py --model={model} --data_name={data_name} --data_dir={data_dir} 15 | ``` 16 | 17 | The command above runs a single CNN model on multiple devices specified in `resource_info`. 18 | The command assumes that the data directory and the TensorFlow CNN benchmark codebase are distributed and reachable in the same absolute path in each of the machines. 19 | 20 | Also, we have a few more options you can choose for distributed running. 21 | 22 | | Parameter Name | Default | Description | 23 | | :------------------- |:-----------------------| :-----------| 24 | | --resource_info_file | `./resource_info` | Filename containing cluster information written | 25 | | --max_steps | 1000000 | Number of iterations to run for each workers | 26 | | --log_frequency | 100 | How many steps between two runop log | 27 | | --sync | True | Whether to synchronize learning or not | 28 | | --ckpt_dir | None | Directory to save checkpoints | 29 | | --save_ckpt_steps | 0 | Number of steps between two consecutive checkpoints | 30 | | --run_option | None | The run option whether PS or MPI, None utilizes both | 31 | 32 | You can adapt the distributed running with above options. For example, if you want to fix the communication model as MPI mode, you can add `run_option` value like below. 33 | 34 | ```shell 35 | $ python CNNBenchmark_distributed_driver.py --model={model} --data_name={data_name} --data_dir={data_dir} --run_option=MPI 36 | ``` 37 | 38 | ## Evaluation 39 | Execute: 40 | ```shell 41 | $ python CNNBenchmark_eval.py --eval=True --model={model} --data_name={data_name} --data_dir={data_dir} --checkpoint_dir={checkpoint_dir} 42 | ``` 43 | -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/tf_cnn_benchmarks/models/__init__.py -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/models/alexnet_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Alexnet model configuration. 17 | 18 | References: 19 | Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton 20 | ImageNet Classification with Deep Convolutional Neural Networks 21 | Advances in Neural Information Processing Systems. 2012 22 | """ 23 | 24 | import tensorflow as tf 25 | 26 | from models import model 27 | 28 | 29 | class AlexnetModel(model.Model): 30 | """Alexnet cnn model.""" 31 | 32 | def __init__(self): 33 | super(AlexnetModel, self).__init__('alexnet', 224 + 3, 512, 0.005) 34 | 35 | def add_inference(self, cnn): 36 | # Note: VALID requires padding the images by 3 in width and height 37 | cnn.conv(64, 11, 11, 4, 4, 'VALID') 38 | cnn.mpool(3, 3, 2, 2) 39 | cnn.conv(192, 5, 5) 40 | cnn.mpool(3, 3, 2, 2) 41 | cnn.conv(384, 3, 3) 42 | cnn.conv(384, 3, 3) 43 | cnn.conv(256, 3, 3) 44 | cnn.mpool(3, 3, 2, 2) 45 | cnn.reshape([-1, 256 * 6 * 6]) 46 | cnn.affine(4096) 47 | cnn.dropout() 48 | cnn.affine(4096) 49 | cnn.dropout() 50 | 51 | 52 | class AlexnetCifar10Model(model.Model): 53 | """Alexnet cnn model for cifar datasets. 54 | 55 | The model architecture follows the one defined in the tensorflow tutorial 56 | model. 57 | 58 | Reference model: tensorflow/models/tutorials/image/cifar10/cifar10.py 59 | Paper: http://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf 60 | """ 61 | 62 | def __init__(self): 63 | super(AlexnetCifar10Model, self).__init__('alexnet', 32, 128, 0.1) 64 | 65 | def add_inference(self, cnn): 66 | cnn.conv(64, 5, 5, 1, 1, 'SAME', stddev=5e-2) 67 | cnn.mpool(3, 3, 2, 2, mode='SAME') 68 | cnn.lrn(depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75) 69 | cnn.conv(64, 5, 5, 1, 1, 'SAME', bias=0.1, stddev=5e-2) 70 | cnn.lrn(depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75) 71 | cnn.mpool(3, 3, 2, 2, mode='SAME') 72 | shape = cnn.top_layer.get_shape().as_list() 73 | flat_dim = shape[1] * shape[2] * shape[3] 74 | cnn.reshape([-1, flat_dim]) 75 | cnn.affine(384, stddev=0.04, bias=0.1) 76 | cnn.affine(192, stddev=0.04, bias=0.1) 77 | 78 | def get_learning_rate(self, global_step, batch_size): 79 | num_examples_per_epoch = 50000 80 | num_epochs_per_decay = 100 81 | decay_steps = int(num_epochs_per_decay * num_examples_per_epoch / 82 | batch_size) 83 | decay_factor = 0.1 84 | return tf.train.exponential_decay( 85 | self.learning_rate, global_step, decay_steps, decay_factor, 86 | staircase=True) 87 | -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/models/densenet_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Densenet model configuration. 17 | 18 | References: 19 | "Densely Connected Convolutional Networks": https://arxiv.org/pdf/1608.06993 20 | """ 21 | import numpy as np 22 | from six.moves import xrange # pylint: disable=redefined-builtin 23 | import tensorflow as tf 24 | 25 | from models import model as model_lib 26 | 27 | 28 | class DensenetCifar10Model(model_lib.Model): 29 | """Densenet cnn network configuration.""" 30 | 31 | def __init__(self, model, layer_counts, growth_rate): 32 | self.growth_rate = growth_rate 33 | super(DensenetCifar10Model, self).__init__(model, 32, 64, 0.1, 34 | layer_counts=layer_counts) 35 | self.batch_norm_config = {'decay': 0.9, 'epsilon': 1e-5, 'scale': True} 36 | 37 | def dense_block(self, cnn, growth_rate): 38 | input_layer = cnn.top_layer 39 | c = cnn.batch_norm(input_layer, **self.batch_norm_config) 40 | c = tf.nn.relu(c) 41 | c = cnn.conv(growth_rate, 3, 3, 1, 1, 42 | stddev=np.sqrt(2.0 / 9 / growth_rate), 43 | activation=None, input_layer=c) 44 | channel_index = 3 if cnn.channel_pos == 'channels_last' else 1 45 | cnn.top_layer = tf.concat([input_layer, c], channel_index) 46 | cnn.top_size += growth_rate 47 | 48 | def transition_layer(self, cnn): 49 | in_size = cnn.top_size 50 | cnn.batch_norm(**self.batch_norm_config) 51 | cnn.top_layer = tf.nn.relu(cnn.top_layer) 52 | cnn.conv(in_size, 1, 1, 1, 1, stddev=np.sqrt(2.0 / 9 / in_size)) 53 | cnn.apool(2, 2, 2, 2) 54 | 55 | def add_inference(self, cnn): 56 | if self.layer_counts is None: 57 | raise ValueError( 58 | 'Layer counts not specified for %s' % self.get_model()) 59 | if self.growth_rate is None: 60 | raise ValueError( 61 | 'Growth rate not specified for %s' % self.get_model()) 62 | 63 | cnn.conv(16, 3, 3, 1, 1, activation=None) 64 | # Block 1 65 | for _ in xrange(self.layer_counts[0]): 66 | self.dense_block(cnn, self.growth_rate) 67 | self.transition_layer(cnn) 68 | # Block 2 69 | for _ in xrange(self.layer_counts[1]): 70 | self.dense_block(cnn, self.growth_rate) 71 | self.transition_layer(cnn) 72 | # Block 3 73 | for _ in xrange(self.layer_counts[2]): 74 | self.dense_block(cnn, self.growth_rate) 75 | cnn.batch_norm(**self.batch_norm_config) 76 | cnn.top_layer = tf.nn.relu(cnn.top_layer) 77 | channel_index = 3 if cnn.channel_pos == 'channels_last' else 1 78 | cnn.top_size = cnn.top_layer.get_shape().as_list()[channel_index] 79 | cnn.spatial_mean() 80 | 81 | def get_learning_rate(self, global_step, batch_size): 82 | num_batches_per_epoch = int(50000 / batch_size) 83 | boundaries = num_batches_per_epoch * np.array([150, 225, 300], 84 | dtype=np.int64) 85 | boundaries = [x for x in boundaries] 86 | values = [0.1, 0.01, 0.001, 0.0001] 87 | return tf.train.piecewise_constant(global_step, boundaries, values) 88 | 89 | 90 | def create_densenet40_k12_model(): 91 | return DensenetCifar10Model('densenet40_k12', (12, 12, 12), 12) 92 | 93 | 94 | def create_densenet100_k12_model(): 95 | return DensenetCifar10Model('densenet100_k12', (32, 32, 32), 12) 96 | 97 | 98 | def create_densenet100_k24_model(): 99 | return DensenetCifar10Model('densenet100_k24', (32, 32, 32), 24) 100 | -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/models/googlenet_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Googlenet model configuration. 17 | 18 | References: 19 | Szegedy, Christian, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, 20 | Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, and Andrew Rabinovich 21 | Going deeper with convolutions 22 | arXiv preprint arXiv:1409.4842 (2014) 23 | """ 24 | 25 | from models import model 26 | 27 | 28 | class GooglenetModel(model.Model): 29 | 30 | def __init__(self): 31 | super(GooglenetModel, self).__init__('googlenet', 224, 32, 0.005) 32 | 33 | def add_inference(self, cnn): 34 | def inception_v1(cnn, k, l, m, n, p, q): 35 | cols = [[('conv', k, 1, 1)], [('conv', l, 1, 1), ('conv', m, 3, 3)], 36 | [('conv', n, 1, 1), ('conv', p, 5, 5)], 37 | [('mpool', 3, 3, 1, 1, 'SAME'), ('conv', q, 1, 1)]] 38 | cnn.inception_module('incept_v1', cols) 39 | 40 | cnn.conv(64, 7, 7, 2, 2) 41 | cnn.mpool(3, 3, 2, 2, mode='SAME') 42 | cnn.conv(64, 1, 1) 43 | cnn.conv(192, 3, 3) 44 | cnn.mpool(3, 3, 2, 2, mode='SAME') 45 | inception_v1(cnn, 64, 96, 128, 16, 32, 32) 46 | inception_v1(cnn, 128, 128, 192, 32, 96, 64) 47 | cnn.mpool(3, 3, 2, 2, mode='SAME') 48 | inception_v1(cnn, 192, 96, 208, 16, 48, 64) 49 | inception_v1(cnn, 160, 112, 224, 24, 64, 64) 50 | inception_v1(cnn, 128, 128, 256, 24, 64, 64) 51 | inception_v1(cnn, 112, 144, 288, 32, 64, 64) 52 | inception_v1(cnn, 256, 160, 320, 32, 128, 128) 53 | cnn.mpool(3, 3, 2, 2, mode='SAME') 54 | inception_v1(cnn, 256, 160, 320, 32, 128, 128) 55 | inception_v1(cnn, 384, 192, 384, 48, 128, 128) 56 | cnn.apool(7, 7, 1, 1, mode='VALID') 57 | cnn.reshape([-1, 1024]) 58 | -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/models/lenet_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Lenet model configuration. 17 | 18 | References: 19 | LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner 20 | Gradient-based learning applied to document recognition 21 | Proceedings of the IEEE (1998) 22 | """ 23 | 24 | from models import model 25 | 26 | 27 | class Lenet5Model(model.Model): 28 | def __init__(self): 29 | super(Lenet5Model, self).__init__('lenet5', 28, 32, 0.005) 30 | 31 | def add_inference(self, cnn): 32 | # Note: This matches TF's MNIST tutorial model 33 | cnn.conv(32, 5, 5) 34 | cnn.mpool(2, 2) 35 | cnn.conv(64, 5, 5) 36 | cnn.mpool(2, 2) 37 | cnn.reshape([-1, 64 * 7 * 7]) 38 | cnn.affine(512) 39 | -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/models/model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Base model configuration for CNN benchmarks.""" 16 | 17 | 18 | class Model(object): 19 | """Base model configuration for CNN benchmarks.""" 20 | 21 | def __init__(self, 22 | model, 23 | image_size, 24 | batch_size, 25 | learning_rate, 26 | layer_counts=None, 27 | fp16_loss_scale=128): 28 | self.model = model 29 | self.image_size = image_size 30 | self.batch_size = batch_size 31 | self.default_batch_size = batch_size 32 | self.learning_rate = learning_rate 33 | self.layer_counts = layer_counts 34 | # TODO(reedwm) Set custom loss scales for each model instead of using 35 | # the default of 128. 36 | self.fp16_loss_scale = fp16_loss_scale 37 | 38 | def get_model(self): 39 | return self.model 40 | 41 | def get_image_size(self): 42 | return self.image_size 43 | 44 | def get_batch_size(self): 45 | return self.batch_size 46 | 47 | def set_batch_size(self, batch_size): 48 | self.batch_size = batch_size 49 | 50 | def get_default_batch_size(self): 51 | return self.default_batch_size 52 | 53 | def get_layer_counts(self): 54 | return self.layer_counts 55 | 56 | def get_fp16_loss_scale(self): 57 | return self.fp16_loss_scale 58 | 59 | def get_learning_rate(self, global_step, batch_size): 60 | del global_step 61 | del batch_size 62 | return self.learning_rate 63 | 64 | def add_inference(self, unused_cnn): 65 | raise ValueError('Must be implemented in derived classes') 66 | -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/models/model_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Model configurations for CNN benchmarks. 17 | """ 18 | 19 | from models import alexnet_model 20 | from models import densenet_model 21 | from models import googlenet_model 22 | from models import inception_model 23 | from models import lenet_model 24 | from models import overfeat_model 25 | from models import resnet_model 26 | from models import trivial_model 27 | from models import vgg_model 28 | 29 | _model_name_to_imagenet_model = { 30 | 'vgg11': vgg_model.Vgg11Model, 31 | 'vgg16': vgg_model.Vgg16Model, 32 | 'vgg19': vgg_model.Vgg19Model, 33 | 'lenet': lenet_model.Lenet5Model, 34 | 'googlenet': googlenet_model.GooglenetModel, 35 | 'overfeat': overfeat_model.OverfeatModel, 36 | 'alexnet': alexnet_model.AlexnetModel, 37 | 'trivial': trivial_model.TrivialModel, 38 | 'inception3': inception_model.Inceptionv3Model, 39 | 'inception4': inception_model.Inceptionv4Model, 40 | 'resnet50': resnet_model.create_resnet50_model, 41 | 'resnet50_v2': resnet_model.create_resnet50_v2_model, 42 | 'resnet101': resnet_model.create_resnet101_model, 43 | 'resnet101_v2': resnet_model.create_resnet101_v2_model, 44 | 'resnet152': resnet_model.create_resnet152_model, 45 | 'resnet152_v2': resnet_model.create_resnet152_v2_model, 46 | } 47 | 48 | _model_name_to_cifar_model = { 49 | 'alexnet': alexnet_model.AlexnetCifar10Model, 50 | 'resnet20': resnet_model.create_resnet20_cifar_model, 51 | 'resnet20_v2': resnet_model.create_resnet20_v2_cifar_model, 52 | 'resnet32': resnet_model.create_resnet32_cifar_model, 53 | 'resnet32_v2': resnet_model.create_resnet32_v2_cifar_model, 54 | 'resnet44': resnet_model.create_resnet44_cifar_model, 55 | 'resnet44_v2': resnet_model.create_resnet44_v2_cifar_model, 56 | 'resnet56': resnet_model.create_resnet56_cifar_model, 57 | 'resnet56_v2': resnet_model.create_resnet56_v2_cifar_model, 58 | 'resnet110': resnet_model.create_resnet110_cifar_model, 59 | 'resnet110_v2': resnet_model.create_resnet110_v2_cifar_model, 60 | 'trivial': trivial_model.TrivialCifar10Model, 61 | 'densenet40_k12': densenet_model.create_densenet40_k12_model, 62 | 'densenet100_k12': densenet_model.create_densenet100_k12_model, 63 | 'densenet100_k24': densenet_model.create_densenet100_k24_model, 64 | } 65 | 66 | 67 | def _get_model_map(dataset_name): 68 | if 'cifar10' == dataset_name: 69 | return _model_name_to_cifar_model 70 | elif dataset_name in ('imagenet', 'synthetic'): 71 | return _model_name_to_imagenet_model 72 | else: 73 | raise ValueError('Invalid dataset name: %s' % dataset_name) 74 | 75 | 76 | def get_model_config(model_name, dataset): 77 | """Map model name to model network configuration.""" 78 | model_map = _get_model_map(dataset.name) 79 | if model_name not in model_map: 80 | raise ValueError('Invalid model name \'%s\' for dataset \'%s\'' % 81 | (model_name, dataset.name)) 82 | else: 83 | return model_map[model_name]() 84 | 85 | 86 | def register_model(model_name, dataset_name, model_func): 87 | """Register a new model that can be obtained with `get_model_config`.""" 88 | model_map = _get_model_map(dataset_name) 89 | if model_name in model_map: 90 | raise ValueError('Model "%s" is already registered for dataset "%s"' % 91 | (model_name, dataset_name)) 92 | model_map[model_name] = model_func 93 | -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/models/overfeat_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Overfeat model configuration. 17 | 18 | References: 19 | OverFeat: Integrated Recognition, Localization and Detection using 20 | Convolutional Networks 21 | Pierre Sermanet, David Eigen, Xiang Zhang, Michael Mathieu, Rob Fergus, 22 | Yann LeCun, 2014 23 | http://arxiv.org/abs/1312.6229 24 | """ 25 | 26 | from models import model 27 | 28 | 29 | class OverfeatModel(model.Model): 30 | 31 | def __init__(self): 32 | super(OverfeatModel, self).__init__('overfeat', 231, 32, 0.005) 33 | 34 | def add_inference(self, cnn): 35 | # Note: VALID requires padding the images by 3 in width and height 36 | cnn.conv(96, 11, 11, 4, 4, mode='VALID') 37 | cnn.mpool(2, 2) 38 | cnn.conv(256, 5, 5, 1, 1, mode='VALID') 39 | cnn.mpool(2, 2) 40 | cnn.conv(512, 3, 3) 41 | cnn.conv(1024, 3, 3) 42 | cnn.conv(1024, 3, 3) 43 | cnn.mpool(2, 2) 44 | cnn.reshape([-1, 1024 * 6 * 6]) 45 | cnn.affine(3072) 46 | cnn.affine(4096) 47 | -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/models/trivial_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Trivial model configuration.""" 16 | 17 | from models import model 18 | 19 | 20 | class TrivialModel(model.Model): 21 | """Trivial model configuration.""" 22 | 23 | def __init__(self): 24 | super(TrivialModel, self).__init__('trivial', 224 + 3, 32, 0.005) 25 | 26 | def add_inference(self, cnn): 27 | cnn.reshape([-1, 227 * 227 * 3]) 28 | cnn.affine(1) 29 | cnn.affine(4096) 30 | 31 | 32 | class TrivialCifar10Model(model.Model): 33 | """Trivial cifar10 model configuration.""" 34 | 35 | def __init__(self): 36 | super(TrivialCifar10Model, self).__init__('trivial', 32, 32, 0.005) 37 | 38 | def add_inference(self, cnn): 39 | cnn.reshape([-1, 32 * 32 * 3]) 40 | cnn.affine(1) 41 | cnn.affine(4096) 42 | -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/models/vgg_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Vgg model configuration. 17 | 18 | Includes multiple models: vgg11, vgg16, vgg19, corresponding to 19 | model A, D, and E in Table 1 of [1]. 20 | 21 | References: 22 | [1] Simonyan, Karen, Andrew Zisserman 23 | Very Deep Convolutional Networks for Large-Scale Image Recognition 24 | arXiv:1409.1556 (2014) 25 | """ 26 | 27 | from six.moves import xrange # pylint: disable=redefined-builtin 28 | from models import model 29 | 30 | 31 | def _construct_vgg(cnn, num_conv_layers): 32 | """Build vgg architecture from blocks.""" 33 | assert len(num_conv_layers) == 5 34 | for _ in xrange(num_conv_layers[0]): 35 | cnn.conv(64, 3, 3) 36 | cnn.mpool(2, 2) 37 | for _ in xrange(num_conv_layers[1]): 38 | cnn.conv(128, 3, 3) 39 | cnn.mpool(2, 2) 40 | for _ in xrange(num_conv_layers[2]): 41 | cnn.conv(256, 3, 3) 42 | cnn.mpool(2, 2) 43 | for _ in xrange(num_conv_layers[3]): 44 | cnn.conv(512, 3, 3) 45 | cnn.mpool(2, 2) 46 | for _ in xrange(num_conv_layers[4]): 47 | cnn.conv(512, 3, 3) 48 | cnn.mpool(2, 2) 49 | cnn.reshape([-1, 512 * 7 * 7]) 50 | cnn.affine(4096) 51 | cnn.dropout() 52 | cnn.affine(4096) 53 | cnn.dropout() 54 | 55 | 56 | class Vgg11Model(model.Model): 57 | 58 | def __init__(self): 59 | super(Vgg11Model, self).__init__('vgg11', 224, 64, 0.005) 60 | 61 | def add_inference(self, cnn): 62 | _construct_vgg(cnn, [1, 1, 2, 2, 2]) 63 | 64 | 65 | class Vgg16Model(model.Model): 66 | 67 | def __init__(self): 68 | super(Vgg16Model, self).__init__('vgg16', 224, 64, 0.005) 69 | 70 | def add_inference(self, cnn): 71 | _construct_vgg(cnn, [2, 2, 3, 3, 3]) 72 | 73 | 74 | class Vgg19Model(model.Model): 75 | 76 | def __init__(self): 77 | super(Vgg19Model, self).__init__('vgg19', 224, 64, 0.005) 78 | 79 | def add_inference(self, cnn): 80 | _construct_vgg(cnn, [2, 2, 4, 4, 4]) 81 | -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/parallax_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Seoul National University 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import tensorflow as tf 17 | import parallax 18 | 19 | 20 | flags = tf.app.flags 21 | flags.DEFINE_boolean('replicate_variables', True, """replicate_variables""") 22 | flags.DEFINE_string('protocol', 'grpc', """The method for managing variables""") 23 | tf.app.flags.DEFINE_string('mpirun_options', '', 24 | 'option for mpirun') 25 | flags.DEFINE_string('run_option', 'HYBRID', 26 | 'The run option whether PS, MPI or HYBRID') 27 | flags.DEFINE_string('redirect_path', None, """redirect path to keep the log of distributed workers""") 28 | flags.DEFINE_integer('save_ckpt_steps', None, 29 | """Number of steps between two consecutive checkpoints""") 30 | flags.DEFINE_integer('save_n_ckpts_per_epoch', -1, """Save n checkpoints per every epoch""") 31 | flags.DEFINE_string('ckpt_dir', None, """Directory to save checkpoints""") 32 | flags.DEFINE_string('profile_dir', None, """Directory to save RunMetadata""") 33 | flags.DEFINE_string('profile_steps', None, """Comma separated porfile steps""") 34 | flags.DEFINE_string('profile_range', None, """profile_start_step,profile_end_step""") 35 | flags.DEFINE_integer('profile_worker', None, """The worker to profile""") 36 | flags.DEFINE_boolean('local_aggregation', True, 37 | """Whether to use local aggregation or not""") 38 | flags.DEFINE_boolean('boundary_among_servers', True, 39 | """Whether to use operation placement among servers""") 40 | flags.DEFINE_boolean('boundary_between_workers_and_servers', True, 41 | """Whether to use operation placement between workers and servers""") 42 | flags.DEFINE_string('export_graph_path', None, """export path to keep transformed graph definintion""") 43 | 44 | FLAGS = flags.FLAGS 45 | 46 | def calculate_ckpt_steps(): 47 | if FLAGS.save_n_ckpts_per_epoch > 0: 48 | with open(FLAGS.resource_info_file) as resource_info: 49 | num_workers = sum([len(w['gpus']) for w in json.load(resource_info)['worker']]) 50 | num_words_per_iter = FLAGS.batch_size * FLAGS.num_steps * num_workers 51 | num_iters_per_epoch = math.ceil(language_model_graph._NUM_WORDS['train'] / num_words_per_iter / FLAGS.save_n_ckpts_per_epoch) 52 | save_ckpt_steps = num_iters_per_epoch if FLAGS.sync else num_iters_per_epoch * num_workers 53 | parallax.log.info('Save checkpoint for every %d iters' % save_ckpt_steps) 54 | else: 55 | save_ckpt_steps = FLAGS.save_ckpt_steps 56 | 57 | return save_ckpt_steps 58 | 59 | 60 | def build_config(): 61 | 62 | ckpt_config = parallax.CheckPointConfig(ckpt_dir=FLAGS.ckpt_dir, 63 | save_ckpt_steps=calculate_ckpt_steps()) 64 | ps_config = parallax.PSConfig(replicate_variables=FLAGS.replicate_variables, 65 | protocol=FLAGS.protocol, 66 | local_aggregation=FLAGS.local_aggregation, 67 | boundary_among_servers=FLAGS.boundary_among_servers, 68 | boundary_between_workers_and_servers=\ 69 | FLAGS.boundary_between_workers_and_servers) 70 | mpi_config = parallax.MPIConfig(mpirun_options=FLAGS.mpirun_options) 71 | def get_profile_steps(): 72 | if FLAGS.profile_steps: 73 | FLAGS.profile_steps = FLAGS.profile_steps.strip() 74 | return [int(step) for step in FLAGS.profile_steps.split(',')] 75 | return None 76 | 77 | def get_profile_range(): 78 | if FLAGS.profile_range: 79 | FLAGS.profile_range = FLAGS.profile_range.strip() 80 | splits = FLAGS.profile_range.split(',') 81 | return (int(splits[0]), int(splits[1])) 82 | return None 83 | 84 | profile_config = parallax.ProfileConfig(profile_dir=FLAGS.profile_dir, 85 | profile_steps=get_profile_steps(), 86 | profile_range=get_profile_range(), 87 | profile_worker=FLAGS.profile_worker) 88 | 89 | parallax_config = parallax.Config() 90 | parallax_config.run_option = FLAGS.run_option 91 | parallax_config.average_sparse = False 92 | parallax_config.communication_config = parallax.CommunicationConfig(ps_config, mpi_config) 93 | parallax_config.ckpt_config=ckpt_config 94 | parallax_config.profile_config = profile_config 95 | parallax_config.redirect_path = FLAGS.redirect_path 96 | parallax_config.export_graph_path = FLAGS.export_graph_path 97 | 98 | return parallax_config 99 | -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/platforms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/tf_cnn_benchmarks/platforms/__init__.py -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/platforms/default/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/tf_cnn_benchmarks/platforms/default/__init__.py -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/platforms/default/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Utility code for the default platform.""" 17 | 18 | import cnn_util 19 | 20 | 21 | def get_platform_params(): 22 | """Returns a dict of platform-specific params. 23 | 24 | No platform-specific flags are needed for the default platform, so this 25 | returns an empty dict. 26 | 27 | Returns: 28 | A dict that maps from param name to ParamSpec. 29 | """ 30 | return {} 31 | 32 | 33 | def get_cluster_manager(params, config_proto): 34 | """Returns the cluster manager to be used.""" 35 | return cnn_util.GrpcClusterManager(params, config_proto) 36 | 37 | 38 | def _initialize(params, config_proto): 39 | # Currently, no platform initialization needs to be done. 40 | del params, config_proto 41 | 42 | 43 | _is_initalized = False 44 | 45 | 46 | def initialize(params, config_proto): 47 | global _is_initalized 48 | if _is_initalized: 49 | return 50 | _is_initalized = True 51 | _initialize(params, config_proto) 52 | -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/platforms/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Utility code for a certain platform. 17 | 18 | This file simply imports everything from the default platform. To switch to a 19 | different platform, the import statement can be changed to point to a new 20 | platform. 21 | 22 | Creating a custom platform can be useful to, e.g., run some initialization code 23 | required by the platform or register a platform-specific model. 24 | """ 25 | 26 | from platforms.default.util import * # pylint: disable=unused-import,wildcard-import 27 | -------------------------------------------------------------------------------- /parallax/parallax/examples/tf_cnn_benchmarks/resource_info: -------------------------------------------------------------------------------- 1 | 123.456.78.90:1,2 2 | -------------------------------------------------------------------------------- /parallax/parallax/util/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) # Apache 2.0 2 | 3 | package( 4 | default_visibility = [ 5 | "//visibility:public", 6 | ], 7 | ) 8 | 9 | sh_binary( 10 | name = "build_pip_package", 11 | srcs = ["build_pip_package.sh"], 12 | data = [ 13 | "//parallax:parallax", 14 | "//parallax/core/python/tools:tools", 15 | ], 16 | ) 17 | -------------------------------------------------------------------------------- /parallax/parallax/util/build_pip_package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2017 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # Script for building a pip package. 18 | # 19 | # Based on tensorflow/tools/pip_package/build_pip_package.sh. 20 | set -e 21 | 22 | function main() { 23 | PYTHON="python" 24 | POSITIONAL=() 25 | while [[ $# -gt 0 ]] 26 | do 27 | key="$1" 28 | case $key in 29 | -p|-py) 30 | PYTHON="$2" 31 | shift 32 | shift 33 | ;; 34 | --py=*|--python=*) 35 | PYTHON="${key#*=}" 36 | shift 37 | ;; 38 | *) 39 | POSITIONAL+=("$1") 40 | shift 41 | ;; 42 | esac 43 | done 44 | set -- "${POSITIONAL[@]}" # restore positional parameters 45 | 46 | if [ $# -lt 1 ] ; then 47 | echo "No destination dir provided" 48 | exit 1 49 | fi 50 | 51 | DEST=$1 52 | TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX) 53 | 54 | echo $(date) : "=== Using tmpdir: ${TMPDIR}" 55 | 56 | if [ ! -d bazel-bin/parallax ]; then 57 | echo "Could not find bazel-bin. Did you run from the root of the build tree?" 58 | exit 1 59 | fi 60 | 61 | cp -R \ 62 | bazel-bin/parallax/util/build_pip_package.runfiles/parallax/parallax \ 63 | "${TMPDIR}" 64 | 65 | cp parallax/util/setup.py ${TMPDIR} 66 | 67 | # Before we leave the top-level directory, make sure we know how to 68 | # call python. 69 | #source tensorflow/tools/python_bin_path.sh 70 | 71 | pushd ${TMPDIR} 72 | echo $(date) : "=== Building wheel" 73 | ${PYTHON} setup.py bdist_wheel >/dev/null 74 | mkdir -p ${DEST} 75 | cp dist/* ${DEST} 76 | popd 77 | rm -rf ${TMPDIR} 78 | echo $(date) : "=== Output wheel file is in: ${DEST}" 79 | echo ${PYTHON} 80 | } 81 | 82 | main "$@" 83 | -------------------------------------------------------------------------------- /tools/bazel.rc: -------------------------------------------------------------------------------- 1 | import %workspace%/tensorflow/tools/bazel.rc 2 | import %workspace%/tensorflow/.tf_configure.bazelrc 3 | 4 | build --define PYTHON_LIB_PATH=$PYTHON_BINARY/../../lib/python$PYTHON_MAJOR_VERSION/site-packages 5 | 6 | build --package_path=%workspace%:%workspace%/tensorflow/ 7 | -------------------------------------------------------------------------------- /tools/style_check.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Seoul National University 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Simple Python style check script. 16 | pycodestyle checks code against style 17 | conventions in PEP8. Do not check 18 | example files. 19 | requirements: pycodestyle""" 20 | import os 21 | 22 | # return 0 for success 23 | if os.system( 24 | "pycodestyle --statistics ../parallax/parallax/ " 25 | "--exclude=../parallax/parallax/examples/") == 0: 26 | print("PASS") 27 | --------------------------------------------------------------------------------