├── .github
├── issue_template.md
└── pull_request_template.md
├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── doc
├── figure
│ ├── LM-1B Benchmark.png
│ ├── Resnet50 Benchmark.png
│ ├── benchmark.png
│ ├── exec_model.png
│ ├── hybrid.png
│ ├── lm1b_convergence.png
│ ├── nmt_convergence.png
│ └── resnet50_convergence.png
├── installation.md
├── parallax_api.md
├── quick_start.md
└── trouble_shooting.md
├── parallax
├── WORKSPACE
└── parallax
│ ├── BUILD
│ ├── __init__.py
│ ├── core
│ ├── BUILD
│ ├── __init__.py
│ └── python
│ │ ├── BUILD
│ │ ├── __init__.py
│ │ ├── common
│ │ ├── BUILD
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── consts.py
│ │ ├── graph_transform_lib.py
│ │ ├── lib.py
│ │ ├── partitions.py
│ │ ├── runner.py
│ │ ├── session_context.py
│ │ └── shard.py
│ │ ├── hybrid
│ │ ├── BUILD
│ │ ├── __init__.py
│ │ ├── between_graph_parallel.py
│ │ ├── graph_transform.py
│ │ ├── in_graph_parallel.py
│ │ └── runner.py
│ │ ├── mpi
│ │ ├── BUILD
│ │ ├── __init__.py
│ │ ├── graph_transform.py
│ │ └── runner.py
│ │ ├── ps
│ │ ├── BUILD
│ │ ├── __init__.py
│ │ ├── between_graph_parallel.py
│ │ ├── graph_transform.py
│ │ ├── in_graph_parallel.py
│ │ └── runner.py
│ │ └── tools
│ │ ├── BUILD
│ │ ├── __init__.py
│ │ └── launch_ps.py
│ ├── examples
│ ├── lm1b
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── data_utils.py
│ │ ├── language_model.py
│ │ ├── language_model_graph.py
│ │ ├── lm1b_distributed_driver.py
│ │ ├── lm1b_eval.py
│ │ ├── lm1b_input.py
│ │ ├── parallax_config.py
│ │ ├── resource_info
│ │ └── testdata
│ │ │ ├── test_s2.txt
│ │ │ ├── test_sentences.txt
│ │ │ └── test_vocab.txt
│ ├── nmt
│ │ ├── .gitignore
│ │ ├── CONTRIBUTING.md
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── attention_model.py
│ │ ├── g3doc
│ │ │ └── img
│ │ │ │ ├── attention_equation_0.jpg
│ │ │ │ ├── attention_equation_1.jpg
│ │ │ │ ├── attention_mechanism.jpg
│ │ │ │ ├── attention_vis.jpg
│ │ │ │ ├── encdec.jpg
│ │ │ │ ├── greedy_dec.jpg
│ │ │ │ └── seq2seq.jpg
│ │ ├── gnmt_model.py
│ │ ├── inference.py
│ │ ├── inference_test.py
│ │ ├── model.py
│ │ ├── model_helper.py
│ │ ├── model_test.py
│ │ ├── nmt.py
│ │ ├── nmt_distributed_driver.py
│ │ ├── nmt_eval.py
│ │ ├── nmt_test.py
│ │ ├── parallax_config.py
│ │ ├── resource_info
│ │ ├── scripts
│ │ │ ├── __init__.py
│ │ │ ├── bleu.py
│ │ │ ├── download_iwslt15.sh
│ │ │ ├── rouge.py
│ │ │ └── wmt16_en_de.sh
│ │ ├── standard_hparams
│ │ │ ├── iwslt15.json
│ │ │ ├── wmt16.json
│ │ │ ├── wmt16_gnmt_4_layer.json
│ │ │ └── wmt16_gnmt_8_layer.json
│ │ ├── testdata
│ │ │ ├── deen_output
│ │ │ ├── deen_ref_bpe
│ │ │ ├── deen_ref_spm
│ │ │ ├── iwslt15.tst2013.100.en
│ │ │ ├── iwslt15.tst2013.100.vi
│ │ │ ├── iwslt15.vocab.100.en
│ │ │ ├── iwslt15.vocab.100.vi
│ │ │ ├── label_ref
│ │ │ ├── pred_output
│ │ │ ├── test_embed.txt
│ │ │ ├── test_embed_with_header.txt
│ │ │ ├── test_infer_file
│ │ │ ├── test_infer_vocab.src
│ │ │ └── test_infer_vocab.tgt
│ │ ├── train.py
│ │ └── utils
│ │ │ ├── __init__.py
│ │ │ ├── common_test_utils.py
│ │ │ ├── evaluation_utils.py
│ │ │ ├── evaluation_utils_test.py
│ │ │ ├── iterator_utils.py
│ │ │ ├── iterator_utils_test.py
│ │ │ ├── misc_utils.py
│ │ │ ├── misc_utils_test.py
│ │ │ ├── nmt_utils.py
│ │ │ ├── standard_hparams_utils.py
│ │ │ ├── vocab_utils.py
│ │ │ └── vocab_utils_test.py
│ ├── simple
│ │ ├── README.md
│ │ ├── resource_info
│ │ └── simple_driver.py
│ ├── skip_thoughts
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── configuration.py
│ │ ├── data
│ │ │ ├── preprocess_dataset.py
│ │ │ └── special_words.py
│ │ ├── encoder_manager.py
│ │ ├── evaluate.py
│ │ ├── ops
│ │ │ ├── __init__.py
│ │ │ ├── gru_cell.py
│ │ │ └── input_ops.py
│ │ ├── parallax_config.py
│ │ ├── resource_info
│ │ ├── skip_distributed_driver.py
│ │ ├── skip_thoughts_encoder.py
│ │ ├── skip_thoughts_model.py
│ │ ├── track_perplexity.py
│ │ ├── train.py
│ │ └── vocabulary_expansion.py
│ └── tf_cnn_benchmarks
│ │ ├── CNNBenchmark_distributed_driver.py
│ │ ├── CNNBenchmark_eval.py
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── benchmark_cnn.py
│ │ ├── cnn_util.py
│ │ ├── convnet_builder.py
│ │ ├── datasets.py
│ │ ├── models
│ │ ├── __init__.py
│ │ ├── alexnet_model.py
│ │ ├── densenet_model.py
│ │ ├── googlenet_model.py
│ │ ├── inception_model.py
│ │ ├── lenet_model.py
│ │ ├── model.py
│ │ ├── model_config.py
│ │ ├── overfeat_model.py
│ │ ├── resnet_model.py
│ │ ├── trivial_model.py
│ │ └── vgg_model.py
│ │ ├── parallax_config.py
│ │ ├── platforms
│ │ ├── __init__.py
│ │ ├── default
│ │ │ ├── __init__.py
│ │ │ └── util.py
│ │ └── util.py
│ │ ├── preprocessing.py
│ │ └── resource_info
│ └── util
│ ├── BUILD
│ ├── build_pip_package.sh
│ └── setup.py
└── tools
├── bazel.rc
└── style_check.py
/.github/issue_template.md:
--------------------------------------------------------------------------------
1 | ### Things to Change
2 |
3 | ### Current Behavior
4 |
5 | ### Expected Behavior
6 |
7 | ### Failure Information (for bugs)
8 |
9 | #### Failure Logs
10 |
11 | #### How to Reproduce
12 |
13 | ### Related Issues
14 |
15 |
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | Github issue: #XX
2 |
3 | **Major changes:**
4 | -
5 |
6 | **Minor changes to note:**
7 | -
8 |
9 | **Tests for the changes:**
10 | -
11 |
12 | **Other comments:**
13 | -
14 |
15 | resolves #XX
16 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.sh
2 | *.pyc
3 | *bazel*
4 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tensorflow"]
2 | path = tensorflow
3 | url = https://github.com/snuspl/tensorflow.git
4 | [submodule "horovod"]
5 | path = horovod
6 | url = https://github.com/horovod/horovod.git
7 | branch = v0.16.3
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Parallax
2 | **Parallax** is a tool that optimizes data parallel training by considering whether each variable in a deep learning model is sparse or dense. The sparsity-aware data parallel training improves performance of models with sparse variables that show relatively low scalability on existing frameworks while maintaining equal performance for models with only dense variables such as ResNet-50 and Inception-V3. In addition, Parallax automatically parallelizes training of a single-GPU deep learning model to minimize user efforts. If you are interested, you can find the technical details of Parallax in [our paper](https://dl.acm.org/citation.cfm?id=3303957).
3 |
4 | Parallax is currently implemented on TensorFlow. We support [TensorFlow v1.6](https://github.com/tensorflow/tensorflow/tree/r1.6) and [TensorFlow v1.11](https://github.com/tensorflow/tensorflow/tree/r1.11). In case that Parallax uses Message Passing Interface (MPI), Parallax requires *AllReduce*, *AllGather* operations implemented in [Horovod v0.11.2](https://github.com/uber/horovod/tree/v0.11.2). We plan to support multiple TensorFlow versions.
5 |
6 | * [Installation](doc/installation.md)
7 | * [Running Parallax](doc/quick_start.md)
8 | * [Parallax API](doc/parallax_api.md)
9 |
10 | ## Why Parallax?
11 | Parallax makes it easier for users to do distributed training of a deep learning model developed in a single device (e.g., GPU or CPU) while employing various optimization techniques that Parallax provides. A Parallax user simply specifies a single-device model graph, resource specification for distributed training and Parallax does the rest! For distributed training, Parallax supports hybrid architecture that combines two different distributed training architectures: Parameter Server (PS) and AllReduce (AR). Hybrid architecture exploits the advantages of both architectures. Moreover, Parallax will provide large sparse variable partitioning soon to maximize parallelism while maintaining low computation and communication overhead. Parallax further optimizes training with local aggregation and smart operation placement to mitigate communication overhead.
12 |
13 | PS and AR architectures are still available in Parallax; users can choose the training architecture if they want (default is hybrid for synchronous training).
14 |
15 | ### Hybrid Architecture
16 |

17 |
18 | The amount of data transfer of each PS and AR achitecture changes according to whether a variable is sparse or dense. Based on the fact, Parallax pursues a hybrid architecture in which the AR architecture handles dense variables and the PS architecture handles sparse variables to minimize communication overhead. Each worker has a replica of dense variables, while separate server processes manage only sparse variables.
19 |
20 | ### Parallax Execution Model
21 |
22 | 
23 |
24 |
25 | When a client initiates a deep learning job with a single-device computation graph, resource information, and optionally a flag that indicates either synchronous or asynchronous training, Parallax transforms the computation graph by analyzing its characteristics. Then, Parallax executes the transformed graph with its optimized communication layer in the distributed environment.
26 |
27 | ### Parallax Benchmark
28 |
29 | To give you an idea on how well Parallax performs, we present the following chart that shows the result of experiments done in a cluster of eight machines that are connected via Mellanox ConnectX-4 cards with 100Gbps InfiniBand. Each machine has six NVIDIA GeForce TITAN Xp GPU cards.
30 |
31 |
32 |
33 |
34 |
35 |
36 | Parallax converges correctly as other frameworks(TensorFlow and Horovod). Parallax is faster than TensorFlow and similiar to Horovod for ResNet50 (dense model). In case of LM1B (sparse model), Parallax outperforms than both TensorFlow and Horovod.
37 |
38 |
39 |
40 |
41 | Parallax outperforms TensorFlow for both Resnet50 and LM1B. In addition, Parallax outperforms Horovod for LM1B.
42 |
43 | ## Troubleshooting
44 | See the [Troubleshooting](doc/trouble_shooting.md) page and submit a new [issue](https://github.com/snuspl/parallax/issues/new) or [contact us](#contact-us) if you cannot find an answer.
45 |
46 | ## Contact us
47 | To contact us, send an email to parallax-dev@googlegroups.com.
48 |
49 | ## License
50 | [Apache License 2.0](LICENSE)
51 |
--------------------------------------------------------------------------------
/doc/figure/LM-1B Benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/LM-1B Benchmark.png
--------------------------------------------------------------------------------
/doc/figure/Resnet50 Benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/Resnet50 Benchmark.png
--------------------------------------------------------------------------------
/doc/figure/benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/benchmark.png
--------------------------------------------------------------------------------
/doc/figure/exec_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/exec_model.png
--------------------------------------------------------------------------------
/doc/figure/hybrid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/hybrid.png
--------------------------------------------------------------------------------
/doc/figure/lm1b_convergence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/lm1b_convergence.png
--------------------------------------------------------------------------------
/doc/figure/nmt_convergence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/nmt_convergence.png
--------------------------------------------------------------------------------
/doc/figure/resnet50_convergence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/resnet50_convergence.png
--------------------------------------------------------------------------------
/doc/installation.md:
--------------------------------------------------------------------------------
1 | # Installation
2 | Parallax runs under Linux with Python 2.7 and 3.6; we haven't yet tested Parallax on other platforms and 3.3+.
3 | Parallax depends on a modified version of TensorFlow 1.6/1.11 and horovod 0.11.2 in parallax repository as submodules. *Each of these frameworks needs to be built and installed from source, which is explained in further detail below*. Parallax itself also requires installing from sources, and below explains the installation process step by step. We plan to provide binary files in the near future.
4 |
5 | First, clone the parallax repository on your linux machine:
6 | ```shell
7 | $ git clone --recurse-submodules https://github.com/snuspl/parallax.git
8 | ```
9 | We recommend installing using Virtualenv and pip.
10 |
11 | Install Python, pip, and Virtualenv:
12 | ```shell
13 | $ sudo apt-get install python-pip python-dev python-virtualenv
14 | ```
15 |
16 | Create a Virtualenv environment in the directory `parallax_venv`(specify whichever name you prefer), and then activate it.
17 | ```shell
18 | $ virtualenv parallax_venv
19 | $ source parallax_venv/bin/activate
20 | ```
21 |
22 | ## Install TensorFlow
23 | TensorFlow requires [Bazel](https://docs.bazel.build/versions/master/install.html) to build a binary file. (See [TF install](https://www.tensorflow.org/install/install_sources) for more instructions on how to build TensorFlow from source.) TensorFlow can be built CPU-only but Parallax needs TensorFlow with GPU support using [CUDA Toolkit 9.0 or 10.0](https://developer.nvidia.com/cuda-zone) and [CuDNN SDK v7](https://developer.nvidia.com/cudnn). To install TensorFlow with GPU support, follow the commands below.
24 |
25 | ```shell
26 | $ cd parallax/tensorflow
27 | $ git checkout r1.11 (optional for TensorFlow v1.11)
28 | $ pip install numpy
29 | $ ./configure
30 | (Configurations related to cuda should be turned on to use GPUs)
31 | (verbs: ibverbs RDMA)
32 | (gdr: GPU Direct (only for GPUs with GDR support))
33 | $ bazel build --config=opt --config=cuda //tensorflow/tools/pip_package:build_pip_package
34 | $ bazel-bin/tensorflow/tools/pip_package/build_pip_package {target_directory}
35 | $ pip install {target_directory}/tensorflow-*.whl
36 | ```
37 |
38 |
39 | ## Install Horovod
40 | To install horovod, [Open MPI](https://www.open-mpi.org/faq/?category=building#easy-build) and [NCCL](https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html) are required as MPI implementations. To install OpenMPI, `--with-cuda` flag should be in the configure line, and you can also add `--with-verbs` to use ibverbs.
41 | We tested on openmpi-3.0.0, NCCL 2.1.15(for cuda9.0) and NCCL 2.3.5(for cuda10.0).
42 | ```shell
43 | $ cd ../horovod
44 | $ python setup.py sdist
45 | $ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITHOUT_PYTORCH=True HOROVOD_WITHOUT_MXNET=True pip install --no-cache-dir dist/horovod-*.tar.gz
46 | ```
47 |
48 | ## Install Parallax
49 | Parallax also uses [Bazel](https://docs.bazel.build/versions/master/install.html) for installation.
50 | ```shell
51 | $ cd ../parallax # parallax directory
52 | $ bazel build //parallax/util:build_pip_package
53 | $ bazel-bin/parallax/util/build_pip_package {target_directory}
54 | $ pip install {target_directory}/parallax-*.whl
55 |
--------------------------------------------------------------------------------
/doc/trouble_shooting.md:
--------------------------------------------------------------------------------
1 | # Trouble Shooting
2 |
3 | Because Parallax execution involves many dependent software and hardware packages, debugging can be tricky if errors occur.
4 | This page collects the troublesome situations we have experienced and the solutions. If you have a similar symptom, try following the suggestions. Also, if you have any additional trouble shooting case, please add it here.
5 |
6 | ### Device placement error
7 | Error message:
8 |
9 | `device placement error(Cannot assign a device for operation)`
10 |
11 | Parallax assumes `allow_soft_placement=True` because Parallax assigns operators on CPU/GPU devices according to their characteristics(shared or replicated) if the placement of the device is not specified. If you face a device placement error, try setting `allow_soft_placement=True` on the session configuration.
12 |
13 | ### RDMA queue issue while running parameter server model
14 | Error message:
15 | ```
16 | tensorflow/contrib/verbs/rdma.cc:1009] Check failed: status.ok() RecvLocalAsync was not ok. error message: Step 123330693738664103
17 | tensorflow/contrib/verbs/rdma.cc:1009] Check failed: status.ok() RecvLocalAsync was not ok. error message: Step 95609778068110326
18 | ```
19 | There are some issues related to managing RDMA queue in Tensorflow. Consider increasing the RDMA queue depth by adjusting `RDMA_QUEUE_DEPTH=` in `.ssh/environment` or elsewhere you managing environment variables.
20 |
21 | ### NCCL different version issue
22 | Error message:
23 | ```
24 | Signal: Segmentation fault (11)
25 | Signal code: Address not mapped (1)
26 | Failing at address: 0xa0
27 | ```
28 | This error can occur if multiple machines use different versions of NCCL.
29 |
30 | ### Hang by fetching gradients from non-chief workers while running parameter server model
31 | Error message: None
32 |
33 | There are a chief(worker 0) worker and non-chief workers, and Parallax assumes that only the chief worker
34 | can fetch the gradients. It means fetching gradients from non-chief workers can block the distributed training.
35 |
--------------------------------------------------------------------------------
/parallax/WORKSPACE:
--------------------------------------------------------------------------------
1 | workspace(name = "parallax")
2 |
--------------------------------------------------------------------------------
/parallax/parallax/BUILD:
--------------------------------------------------------------------------------
1 | licenses(["notice"]) # Apache 2.0
2 |
3 | package(
4 | default_visibility = [
5 | "//visibility:public",
6 | ],
7 | )
8 |
9 | native.py_library(
10 | name = "parallax",
11 | srcs = ["__init__.py"],
12 | deps = [
13 | "//parallax/core:core",
14 | "//parallax/core/python/common:runner",
15 | "//parallax/core/python/common:shard",
16 | ],
17 | )
--------------------------------------------------------------------------------
/parallax/parallax/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2018 Seoul National University
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from parallax.core.python.common.partitions import get_partitioner
17 | from parallax.core.python.common.runner import parallel_run
18 | from parallax.core.python.common import shard
19 | from parallax.core.python.common.lib import parallax_log as log
20 |
21 | from parallax.core.python.common.config import ParallaxConfig as Config
22 | from parallax.core.python.common.config import PSConfig
23 | from parallax.core.python.common.config import MPIConfig
24 | from parallax.core.python.common.config import CommunicationConfig
25 | from parallax.core.python.common.config import CheckPointConfig
26 | from parallax.core.python.common.config import ProfileConfig
27 |
--------------------------------------------------------------------------------
/parallax/parallax/core/BUILD:
--------------------------------------------------------------------------------
1 | licenses(["notice"]) # Apache 2.0
2 |
3 | package(
4 | default_visibility = [
5 | "//visibility:public",
6 | ],
7 | )
8 |
9 | native.py_library(
10 | name = "core",
11 | srcs = ["__init__.py"],
12 | deps = [
13 | "//parallax/core/python:python"
14 | ],
15 | )
--------------------------------------------------------------------------------
/parallax/parallax/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/__init__.py
--------------------------------------------------------------------------------
/parallax/parallax/core/python/BUILD:
--------------------------------------------------------------------------------
1 | licenses(["notice"]) # Apache 2.0
2 |
3 | package(
4 | default_visibility = [
5 | "//visibility:public",
6 | ],
7 | )
8 |
9 | native.py_library(
10 | name = "python",
11 | srcs = ["__init__.py"],
12 | deps = [
13 | "//parallax/core/python/common:common",
14 | "//parallax/core/python/mpi:mpi",
15 | "//parallax/core/python/ps:ps",
16 | "//parallax/core/python/hybrid:hybrid",
17 | "//parallax/core/python/tools:tools",
18 | ],
19 | )
20 |
--------------------------------------------------------------------------------
/parallax/parallax/core/python/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/__init__.py
--------------------------------------------------------------------------------
/parallax/parallax/core/python/common/BUILD:
--------------------------------------------------------------------------------
1 | licenses(["notice"]) # Apache 2.0
2 |
3 | package(
4 | default_visibility = [
5 | "//visibility:public",
6 | ],
7 | )
8 |
9 |
10 |
11 | native.py_library(
12 | name = "lib",
13 | srcs = ["lib.py"],
14 | deps = [
15 | "consts",
16 | ]
17 | )
18 |
19 | native.py_library(
20 | name = "config",
21 | srcs = ["config.py"],
22 | deps = [
23 | ]
24 | )
25 |
26 | native.py_library(
27 | name = "graph_transform_lib",
28 | srcs = ["graph_transform_lib.py"],
29 | deps = [
30 | "lib",
31 | ]
32 | )
33 |
34 | native.py_library(
35 | name = "session_context",
36 | srcs = ["session_context.py"],
37 | deps = [
38 | ]
39 | )
40 |
41 | native.py_library(
42 | name = "runner",
43 | srcs = ["runner.py"],
44 | deps = [
45 | "lib",
46 | "graph_transform_lib",
47 | "consts",
48 | "partitions",
49 | "//parallax/core/python/ps:runner",
50 | "//parallax/core/python/mpi:runner",
51 | "//parallax/core/python/hybrid:runner"
52 | ]
53 | )
54 |
55 | native.py_library(
56 | name = "shard",
57 | srcs = ["shard.py"],
58 | deps = [
59 | "graph_transform_lib",
60 | ],
61 | )
62 |
63 | native.py_library(
64 | name = "consts",
65 | srcs = ["consts.py"],
66 | deps = [
67 | ],
68 | )
69 |
70 | native.py_library(
71 | name = "partitions",
72 | srcs = ["partitions.py"],
73 | deps = [
74 | ],
75 | )
76 | native.py_library(
77 | name = "common",
78 | srcs = ["__init__.py"],
79 | deps = [
80 | "graph_transform_lib",
81 | "runner",
82 | "shard",
83 | "config",
84 | "session_context",
85 | "partitions"
86 | ],
87 | )
88 |
89 |
--------------------------------------------------------------------------------
/parallax/parallax/core/python/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/common/__init__.py
--------------------------------------------------------------------------------
/parallax/parallax/core/python/common/consts.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2018 Seoul National University
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import os
17 |
18 | PARALLAX_RUN_OPTION = "PARALLAX_RUN_OPTION"
19 | PARALLAX_RUN_MASTER = "PARALLAX_RUN_MASTER"
20 | PARALLAX_RUN_MPI = "PARALLAX_RUN_MPI"
21 | PARALLAX_RUN_PS = "PARALLAX_RUN_PS"
22 | PARALLAX_RUN_HYBRID = "PARALLAX_RUN_HYBRID"
23 | PARALLAX_WORKER_ID = "PARALLAX_WORKER_ID"
24 | PARALLAX_NUM_WORKERS = "PARALLAX_NUM_WORKERS"
25 | PARALLAX_RESOURCE_INFO = "PARALLAX_RESOURCE_INFO"
26 | PARALLAX_MACHINE_ID = "PARALLAX_MACHINE_ID"
27 | PARALLAX_HOSTNAME = "PARALLAX_HOSTNAME"
28 |
29 | LOCAL_CODE_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
30 | LOCAL_LAUNCH_PS_PATH = os.path.join(LOCAL_CODE_ROOT, 'tools',
31 | 'launch_ps.py')
32 |
33 | REMOTE_PARALLAX_ROOT = os.path.join('/tmp', 'parallax-%s' % os.environ['USER'])
34 | REMOTE_LAUNCH_PS_PATH = os.path.join(REMOTE_PARALLAX_ROOT, 'launch_ps.py')
35 | REMOTE_MPI_SCRIPT_PATH = os.path.join(REMOTE_PARALLAX_ROOT, 'mpi_run.sh')
36 |
37 | NUM_ITERATIONS_FOR_TEST = 200
38 | NUM_ITERATIONS_FOR_WARMUP = 200
39 |
--------------------------------------------------------------------------------
/parallax/parallax/core/python/common/shard.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2018 Seoul National University
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import tensorflow as tf
17 |
18 |
19 | NUM_SHARDS = "num_shards"
20 | SHARD_ID = "shard_id"
21 | SHARD_FILTER_PRED = "shard_filter_predicate"
22 | FILTER_DATASET_NUM_SHARDS_POS = 1
23 | FILTER_DATASET_SHARD_ID_POS = 2
24 |
25 |
26 | def create_num_shards_and_shard_id():
27 | """Returns and create the num shards and the shard id tensors.
28 |
29 | Returns:
30 | The num shards and the shard id tensors.
31 |
32 | Raises:
33 | ValueError: if the num shards tensor or the shard id tensor is already
34 | defined.
35 | """
36 |
37 | # TODO: allow num_shards and shard_id inside a library function
38 | graph = tf.get_default_graph()
39 | num_shards_tensors = graph.get_collection(NUM_SHARDS)
40 | if len(num_shards_tensors) > 0:
41 | raise ValueError('"num_shards" already exists.')
42 | shard_id_tensors = graph.get_collection(SHARD_ID)
43 | if len(shard_id_tensors) > 0:
44 | raise ValueError('"shard_id" already exists.')
45 | # Create in proper graph and base name_scope.
46 | with graph.as_default() as g, g.name_scope(None):
47 | # Initialize num_shards_tensor=1, and shard_id_tensor=0.
48 | # parallax updates the value when the graph is transformed
49 | # for distributed version.
50 | num_shards_tensor = tf.constant(1, dtype=tf.int64, name="num_shards")
51 | shard_id_tensor = tf.constant(0, dtype=tf.int64, name="shard_id")
52 | tf.add_to_collection(NUM_SHARDS, num_shards_tensor)
53 | tf.add_to_collection(SHARD_ID, shard_id_tensor)
54 | return num_shards_tensor, shard_id_tensor
55 |
56 |
57 | def _get_or_create_num_shards_and_shard_id():
58 | graph = tf.get_default_graph()
59 | num_shards_tensors = graph.get_collection(NUM_SHARDS)
60 | if len(num_shards_tensors) > 0:
61 | num_shards_tensor = num_shards_tensors[0]
62 | shard_id_tensor = \
63 | graph.get_collection(SHARD_ID)[0]
64 | else:
65 | num_shards_tensor, shard_id_tensor = create_num_shards_and_shard_id()
66 | return num_shards_tensor, shard_id_tensor
67 |
68 |
69 | def shard(ds):
70 | """Convert a dataset to include shard, it has same effect
71 | with ds.shard(num_shards, index).
72 | """
73 |
74 | # TODO: allow dataset shard inside a function or dataset api
75 | # (e.g., map, parallel_interleave)
76 | num_shards, shard_id = _get_or_create_num_shards_and_shard_id()
77 |
78 | def filter_fn(elem_index, _):
79 | mod_result = tf.mod(elem_index, num_shards)
80 | return tf.equal(mod_result, shard_id)
81 |
82 | f = ds._enumerate().filter(filter_fn)
83 | assert f._predicate.captured_inputs[0] == num_shards
84 | assert f._predicate.captured_inputs[1] == shard_id
85 | tf.add_to_collection(SHARD_FILTER_PRED,
86 | f._predicate.name)
87 | return f.map(lambda _, elem: elem)
88 |
--------------------------------------------------------------------------------
/parallax/parallax/core/python/hybrid/BUILD:
--------------------------------------------------------------------------------
1 | licenses(["notice"]) # Apache 2.0
2 |
3 | package(
4 | default_visibility = [
5 | "//visibility:public",
6 | ],
7 | )
8 |
9 | native.py_library(
10 | name = "graph_transform",
11 | srcs = ["graph_transform.py"],
12 | deps = [
13 | "//parallax/core/python/common:graph_transform_lib",
14 | "//parallax/core/python/common:lib",
15 | "between_graph_parallel",
16 | "in_graph_parallel",
17 | ]
18 | )
19 |
20 | native.py_library(
21 | name = "between_graph_parallel",
22 | srcs = ["between_graph_parallel.py"],
23 | deps = [
24 | "//parallax/core/python/common:graph_transform_lib",
25 | "//parallax/core/python/common:lib",
26 | ]
27 | )
28 |
29 | native.py_library(
30 | name = "in_graph_parallel",
31 | srcs = ["in_graph_parallel.py"],
32 | deps = [
33 | "//parallax/core/python/common:graph_transform_lib",
34 | "//parallax/core/python/common:lib",
35 | ]
36 | )
37 |
38 | native.py_library(
39 | name = "runner",
40 | srcs = ["runner.py"],
41 | deps = [
42 | "graph_transform",
43 | "//parallax/core/python/common:lib",
44 | "//parallax/core/python/common:consts",
45 | ]
46 | )
47 |
48 | native.py_library(
49 | name = "hybrid",
50 | srcs = ["__init__.py"],
51 | deps = [
52 | "runner"
53 | ]
54 | )
55 |
--------------------------------------------------------------------------------
/parallax/parallax/core/python/hybrid/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/hybrid/__init__.py
--------------------------------------------------------------------------------
/parallax/parallax/core/python/mpi/BUILD:
--------------------------------------------------------------------------------
1 | licenses(["notice"]) # Apache 2.0
2 |
3 | package(
4 | default_visibility = [
5 | "//visibility:public",
6 | ],
7 | )
8 |
9 | native.py_library(
10 | name = "graph_transform",
11 | srcs = ["graph_transform.py"],
12 | deps = [
13 | "//parallax/core/python/common:graph_transform_lib",
14 | "//parallax/core/python/common:lib",
15 | ]
16 | )
17 |
18 | native.py_library(
19 | name = "runner",
20 | srcs = ["runner.py"],
21 | deps = [
22 | "graph_transform",
23 | "//parallax/core/python/common:lib",
24 | "//parallax/core/python/common:consts",
25 | "//parallax/core/python/common:session_context",
26 | ]
27 | )
28 |
29 | native.py_library(
30 | name = "mpi",
31 | srcs = ["__init__.py"],
32 | deps = [
33 | "runner"
34 | ]
35 | )
36 |
--------------------------------------------------------------------------------
/parallax/parallax/core/python/mpi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/mpi/__init__.py
--------------------------------------------------------------------------------
/parallax/parallax/core/python/mpi/graph_transform.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2018 Seoul National University
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import tensorflow as tf
17 | import horovod.tensorflow as hvd
18 |
19 | from parallax.core.python.common.graph_transform_lib import get_all_control_consumers
20 | from parallax.core.python.common.graph_transform_lib import update_consumers
21 | from parallax.core.python.common.graph_transform_lib import update_control_consumers
22 | from parallax.core.python.common.graph_transform_lib import update_shard_values_for_worker
23 | from parallax.core.python.common.lib import *
24 |
25 |
26 | def _add_broadcast_ops():
27 | bcast_global_variables_ops = []
28 | for var in tf.global_variables():
29 | bcast_global_variables_ops.append(
30 | tf.assign(var, hvd.broadcast(var, 0)))
31 | with tf.control_dependencies(bcast_global_variables_ops):
32 | tf.no_op(name='auto_parallel_bcast_global_vars')
33 |
34 |
35 | def _add_aggregation_ops(gradients_info, op_to_control_consumer_ops, config):
36 | grad_tensor = gradients_info._grad
37 | if isinstance(grad_tensor, tf.Tensor):
38 | grad = grad_tensor
39 | grad_consumers = [c for c in grad.consumers()]
40 | agg_grad = hvd.allreduce(grad,
41 | average=True)
42 | update_consumers(grad_consumers, grad, agg_grad)
43 | update_control_consumers(op_to_control_consumer_ops[grad.op],
44 | grad.op, agg_grad.op)
45 | else:
46 | grad = grad_tensor.values
47 | indices = grad_tensor.indices
48 | dense_shape = grad_tensor.dense_shape
49 | grad_consumers = [c for c in grad.consumers()]
50 | indices_consumers = [c for c in indices.consumers()]
51 | agg_grad = \
52 | hvd.allreduce(tf.IndexedSlices(grad, indices, dense_shape),
53 | average=config.average_sparse)
54 | update_consumers(grad_consumers, grad, agg_grad.values)
55 | update_consumers(indices_consumers, indices, agg_grad.indices)
56 | update_control_consumers(op_to_control_consumer_ops[grad.op],
57 | grad.op, agg_grad.values.op)
58 | update_control_consumers(
59 | op_to_control_consumer_ops[indices.op], indices.op,
60 | agg_grad.indices.op)
61 | gradients_info._grad = agg_grad
62 |
63 |
64 | def graph_transform_mpi(single_gpu_meta_graph_def, config,
65 | op_library_path=None):
66 | if op_library_path is not None:
67 | tf.load_op_library(op_library_path)
68 |
69 | with tf.Graph().as_default() as replica:
70 | tf.train.import_meta_graph(single_gpu_meta_graph_def)
71 |
72 | tensor_or_op_name_to_replica_names = {}
73 | for op in replica.get_operations():
74 | tensor_or_op_name_to_replica_names[op.name] = [op.name]
75 | for output in op.outputs:
76 | tensor_or_op_name_to_replica_names[output.name] = [output.name]
77 |
78 | # Initialize horovod
79 | hvd.init()
80 |
81 | num_workers = hvd.size()
82 | worker_id = hvd.rank()
83 | update_shard_values_for_worker(num_workers, worker_id)
84 |
85 | op_to_control_consumer_ops = get_all_control_consumers(replica)
86 | trainable_variable_ops = [var.op for var in tf.get_collection(
87 | tf.GraphKeys.TRAINABLE_VARIABLES)]
88 |
89 | for gradients_info in tf.get_collection(tf.GraphKeys.GRADIENTS_INFO):
90 | target_tensor = gradients_info._target
91 | if target_tensor.op not in trainable_variable_ops:
92 | parallax_log.debug(
93 | "Gradient for non-trainable variable %s is created, ignore"
94 | % target_tensor.op.name)
95 | continue
96 |
97 | _add_aggregation_ops(gradients_info, op_to_control_consumer_ops, config)
98 | _add_broadcast_ops()
99 |
100 | return tf.train.export_meta_graph(graph=replica), \
101 | tensor_or_op_name_to_replica_names
102 |
--------------------------------------------------------------------------------
/parallax/parallax/core/python/ps/BUILD:
--------------------------------------------------------------------------------
1 | licenses(["notice"]) # Apache 2.0
2 |
3 | package(
4 | default_visibility = [
5 | "//visibility:public",
6 | ],
7 | )
8 |
9 | native.py_library(
10 | name = "graph_transform",
11 | srcs = ["graph_transform.py"],
12 | deps = [
13 | "//parallax/core/python/common:graph_transform_lib",
14 | "//parallax/core/python/common:lib",
15 | "//parallax/core/python/ps:between_graph_parallel",
16 | "//parallax/core/python/ps:in_graph_parallel",
17 | ]
18 | )
19 |
20 | native.py_library(
21 | name = "between_graph_parallel",
22 | srcs = ["between_graph_parallel.py"],
23 | deps = [
24 | "//parallax/core/python/common:graph_transform_lib",
25 | "//parallax/core/python/common:lib",
26 | ]
27 | )
28 |
29 | native.py_library(
30 | name = "in_graph_parallel",
31 | srcs = ["in_graph_parallel.py"],
32 | deps = [
33 | "//parallax/core/python/common:graph_transform_lib",
34 | "//parallax/core/python/common:lib",
35 | ]
36 | )
37 |
38 | native.py_library(
39 | name = "runner",
40 | srcs = ["runner.py"],
41 | deps = [
42 | "graph_transform",
43 | "//parallax/core/python/common:lib",
44 | "//parallax/core/python/common:consts",
45 | "//parallax/core/python/common:graph_transform_lib",
46 | ]
47 | )
48 |
49 | native.py_library(
50 | name = "ps",
51 | srcs = ["__init__.py"],
52 | deps = [
53 | "runner"
54 | ]
55 | )
--------------------------------------------------------------------------------
/parallax/parallax/core/python/ps/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/ps/__init__.py
--------------------------------------------------------------------------------
/parallax/parallax/core/python/ps/graph_transform.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2018 Seoul National University
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from parallax.core.python.common.lib import *
17 | from parallax.core.python.ps.in_graph_parallel import in_graph_auto_parallel_compute
18 | from parallax.core.python.ps.between_graph_parallel import between_graph_auto_parallel_compute
19 |
20 |
21 | def graph_transform_ps(single_gpu_meta_graph_def,
22 | worker_id,
23 | config,
24 | op_library_path=None):
25 | cluster_info = config.resource_info
26 | # TODO: Handle all ps configurations
27 | if config.communication_config.ps_config.replicate_variables and not config.sync:
28 | raise ValueError('replicate_variables is only possible with sync')
29 | ps_device = '/job:ps' if 'ps' in cluster_info else '/job:worker/cpu:0'
30 | cluster_spec = get_tf_clusterspec(cluster_info)
31 | worker = cluster_info['worker'][worker_id]
32 | num_gpus = len(worker['gpus'])
33 |
34 | parallax_log.debug(
35 | "Starting graph transformation for PS for worker %d" % worker_id)
36 |
37 | tensor_or_op_name_to_replica_names = TensorOrOpNameToReplicaNames(
38 | single_gpu_meta_graph_def.meta_info_def.stripped_op_list)
39 |
40 | multi_gpu_meta_graph_def = \
41 | in_graph_auto_parallel_compute(
42 | single_gpu_meta_graph_def, num_gpus, config=config,
43 | op_library_path=op_library_path,
44 | tensor_or_op_name_to_replica_names=tensor_or_op_name_to_replica_names)
45 |
46 | ps_meta_graph_def = \
47 | between_graph_auto_parallel_compute(
48 | multi_gpu_meta_graph_def,
49 | worker_id=worker_id,
50 | ps_device=ps_device,
51 | worker_device='/job:worker/task:%d' % worker_id,
52 | merge_devices=True,
53 | cluster_spec=cluster_spec,
54 | config=config,
55 | op_library_path=op_library_path,
56 | num_replicas_per_worker=num_gpus,
57 | tensor_or_op_name_to_replica_names=tensor_or_op_name_to_replica_names)
58 | parallax_log.debug(
59 | "Finished graph transformation for PS for worker %d" % worker_id)
60 | return ps_meta_graph_def, tensor_or_op_name_to_replica_names.export()
61 |
--------------------------------------------------------------------------------
/parallax/parallax/core/python/tools/BUILD:
--------------------------------------------------------------------------------
1 | licenses(["notice"]) # Apache 2.0
2 |
3 | package(
4 | default_visibility = [
5 | "//visibility:public",
6 | ],
7 | )
8 |
9 | native.py_library(
10 | name = "launch_ps",
11 | srcs = ["launch_ps.py"]
12 | )
13 |
14 | native.py_library(
15 | name = "tools",
16 | srcs = ["__init__.py"],
17 | deps = [
18 | "launch_ps"
19 | ],
20 | )
21 |
--------------------------------------------------------------------------------
/parallax/parallax/core/python/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/tools/__init__.py
--------------------------------------------------------------------------------
/parallax/parallax/core/python/tools/launch_ps.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2018 Seoul National University
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import argparse
17 | import sys, os
18 | import json
19 |
20 | import tensorflow as tf
21 |
22 | FLAGS = tf.app.flags.FLAGS
23 |
24 | tf.app.flags.DEFINE_string('ps_hosts', '',
25 | """Comma-separated list of target hosts""")
26 | tf.app.flags.DEFINE_string('worker_hosts', '',
27 | """Comma-separated list of target hosts""")
28 | tf.app.flags.DEFINE_string('job_name', '',
29 | """Job name in cluster""")
30 | tf.app.flags.DEFINE_integer('task_index', -1,
31 | """Task index of the job""")
32 | tf.app.flags.DEFINE_string('protocol', 'grpc',
33 | """Server protocol: grpc, grpc+verbs, grpc+gdr""")
34 |
35 |
36 | def main(argv=None):
37 | assert FLAGS.job_name == 'ps'
38 | tf_cluster_dict = {}
39 |
40 | if not FLAGS.ps_hosts == '':
41 | tf_cluster_dict['ps'] = []
42 | for ps in FLAGS.ps_hosts.split(','):
43 | tf_cluster_dict['ps'].append(ps)
44 |
45 | tf_cluster_dict['worker'] = []
46 | for worker in FLAGS.worker_hosts.split(','):
47 | tf_cluster_dict['worker'].append(worker)
48 | cluster = tf.train.ClusterSpec(tf_cluster_dict)
49 |
50 | server = tf.train.Server(cluster, job_name='ps',
51 | task_index=FLAGS.task_index,
52 | protocol=FLAGS.protocol)
53 | server.join()
54 |
55 |
56 | if __name__ == "__main__":
57 | tf.app.run()
58 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2016 Rafal Jozefowicz
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/README.md:
--------------------------------------------------------------------------------
1 | # LM-1B
2 | LM-1B implements the LSTM language model described in [LM](https://arxiv.org/abs/1602.02410).
3 | The original code comes from https://github.com/rafaljozefowicz/lm, which supports
4 | synchronous training with multiple GPUs. We change the code as single GPU code, and
5 | then apply parallax auto-parallelization for multi-GPU, multi-machine with synchronous
6 | or asynchronous training.
7 |
8 | ## Dataset
9 | * [1B Word Benchmark Dataset](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark)
10 |
11 | ## To Run
12 | Set your resource information in the `resource_info` file.
13 |
14 | Then, you can run lm1b model with data in `` in parallel by executing:
15 | ```shell
16 | $ python lm1b_distributed_driver.py --datadir
17 | ```
18 |
19 | The command above runs a single LM model on multiple devices specified in `resource_info`.
20 | The command assumes that the data directory and the LM-1B codebase are distributed and reachable in the same absolute path in each of the machines.
21 |
22 | Also, we have a few more options you can choose for distributed running.
23 |
24 | | Parameter Name | Default | Description |
25 | | :------------------- |:-----------------------| :-----------|
26 | | --logdir | /tmp/lm1b | Logging directory |
27 | | --datadir | None | Data directory |
28 | | --hpconfig | "" | Overrides default hyper-parameters |
29 | | --eval_steps | 70 | Number of evaluation steps |
30 | | --resource_info_file | `./resource_info` | Filename containing cluster information written |
31 | | --max_steps | 1000000 | Number of iterations to run for each workers |
32 | | --log_frequency | 100 | How many steps between two runop log |
33 | | --sync | True | Whether to synchronize learning or not |
34 | | --ckpt_dir | None | Directory to save checkpoints |
35 | | --save_ckpt_steps | 0 | Number of steps between two consecutive checkpoints |
36 | | --save_n_ckpts_per_epoch | -1 | Number of checkpoints to save per each epoch |
37 | | --run_option | None | The run option whether PS or MPI, None utilizes both |
38 | | --search_partitions | False | Whether to use Parallax's variable partitioning method or not
39 |
40 | You can adapt the distributed running with above options. For example, if you want to fix the communication model as MPI mode, you can add `run_option` value like below.
41 |
42 | ```shell
43 | $ python lm1b_distributed_driver.py --datadir --run_option=MPI
44 | ```
45 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/lm1b/__init__.py
--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/data_utils.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | import glob
3 | import json
4 | import random
5 |
6 | import numpy as np
7 |
8 |
9 | class Vocabulary(object):
10 |
11 | def __init__(self):
12 | self._token_to_id = {}
13 | self._token_to_count = {}
14 | self._id_to_token = []
15 | self._num_tokens = 0
16 | self._s_id = None
17 | self._unk_id = None
18 |
19 | @property
20 | def num_tokens(self):
21 | return self._num_tokens
22 |
23 | @property
24 | def unk(self):
25 | return ""
26 |
27 | @property
28 | def unk_id(self):
29 | return self._unk_id
30 |
31 | @property
32 | def s(self):
33 | return ""
34 |
35 | @property
36 | def s_id(self):
37 | return self._s_id
38 |
39 | def add(self, token, count):
40 | self._token_to_id[token] = self._num_tokens
41 | self._token_to_count[token] = count
42 | self._id_to_token.append(token)
43 | self._num_tokens += 1
44 |
45 | def finalize(self):
46 | self._s_id = self.get_id(self.s)
47 | self._unk_id = self.get_id(self.unk)
48 |
49 | def get_id(self, token):
50 | return self._token_to_id.get(token, self.unk_id)
51 |
52 | def get_token(self, id_):
53 | return self._id_to_token[id_]
54 |
55 | @staticmethod
56 | def from_file(filename):
57 | vocab = Vocabulary()
58 | with codecs.open(filename, "r", "utf-8") as f:
59 | for line in f:
60 | word, count = line.strip().split()
61 | vocab.add(word, int(count))
62 | vocab.finalize()
63 | return vocab
64 |
65 |
66 | class Dataset(object):
67 |
68 | def __init__(self, vocab, file_pattern, deterministic=False):
69 | self._vocab = vocab
70 | self._file_pattern = file_pattern
71 | self._deterministic = deterministic
72 |
73 | def _parse_sentence(self, line):
74 | s_id = self._vocab.s_id
75 | return [s_id]\
76 | + [self._vocab.get_id(word) for word in line.strip().split()]\
77 | + [s_id]
78 |
79 | def _parse_file(self, file_name):
80 | print("Processing file: %s" % file_name)
81 | with codecs.open(file_name, "r", "utf-8") as f:
82 | lines = [line.strip() for line in f]
83 | if not self._deterministic:
84 | random.shuffle(lines)
85 | print("Finished processing!")
86 | for line in lines:
87 | yield self._parse_sentence(line)
88 |
89 | def _sentence_stream(self, file_stream):
90 | for file_name in file_stream:
91 | for sentence in self._parse_file(file_name):
92 | yield sentence
93 |
94 | def _iterate(self, sentences, batch_size, num_steps):
95 | streams = [None] * batch_size
96 | x = np.zeros([batch_size, num_steps], np.int32)
97 | y = np.zeros([batch_size, num_steps], np.int32)
98 | w = np.zeros([batch_size, num_steps], np.uint8)
99 | while True:
100 | x[:] = 0
101 | y[:] = 0
102 | w[:] = 0
103 | for i in range(batch_size):
104 | tokens_filled = 0
105 | try:
106 | while tokens_filled < num_steps:
107 | if streams[i] is None or len(streams[i]) <= 1:
108 | streams[i] = next(sentences)
109 | num_tokens = min(len(streams[i]) - 1,
110 | num_steps - tokens_filled)
111 | x[i, tokens_filled:tokens_filled+num_tokens] = \
112 | streams[i][:num_tokens]
113 | y[i, tokens_filled:tokens_filled + num_tokens] = \
114 | streams[i][1:num_tokens+1]
115 | w[i, tokens_filled:tokens_filled + num_tokens] = 1
116 | streams[i] = streams[i][num_tokens:]
117 | tokens_filled += num_tokens
118 | except StopIteration:
119 | pass
120 | if not np.any(w):
121 | return
122 |
123 | yield x, y, w
124 |
125 | def iterate_once(self, batch_size, num_steps):
126 | def file_stream():
127 | for file_name in glob.glob(self._file_pattern):
128 | yield file_name
129 | for value in self._iterate(
130 | self._sentence_stream(file_stream()), batch_size, num_steps):
131 | yield value
132 |
133 | def iterate_forever(self, batch_size, num_steps, num_workers, worker_id):
134 | def file_stream():
135 | while True:
136 | file_patterns = glob.glob(self._file_pattern)
137 | file_patterns.sort()
138 | filenames_for_worker = []
139 | for i in range(len(file_patterns)):
140 | if i % num_workers == worker_id:
141 | filenames_for_worker.append(file_patterns[i])
142 | if not self._deterministic:
143 | random.shuffle(filenames_for_worker)
144 | for filename in filenames_for_worker:
145 | yield filename
146 | for value in self._iterate(
147 | self._sentence_stream(file_stream()), batch_size, num_steps):
148 | yield value
149 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/language_model.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import functools
6 |
7 | import numpy as np
8 | import tensorflow as tf
9 | from tensorflow.python.framework import ops
10 | from tensorflow.python.framework import tensor_shape
11 | from tensorflow.python.layers import base
12 |
13 | import parallax
14 |
15 | FLAGS = tf.flags.FLAGS
16 | tf.flags.DEFINE_integer('num_variable_shards', 32, 'Number of variable shard')
17 |
18 | class LM(base.Layer):
19 | def __init__(self, num_steps):
20 | super(LM, self).__init__()
21 | self.num_steps = num_steps
22 | self.num_shards = FLAGS.num_variable_shards
23 | # Use keep_prob 1.0 at evaluation
24 | self.keep_prob = 0.9
25 |
26 | self.vocab_size = 793470
27 | self.emb_size = 512
28 | self.state_size = 2048
29 | self.projected_size = 512
30 | # Use num_sampled 0 (full softmax) at evaluation
31 | self.num_sampled = 8192
32 |
33 | def build(self, input_shape):
34 | partitioner = parallax.get_partitioner(self.num_shards)
35 | with tf.variable_scope(tf.get_variable_scope(), partitioner=partitioner):
36 | self.emb = tf.get_variable('emb',
37 | shape=[self.vocab_size, self.emb_size],
38 | initializer=tf.uniform_unit_scaling_initializer(),
39 | trainable=True,
40 | dtype=tf.float32)
41 | self.softmax_w = tf.get_variable(name='softmax_w',
42 | shape=[self.vocab_size, self.projected_size],
43 | initializer=tf.uniform_unit_scaling_initializer(),
44 | trainable=True,
45 | dtype=tf.float32)
46 |
47 | self.softmax_b = self.add_variable(name='softmax_b',
48 | shape=[self.vocab_size],
49 | trainable=True,
50 | dtype=tf.float32)
51 | self.W = self.add_variable(name='W',
52 | shape=[self.emb_size + self.projected_size, 4 * self.state_size],
53 | trainable=True,
54 | dtype=tf.float32)
55 | self.B = self.add_variable(name='B',
56 | shape=[4 * self.state_size],
57 | trainable=True,
58 | dtype=tf.float32)
59 | self.W_P = self.add_variable(name='W_P',
60 | shape=[self.state_size, self.projected_size],
61 | trainable=True,
62 | dtype=tf.float32)
63 | self.built = True
64 |
65 | def call(self, x, y, w, initial_state_c, initial_state_h, training):
66 | # [bs, steps, emb_size]
67 | x = tf.nn.embedding_lookup(self.emb, x)
68 | if training:
69 | x = tf.nn.dropout(x, self.keep_prob)
70 |
71 | # [bs, emb_size] * steps
72 | inputs = [tf.squeeze(v, axis=[1]) for v in tf.split(value=x, num_or_size_splits=self.num_steps, axis=1)]
73 |
74 | c = initial_state_c
75 | h = initial_state_h
76 | for t in range(self.num_steps):
77 | # i = input_gate, j = new_input, f = forget_gate, o = output_gate
78 | cell_inputs = tf.concat([inputs[t], h], axis=1)
79 | lstm_matrix = tf.nn.xw_plus_b(cell_inputs, self.W, self.B)
80 | i, j, f, o = tf.split(lstm_matrix, 4, axis=1)
81 |
82 | c = tf.sigmoid(f + 1.0) * c + tf.sigmoid(i) * tf.tanh(j)
83 | h = tf.sigmoid(o) * tf.tanh(c)
84 | h = tf.matmul(h, self.W_P)
85 | inputs[t] = h
86 | if training:
87 | inputs[t] = tf.nn.dropout(inputs[t], self.keep_prob)
88 |
89 | inputs[t] = tf.identity(inputs[t])
90 |
91 | inputs = tf.reshape(tf.concat(inputs, axis=1), [-1, self.projected_size])
92 |
93 | if training:
94 | targets = tf.reshape(y, [-1, 1])
95 | loss = tf.nn.sampled_softmax_loss(self.softmax_w,
96 | self.softmax_b,
97 | targets,
98 | inputs,
99 | self.num_sampled,
100 | self.vocab_size)
101 | else:
102 | full_softmax_w = tf.reshape(tf.concat(self.softmax_w, axis=1), [-1, self.projected_size])
103 | full_softmax_w = full_softmax_w[:self.vocab_size, :]
104 |
105 | logits = tf.matmul(inputs, full_softmax_w, transpose_b=True) + self.softmax_b
106 | targets = tf.reshape(y, [-1])
107 | loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets)
108 |
109 | loss = tf.reduce_mean(loss * tf.reshape(tf.to_float(w), [-1]))
110 | return loss, c, h
111 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/language_model_graph.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import numpy as np
6 | import tensorflow as tf
7 |
8 | import language_model
9 |
10 | _NUM_WORDS = {
11 | 'train': 798945280,
12 | 'validation': 7789987,
13 | }
14 |
15 | FLAGS = tf.flags.FLAGS
16 | tf.flags.DEFINE_integer('batch_size', 128, 'Batch size')
17 | tf.flags.DEFINE_integer('num_steps', 20, 'Number of steps')
18 | tf.flags.DEFINE_float('learning_rate', 0.2, 'Learning rate')
19 | tf.flags.DEFINE_float('max_grad_norm', 10.0, 'max_grad_norm')
20 | tf.flags.DEFINE_integer('num_epoch', 5, 'Number of epoch')
21 | tf.flags.DEFINE_boolean('use_synthetic', False, 'whether to use synthetic data or not')
22 |
23 |
24 | def build_model():
25 | model = language_model.LM(FLAGS.num_steps)
26 | global_step = tf.train.get_or_create_global_step()
27 |
28 | with tf.device('/gpu:0'):
29 | placeholder_x = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.num_steps])
30 | placeholder_y = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.num_steps])
31 | placeholder_w = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.num_steps])
32 | initial_state_c = tf.placeholder(dtype=tf.float32,
33 | shape=[FLAGS.batch_size, model.state_size],
34 | name='initial_c')
35 | initial_state_h = tf.placeholder(dtype=tf.float32,
36 | shape=[FLAGS.batch_size, model.projected_size],
37 | name='initial_h')
38 | loss, final_state_c, final_state_h = model(placeholder_x, placeholder_y, placeholder_w, initial_state_c, initial_state_h, training=True)
39 | scaled_loss = loss * FLAGS.num_steps
40 |
41 | emb_vars = list(model.emb)
42 | lstm_vars = [model.W, model.B, model.W_P]
43 | softmax_vars = list(model.softmax_w) + [model.softmax_b]
44 | all_vars = emb_vars + lstm_vars + softmax_vars
45 | grads = tf.gradients(scaled_loss, all_vars)
46 |
47 | emb_grads = grads[:len(emb_vars)]
48 | emb_grads = [tf.IndexedSlices(grad.values * FLAGS.batch_size,
49 | grad.indices,
50 | grad.dense_shape) for grad in emb_grads]
51 |
52 | lstm_grads = grads[len(emb_vars):len(emb_vars) + len(lstm_vars)]
53 | lstm_grads, _ = tf.clip_by_global_norm(lstm_grads, FLAGS.max_grad_norm)
54 |
55 | softmax_grads = grads[len(emb_vars) + len(lstm_vars):]
56 |
57 | clipped_grads = emb_grads + lstm_grads + softmax_grads
58 | grads_and_vars = list(zip(clipped_grads, all_vars))
59 |
60 | optimizer = tf.train.AdagradOptimizer(FLAGS.learning_rate, initial_accumulator_value=1.0)
61 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
62 |
63 | ema = tf.train.ExponentialMovingAverage(decay=0.999)
64 | with tf.control_dependencies([train_op]):
65 | train_op = ema.apply(lstm_vars)
66 |
67 | model.global_step = global_step
68 | model.loss = loss
69 | model.train_op = train_op
70 |
71 | model.final_state_c = final_state_c
72 | model.final_state_h = final_state_h
73 |
74 | model.initial_state_c = initial_state_c
75 | model.initial_state_h = initial_state_h
76 |
77 | model.x = placeholder_x
78 | model.y = placeholder_y
79 | model.w = placeholder_w
80 |
81 | return model
82 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/lm1b_distributed_driver.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2018 Seoul National University
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import tensorflow as tf
17 | import parallax
18 |
19 |
20 | import os
21 | import time
22 | import math
23 | import json
24 | import sys
25 | import numpy as np
26 |
27 | from data_utils import Vocabulary, Dataset
28 | import language_model_graph
29 | import parallax_config
30 |
31 | flags = tf.app.flags
32 | flags.DEFINE_string("logdir", "/tmp/lm1b", "Logging directory.")
33 | flags.DEFINE_string("datadir", None, "Logging directory.")
34 | flags.DEFINE_string("hpconfig", "", "Overrides default hyper-parameters.")
35 | flags.DEFINE_integer("eval_steps", 70, "Number of eval steps.")
36 | flags.DEFINE_string('resource_info_file',
37 | os.path.abspath(os.path.join(os.path.dirname(__file__),
38 | '.',
39 | 'resource_info')),
40 | 'Filename containing cluster information')
41 | flags.DEFINE_integer('max_steps', 1000000,
42 | """Number of iterations to run for each workers.""")
43 | flags.DEFINE_integer('log_frequency', 100,
44 | """How many steps between two runop logs.""")
45 | flags.DEFINE_boolean('sync', True, '')
46 | FLAGS = flags.FLAGS
47 |
48 |
49 | def main(_):
50 |
51 | vocab = Vocabulary.from_file(os.path.join(FLAGS.datadir, "1b_word_vocab.txt"))
52 | dataset = Dataset(vocab, os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*"))
53 |
54 | single_gpu_graph = tf.Graph()
55 | with single_gpu_graph.as_default():
56 | with tf.variable_scope("model"):
57 | model = language_model_graph.build_model()
58 |
59 | def run(sess,num_workers, worker_id, num_replicas_per_worker):
60 |
61 | state_c = []
62 | state_h = []
63 |
64 | if len(state_c) == 0:
65 | state_c.extend([np.zeros([FLAGS.batch_size, model.state_size], dtype=np.float32) for _ in range(num_replicas_per_worker)])
66 | state_h.extend([np.zeros([FLAGS.batch_size, model.projected_size], dtype=np.float32) for _ in range(num_replicas_per_worker)])
67 |
68 | prev_global_step = sess.run(model.global_step)[0]
69 | prev_time = time.time()
70 | data_iterator = dataset.iterate_forever(FLAGS.batch_size * num_replicas_per_worker,
71 | FLAGS.num_steps, num_workers, worker_id)
72 | fetches = {
73 | 'global_step': model.global_step,
74 | 'loss': model.loss,
75 | 'train_op': model.train_op,
76 | 'final_state_c': model.final_state_c,
77 | 'final_state_h': model.final_state_h
78 | }
79 |
80 | for local_step in range(FLAGS.max_steps):
81 | if FLAGS.use_synthetic:
82 | x = np.random.randint(low=0, high=model.vocab_size, size=(FLAGS.batch_size*num_replicas_per_worker, FLAGS.num_steps))
83 | y = np.random.randint(low=0, high=model.vocab_size, size=(FLAGS.batch_size*num_replicas_per_worker, FLAGS.num_steps))
84 | w = np.ones((FLAGS.batch_size*num_replicas_per_worker, FLAGS.num_steps))
85 | else:
86 | x, y, w = next(data_iterator)
87 | feeds = {}
88 | feeds[model.x] = np.split(x, num_replicas_per_worker)
89 | feeds[model.y] = np.split(y, num_replicas_per_worker)
90 | feeds[model.w] = np.split(w, num_replicas_per_worker)
91 | feeds[model.initial_state_c] = state_c
92 | feeds[model.initial_state_h] = state_h
93 | fetched = sess.run(fetches, feeds)
94 |
95 | state_c = fetched['final_state_c']
96 | state_h = fetched['final_state_h']
97 |
98 | if local_step % FLAGS.log_frequency == 0:
99 | cur_time = time.time()
100 | elapsed_time = cur_time - prev_time
101 | num_words = FLAGS.batch_size * FLAGS.num_steps
102 | wps = (fetched['global_step'][0] - prev_global_step) * num_words / elapsed_time
103 | prev_global_step = fetched['global_step'][0]
104 | parallax.log.info("Iteration %d, time = %.2fs, wps = %.0f, train loss = %.4f" % (
105 | fetched['global_step'][0], cur_time - prev_time, wps, fetched['loss'][0]))
106 | prev_time = cur_time
107 |
108 | sess, num_workers, worker_id, num_replicas_per_worker = \
109 | parallax.parallel_run(single_gpu_graph,
110 | FLAGS.resource_info_file,
111 | sync=FLAGS.sync,
112 | parallax_config=parallax_config.build_config())
113 | run(sess, num_workers, worker_id, num_replicas_per_worker)
114 |
115 | if __name__ == "__main__":
116 | tf.app.run()
117 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/lm1b_input.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | import glob
3 | import json
4 | import random
5 |
6 | import numpy as np
7 | import sys
8 |
9 |
10 | class Vocabulary(object):
11 |
12 | def __init__(self):
13 | self._token_to_id = {}
14 | self._token_to_count = {}
15 | self._id_to_token = []
16 | self._num_tokens = 0
17 | self._s_id = None
18 | self._unk_id = None
19 |
20 | @property
21 | def num_tokens(self):
22 | return self._num_tokens
23 |
24 | @property
25 | def unk(self):
26 | return ""
27 |
28 | @property
29 | def unk_id(self):
30 | return self._unk_id
31 |
32 | @property
33 | def s(self):
34 | return ""
35 |
36 | @property
37 | def s_id(self):
38 | return self._s_id
39 |
40 | def add(self, token, count):
41 | self._token_to_id[token] = self._num_tokens
42 | self._token_to_count[token] = count
43 | self._id_to_token.append(token)
44 | self._num_tokens += 1
45 |
46 | def finalize(self):
47 | self._s_id = self.get_id(self.s)
48 | self._unk_id = self.get_id(self.unk)
49 |
50 | def get_id(self, token):
51 | return self._token_to_id.get(token, self.unk_id)
52 |
53 | def get_token(self, id_):
54 | return self._id_to_token[id_]
55 |
56 | @staticmethod
57 | def from_file(filename, num_tokens_limit=None):
58 | vocab = Vocabulary()
59 | with codecs.open(filename, "r", "utf-8") as f:
60 | for line in f:
61 | word, count = line.strip().split()
62 | vocab.add(word, int(count))
63 | if num_tokens_limit is not None:
64 | if vocab.num_tokens == num_tokens_limit:
65 | break
66 | vocab.finalize()
67 | return vocab
68 |
69 |
70 | class Dataset(object):
71 |
72 | def __init__(self, vocab, filenames, deterministic=False):
73 | self._vocab = vocab
74 | self._filenames = filenames
75 | self._deterministic = deterministic
76 |
77 | def _parse_sentence(self, line):
78 | s_id = self._vocab.s_id
79 | return [s_id] + [self._vocab.get_id(word) for word in line.strip().split()] + [s_id]
80 |
81 | def _parse_file(self, file_name):
82 | print("Processing file: %s" % file_name)
83 | with codecs.open(file_name, "r", "utf-8") as f:
84 | lines = [line.strip() for line in f]
85 | if not self._deterministic:
86 | random.shuffle(lines)
87 | print("Finished processing!")
88 | for line in lines:
89 | yield self._parse_sentence(line)
90 |
91 | def _sentence_stream(self, file_stream):
92 | for file_name in file_stream:
93 | for sentence in self._parse_file(file_name):
94 | yield sentence
95 |
96 | def _iterate(self, sentences, batch_size, num_steps):
97 | streams = [None] * batch_size
98 | x = np.zeros([batch_size, num_steps], np.int32)
99 | y = np.zeros([batch_size, num_steps], np.int32)
100 | w = np.zeros([batch_size, num_steps], np.uint8)
101 | while True:
102 | x[:] = 0
103 | y[:] = 0
104 | w[:] = 0
105 | for i in range(batch_size):
106 | tokens_filled = 0
107 | try:
108 | while tokens_filled < num_steps:
109 | if streams[i] is None or len(streams[i]) <= 1:
110 | streams[i] = next(sentences)
111 | num_tokens = min(len(streams[i]) - 1, num_steps - tokens_filled)
112 | x[i, tokens_filled:tokens_filled+num_tokens] = streams[i][:num_tokens]
113 | y[i, tokens_filled:tokens_filled + num_tokens] = streams[i][1:num_tokens+1]
114 | w[i, tokens_filled:tokens_filled + num_tokens] = 1
115 | streams[i] = streams[i][num_tokens:]
116 | tokens_filled += num_tokens
117 | except StopIteration:
118 | pass
119 | if not np.any(w):
120 | return
121 |
122 | yield x, y, w
123 |
124 | def iterate_once(self, batch_size, num_steps):
125 | def file_stream():
126 | for file_name in self._filenames:
127 | yield file_name
128 | for value in self._iterate(self._sentence_stream(file_stream()), batch_size, num_steps):
129 | yield value
130 |
131 | def iterate_forever(self, batch_size, num_steps):
132 | def file_stream():
133 | while True:
134 | if not self._deterministic:
135 | random.shuffle(self._filenames)
136 | for file_name in self._filenames:
137 | yield file_name
138 | for value in self._iterate(self._sentence_stream(file_stream()), batch_size, num_steps):
139 | yield value
140 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/parallax_config.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2018 Seoul National University
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import tensorflow as tf
17 | import parallax
18 |
19 | flags = tf.app.flags
20 | flags.DEFINE_boolean('replicate_variables', True, """replicate_variables""")
21 | flags.DEFINE_string('protocol', 'grpc', """The method for managing variables""")
22 | flags.DEFINE_string('mpirun_options', '', 'The option for mpirun')
23 | flags.DEFINE_string('run_option', 'HYBRID',
24 | 'The run option whether PS, MPI or HYBRID')
25 | flags.DEFINE_string('redirect_path', None, """redirect path to keep the log of distributed workers""")
26 | flags.DEFINE_string('ckpt_dir', None, """Directory to save checkpoints""")
27 | flags.DEFINE_integer('save_ckpt_steps', None,
28 | """Number of steps between two consecutive checkpoints""")
29 | flags.DEFINE_string('profile_dir', None, """Directory to save RunMetadata""")
30 | flags.DEFINE_string('profile_steps', None, """Comma separated porfile steps""")
31 | flags.DEFINE_string('profile_range', None, """profile_start_step,profile_end_step""")
32 | flags.DEFINE_boolean('local_aggregation', True,
33 | """Whether to use local aggregation or not""")
34 | flags.DEFINE_boolean('boundary_among_servers', True,
35 | """Whether to use operation placement among servers""")
36 | flags.DEFINE_boolean('boundary_between_workers_and_servers', True,
37 | """Whether to use operation placement between workers and servers""")
38 | flags.DEFINE_string('export_graph_path', None, """export path to keep transformed graph definintion""")
39 | flags.DEFINE_boolean('search_partitions', False, "Whether to use variable partitioning method")
40 | FLAGS = flags.FLAGS
41 |
42 | def build_config():
43 |
44 | ckpt_config = parallax.CheckPointConfig(ckpt_dir=FLAGS.ckpt_dir,
45 | save_ckpt_steps=FLAGS.save_ckpt_steps)
46 | ps_config = parallax.PSConfig(replicate_variables=FLAGS.replicate_variables,
47 | protocol=FLAGS.protocol,
48 | local_aggregation=FLAGS.local_aggregation,
49 | boundary_among_servers=FLAGS.boundary_among_servers,
50 | boundary_between_workers_and_servers=\
51 | FLAGS.boundary_between_workers_and_servers)
52 | mpi_config = parallax.MPIConfig(mpirun_options=FLAGS.mpirun_options)
53 | def get_profile_steps():
54 | if FLAGS.profile_steps:
55 | FLAGS.profile_steps = FLAGS.profile_steps.strip()
56 | return [int(step) for step in FLAGS.profile_steps.split(',')]
57 | return None
58 |
59 | def get_profile_range():
60 | if FLAGS.profile_range:
61 | FLAGS.profile_range = FLAGS.profile_range.strip()
62 | splits = FLAGS.profile_range.split(',')
63 | return (int(splits[0]), int(splits[1]))
64 | return None
65 |
66 | profile_config = parallax.ProfileConfig(profile_dir=FLAGS.profile_dir,
67 | profile_steps=get_profile_steps(),
68 | profile_range=get_profile_range())
69 | parallax_config = parallax.Config()
70 | parallax_config.run_option = FLAGS.run_option
71 | parallax_config.average_sparse = False
72 | parallax_config.communication_config = parallax.CommunicationConfig(ps_config, mpi_config)
73 | parallax_config.ckpt_config = ckpt_config
74 | parallax_config.profile_config = profile_config
75 | parallax_config.redirect_path = FLAGS.redirect_path
76 | parallax_config.export_graph_path = FLAGS.export_graph_path
77 | parallax_config.search_partitions = FLAGS.search_partitions
78 |
79 | return parallax_config
80 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/resource_info:
--------------------------------------------------------------------------------
1 | 123.456.78.90:1,2
2 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/testdata/test_s2.txt:
--------------------------------------------------------------------------------
1 | 非婚姻所生 非婚姻所生
2 | ala ma kota
3 | test
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/.gitignore:
--------------------------------------------------------------------------------
1 | bazel-bin
2 | bazel-genfiles
3 | bazel-out
4 | bazel-testlogs
5 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Want to contribute? Great! First, read this page (including the small print at the end).
2 |
3 | ### Before you contribute
4 |
5 | Before we can use your code, you must sign the
6 | [Google Individual Contributor License Agreement]
7 | (https://cla.developers.google.com/about/google-individual)
8 | (CLA), which you can do online. The CLA is necessary mainly because you own the
9 | copyright to your changes, even after your contribution becomes part of our
10 | codebase, so we need your permission to use and distribute your code. We also
11 | need to be sure of various other things—for instance that you'll tell us if you
12 | know that your code infringes on other people's patents. You don't have to sign
13 | the CLA until after you've submitted your code for review and a member has
14 | approved it, but you must do it before we can put your code into our codebase.
15 | Before you start working on a larger contribution, you should get in touch with
16 | us first through the issue tracker with your idea so that we can help out and
17 | possibly guide you. Coordinating up front makes it much easier to avoid
18 | frustration later on.
19 |
20 | ### Code reviews
21 |
22 | All submissions, including submissions by project members, require review. We
23 | use Github pull requests for this purpose.
24 |
25 | ### The small print
26 |
27 | Contributions made by corporations are covered by a different agreement than
28 | the one above, the
29 | [Software Grant and Corporate Contributor License Agreement]
30 | (https://cla.developers.google.com/about/google-corporate).
31 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/README.md:
--------------------------------------------------------------------------------
1 | # Neural Machine Translation (seq2seq)
2 |
3 | Neural Machine Translation (NMT) mimics translation process of human. For more detailed description about the program itself, please check out [https://github.com/tensorflow/nmt](https://github.com/tensorflow/nmt) where this program comes from.
4 |
5 | ## Dataset
6 |
7 | We can use the following publicly available datasets:
8 |
9 | 1. *Small-scale*: English-Vietnamese parallel corpus of TED talks (133K sentence
10 | pairs) provided by
11 | the
12 | [IWSLT Evaluation Campaign](https://sites.google.com/site/iwsltevaluation2015/).
13 | 1. *Large-scale*: German-English parallel corpus (4.5M sentence pairs) provided
14 | by the [WMT Evaluation Campaign](http://www.statmt.org/wmt16/translation-task.html).
15 |
16 | ## To Run
17 |
18 | Set your resource information in the `resource_info` file.
19 |
20 | The command below runs a single GNMT WMT German-English model on multiple devices specified in `resource_info`. The command assumes that the data directory and the NMT codebase are distributed and reachable in the same absolute path in each of the machines.
21 |
22 |
23 | ```
24 | $ python nmt_distributed_driver.py \
25 | --src=de --tgt=en \
26 | --hparams_path=nmt/standard_hparams/wmt16_gnmt_4_layer.json \
27 | --out_dir=/tmp/deen_gnmt \
28 | --vocab_prefix=/tmp/wmt16/vocab.bpe.32000 \
29 | --train_prefix=/tmp/wmt16/train.tok.clean.bpe.32000 \
30 | --dev_prefix=/tmp/wmt16/newstest2013.tok.bpe.32000 \
31 | --test_prefix=/tmp/wmt16/newstest2015.tok.bpe.32000
32 | ```
33 |
34 | For more options of nmt model command, please check out [https://github.com/tensorflow/nmt](https://github.com/tensorflow/nmt) again.
35 |
36 | Besides, we have a few more options you can choose for distributed running.
37 |
38 | | Parameter Name | Default | Description |
39 | | :------------------- |:-----------------------| :-----------|
40 | | --resource_info_file | `./resource_info` | Filename containing cluster information written |
41 | | --max_steps | 1000000 | Number of iterations to run for each workers |
42 | | --steps_per_stats | 100 | How many steps between two runop log |
43 | | --sync | True | Whether to synchronize learning or not |
44 | | --ckpt_dir | None | Directory to save checkpoints |
45 | | --save_ckpt_steps | 0 | Number of steps between two consecutive checkpoints |
46 | | --run_option | None | The run option whether PS or MPI, None utilizes both |
47 | | --epoch_size | 0 | total number of data instances |
48 | | --search_partitions | False | Whether to use Parallax's variable partitioning method or not |
49 |
50 | You can adapt the distributed running with above options. For example, you can run the GNMT WMT German-English model in MPI mode by just adding `--run_option` value to the script like below:
51 |
52 | ```
53 | $ python nmt_distributed_driver.py \
54 | --src=de --tgt=en \
55 | --hparams_path=${PWD}/nmt/standard_hparams/wmt16_gnmt_4_layer.json \
56 | --out_dir=/tmp/deen_gnmt \
57 | --vocab_prefix=/tmp/wmt16/vocab.bpe.32000 \
58 | --train_prefix=/tmp/wmt16/train.tok.clean.bpe.32000 \
59 | --dev_prefix=/tmp/wmt16/newstest2013.tok.bpe.32000 \
60 | --test_prefix=/tmp/wmt16/newstest2015.tok.bpe.32000
61 | --run_option=MPI
62 | ```
63 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/__init__.py
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/g3doc/img/attention_equation_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/attention_equation_0.jpg
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/g3doc/img/attention_equation_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/attention_equation_1.jpg
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/g3doc/img/attention_mechanism.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/attention_mechanism.jpg
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/g3doc/img/attention_vis.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/attention_vis.jpg
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/g3doc/img/encdec.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/encdec.jpg
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/g3doc/img/greedy_dec.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/greedy_dec.jpg
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/g3doc/img/seq2seq.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/seq2seq.jpg
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/nmt_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Tests for nmt.py, train.py and inference.py."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import argparse
22 | import os
23 |
24 | import tensorflow as tf
25 |
26 | from . import inference
27 | from . import nmt
28 | from . import train
29 |
30 |
31 | def _update_flags(flags, test_name):
32 | """Update flags for basic training."""
33 | flags.num_train_steps = 100
34 | flags.steps_per_stats = 5
35 | flags.src = "en"
36 | flags.tgt = "vi"
37 | flags.train_prefix = ("nmt/testdata/"
38 | "iwslt15.tst2013.100")
39 | flags.vocab_prefix = ("nmt/testdata/"
40 | "iwslt15.vocab.100")
41 | flags.dev_prefix = ("nmt/testdata/"
42 | "iwslt15.tst2013.100")
43 | flags.test_prefix = ("nmt/testdata/"
44 | "iwslt15.tst2013.100")
45 | flags.out_dir = os.path.join(tf.test.get_temp_dir(), test_name)
46 |
47 |
48 | class NMTTest(tf.test.TestCase):
49 |
50 | def testTrain(self):
51 | """Test the training loop is functional with basic hparams."""
52 | nmt_parser = argparse.ArgumentParser()
53 | nmt.add_arguments(nmt_parser)
54 | FLAGS, unparsed = nmt_parser.parse_known_args()
55 |
56 | _update_flags(FLAGS, "nmt_train_test")
57 |
58 | default_hparams = nmt.create_hparams(FLAGS)
59 |
60 | train_fn = train.train
61 | nmt.run_main(FLAGS, default_hparams, train_fn, None)
62 |
63 |
64 | def testTrainWithAvgCkpts(self):
65 | """Test the training loop is functional with basic hparams."""
66 | nmt_parser = argparse.ArgumentParser()
67 | nmt.add_arguments(nmt_parser)
68 | FLAGS, unparsed = nmt_parser.parse_known_args()
69 |
70 | _update_flags(FLAGS, "nmt_train_test_avg_ckpts")
71 | FLAGS.avg_ckpts = True
72 |
73 | default_hparams = nmt.create_hparams(FLAGS)
74 |
75 | train_fn = train.train
76 | nmt.run_main(FLAGS, default_hparams, train_fn, None)
77 |
78 |
79 | def testInference(self):
80 | """Test inference is function with basic hparams."""
81 | nmt_parser = argparse.ArgumentParser()
82 | nmt.add_arguments(nmt_parser)
83 | FLAGS, unparsed = nmt_parser.parse_known_args()
84 |
85 | _update_flags(FLAGS, "nmt_train_infer")
86 |
87 | # Train one step so we have a checkpoint.
88 | FLAGS.num_train_steps = 1
89 | default_hparams = nmt.create_hparams(FLAGS)
90 | train_fn = train.train
91 | nmt.run_main(FLAGS, default_hparams, train_fn, None)
92 |
93 | # Update FLAGS for inference.
94 | FLAGS.inference_input_file = ("nmt/testdata/"
95 | "iwslt15.tst2013.100.en")
96 | FLAGS.inference_output_file = os.path.join(FLAGS.out_dir, "output")
97 | FLAGS.inference_ref_file = ("nmt/testdata/"
98 | "iwslt15.tst2013.100.vi")
99 |
100 | default_hparams = nmt.create_hparams(FLAGS)
101 |
102 | inference_fn = inference.inference
103 | nmt.run_main(FLAGS, default_hparams, None, inference_fn)
104 |
105 |
106 | if __name__ == "__main__":
107 | tf.test.main()
108 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/parallax_config.py:
--------------------------------------------------------------------------------
1 |
2 | # Copyright (C) 2018 Seoul National University
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ==============================================================================
16 |
17 | import tensorflow as tf
18 | import parallax
19 |
20 |
21 | flags = tf.app.flags
22 | flags.DEFINE_boolean('replicate_variables', True, """replicate_variables""")
23 | flags.DEFINE_string('protocol', 'grpc', """The method for managing variables""")
24 | tf.app.flags.DEFINE_string('mpirun_options', '', 'option for mpirun')
25 | flags.DEFINE_string('run_option', 'HYBRID',
26 | 'The run option whether PS, MPI or HYBRID')
27 | flags.DEFINE_string('redirect_path', None, """redirect path to keep the log of distributed workers""")
28 | flags.DEFINE_integer('save_ckpt_steps', None,
29 | """Number of steps between two consecutive checkpoints""")
30 | flags.DEFINE_integer('save_n_ckpts_per_epoch', -1, """Save n checkpoints per every epoch""")
31 | flags.DEFINE_string('ckpt_dir', None, """Directory to save checkpoints""")
32 | flags.DEFINE_string('profile_dir', None, """Directory to save RunMetadata""")
33 | flags.DEFINE_string('profile_steps', None, """Comma separated porfile steps""")
34 | flags.DEFINE_boolean('local_aggregation', True,
35 | """Whether to use local aggregation or not""")
36 | flags.DEFINE_boolean('boundary_among_servers', True,
37 | """Whether to use operation placement among servers""")
38 | flags.DEFINE_boolean('boundary_between_workers_and_servers', True,
39 | """Whether to use operation placement between workers and servers""")
40 | flags.DEFINE_string('export_graph_path', None, """export path to keep transformed graph definintion""")
41 | flags.DEFINE_boolean('search_partitions', False, """Whether to use variable partitioning method""")
42 | FLAGS = flags.FLAGS
43 |
44 | def calculate_ckpt_steps():
45 | if FLAGS.save_n_ckpts_per_epoch > 0:
46 | with open(FLAGS.resource_info_file) as resource_info:
47 | num_workers = sum([len(w['gpus']) for w in json.load(resource_info)['worker']])
48 | num_words_per_iter = FLAGS.batch_size * FLAGS.num_steps * num_workers
49 | num_iters_per_epoch = math.ceil(language_model_graph._NUM_WORDS['train'] / num_words_per_iter / FLAGS.save_n_ckpts_per_epoch)
50 | save_ckpt_steps = num_iters_per_epoch if FLAGS.sync else num_iters_per_epoch * num_workers
51 | parallax.log.info('Save checkpoint for every %d iters' % save_ckpt_steps)
52 | else:
53 | save_ckpt_steps = FLAGS.save_ckpt_steps
54 |
55 | return save_ckpt_steps
56 |
57 |
58 | def build_config():
59 |
60 | ckpt_config = parallax.CheckPointConfig(ckpt_dir=FLAGS.ckpt_dir,
61 | save_ckpt_steps=calculate_ckpt_steps())
62 | ps_config = parallax.PSConfig(replicate_variables=FLAGS.replicate_variables,
63 | protocol=FLAGS.protocol,
64 | local_aggregation=FLAGS.local_aggregation,
65 | boundary_among_servers=FLAGS.boundary_among_servers,
66 | boundary_between_workers_and_servers=\
67 | FLAGS.boundary_between_workers_and_servers)
68 | mpi_config = parallax.MPIConfig(mpirun_options=FLAGS.mpirun_options)
69 | parallax_config = parallax.Config()
70 | parallax_config.run_option = FLAGS.run_option
71 | parallax_config.average_sparse = False
72 | parallax_config.communication_config = parallax.CommunicationConfig(ps_config, mpi_config)
73 | parallax_config.ckpt_config = ckpt_config
74 | def get_profile_steps():
75 | if not FLAGS.profile_steps:
76 | return []
77 | FLAGS.profile_steps = FLAGS.profile_steps.strip()
78 | return [int(step) for step in FLAGS.profile_steps.split(',')]
79 | profile_config = parallax.ProfileConfig(profile_dir=FLAGS.profile_dir,
80 | profile_steps=get_profile_steps())
81 | parallax_config.profile_config = profile_config
82 | parallax_config.redirect_path = FLAGS.redirect_path
83 | parallax_config.export_graph_path = FLAGS.export_graph_path
84 | parallax_config.search_partitions = FLAGS.search_partitions
85 |
86 | return parallax_config
87 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/resource_info:
--------------------------------------------------------------------------------
1 | 123.456.78.90:1,2
2 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/scripts/__init__.py
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/scripts/bleu.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Python implementation of BLEU and smooth-BLEU.
17 |
18 | This module provides a Python implementation of BLEU and smooth-BLEU.
19 | Smooth BLEU is computed following the method outlined in the paper:
20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
21 | evaluation metrics for machine translation. COLING 2004.
22 | """
23 |
24 | import collections
25 | import math
26 |
27 |
28 | def _get_ngrams(segment, max_order):
29 | """Extracts all n-grams upto a given maximum order from an input segment.
30 |
31 | Args:
32 | segment: text segment from which n-grams will be extracted.
33 | max_order: maximum length in tokens of the n-grams returned by this
34 | methods.
35 |
36 | Returns:
37 | The Counter containing all n-grams upto max_order in segment
38 | with a count of how many times each n-gram occurred.
39 | """
40 | ngram_counts = collections.Counter()
41 | for order in range(1, max_order + 1):
42 | for i in range(0, len(segment) - order + 1):
43 | ngram = tuple(segment[i:i+order])
44 | ngram_counts[ngram] += 1
45 | return ngram_counts
46 |
47 |
48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4,
49 | smooth=False):
50 | """Computes BLEU score of translated segments against one or more references.
51 |
52 | Args:
53 | reference_corpus: list of lists of references for each translation. Each
54 | reference should be tokenized into a list of tokens.
55 | translation_corpus: list of translations to score. Each translation
56 | should be tokenized into a list of tokens.
57 | max_order: Maximum n-gram order to use when computing BLEU score.
58 | smooth: Whether or not to apply Lin et al. 2004 smoothing.
59 |
60 | Returns:
61 | 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
62 | precisions and brevity penalty.
63 | """
64 | matches_by_order = [0] * max_order
65 | possible_matches_by_order = [0] * max_order
66 | reference_length = 0
67 | translation_length = 0
68 | for (references, translation) in zip(reference_corpus,
69 | translation_corpus):
70 | reference_length += min(len(r) for r in references)
71 | translation_length += len(translation)
72 |
73 | merged_ref_ngram_counts = collections.Counter()
74 | for reference in references:
75 | merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
76 | translation_ngram_counts = _get_ngrams(translation, max_order)
77 | overlap = translation_ngram_counts & merged_ref_ngram_counts
78 | for ngram in overlap:
79 | matches_by_order[len(ngram)-1] += overlap[ngram]
80 | for order in range(1, max_order+1):
81 | possible_matches = len(translation) - order + 1
82 | if possible_matches > 0:
83 | possible_matches_by_order[order-1] += possible_matches
84 |
85 | precisions = [0] * max_order
86 | for i in range(0, max_order):
87 | if smooth:
88 | precisions[i] = ((matches_by_order[i] + 1.) /
89 | (possible_matches_by_order[i] + 1.))
90 | else:
91 | if possible_matches_by_order[i] > 0:
92 | precisions[i] = (float(matches_by_order[i]) /
93 | possible_matches_by_order[i])
94 | else:
95 | precisions[i] = 0.0
96 |
97 | if min(precisions) > 0:
98 | p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
99 | geo_mean = math.exp(p_log_sum)
100 | else:
101 | geo_mean = 0
102 |
103 | ratio = float(translation_length) / reference_length
104 |
105 | if ratio > 1.0:
106 | bp = 1.
107 | else:
108 | bp = math.exp(1 - 1. / ratio)
109 |
110 | bleu = geo_mean * bp
111 |
112 | return (bleu, precisions, bp, ratio, translation_length, reference_length)
113 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/scripts/download_iwslt15.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Download small-scale IWSLT15 Vietnames to English translation data for NMT
3 | # model training.
4 | #
5 | # Usage:
6 | # ./download_iwslt15.sh path-to-output-dir
7 | #
8 | # If output directory is not specified, "./iwslt15" will be used as the default
9 | # output directory.
10 | OUT_DIR="${1:-iwslt15}"
11 | SITE_PREFIX="https://nlp.stanford.edu/projects/nmt/data"
12 |
13 | mkdir -v -p $OUT_DIR
14 |
15 | # Download iwslt15 small dataset from standford website.
16 | echo "Download training dataset train.en and train.vi."
17 | curl -o "$OUT_DIR/train.en" "$SITE_PREFIX/iwslt15.en-vi/train.en"
18 | curl -o "$OUT_DIR/train.vi" "$SITE_PREFIX/iwslt15.en-vi/train.vi"
19 |
20 | echo "Download dev dataset tst2012.en and tst2012.vi."
21 | curl -o "$OUT_DIR/tst2012.en" "$SITE_PREFIX/iwslt15.en-vi/tst2012.en"
22 | curl -o "$OUT_DIR/tst2012.vi" "$SITE_PREFIX/iwslt15.en-vi/tst2012.vi"
23 |
24 | echo "Download test dataset tst2013.en and tst2013.vi."
25 | curl -o "$OUT_DIR/tst2013.en" "$SITE_PREFIX/iwslt15.en-vi/tst2013.en"
26 | curl -o "$OUT_DIR/tst2013.vi" "$SITE_PREFIX/iwslt15.en-vi/tst2013.vi"
27 |
28 | echo "Download vocab file vocab.en and vocab.vi."
29 | curl -o "$OUT_DIR/vocab.en" "$SITE_PREFIX/iwslt15.en-vi/vocab.en"
30 | curl -o "$OUT_DIR/vocab.vi" "$SITE_PREFIX/iwslt15.en-vi/vocab.vi"
31 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/standard_hparams/iwslt15.json:
--------------------------------------------------------------------------------
1 | {
2 | "attention": "scaled_luong",
3 | "attention_architecture": "standard",
4 | "batch_size": 128,
5 | "colocate_gradients_with_ops": true,
6 | "dropout": 0.2,
7 | "encoder_type": "bi",
8 | "eos": "",
9 | "forget_bias": 1.0,
10 | "infer_batch_size": 32,
11 | "init_weight": 0.1,
12 | "learning_rate": 1.0,
13 | "max_gradient_norm": 5.0,
14 | "metrics": ["bleu"],
15 | "num_buckets": 5,
16 | "num_layers": 2,
17 | "num_train_steps": 12000,
18 | "decay_scheme": "luong234",
19 | "num_units": 512,
20 | "optimizer": "sgd",
21 | "residual": false,
22 | "share_vocab": false,
23 | "subword_option": "",
24 | "sos": "",
25 | "src_max_len": 50,
26 | "src_max_len_infer": null,
27 | "steps_per_external_eval": null,
28 | "steps_per_stats": 100,
29 | "tgt_max_len": 50,
30 | "tgt_max_len_infer": null,
31 | "time_major": true,
32 | "unit_type": "lstm",
33 | "beam_width": 10
34 | }
35 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/standard_hparams/wmt16.json:
--------------------------------------------------------------------------------
1 | {
2 | "attention": "normed_bahdanau",
3 | "attention_architecture": "standard",
4 | "batch_size": 128,
5 | "colocate_gradients_with_ops": true,
6 | "dropout": 0.2,
7 | "encoder_type": "bi",
8 | "eos": "",
9 | "forget_bias": 1.0,
10 | "infer_batch_size": 32,
11 | "init_weight": 0.1,
12 | "learning_rate": 1.0,
13 | "max_gradient_norm": 5.0,
14 | "metrics": ["bleu"],
15 | "num_buckets": 5,
16 | "num_layers": 4,
17 | "num_train_steps": 340000,
18 | "decay_scheme": "luong10",
19 | "num_units": 1024,
20 | "optimizer": "sgd",
21 | "residual": false,
22 | "share_vocab": false,
23 | "subword_option": "bpe",
24 | "sos": "",
25 | "src_max_len": 50,
26 | "src_max_len_infer": null,
27 | "steps_per_external_eval": null,
28 | "steps_per_stats": 100,
29 | "tgt_max_len": 50,
30 | "tgt_max_len_infer": null,
31 | "time_major": true,
32 | "unit_type": "lstm",
33 | "beam_width": 10
34 | }
35 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/standard_hparams/wmt16_gnmt_4_layer.json:
--------------------------------------------------------------------------------
1 | {
2 | "attention": "normed_bahdanau",
3 | "attention_architecture": "gnmt_v2",
4 | "batch_size": 128,
5 | "colocate_gradients_with_ops": true,
6 | "dropout": 0.2,
7 | "encoder_type": "gnmt",
8 | "eos": "",
9 | "forget_bias": 1.0,
10 | "infer_batch_size": 32,
11 | "init_weight": 0.1,
12 | "learning_rate": 1.0,
13 | "max_gradient_norm": 5.0,
14 | "metrics": ["bleu"],
15 | "num_buckets": 5,
16 | "num_layers": 4,
17 | "num_train_steps": 340000,
18 | "decay_scheme": "luong10",
19 | "num_units": 1024,
20 | "optimizer": "sgd",
21 | "residual": true,
22 | "share_vocab": false,
23 | "subword_option": "bpe",
24 | "sos": "",
25 | "src_max_len": 50,
26 | "src_max_len_infer": null,
27 | "steps_per_external_eval": null,
28 | "steps_per_stats": 100,
29 | "tgt_max_len": 50,
30 | "tgt_max_len_infer": null,
31 | "time_major": true,
32 | "unit_type": "lstm",
33 | "beam_width": 10,
34 | "length_penalty_weight": 1.0
35 | }
36 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/standard_hparams/wmt16_gnmt_8_layer.json:
--------------------------------------------------------------------------------
1 | {
2 | "attention": "normed_bahdanau",
3 | "attention_architecture": "gnmt_v2",
4 | "batch_size": 128,
5 | "colocate_gradients_with_ops": true,
6 | "dropout": 0.2,
7 | "encoder_type": "gnmt",
8 | "eos": "",
9 | "forget_bias": 1.0,
10 | "infer_batch_size": 32,
11 | "init_weight": 0.1,
12 | "learning_rate": 1.0,
13 | "max_gradient_norm": 5.0,
14 | "metrics": ["bleu"],
15 | "num_buckets": 5,
16 | "num_layers": 8,
17 | "num_train_steps": 340000,
18 | "decay_scheme": "luong10",
19 | "num_units": 1024,
20 | "optimizer": "sgd",
21 | "residual": true,
22 | "share_vocab": false,
23 | "subword_option": "bpe",
24 | "sos": "",
25 | "src_max_len": 50,
26 | "src_max_len_infer": null,
27 | "steps_per_external_eval": null,
28 | "steps_per_stats": 50,
29 | "tgt_max_len": 50,
30 | "tgt_max_len_infer": null,
31 | "time_major": true,
32 | "unit_type": "lstm",
33 | "beam_width": 10,
34 | "length_penalty_weight": 1.0
35 | }
36 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/iwslt15.vocab.100.en:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Rachel
5 | :
6 | The
7 | science
8 | behind
9 | a
10 | climate
11 | headline
12 | In
13 | 4
14 | minutes
15 | ,
16 | atmospheric
17 | chemist
18 | provides
19 | glimpse
20 | of
21 | the
22 | massive
23 | scientific
24 | effort
25 | bold
26 | headlines
27 | on
28 | change
29 | with
30 | her
31 | team
32 | --
33 | one
34 | thousands
35 | who
36 | contributed
37 | taking
38 | risky
39 | flight
40 | over
41 | rainforest
42 | in
43 | pursuit
44 | data
45 | key
46 | molecule
47 | .
48 | I
49 | 'd
50 | like
51 | to
52 | talk
53 | you
54 | today
55 | about
56 | scale
57 | that
58 | goes
59 | into
60 | making
61 | see
62 | paper
63 | look
64 | this
65 | when
66 | they
67 | have
68 | do
69 | and
70 | air
71 | quality
72 | or
73 | smog
74 | They
75 | are
76 | both
77 | two
78 | branches
79 | same
80 | field
81 | Recently
82 | looked
83 | Panel
84 | Climate
85 | Change
86 | IPCC
87 | put
88 | out
89 | their
90 | report
91 | state
92 | understanding
93 | system
94 | That
95 | was
96 | written
97 | by
98 | scientists
99 | from
100 | 40
101 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/iwslt15.vocab.100.vi:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Khoa
5 | học
6 | đằng
7 | sau
8 | một
9 | tiêu
10 | đề
11 | về
12 | khí
13 | hậu
14 | Trong
15 | 4
16 | phút
17 | ,
18 | chuyên
19 | gia
20 | hoá
21 | quyển
22 | Rachel
23 | giới
24 | thiệu
25 | sơ
26 | lược
27 | những
28 | nỗ
29 | lực
30 | khoa
31 | miệt
32 | mài
33 | táo
34 | bạo
35 | biến
36 | đổi
37 | cùng
38 | với
39 | đoàn
40 | nghiên
41 | cứu
42 | của
43 | mình
44 | --
45 | hàng
46 | ngàn
47 | người
48 | đã
49 | cống
50 | hiến
51 | cho
52 | dự
53 | án
54 | này
55 | chuyến
56 | bay
57 | mạo
58 | hiểm
59 | qua
60 | rừng
61 | già
62 | để
63 | tìm
64 | kiếm
65 | thông
66 | tin
67 | phân
68 | tử
69 | then
70 | chốt
71 | .
72 | Tôi
73 | muốn
74 | các
75 | bạn
76 | biết
77 | sự
78 | to
79 | lớn
80 | góp
81 | phần
82 | làm
83 | nên
84 | dòng
85 | tít
86 | thường
87 | thấy
88 | trên
89 | báo
90 | Có
91 | trông
92 | như
93 | thế
94 | khi
95 | bàn
96 | và
97 | nói
98 | chất
99 | lượng
100 | không
101 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/label_ref:
--------------------------------------------------------------------------------
1 | positive
2 | positive
3 | positive
4 | negative
5 | negative
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/pred_output:
--------------------------------------------------------------------------------
1 | positive
2 | positive
3 | negative
4 | negative
5 | positive
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/test_embed.txt:
--------------------------------------------------------------------------------
1 | some_word 1.0 2.0 3.0 4.0
2 | some_other_word 4.0 3.0 2.0 1.0
3 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/test_embed_with_header.txt:
--------------------------------------------------------------------------------
1 | 2 4
2 | some_word 1.0 2.0 3.0 4.0
3 | some_other_word 4.0 3.0 2.0 1.0
4 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/test_infer_file:
--------------------------------------------------------------------------------
1 | A Republic@@ an strategy to counter the re-@@ election of Obama
2 | Republic@@ an leaders justified their policy by the need to combat electoral fraud .
3 | However , the Brenn@@ an Centre considers this a my@@ th , stating that electoral fraud is rar@@ er in the United States than the number of people killed by ligh@@ tn@@ ing .
4 | Indeed , Republic@@ an lawyers identified only 300 cases of electoral fraud in the United States in a decade .
5 | One thing is certain : these new provisions will have a negative impact on vot@@ er tur@@ n-@@ out .
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/test_infer_vocab.src:
--------------------------------------------------------------------------------
1 | unk
2 | eos
3 | sos
4 | test1
5 | test2
6 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/test_infer_vocab.tgt:
--------------------------------------------------------------------------------
1 | unk
2 | eos
3 | test1
4 | test2
5 | test3
6 | test4
7 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/utils/__init__.py
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/utils/common_test_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Common utility functions for tests."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import tensorflow as tf
23 |
24 | from tensorflow.python.ops import lookup_ops
25 |
26 | from ..utils import iterator_utils
27 | from ..utils import standard_hparams_utils
28 |
29 |
30 | def create_test_hparams(unit_type="lstm",
31 | encoder_type="uni",
32 | num_layers=4,
33 | attention="",
34 | attention_architecture=None,
35 | use_residual=False,
36 | inference_indices=None,
37 | num_translations_per_input=1,
38 | beam_width=0,
39 | init_op="uniform"):
40 | """Create training and inference test hparams."""
41 | num_residual_layers = 0
42 | if use_residual:
43 | # TODO(rzhao): Put num_residual_layers computation logic into
44 | # `model_utils.py`, so we can also test it here.
45 | num_residual_layers = 2
46 |
47 | standard_hparams = standard_hparams_utils.create_standard_hparams()
48 |
49 | # Networks
50 | standard_hparams.num_units = 5
51 | standard_hparams.num_encoder_layers = num_layers
52 | standard_hparams.num_decoder_layers = num_layers
53 | standard_hparams.dropout = 0.5
54 | standard_hparams.unit_type = unit_type
55 | standard_hparams.encoder_type = encoder_type
56 | standard_hparams.residual = use_residual
57 | standard_hparams.num_residual_layers = num_residual_layers
58 |
59 | # Attention mechanisms
60 | standard_hparams.attention = attention
61 | standard_hparams.attention_architecture = attention_architecture
62 |
63 | # Train
64 | standard_hparams.init_op = init_op
65 | standard_hparams.num_train_steps = 1
66 | standard_hparams.decay_scheme = ""
67 |
68 | # Infer
69 | standard_hparams.tgt_max_len_infer = 100
70 | standard_hparams.beam_width = beam_width
71 | standard_hparams.num_translations_per_input = num_translations_per_input
72 |
73 | # Misc
74 | standard_hparams.forget_bias = 0.0
75 | standard_hparams.random_seed = 3
76 |
77 | # Vocab
78 | standard_hparams.src_vocab_size = 5
79 | standard_hparams.tgt_vocab_size = 5
80 | standard_hparams.eos = "eos"
81 | standard_hparams.sos = "sos"
82 | standard_hparams.src_vocab_file = ""
83 | standard_hparams.tgt_vocab_file = ""
84 | standard_hparams.src_embed_file = ""
85 | standard_hparams.tgt_embed_file = ""
86 |
87 | # For inference.py test
88 | standard_hparams.subword_option = "bpe"
89 | standard_hparams.src = "src"
90 | standard_hparams.tgt = "tgt"
91 | standard_hparams.src_max_len = 400
92 | standard_hparams.tgt_eos_id = 0
93 | standard_hparams.inference_indices = inference_indices
94 | return standard_hparams
95 |
96 |
97 | def create_test_iterator(hparams, mode):
98 | """Create test iterator."""
99 | src_vocab_table = lookup_ops.index_table_from_tensor(
100 | tf.constant([hparams.eos, "a", "b", "c", "d"]))
101 | tgt_vocab_mapping = tf.constant([hparams.sos, hparams.eos, "a", "b", "c"])
102 | tgt_vocab_table = lookup_ops.index_table_from_tensor(tgt_vocab_mapping)
103 | if mode == tf.contrib.learn.ModeKeys.INFER:
104 | reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_tensor(
105 | tgt_vocab_mapping)
106 |
107 | src_dataset = tf.data.Dataset.from_tensor_slices(
108 | tf.constant(["a a b b c", "a b b"]))
109 |
110 | if mode != tf.contrib.learn.ModeKeys.INFER:
111 | tgt_dataset = tf.data.Dataset.from_tensor_slices(
112 | tf.constant(["a b c b c", "a b c b"]))
113 | return (
114 | iterator_utils.get_iterator(
115 | src_dataset=src_dataset,
116 | tgt_dataset=tgt_dataset,
117 | src_vocab_table=src_vocab_table,
118 | tgt_vocab_table=tgt_vocab_table,
119 | batch_size=hparams.batch_size,
120 | sos=hparams.sos,
121 | eos=hparams.eos,
122 | random_seed=hparams.random_seed,
123 | num_buckets=hparams.num_buckets),
124 | src_vocab_table,
125 | tgt_vocab_table)
126 | else:
127 | return (
128 | iterator_utils.get_infer_iterator(
129 | src_dataset=src_dataset,
130 | src_vocab_table=src_vocab_table,
131 | eos=hparams.eos,
132 | batch_size=hparams.batch_size),
133 | src_vocab_table,
134 | tgt_vocab_table,
135 | reverse_tgt_vocab_table)
136 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/utils/evaluation_utils_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Tests for evaluation_utils.py."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import tensorflow as tf
23 |
24 | from ..utils import evaluation_utils
25 |
26 |
27 | class EvaluationUtilsTest(tf.test.TestCase):
28 |
29 | def testEvaluate(self):
30 | output = "nmt/testdata/deen_output"
31 | ref_bpe = "nmt/testdata/deen_ref_bpe"
32 | ref_spm = "nmt/testdata/deen_ref_spm"
33 |
34 | expected_bleu_score = 22.5855084573
35 | expected_rouge_score = 50.8429782599
36 |
37 | bpe_bleu_score = evaluation_utils.evaluate(
38 | ref_bpe, output, "bleu", "bpe")
39 | bpe_rouge_score = evaluation_utils.evaluate(
40 | ref_bpe, output, "rouge", "bpe")
41 |
42 | self.assertAlmostEqual(expected_bleu_score, bpe_bleu_score)
43 | self.assertAlmostEqual(expected_rouge_score, bpe_rouge_score)
44 |
45 | spm_bleu_score = evaluation_utils.evaluate(
46 | ref_spm, output, "bleu", "spm")
47 | spm_rouge_score = evaluation_utils.evaluate(
48 | ref_spm, output, "rouge", "spm")
49 |
50 | self.assertAlmostEqual(expected_rouge_score, spm_rouge_score)
51 | self.assertAlmostEqual(expected_bleu_score, spm_bleu_score)
52 |
53 | def testAccuracy(self):
54 | pred_output = "nmt/testdata/pred_output"
55 | label_ref = "nmt/testdata/label_ref"
56 |
57 | expected_accuracy_score = 60.00
58 |
59 | accuracy_score = evaluation_utils.evaluate(
60 | label_ref, pred_output, "accuracy")
61 | self.assertAlmostEqual(expected_accuracy_score, accuracy_score)
62 |
63 | def testWordAccuracy(self):
64 | pred_output = "nmt/testdata/pred_output"
65 | label_ref = "nmt/testdata/label_ref"
66 |
67 | expected_word_accuracy_score = 60.00
68 |
69 | word_accuracy_score = evaluation_utils.evaluate(
70 | label_ref, pred_output, "word_accuracy")
71 | self.assertAlmostEqual(expected_word_accuracy_score, word_accuracy_score)
72 |
73 |
74 | if __name__ == "__main__":
75 | tf.test.main()
76 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/utils/misc_utils_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Tests for vocab_utils."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import tensorflow as tf
23 |
24 | from ..utils import misc_utils
25 |
26 |
27 | class MiscUtilsTest(tf.test.TestCase):
28 |
29 | def testFormatBpeText(self):
30 | bpe_line = (
31 | b"En@@ ough to make already reluc@@ tant men hesitate to take screening"
32 | b" tests ."
33 | )
34 | expected_result = (
35 | b"Enough to make already reluctant men hesitate to take screening tests"
36 | b" ."
37 | )
38 | self.assertEqual(expected_result,
39 | misc_utils.format_bpe_text(bpe_line.split(b" ")))
40 |
41 | def testFormatSPMText(self):
42 | spm_line = u"\u2581This \u2581is \u2581a \u2581 te st .".encode("utf-8")
43 | expected_result = "This is a test."
44 | self.assertEqual(expected_result,
45 | misc_utils.format_spm_text(spm_line.split(b" ")))
46 |
47 |
48 | if __name__ == "__main__":
49 | tf.test.main()
50 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/utils/nmt_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Utility functions specifically for NMT."""
17 | from __future__ import print_function
18 |
19 | import codecs
20 | import time
21 | import numpy as np
22 | import tensorflow as tf
23 |
24 | from utils import evaluation_utils
25 | from utils import misc_utils as utils
26 |
27 | __all__ = ["decode_and_evaluate", "get_translation"]
28 |
29 |
30 | def decode_and_evaluate(name,
31 | model,
32 | sess,
33 | trans_file,
34 | ref_file,
35 | metrics,
36 | subword_option,
37 | beam_width,
38 | tgt_eos,
39 | num_translations_per_input=1,
40 | decode=True):
41 | """Decode a test set and compute a score according to the evaluation task."""
42 | # Decode
43 | if decode:
44 | utils.print_out(" decoding to output %s." % trans_file)
45 |
46 | start_time = time.time()
47 | num_sentences = 0
48 | with codecs.getwriter("utf-8")(
49 | tf.gfile.GFile(trans_file, mode="wb")) as trans_f:
50 | trans_f.write("") # Write empty string to ensure file is created.
51 |
52 | num_translations_per_input = max(
53 | min(num_translations_per_input, beam_width), 1)
54 | while True:
55 | try:
56 | nmt_outputs, _ = model.decode(sess)
57 | if beam_width == 0:
58 | nmt_outputs = np.expand_dims(nmt_outputs, 0)
59 |
60 | batch_size = nmt_outputs.shape[1]
61 | num_sentences += batch_size
62 |
63 | for sent_id in range(batch_size):
64 | for beam_id in range(num_translations_per_input):
65 | translation = get_translation(
66 | nmt_outputs[beam_id],
67 | sent_id,
68 | tgt_eos=tgt_eos,
69 | subword_option=subword_option)
70 | trans_f.write((translation + b"\n").decode("utf-8"))
71 | except tf.errors.OutOfRangeError:
72 | utils.print_time(
73 | " done, num sentences %d, num translations per input %d" %
74 | (num_sentences, num_translations_per_input), start_time)
75 | break
76 |
77 | # Evaluation
78 | evaluation_scores = {}
79 | if ref_file and tf.gfile.Exists(trans_file):
80 | for metric in metrics:
81 | score = evaluation_utils.evaluate(
82 | ref_file,
83 | trans_file,
84 | metric,
85 | subword_option=subword_option)
86 | evaluation_scores[metric] = score
87 | utils.print_out(" %s %s: %.1f" % (metric, name, score))
88 |
89 | return evaluation_scores
90 |
91 |
92 | def get_translation(nmt_outputs, sent_id, tgt_eos, subword_option):
93 | """Given batch decoding outputs, select a sentence and turn to text."""
94 | if tgt_eos: tgt_eos = tgt_eos.encode("utf-8")
95 | # Select a sentence
96 | output = nmt_outputs[sent_id, :].tolist()
97 |
98 | # If there is an eos symbol in outputs, cut them at that point.
99 | if tgt_eos and tgt_eos in output:
100 | output = output[:output.index(tgt_eos)]
101 |
102 | if subword_option == "bpe": # BPE
103 | translation = utils.format_bpe_text(output)
104 | elif subword_option == "spm": # SPM
105 | translation = utils.format_spm_text(output)
106 | else:
107 | translation = utils.format_text(output)
108 |
109 | return translation
110 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/utils/standard_hparams_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """standard hparams utils."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import tensorflow as tf
23 |
24 |
25 | def create_standard_hparams():
26 | return tf.contrib.training.HParams(
27 | # Data
28 | src="",
29 | tgt="",
30 | train_prefix="",
31 | dev_prefix="",
32 | test_prefix="",
33 | vocab_prefix="",
34 | embed_prefix="",
35 | out_dir="",
36 |
37 | # Networks
38 | num_units=512,
39 | num_layers=2,
40 | num_encoder_layers=2,
41 | num_decoder_layers=2,
42 | dropout=0.2,
43 | unit_type="lstm",
44 | encoder_type="bi",
45 | residual=False,
46 | time_major=True,
47 | num_embeddings_partitions=0,
48 |
49 | # Attention mechanisms
50 | attention="scaled_luong",
51 | attention_architecture="standard",
52 | output_attention=True,
53 | pass_hidden_state=True,
54 |
55 | # Train
56 | optimizer="sgd",
57 | batch_size=128,
58 | init_op="uniform",
59 | init_weight=0.1,
60 | max_gradient_norm=5.0,
61 | learning_rate=1.0,
62 | warmup_steps=0,
63 | warmup_scheme="t2t",
64 | decay_scheme="luong234",
65 | colocate_gradients_with_ops=True,
66 | num_train_steps=12000,
67 |
68 | # Data constraints
69 | num_buckets=5,
70 | max_train=0,
71 | src_max_len=50,
72 | tgt_max_len=50,
73 | src_max_len_infer=0,
74 | tgt_max_len_infer=0,
75 |
76 | # Data format
77 | sos="",
78 | eos="",
79 | subword_option="",
80 | check_special_token=True,
81 |
82 | # Misc
83 | forget_bias=1.0,
84 | num_gpus=1,
85 | epoch_step=0, # record where we were within an epoch.
86 | steps_per_stats=100,
87 | steps_per_external_eval=0,
88 | share_vocab=False,
89 | metrics=["bleu"],
90 | log_device_placement=False,
91 | random_seed=None,
92 | # only enable beam search during inference when beam_width > 0.
93 | beam_width=0,
94 | length_penalty_weight=0.0,
95 | override_loaded_hparams=True,
96 | num_keep_ckpts=5,
97 | avg_ckpts=False,
98 |
99 | # For inference
100 | inference_indices=None,
101 | infer_batch_size=32,
102 | sampling_temperature=0.0,
103 | num_translations_per_input=1,
104 | )
105 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/utils/vocab_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Utility to handle vocabularies."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import codecs
23 | import os
24 | import tensorflow as tf
25 |
26 | from tensorflow.python.ops import lookup_ops
27 |
28 | from utils import misc_utils as utils
29 |
30 |
31 | UNK = ""
32 | SOS = ""
33 | EOS = ""
34 | UNK_ID = 0
35 |
36 |
37 | def load_vocab(vocab_file):
38 | vocab = []
39 | with codecs.getreader("utf-8")(tf.gfile.GFile(vocab_file, "rb")) as f:
40 | vocab_size = 0
41 | for word in f:
42 | vocab_size += 1
43 | vocab.append(word.strip())
44 | return vocab, vocab_size
45 |
46 |
47 | def check_vocab(vocab_file, out_dir, check_special_token=True, sos=None,
48 | eos=None, unk=None):
49 | """Check if vocab_file doesn't exist, create from corpus_file."""
50 | if tf.gfile.Exists(vocab_file):
51 | utils.print_out("# Vocab file %s exists" % vocab_file)
52 | vocab, vocab_size = load_vocab(vocab_file)
53 | if check_special_token:
54 | # Verify if the vocab starts with unk, sos, eos
55 | # If not, prepend those tokens & generate a new vocab file
56 | if not unk: unk = UNK
57 | if not sos: sos = SOS
58 | if not eos: eos = EOS
59 | assert len(vocab) >= 3
60 | if vocab[0] != unk or vocab[1] != sos or vocab[2] != eos:
61 | utils.print_out("The first 3 vocab words [%s, %s, %s]"
62 | " are not [%s, %s, %s]" %
63 | (vocab[0], vocab[1], vocab[2], unk, sos, eos))
64 | vocab = [unk, sos, eos] + vocab
65 | vocab_size += 3
66 | new_vocab_file = os.path.join(out_dir, os.path.basename(vocab_file))
67 | with codecs.getwriter("utf-8")(
68 | tf.gfile.GFile(new_vocab_file, "wb")) as f:
69 | for word in vocab:
70 | f.write("%s\n" % word)
71 | vocab_file = new_vocab_file
72 | else:
73 | raise ValueError("vocab_file '%s' does not exist." % vocab_file)
74 |
75 | vocab_size = len(vocab)
76 | return vocab_size, vocab_file
77 |
78 |
79 | def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
80 | """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
81 | src_vocab_table = lookup_ops.index_table_from_file(
82 | src_vocab_file, default_value=UNK_ID)
83 | if share_vocab:
84 | tgt_vocab_table = src_vocab_table
85 | else:
86 | tgt_vocab_table = lookup_ops.index_table_from_file(
87 | tgt_vocab_file, default_value=UNK_ID)
88 | return src_vocab_table, tgt_vocab_table
89 |
90 |
91 | def load_embed_txt(embed_file):
92 | """Load embed_file into a python dictionary.
93 |
94 | Note: the embed_file should be a Glove formated txt file. Assuming
95 | embed_size=5, for example:
96 |
97 | the -0.071549 0.093459 0.023738 -0.090339 0.056123
98 | to 0.57346 0.5417 -0.23477 -0.3624 0.4037
99 | and 0.20327 0.47348 0.050877 0.002103 0.060547
100 |
101 | Args:
102 | embed_file: file path to the embedding file.
103 | Returns:
104 | a dictionary that maps word to vector, and the size of embedding dimensions.
105 | """
106 | emb_dict = dict()
107 | emb_size = None
108 | with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, 'rb')) as f:
109 | for line in f:
110 | tokens = line.strip().split(" ")
111 | word = tokens[0]
112 | vec = list(map(float, tokens[1:]))
113 | emb_dict[word] = vec
114 | if emb_size:
115 | assert emb_size == len(vec), "All embedding size should be same."
116 | else:
117 | emb_size = len(vec)
118 | return emb_dict, emb_size
119 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/utils/vocab_utils_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Tests for vocab_utils."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import codecs
23 | import os
24 | import tensorflow as tf
25 |
26 | from ..utils import vocab_utils
27 |
28 |
29 | class VocabUtilsTest(tf.test.TestCase):
30 |
31 | def testCheckVocab(self):
32 | # Create a vocab file
33 | vocab_dir = os.path.join(tf.test.get_temp_dir(), "vocab_dir")
34 | os.makedirs(vocab_dir)
35 | vocab_file = os.path.join(vocab_dir, "vocab_file")
36 | vocab = ["a", "b", "c"]
37 | with codecs.getwriter("utf-8")(tf.gfile.GFile(vocab_file, "wb")) as f:
38 | for word in vocab:
39 | f.write("%s\n" % word)
40 |
41 | # Call vocab_utils
42 | out_dir = os.path.join(tf.test.get_temp_dir(), "out_dir")
43 | os.makedirs(out_dir)
44 | vocab_size, new_vocab_file = vocab_utils.check_vocab(
45 | vocab_file, out_dir)
46 |
47 | # Assert: we expect the code to add , , and
48 | # create a new vocab file
49 | self.assertEqual(len(vocab) + 3, vocab_size)
50 | self.assertEqual(os.path.join(out_dir, "vocab_file"), new_vocab_file)
51 | new_vocab, _ = vocab_utils.load_vocab(new_vocab_file)
52 | self.assertEqual(
53 | [vocab_utils.UNK, vocab_utils.SOS, vocab_utils.EOS] + vocab, new_vocab)
54 |
55 |
56 | if __name__ == "__main__":
57 | tf.test.main()
58 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/simple/README.md:
--------------------------------------------------------------------------------
1 | # Simple Example
2 | This is a basic distributed training example with parallax.
3 |
4 | ## To Run
5 | Set your resource information in the `resource_info` file.
6 |
7 | Then execute:
8 | ```shell
9 | $ python simple_driver.py
10 | ```
11 |
12 | The command assumes the simple example codebase is distributed and reachable in the same absolute path in each of the machines.
13 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/simple/resource_info:
--------------------------------------------------------------------------------
1 | 123.456.78.90:1,2,4,5
2 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/simple/simple_driver.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2018 Seoul National University
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import os
17 | import numpy as np
18 | import tensorflow as tf
19 | import argparse
20 |
21 | import parallax
22 |
23 | parser = argparse.ArgumentParser()
24 | parser.add_argument('-lr', "--learning_rate", type=float, default=0.01,
25 | help='Learning rate')
26 |
27 | args = parser.parse_args()
28 |
29 | [-0.880728357, -0.706550564],
30 | [-0.179175969, 0.052373456],
31 | [0.460992645, 0.328267666],
32 | [-0.378916048, 0.86581809],
33 | [-0.064562793, -0.755948805],
34 | [-0.585833517, -0.46743004],
35 | [-0.151177544, -0.582325109],
36 | [-0.720116833, 0.834904979],
37 | [-0.518939078, -0.670627318],
38 | [-0.035878422, 0.750102543],
39 | [-0.673400627, -0.919498322],
40 | [-0.731202767, -0.159733489],
41 | [-0.463404605, 0.697764632],
42 | [0.706744043, 0.458026442],
43 | [0.819940015, -0.867168658],
44 | [-0.056113501, -0.602024627],
45 | [0.213450484, -0.20133007],
46 | [-0.358544296, -0.40380244],
47 |
48 | train_x = np.array([
49 | [-0.880728357, -0.706550564],
50 | [-0.179175969, 0.052373456],
51 | [0.460992645, 0.328267666],
52 | [-0.378916048, 0.86581809],
53 | [-0.064562793, -0.755948805],
54 | [-0.585833517, -0.46743004],
55 | [-0.151177544, -0.582325109],
56 | [-0.720116833, 0.834904979],
57 | [-0.518939078, -0.670627318],
58 | [-0.035878422, 0.750102543],
59 | [-0.673400627, -0.919498322],
60 | [-0.731202767, -0.159733489],
61 | [-0.463404605, 0.697764632],
62 | [0.706744043, 0.458026442],
63 | [0.819940015, -0.867168658],
64 | [-0.056113501, -0.602024627],
65 | [0.213450484, -0.20133007],
66 | [-0.358544296, -0.40380244]
67 | ])
68 |
69 | train_y = np.array([
70 | [2.306799664],
71 | [1.825970013],
72 | [1.901374447],
73 | [0.909895597],
74 | [2.723102683],
75 | [2.145410027],
76 | [2.498034199],
77 | [0.844066487],
78 | [2.401599333],
79 | [1.274285598],
80 | [2.542184193],
81 | [1.81653423],
82 | [1.06511757],
83 | [1.891457798],
84 | [3.317388286],
85 | [2.579920223],
86 | [2.301286159],
87 | [2.197386858],
88 | ])
89 |
90 | num_samples = train_x.shape[0]
91 |
92 |
93 | def main(_):
94 | single_gpu_graph = tf.Graph()
95 | with single_gpu_graph.as_default():
96 | global_step = tf.train.get_or_create_global_step()
97 | x = tf.placeholder(tf.float32, shape=(2))
98 | y = tf.placeholder(tf.float32, shape=(1))
99 |
100 | w = tf.get_variable(name='w', shape=(2, 1))
101 | b = tf.get_variable(name='b', shape=(1))
102 |
103 | pred = tf.nn.bias_add(tf.matmul(tf.expand_dims(x, axis=0), w), b)
104 | loss = tf.reduce_sum(tf.pow(pred - tf.expand_dims(y, axis=0), 2)) / 2
105 |
106 | optimizer = tf.train.GradientDescentOptimizer(args.learning_rate)
107 | train_op = optimizer.minimize(loss, global_step=global_step)
108 |
109 | # init = tf.global_variables_initializer()
110 |
111 | def run(sess, num_workers, worker_id, num_replicas_per_worker):
112 | cursor = 0
113 | for i in range(1000):
114 | feed_dict = {}
115 | feed_dict[x] = [train_x[(cursor + j) % num_samples] for j in \
116 | range(num_replicas_per_worker)]
117 | feed_dict[y] = [train_y[(cursor + j) % num_samples] for j in \
118 | range(num_replicas_per_worker)]
119 | cursor += num_replicas_per_worker
120 | fetches = {
121 | 'global_step': global_step,
122 | 'loss': loss,
123 | 'train_op': train_op
124 | }
125 |
126 | results = sess.run(fetches, feed_dict=feed_dict)
127 |
128 | if i % 5 == 0:
129 | print("global step: %d, loss: %f"
130 | % (results['global_step'][0], results['loss'][0]))
131 |
132 | resource_info = os.path.join(os.path.dirname(os.path.abspath(__file__)),
133 | 'resource_info')
134 | sess, num_workers, worker_id, num_replicas_per_worker = \
135 | parallax.parallel_run(single_gpu_graph, resource_info)
136 | run(sess, num_workers, worker_id, num_replicas_per_worker)
137 |
138 | if __name__ == '__main__':
139 | tf.app.run()
140 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/README.md:
--------------------------------------------------------------------------------
1 | # Skip-Thought Vectors
2 | This example implements the model described in [Skip-Thought Vectors](https://papers.nips.cc/paper/5950-skip-thought-vectors.pdf).
3 | The original code comes from [here](https://github.com/tensorflow/models/tree/master/research/skip_thoughts).
4 | We changed a minimal amount of the original code;`import path` code and BUILD file.
5 | We added the `skip_distributed_driver.py` file and modified `ops/input_ops.py`(for data sharding) file to run the example on parallax.
6 |
7 | ## Dataset
8 | * Follow the instructions shown in [Prepare the Training Data](https://github.com/tensorflow/models/tree/master/research/skip_thoughts).
9 |
10 | ## To Run
11 | Set your resource information in the `resource_info` file.
12 |
13 | Then execute:
14 | ```shell
15 | $ python skip_distributed_driver.py --input_file_pattern ${DATA_DIR}/data/train-?????-of-00100
16 | ```
17 | The command above runs a single Skip-Thought Vectors model on multiple devices specified in `resource_info`.
18 | The command assumes that the data directory and the Skip-Thought Vectors codebase are distributed and reachable in the same absolute path in each of the machines.
19 |
20 | Also, we have a few more options you can choose for distributed running.
21 |
22 | | Parameter Name | Default | Description |
23 | | :------------------- |:-----------------------| :-----------|
24 | | --data_path | None | Where to training/test data is stored |
25 | | --input_file_pattern | "" | File pattern of training data |
26 | | --batch_size | 128 | Batch size |
27 | | --resource_info_file | `./resource_info` | Filename containing cluster information written |
28 | | --max_steps | 1000000 | Number of iterations to run for each workers |
29 | | --log_frequency | 100 | How many steps between two runop log |
30 | | --sync | True | Whether to synchronize learning or not |
31 | | --ckpt_dir | None | Directory to save checkpoints |
32 | | --save_ckpt_steps | 0 | Number of steps between two consecutive checkpoints |
33 | | --run_option | None | The run option whether PS or MPI, None utilizes both |
34 |
35 |
36 | You can adapt the distributed running with above options. For example, if you want to fix the communication model as MPI mode, you can add `run_option` value like below.
37 |
38 | ```shell
39 | $ python skip_distributed_driver.py --input_file_pattern ${DATA_DIR}/data/train-?????-of-00100 --run_option MPI
40 | ```
41 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/configuration.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Default configuration for model architecture and training.
16 |
17 | Original source : https://github.com/tensorflow/models/tree/master/skip_thoughts
18 |
19 | """
20 |
21 | from __future__ import absolute_import
22 | from __future__ import division
23 | from __future__ import print_function
24 |
25 |
26 | class _HParams(object):
27 | """Wrapper for configuration parameters."""
28 | pass
29 |
30 |
31 | def model_config(input_file_pattern=None,
32 | input_queue_capacity=640000,
33 | num_input_reader_threads=1,
34 | shuffle_input_data=True,
35 | uniform_init_scale=0.1,
36 | vocab_size=20000,
37 | batch_size=128,
38 | word_embedding_dim=620,
39 | bidirectional_encoder=False,
40 | encoder_dim=2400):
41 | """Creates a model configuration object.
42 |
43 | Args:
44 | input_file_pattern: File pattern of sharded TFRecord files containing
45 | tf.Example protobufs.
46 | input_queue_capacity: Number of examples to keep in the input queue.
47 | num_input_reader_threads: Number of threads for prefetching input
48 | tf.Examples.
49 | shuffle_input_data: Whether to shuffle the input data.
50 | uniform_init_scale: Scale of random uniform initializer.
51 | vocab_size: Number of unique words in the vocab.
52 | batch_size: Batch size (training and evaluation only).
53 | word_embedding_dim: Word embedding dimension.
54 | bidirectional_encoder: Whether to use a bidirectional or unidirectional
55 | encoder RNN.
56 | encoder_dim: Number of output dimensions of the sentence encoder.
57 |
58 | Returns:
59 | An object containing model configuration parameters.
60 | """
61 | config = _HParams()
62 | config.input_file_pattern = input_file_pattern
63 | config.input_queue_capacity = input_queue_capacity
64 | config.num_input_reader_threads = num_input_reader_threads
65 | config.shuffle_input_data = shuffle_input_data
66 | config.uniform_init_scale = uniform_init_scale
67 | config.vocab_size = vocab_size
68 | config.batch_size = batch_size
69 | config.word_embedding_dim = word_embedding_dim
70 | config.bidirectional_encoder = bidirectional_encoder
71 | config.encoder_dim = encoder_dim
72 | return config
73 |
74 |
75 | def training_config(learning_rate=0.0008,
76 | learning_rate_decay_factor=0.5,
77 | learning_rate_decay_steps=400000,
78 | number_of_steps=500000,
79 | clip_gradient_norm=5.0,
80 | save_model_secs=600,
81 | save_summaries_secs=600):
82 | """Creates a training configuration object.
83 |
84 | Args:
85 | learning_rate: Initial learning rate.
86 | learning_rate_decay_factor: If > 0, the learning rate decay factor.
87 | learning_rate_decay_steps: The number of steps before the learning rate
88 | decays by learning_rate_decay_factor.
89 | number_of_steps: The total number of training steps to run. Passing None
90 | will cause the training script to run indefinitely.
91 | clip_gradient_norm: If not None, then clip gradients to this value.
92 | save_model_secs: How often (in seconds) to save model checkpoints.
93 | save_summaries_secs: How often (in seconds) to save model summaries.
94 |
95 | Returns:
96 | An object containing training configuration parameters.
97 |
98 | Raises:
99 | ValueError: If learning_rate_decay_factor is set and
100 | learning_rate_decay_steps is unset.
101 | """
102 | if learning_rate_decay_factor and not learning_rate_decay_steps:
103 | raise ValueError(
104 | "learning_rate_decay_factor requires learning_rate_decay_steps.")
105 |
106 | config = _HParams()
107 | config.learning_rate = learning_rate
108 | config.learning_rate_decay_factor = learning_rate_decay_factor
109 | config.learning_rate_decay_steps = learning_rate_decay_steps
110 | config.number_of_steps = number_of_steps
111 | config.clip_gradient_norm = clip_gradient_norm
112 | config.save_model_secs = save_model_secs
113 | config.save_summaries_secs = save_summaries_secs
114 | return config
115 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/data/special_words.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Special word constants.
16 |
17 | NOTE: The ids of the EOS and UNK constants should not be modified. It is assumed
18 | that these always occupy the first two ids.
19 | """
20 |
21 | # End of sentence.
22 | EOS = ""
23 | EOS_ID = 0
24 |
25 | # Unknown.
26 | UNK = ""
27 | UNK_ID = 1
28 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/encoder_manager.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Manager class for loading and encoding with multiple skip-thoughts models.
16 |
17 | If multiple models are loaded at once then the encode() function returns the
18 | concatenation of the outputs of each model.
19 |
20 | Example usage:
21 | manager = EncoderManager()
22 | manager.load_model(model_config_1, vocabulary_file_1, embedding_matrix_file_1,
23 | checkpoint_path_1)
24 | manager.load_model(model_config_2, vocabulary_file_2, embedding_matrix_file_2,
25 | checkpoint_path_2)
26 | encodings = manager.encode(data)
27 |
28 | Original source : https://github.com/tensorflow/models/tree/master/skip_thoughts
29 |
30 | """
31 |
32 | from __future__ import absolute_import
33 | from __future__ import division
34 | from __future__ import print_function
35 |
36 | import collections
37 |
38 | import numpy as np
39 | import tensorflow as tf
40 |
41 | import skip_thoughts_encoder
42 |
43 |
44 | class EncoderManager(object):
45 | """Manager class for loading and encoding with skip-thoughts models."""
46 |
47 | def __init__(self):
48 | self.encoders = []
49 | self.sessions = []
50 |
51 | def load_model(self, model_config, vocabulary_file, embedding_matrix_file,
52 | checkpoint_path):
53 | """Loads a skip-thoughts model.
54 |
55 | Args:
56 | model_config: Object containing parameters for building the model.
57 | vocabulary_file: Path to vocabulary file containing a list of newline-
58 | separated words where the word id is the corresponding 0-based index in
59 | the file.
60 | embedding_matrix_file: Path to a serialized numpy array of shape
61 | [vocab_size, embedding_dim].
62 | checkpoint_path: SkipThoughtsModel checkpoint file or a directory
63 | containing a checkpoint file.
64 | """
65 | tf.logging.info("Reading vocabulary from %s", vocabulary_file)
66 | with tf.gfile.GFile(vocabulary_file, mode="r") as f:
67 | lines = list(f.readlines())
68 | reverse_vocab = [line.decode("utf-8").strip() for line in lines]
69 | tf.logging.info("Loaded vocabulary with %d words.", len(reverse_vocab))
70 |
71 | tf.logging.info("Loading embedding matrix from %s",
72 | embedding_matrix_file)
73 | # Note: tf.gfile.GFile doesn't work here because np.load() calls f.seek()
74 | # with 3 arguments.
75 | with open(embedding_matrix_file, "r") as f:
76 | embedding_matrix = np.load(f)
77 | tf.logging.info("Loaded embedding matrix with shape %s",
78 | embedding_matrix.shape)
79 |
80 | word_embeddings = collections.OrderedDict(
81 | zip(reverse_vocab, embedding_matrix))
82 |
83 | g = tf.Graph()
84 | with g.as_default():
85 | encoder = skip_thoughts_encoder.SkipThoughtsEncoder(word_embeddings)
86 | restore_model = encoder.build_graph_from_config(model_config,
87 | checkpoint_path)
88 |
89 | sess = tf.Session(graph=g)
90 | restore_model(sess)
91 |
92 | self.encoders.append(encoder)
93 | self.sessions.append(sess)
94 |
95 | def encode(self,
96 | data,
97 | use_norm=True,
98 | verbose=False,
99 | batch_size=128,
100 | use_eos=False):
101 | """Encodes a sequence of sentences as skip-thought vectors.
102 |
103 | Args:
104 | data: A list of input strings.
105 | use_norm: If True, normalize output skip-thought vectors to unit
106 | L2 norm.
107 | verbose: Whether to log every batch.
108 | batch_size: Batch size for the RNN encoders.
109 | use_eos: If True, append the end-of-sentence word to each input
110 | sentence.
111 |
112 | Returns:
113 | thought_vectors: A list of numpy arrays corresponding to 'data'.
114 |
115 | Raises:
116 | ValueError: If called before calling load_encoder.
117 | """
118 | if not self.encoders:
119 | raise ValueError(
120 | "Must call load_model at least once before calling encode.")
121 |
122 | encoded = []
123 | for encoder, sess in zip(self.encoders, self.sessions):
124 | encoded.append(
125 | np.array(
126 | encoder.encode(
127 | sess,
128 | data,
129 | use_norm=use_norm,
130 | verbose=verbose,
131 | batch_size=batch_size,
132 | use_eos=use_eos)))
133 |
134 | return np.concatenate(encoded, axis=1)
135 |
136 | def close(self):
137 | """Closes the active TensorFlow Sessions."""
138 | for sess in self.sessions:
139 | sess.close()
140 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/evaluate.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Script to evaluate a skip-thoughts model.
16 |
17 | This script can evaluate a model with a unidirectional encoder ("uni-skip" in
18 | the paper); or a model with a bidirectional encoder ("bi-skip"); or the
19 | combination of a model with a unidirectional encoder and a model with a
20 | bidirectional encoder ("combine-skip").
21 |
22 | The uni-skip model (if it exists) is specified by the flags
23 | --uni_vocab_file, --uni_embeddings_file, --uni_checkpoint_path.
24 |
25 | The bi-skip model (if it exists) is specified by the flags
26 | --bi_vocab_file, --bi_embeddings_path, --bi_checkpoint_path.
27 |
28 | The evaluation tasks have different running times. SICK may take 5-10 minutes.
29 | MSRP, TREC and CR may take 20-60 minutes. SUBJ, MPQA and MR may take 2+ hours.
30 |
31 | Original source : https://github.com/tensorflow/models/tree/master/skip_thoughts
32 |
33 | """
34 |
35 | from __future__ import absolute_import
36 | from __future__ import division
37 | from __future__ import print_function
38 |
39 | import tensorflow as tf
40 |
41 | from skipthoughts import eval_classification
42 | from skipthoughts import eval_msrp
43 | from skipthoughts import eval_sick
44 | from skipthoughts import eval_trec
45 | from skip_thoughts import configuration
46 | from skip_thoughts import encoder_manager
47 |
48 | FLAGS = tf.flags.FLAGS
49 |
50 | tf.flags.DEFINE_string("eval_task", "CR",
51 | "Name of the evaluation task to run. Available tasks: "
52 | "MR, CR, SUBJ, MPQA, SICK, MSRP, TREC.")
53 |
54 | tf.flags.DEFINE_string("data_dir", None, "Directory containing training data.")
55 |
56 | tf.flags.DEFINE_string("uni_vocab_file", None,
57 | "Path to vocabulary file containing a list of newline-"
58 | "separated words where the word id is the "
59 | "corresponding 0-based index in the file.")
60 | tf.flags.DEFINE_string("bi_vocab_file", None,
61 | "Path to vocabulary file containing a list of newline-"
62 | "separated words where the word id is the "
63 | "corresponding 0-based index in the file.")
64 |
65 | tf.flags.DEFINE_string("uni_embeddings_file", None,
66 | "Path to serialized numpy array of shape "
67 | "[vocab_size, embedding_dim].")
68 | tf.flags.DEFINE_string("bi_embeddings_file", None,
69 | "Path to serialized numpy array of shape "
70 | "[vocab_size, embedding_dim].")
71 |
72 | tf.flags.DEFINE_string("uni_checkpoint_path", None,
73 | "Checkpoint file or directory containing a checkpoint "
74 | "file.")
75 | tf.flags.DEFINE_string("bi_checkpoint_path", None,
76 | "Checkpoint file or directory containing a checkpoint "
77 | "file.")
78 |
79 | tf.logging.set_verbosity(tf.logging.INFO)
80 |
81 |
82 | def main(unused_argv):
83 | if not FLAGS.data_dir:
84 | raise ValueError("--data_dir is required.")
85 |
86 | encoder = encoder_manager.EncoderManager()
87 |
88 | # Maybe load unidirectional encoder.
89 | if FLAGS.uni_checkpoint_path:
90 | print("Loading unidirectional model...")
91 | uni_config = configuration.model_config()
92 | encoder.load_model(uni_config, FLAGS.uni_vocab_file,
93 | FLAGS.uni_embeddings_file, FLAGS.uni_checkpoint_path)
94 |
95 | # Maybe load bidirectional encoder.
96 | if FLAGS.bi_checkpoint_path:
97 | print("Loading bidirectional model...")
98 | bi_config = configuration.model_config(bidirectional_encoder=True)
99 | encoder.load_model(bi_config, FLAGS.bi_vocab_file,
100 | FLAGS.bi_embeddings_file,
101 | FLAGS.bi_checkpoint_path)
102 |
103 | if FLAGS.eval_task in ["MR", "CR", "SUBJ", "MPQA"]:
104 | eval_classification.eval_nested_kfold(
105 | encoder, FLAGS.eval_task, FLAGS.data_dir, use_nb=False)
106 | elif FLAGS.eval_task == "SICK":
107 | eval_sick.evaluate(encoder, evaltest=True, loc=FLAGS.data_dir)
108 | elif FLAGS.eval_task == "MSRP":
109 | eval_msrp.evaluate(
110 | encoder, evalcv=True, evaltest=True, use_feats=True,
111 | loc=FLAGS.data_dir)
112 | elif FLAGS.eval_task == "TREC":
113 | eval_trec.evaluate(encoder, evalcv=True, evaltest=True,
114 | loc=FLAGS.data_dir)
115 | else:
116 | raise ValueError("Unrecognized eval_task: %s" % FLAGS.eval_task)
117 |
118 | encoder.close()
119 |
120 |
121 | if __name__ == "__main__":
122 | tf.app.run()
123 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/skip_thoughts/ops/__init__.py
--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/ops/gru_cell.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """GRU cell implementation for the skip-thought vectors model."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 |
22 | import tensorflow as tf
23 |
24 | _layer_norm = tf.contrib.layers.layer_norm
25 |
26 |
27 | class LayerNormGRUCell(tf.contrib.rnn.RNNCell):
28 | """GRU cell with layer normalization.
29 |
30 | The layer normalization implementation is based on:
31 |
32 | https://arxiv.org/abs/1607.06450.
33 |
34 | "Layer Normalization"
35 | Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
36 | """
37 |
38 | def __init__(self,
39 | num_units,
40 | w_initializer,
41 | u_initializer,
42 | b_initializer,
43 | activation=tf.nn.tanh):
44 | """Initializes the cell.
45 |
46 | Args:
47 | num_units: Number of cell units.
48 | w_initializer: Initializer for the "W" (input) parameter matrices.
49 | u_initializer: Initializer for the "U" (recurrent) parameter matrices.
50 | b_initializer: Initializer for the "b" (bias) parameter vectors.
51 | activation: Cell activation function.
52 | """
53 | self._num_units = num_units
54 | self._w_initializer = w_initializer
55 | self._u_initializer = u_initializer
56 | self._b_initializer = b_initializer
57 | self._activation = activation
58 |
59 | @property
60 | def state_size(self):
61 | return self._num_units
62 |
63 | @property
64 | def output_size(self):
65 | return self._num_units
66 |
67 | def _w_h_initializer(self):
68 | """Returns an initializer for the "W_h" parameter matrix.
69 |
70 | See equation (23) in the paper. The "W_h" parameter matrix is the
71 | concatenation of two parameter submatrices. The matrix returned is
72 | [U_z, U_r].
73 |
74 | Returns:
75 | A Tensor with shape [num_units, 2 * num_units] as described above.
76 | """
77 |
78 | def _initializer(shape, dtype=tf.float32, partition_info=None):
79 | num_units = self._num_units
80 | assert shape == [num_units, 2 * num_units]
81 | u_z = self._u_initializer([num_units, num_units], dtype, partition_info)
82 | u_r = self._u_initializer([num_units, num_units], dtype, partition_info)
83 | return tf.concat([u_z, u_r], 1)
84 |
85 | return _initializer
86 |
87 | def _w_x_initializer(self, input_dim):
88 | """Returns an initializer for the "W_x" parameter matrix.
89 |
90 | See equation (23) in the paper. The "W_x" parameter matrix is the
91 | concatenation of two parameter submatrices. The matrix returned is
92 | [W_z, W_r].
93 |
94 | Args:
95 | input_dim: The dimension of the cell inputs.
96 |
97 | Returns:
98 | A Tensor with shape [input_dim, 2 * num_units] as described above.
99 | """
100 |
101 | def _initializer(shape, dtype=tf.float32, partition_info=None):
102 | num_units = self._num_units
103 | assert shape == [input_dim, 2 * num_units]
104 | w_z = self._w_initializer([input_dim, num_units], dtype, partition_info)
105 | w_r = self._w_initializer([input_dim, num_units], dtype, partition_info)
106 | return tf.concat([w_z, w_r], 1)
107 |
108 | return _initializer
109 |
110 | def __call__(self, inputs, state, scope=None):
111 | """GRU cell with layer normalization."""
112 | input_dim = inputs.get_shape().as_list()[1]
113 | num_units = self._num_units
114 |
115 | with tf.variable_scope(scope or "gru_cell"):
116 | with tf.variable_scope("gates"):
117 | w_h = tf.get_variable(
118 | "w_h", [num_units, 2 * num_units],
119 | initializer=self._w_h_initializer())
120 | w_x = tf.get_variable(
121 | "w_x", [input_dim, 2 * num_units],
122 | initializer=self._w_x_initializer(input_dim))
123 | z_and_r = (_layer_norm(tf.matmul(state, w_h), scope="layer_norm/w_h") +
124 | _layer_norm(tf.matmul(inputs, w_x), scope="layer_norm/w_x"))
125 | z, r = tf.split(tf.sigmoid(z_and_r), 2, 1)
126 | with tf.variable_scope("candidate"):
127 | w = tf.get_variable(
128 | "w", [input_dim, num_units], initializer=self._w_initializer)
129 | u = tf.get_variable(
130 | "u", [num_units, num_units], initializer=self._u_initializer)
131 | h_hat = (r * _layer_norm(tf.matmul(state, u), scope="layer_norm/u") +
132 | _layer_norm(tf.matmul(inputs, w), scope="layer_norm/w"))
133 | new_h = (1 - z) * state + z * self._activation(h_hat)
134 | return new_h, new_h
135 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/ops/input_ops.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Input ops."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import collections
22 |
23 |
24 | import tensorflow as tf
25 | from parallax import shard
26 |
27 | # A SentenceBatch is a pair of Tensors:
28 | # ids: Batch of input sentences represented as sequences of word ids: an int64
29 | # Tensor with shape [batch_size, padded_length].
30 | # mask: Boolean mask distinguishing real words (1) from padded words (0): an
31 | # int32 Tensor with shape [batch_size, padded_length].
32 | SentenceBatch = collections.namedtuple("SentenceBatch", ("ids", "mask"))
33 |
34 |
35 | def parse_example_batch(serialized):
36 | """Parses a batch of tf.Example protos.
37 |
38 | Args:
39 | serialized: A 1-D string Tensor; a batch of serialized tf.Example protos.
40 | Returns:
41 | encode: A SentenceBatch of encode sentences.
42 | decode_pre: A SentenceBatch of "previous" sentences to decode.
43 | decode_post: A SentenceBatch of "post" sentences to decode.
44 | """
45 | features = tf.parse_example(
46 | serialized,
47 | features={
48 | "encode": tf.VarLenFeature(dtype=tf.int64),
49 | "decode_pre": tf.VarLenFeature(dtype=tf.int64),
50 | "decode_post": tf.VarLenFeature(dtype=tf.int64),
51 | })
52 |
53 | def _sparse_to_batch(sparse):
54 | ids = tf.sparse_tensor_to_dense(sparse) # Padding with zeroes.
55 | mask = tf.sparse_to_dense(sparse.indices, sparse.dense_shape,
56 | tf.ones_like(sparse.values, dtype=tf.int32))
57 | return SentenceBatch(ids=ids, mask=mask)
58 |
59 | output_names = ("encode", "decode_pre", "decode_post")
60 | return tuple(_sparse_to_batch(features[x]) for x in output_names)
61 |
62 |
63 | def prefetch_input_data(reader,
64 | file_pattern,
65 | shuffle,
66 | capacity,
67 | num_reader_threads=1):
68 | """Prefetches string values from disk into an input queue.
69 |
70 | Args:
71 | reader: Instance of tf.ReaderBase.
72 | file_pattern: Comma-separated list of file patterns (e.g.
73 | "/tmp/train_data-?????-of-00100", where '?' acts as a wildcard that
74 | matches any character).
75 | shuffle: Boolean; whether to randomly shuffle the input data.
76 | capacity: Queue capacity (number of records).
77 | num_reader_threads: Number of reader threads feeding into the queue.
78 |
79 | Returns:
80 | A Queue containing prefetched string values.
81 | """
82 | data_files = []
83 | for pattern in file_pattern.split(","):
84 | data_files.extend(tf.gfile.Glob(pattern))
85 | if not data_files:
86 | tf.logging.fatal("Found no input files matching %s", file_pattern)
87 | else:
88 | tf.logging.info("Prefetching values from %d files matching %s",
89 | len(data_files), file_pattern)
90 | data_files.sort()
91 | num_files = len(data_files)
92 | num_shards, shard_id = shard.create_num_shards_and_shard_id()
93 | shard_size = num_files / num_shards
94 | shard_size = tf.cast(shard_size, dtype=tf.int64)
95 | remainder = num_files % num_shards
96 |
97 | slice_begin = tf.cond(tf.less(shard_id, remainder + 1),
98 | lambda: (shard_size + 1) * shard_id,
99 | lambda: shard_size * shard_id + remainder)
100 | slice_size = tf.cond(tf.less(shard_id, remainder), lambda: shard_size + 1,
101 | lambda: shard_size)
102 | data_files = tf.slice(data_files, [slice_begin], [slice_size])
103 | filename_queue = tf.train.string_input_producer(
104 | data_files, shuffle=shuffle, capacity=16, name="filename_queue")
105 |
106 | if shuffle:
107 | min_after_dequeue = int(0.6 * capacity)
108 | values_queue = tf.RandomShuffleQueue(
109 | capacity=capacity,
110 | min_after_dequeue=min_after_dequeue,
111 | dtypes=[tf.string],
112 | shapes=[[]],
113 | name="random_input_queue")
114 | else:
115 | values_queue = tf.FIFOQueue(
116 | capacity=capacity,
117 | dtypes=[tf.string],
118 | shapes=[[]],
119 | name="fifo_input_queue")
120 |
121 | enqueue_ops = []
122 | for _ in range(num_reader_threads):
123 | _, value = reader.read(filename_queue)
124 | enqueue_ops.append(values_queue.enqueue([value]))
125 | tf.train.queue_runner.add_queue_runner(
126 | tf.train.queue_runner.QueueRunner(values_queue, enqueue_ops))
127 | tf.summary.scalar("queue/%s/fraction_of_%d_full" % (values_queue.name,
128 | capacity),
129 | tf.cast(values_queue.size(), tf.float32) * (1.0 / capacity))
130 |
131 | return values_queue
132 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/parallax_config.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2018 Seoul National University
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import tensorflow as tf
17 | import parallax
18 |
19 |
20 | flags = tf.app.flags
21 | flags.DEFINE_boolean('replicate_variables', True, """replicate_variables""")
22 | flags.DEFINE_string('protocol', 'grpc', """The method for managing variables""")
23 | flags.DEFINE_string('mpirun_options', '', 'The option for mpirun')
24 | flags.DEFINE_string('run_option', 'HYBRID',
25 | 'The run option whether PS, MPI or HYBRID')
26 | flags.DEFINE_string('redirect_path', None, """redirect path to keep the log of distributed workers""")
27 | flags.DEFINE_string('ckpt_dir', None, """Directory to save checkpoints""")
28 | flags.DEFINE_integer('save_ckpt_steps', None,
29 | """Number of steps between two consecutive checkpoints""")
30 | flags.DEFINE_string('profile_dir', None, """Directory to save RunMetadata""")
31 | flags.DEFINE_string('profile_steps', None, """Comma separated porfile steps""")
32 | flags.DEFINE_boolean('local_aggregation', True,
33 | """Whether to use local aggregation or not""")
34 | flags.DEFINE_boolean('boundary_among_servers', True,
35 | """Whether to use operation placement among servers""")
36 | flags.DEFINE_boolean('boundary_between_workers_and_servers', True,
37 | """Whether to use operation placement between workers and servers""")
38 | flags.DEFINE_string('export_graph_path', None, """export path to keep transformed graph definintion""")
39 | FLAGS = flags.FLAGS
40 |
41 | def build_config():
42 |
43 | ckpt_config = parallax.CheckPointConfig(ckpt_dir=FLAGS.ckpt_dir,
44 | save_ckpt_steps=FLAGS.save_ckpt_steps)
45 | ps_config = parallax.PSConfig(replicate_variables=FLAGS.replicate_variables,
46 | protocol=FLAGS.protocol,
47 | local_aggregation=FLAGS.local_aggregation,
48 | boundary_among_servers=FLAGS.boundary_among_servers,
49 | boundary_between_workers_and_servers=\
50 | FLAGS.boundary_between_workers_and_servers)
51 | mpi_config = parallax.MPIConfig(mpirun_options=FLAGS.mpirun_options)
52 | parallax_config = parallax.Config()
53 | parallax_config.run_option = FLAGS.run_option
54 | parallax_config.average_sparse = False
55 | parallax_config.communication_config = parallax.CommunicationConfig(ps_config, mpi_config)
56 | parallax_config.ckpt_config=ckpt_config
57 | def get_profile_steps():
58 | if not FLAGS.profile_steps:
59 | return []
60 | FLAGS.profile_steps = FLAGS.profile_steps.strip()
61 | return [int(step) for step in FLAGS.profile_steps.split(',')]
62 | profile_config = parallax.ProfileConfig(profile_dir=FLAGS.profile_dir,
63 | profile_steps=get_profile_steps())
64 | parallax_config.profile_config = profile_config
65 | parallax_config.redirect_path = FLAGS.redirect_path
66 | parallax_config.export_graph_path = FLAGS.export_graph_path
67 |
68 | return parallax_config
69 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/resource_info:
--------------------------------------------------------------------------------
1 | 123.456.78.90:0
2 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/skip_distributed_driver.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2018 Seoul National University
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import sys
17 | import os
18 | import json
19 | import time
20 |
21 | import tensorflow as tf
22 | from tensorflow.core.protobuf import config_pb2
23 | import parallax
24 | import parallax_config
25 |
26 | import configuration
27 | import skip_thoughts_model
28 |
29 | FLAGS = tf.app.flags.FLAGS
30 |
31 | tf.app.flags.DEFINE_string('data_path', None,
32 | """Where to training/test data is stored.""")
33 | tf.app.flags.DEFINE_string('input_file_pattern', '',
34 | """File pattern of train data""")
35 | tf.app.flags.DEFINE_integer('batch_size', 128,
36 | """Batch_size""")
37 | tf.app.flags.DEFINE_string('resource_info_file',
38 | os.path.abspath(
39 | os.path.join(os.path.dirname(__file__), '.',
40 | 'resource_info')),
41 | 'Filename containing cluster information')
42 | tf.app.flags.DEFINE_integer('max_steps', 1000000,
43 | """Number of iterations to run for each workers.""")
44 | tf.app.flags.DEFINE_integer('log_frequency', 100,
45 | """How many steps between two runop logs.""")
46 | tf.app.flags.DEFINE_boolean('sync', True, '')
47 |
48 | def main(_):
49 | single_gpu_graph = tf.Graph()
50 | with single_gpu_graph.as_default():
51 | model_config = configuration.model_config(
52 | input_file_pattern=FLAGS.input_file_pattern,
53 | batch_size=FLAGS.batch_size)
54 | training_config = configuration.training_config()
55 | model = skip_thoughts_model.SkipThoughtsModel(model_config,
56 | mode="train")
57 | model.build()
58 |
59 | # Setup learning rate
60 | if training_config.learning_rate_decay_factor > 0:
61 | learning_rate = tf.train.exponential_decay(
62 | learning_rate=float(training_config.learning_rate),
63 | global_step=model.global_step,
64 | decay_steps=training_config.learning_rate_decay_steps,
65 | decay_rate=training_config.learning_rate_decay_factor,
66 | staircase=False)
67 | else:
68 | learning_rate = tf.constant(training_config.learning_rate)
69 |
70 | optimizer = tf.train.AdamOptimizer(learning_rate)
71 |
72 | train_tensor = tf.contrib.slim.learning.create_train_op(
73 | total_loss=model.total_loss,
74 | optimizer=optimizer,
75 | global_step=model.global_step,
76 | clip_gradient_norm=training_config.clip_gradient_norm)
77 |
78 | def run(sess, num_workers, worker_id, num_replicas_per_worker):
79 | fetches = {
80 | 'global_step':
81 | model.global_step,
82 | 'cost':
83 | model.total_loss,
84 | 'train_op':
85 | train_tensor,
86 | }
87 |
88 | start = time.time()
89 | for i in range(FLAGS.max_steps):
90 | results = sess.run(fetches)
91 | if i % FLAGS.log_frequency == 0:
92 | end = time.time()
93 | throughput = float(FLAGS.log_frequency) / float(end - start)
94 | parallax.log.info(
95 | "global step: %d, loss: %f, throughput: %f steps/sec"
96 | % (results['global_step'][0], results['cost'][0], throughput))
97 | start = time.time()
98 |
99 | sess, num_workers, worker_id, num_replicas_per_worker = \
100 | parallax.parallel_run(single_gpu_graph,
101 | FLAGS.resource_info_file,
102 | sync=FLAGS.sync,
103 | parallax_config=parallax_config.build_config())
104 | run(sess, num_workers, worker_id, num_replicas_per_worker)
105 |
106 | if __name__ == "__main__":
107 | tf.logging.set_verbosity(tf.logging.INFO)
108 | tf.app.run()
109 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/train.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Train the skip-thoughts model."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import tensorflow as tf
22 |
23 | from skip_thoughts import configuration
24 | from skip_thoughts import skip_thoughts_model
25 |
26 | FLAGS = tf.flags.FLAGS
27 |
28 | tf.flags.DEFINE_string("input_file_pattern", None,
29 | "File pattern of sharded TFRecord files containing "
30 | "tf.Example protos.")
31 | tf.flags.DEFINE_string("train_dir", None,
32 | "Directory for saving and loading checkpoints.")
33 |
34 | tf.logging.set_verbosity(tf.logging.INFO)
35 |
36 |
37 | def _setup_learning_rate(config, global_step):
38 | """Sets up the learning rate with optional exponential decay.
39 |
40 | Args:
41 | config: Object containing learning rate configuration parameters.
42 | global_step: Tensor; the global step.
43 |
44 | Returns:
45 | learning_rate: Tensor; the learning rate with exponential decay.
46 | """
47 | if config.learning_rate_decay_factor > 0:
48 | learning_rate = tf.train.exponential_decay(
49 | learning_rate=float(config.learning_rate),
50 | global_step=global_step,
51 | decay_steps=config.learning_rate_decay_steps,
52 | decay_rate=config.learning_rate_decay_factor,
53 | staircase=False)
54 | else:
55 | learning_rate = tf.constant(config.learning_rate)
56 | return learning_rate
57 |
58 |
59 | def main(unused_argv):
60 | if not FLAGS.input_file_pattern:
61 | raise ValueError("--input_file_pattern is required.")
62 | if not FLAGS.train_dir:
63 | raise ValueError("--train_dir is required.")
64 |
65 | model_config = configuration.model_config(
66 | input_file_pattern=FLAGS.input_file_pattern)
67 | training_config = configuration.training_config()
68 |
69 | tf.logging.info("Building training graph.")
70 | g = tf.Graph()
71 | with g.as_default():
72 | model = skip_thoughts_model.SkipThoughtsModel(model_config,
73 | mode="train")
74 | model.build()
75 |
76 | learning_rate = _setup_learning_rate(training_config, model.global_step)
77 | optimizer = tf.train.AdamOptimizer(learning_rate)
78 |
79 | train_tensor = tf.contrib.slim.learning.create_train_op(
80 | total_loss=model.total_loss,
81 | optimizer=optimizer,
82 | global_step=model.global_step,
83 | clip_gradient_norm=training_config.clip_gradient_norm)
84 |
85 | saver = tf.train.Saver()
86 |
87 | tf.contrib.slim.learning.train(
88 | train_op=train_tensor,
89 | logdir=FLAGS.train_dir,
90 | graph=g,
91 | global_step=model.global_step,
92 | number_of_steps=training_config.number_of_steps,
93 | save_summaries_secs=training_config.save_summaries_secs,
94 | saver=saver,
95 | save_interval_secs=training_config.save_model_secs)
96 |
97 |
98 | if __name__ == "__main__":
99 | tf.app.run()
100 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/CNNBenchmark_distributed_driver.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2018 Seoul National University
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import argparse
17 | import sys
18 | import os
19 | import json
20 | import time
21 |
22 | from absl import flags
23 | import tensorflow as tf
24 |
25 | import benchmark_cnn
26 | import cnn_util
27 | import parallax_config
28 | from cnn_util import log_fn
29 | from tensorflow.core.protobuf import config_pb2
30 |
31 | import parallax
32 |
33 | benchmark_cnn.define_flags()
34 | flags.adopt_module_key_flags(benchmark_cnn)
35 |
36 | FLAGS = tf.app.flags.FLAGS
37 |
38 | tf.app.flags.DEFINE_string('resource_info_file',
39 | os.path.abspath(os.path.join(
40 | os.path.dirname(__file__),
41 | '.',
42 | 'resource_info')),
43 | 'Filename containing cluster information')
44 | tf.app.flags.DEFINE_integer('max_steps', 1000000,
45 | """Number of iterations to run for each workers.""")
46 | tf.app.flags.DEFINE_integer('log_frequency', 100,
47 | """How many steps between two runop logs.""")
48 | tf.app.flags.DEFINE_boolean('sync', True, '')
49 |
50 | def main(_):
51 | # Build benchmark_cnn model
52 | params = benchmark_cnn.make_params_from_flags()
53 | params, sess_config = benchmark_cnn.setup(params)
54 | bench = benchmark_cnn.BenchmarkCNN(params)
55 |
56 | # Print informaton
57 | tfversion = cnn_util.tensorflow_version_tuple()
58 | log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1]))
59 | bench.print_info()
60 |
61 | # Build single-GPU benchmark_cnn model
62 | single_gpu_graph = tf.Graph()
63 | with single_gpu_graph.as_default():
64 | bench.build_model()
65 |
66 | config = parallax_config.build_config()
67 | config.sess_config = sess_config
68 |
69 | sess, num_workers, worker_id, num_replicas_per_worker = \
70 | parallax.parallel_run(single_gpu_graph,
71 | FLAGS.resource_info_file,
72 | sync=FLAGS.sync,
73 | parallax_config=config)
74 |
75 |
76 | fetches = {
77 | 'global_step': bench.global_step,
78 | 'cost': bench.cost,
79 | 'train_op': bench.train_op,
80 | }
81 |
82 | start = time.time()
83 | for i in range(FLAGS.max_steps):
84 | results = sess.run(fetches)
85 | if (i + 1) % FLAGS.log_frequency == 0:
86 | end = time.time()
87 | throughput = float(FLAGS.log_frequency) / float(end - start)
88 | parallax.log.info(
89 | "global step: %d, loss: %f, throughput: %f steps/sec"
90 | % (results['global_step'][0]+1, results['cost'][0], throughput))
91 | start = time.time()
92 |
93 | if __name__ == '__main__':
94 | tf.app.run()
95 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/CNNBenchmark_eval.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2018 Seoul National University
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | from absl import flags
17 | import tensorflow as tf
18 |
19 | import benchmark_cnn
20 |
21 | benchmark_cnn.define_flags()
22 | flags.adopt_module_key_flags(benchmark_cnn)
23 |
24 | FLAGS = tf.app.flags.FLAGS
25 |
26 | def main(_):
27 | FLAGS.eval = True
28 | params = benchmark_cnn.make_params_from_flags()
29 | params, config = benchmark_cnn.setup(params)
30 | bench = benchmark_cnn.BenchmarkCNN(params)
31 | bench.evaluate()
32 |
33 | if __name__ == '__main__':
34 | tf.app.run()
35 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/README.md:
--------------------------------------------------------------------------------
1 | # TensorFlow CNN Benchmarks
2 | The original code of this example comes from [tf_cnn_benchmarks](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks).
3 | We modified this code to build a computation graph for a single-gpu environment instead of a multi-GPU and multi-machine environment(We removed the unnecessary communication-related files like `varialble_mgr.py`, `variable_mgr_util.py`).
4 | We added `CNNBenchmark_distributed_driver.py` for training and `CNNBenchmark_eval.py` for evaluation.
5 |
6 | ## Dataset
7 | * Synthetic data or imagenet data can be used. To use imagenet data follow these [instructions](https://github.com/tensorflow/models/tree/master/research/inception#getting-started).
8 |
9 | ## Training
10 | Set your resource information in the `resource_info` file.
11 |
12 | Then, execute:
13 | ```shell
14 | $ python CNNBenchmark_distributed_driver.py --model={model} --data_name={data_name} --data_dir={data_dir}
15 | ```
16 |
17 | The command above runs a single CNN model on multiple devices specified in `resource_info`.
18 | The command assumes that the data directory and the TensorFlow CNN benchmark codebase are distributed and reachable in the same absolute path in each of the machines.
19 |
20 | Also, we have a few more options you can choose for distributed running.
21 |
22 | | Parameter Name | Default | Description |
23 | | :------------------- |:-----------------------| :-----------|
24 | | --resource_info_file | `./resource_info` | Filename containing cluster information written |
25 | | --max_steps | 1000000 | Number of iterations to run for each workers |
26 | | --log_frequency | 100 | How many steps between two runop log |
27 | | --sync | True | Whether to synchronize learning or not |
28 | | --ckpt_dir | None | Directory to save checkpoints |
29 | | --save_ckpt_steps | 0 | Number of steps between two consecutive checkpoints |
30 | | --run_option | None | The run option whether PS or MPI, None utilizes both |
31 |
32 | You can adapt the distributed running with above options. For example, if you want to fix the communication model as MPI mode, you can add `run_option` value like below.
33 |
34 | ```shell
35 | $ python CNNBenchmark_distributed_driver.py --model={model} --data_name={data_name} --data_dir={data_dir} --run_option=MPI
36 | ```
37 |
38 | ## Evaluation
39 | Execute:
40 | ```shell
41 | $ python CNNBenchmark_eval.py --eval=True --model={model} --data_name={data_name} --data_dir={data_dir} --checkpoint_dir={checkpoint_dir}
42 | ```
43 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/tf_cnn_benchmarks/models/__init__.py
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/alexnet_model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Alexnet model configuration.
17 |
18 | References:
19 | Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton
20 | ImageNet Classification with Deep Convolutional Neural Networks
21 | Advances in Neural Information Processing Systems. 2012
22 | """
23 |
24 | import tensorflow as tf
25 |
26 | from models import model
27 |
28 |
29 | class AlexnetModel(model.Model):
30 | """Alexnet cnn model."""
31 |
32 | def __init__(self):
33 | super(AlexnetModel, self).__init__('alexnet', 224 + 3, 512, 0.005)
34 |
35 | def add_inference(self, cnn):
36 | # Note: VALID requires padding the images by 3 in width and height
37 | cnn.conv(64, 11, 11, 4, 4, 'VALID')
38 | cnn.mpool(3, 3, 2, 2)
39 | cnn.conv(192, 5, 5)
40 | cnn.mpool(3, 3, 2, 2)
41 | cnn.conv(384, 3, 3)
42 | cnn.conv(384, 3, 3)
43 | cnn.conv(256, 3, 3)
44 | cnn.mpool(3, 3, 2, 2)
45 | cnn.reshape([-1, 256 * 6 * 6])
46 | cnn.affine(4096)
47 | cnn.dropout()
48 | cnn.affine(4096)
49 | cnn.dropout()
50 |
51 |
52 | class AlexnetCifar10Model(model.Model):
53 | """Alexnet cnn model for cifar datasets.
54 |
55 | The model architecture follows the one defined in the tensorflow tutorial
56 | model.
57 |
58 | Reference model: tensorflow/models/tutorials/image/cifar10/cifar10.py
59 | Paper: http://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf
60 | """
61 |
62 | def __init__(self):
63 | super(AlexnetCifar10Model, self).__init__('alexnet', 32, 128, 0.1)
64 |
65 | def add_inference(self, cnn):
66 | cnn.conv(64, 5, 5, 1, 1, 'SAME', stddev=5e-2)
67 | cnn.mpool(3, 3, 2, 2, mode='SAME')
68 | cnn.lrn(depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
69 | cnn.conv(64, 5, 5, 1, 1, 'SAME', bias=0.1, stddev=5e-2)
70 | cnn.lrn(depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
71 | cnn.mpool(3, 3, 2, 2, mode='SAME')
72 | shape = cnn.top_layer.get_shape().as_list()
73 | flat_dim = shape[1] * shape[2] * shape[3]
74 | cnn.reshape([-1, flat_dim])
75 | cnn.affine(384, stddev=0.04, bias=0.1)
76 | cnn.affine(192, stddev=0.04, bias=0.1)
77 |
78 | def get_learning_rate(self, global_step, batch_size):
79 | num_examples_per_epoch = 50000
80 | num_epochs_per_decay = 100
81 | decay_steps = int(num_epochs_per_decay * num_examples_per_epoch /
82 | batch_size)
83 | decay_factor = 0.1
84 | return tf.train.exponential_decay(
85 | self.learning_rate, global_step, decay_steps, decay_factor,
86 | staircase=True)
87 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/densenet_model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Densenet model configuration.
17 |
18 | References:
19 | "Densely Connected Convolutional Networks": https://arxiv.org/pdf/1608.06993
20 | """
21 | import numpy as np
22 | from six.moves import xrange # pylint: disable=redefined-builtin
23 | import tensorflow as tf
24 |
25 | from models import model as model_lib
26 |
27 |
28 | class DensenetCifar10Model(model_lib.Model):
29 | """Densenet cnn network configuration."""
30 |
31 | def __init__(self, model, layer_counts, growth_rate):
32 | self.growth_rate = growth_rate
33 | super(DensenetCifar10Model, self).__init__(model, 32, 64, 0.1,
34 | layer_counts=layer_counts)
35 | self.batch_norm_config = {'decay': 0.9, 'epsilon': 1e-5, 'scale': True}
36 |
37 | def dense_block(self, cnn, growth_rate):
38 | input_layer = cnn.top_layer
39 | c = cnn.batch_norm(input_layer, **self.batch_norm_config)
40 | c = tf.nn.relu(c)
41 | c = cnn.conv(growth_rate, 3, 3, 1, 1,
42 | stddev=np.sqrt(2.0 / 9 / growth_rate),
43 | activation=None, input_layer=c)
44 | channel_index = 3 if cnn.channel_pos == 'channels_last' else 1
45 | cnn.top_layer = tf.concat([input_layer, c], channel_index)
46 | cnn.top_size += growth_rate
47 |
48 | def transition_layer(self, cnn):
49 | in_size = cnn.top_size
50 | cnn.batch_norm(**self.batch_norm_config)
51 | cnn.top_layer = tf.nn.relu(cnn.top_layer)
52 | cnn.conv(in_size, 1, 1, 1, 1, stddev=np.sqrt(2.0 / 9 / in_size))
53 | cnn.apool(2, 2, 2, 2)
54 |
55 | def add_inference(self, cnn):
56 | if self.layer_counts is None:
57 | raise ValueError(
58 | 'Layer counts not specified for %s' % self.get_model())
59 | if self.growth_rate is None:
60 | raise ValueError(
61 | 'Growth rate not specified for %s' % self.get_model())
62 |
63 | cnn.conv(16, 3, 3, 1, 1, activation=None)
64 | # Block 1
65 | for _ in xrange(self.layer_counts[0]):
66 | self.dense_block(cnn, self.growth_rate)
67 | self.transition_layer(cnn)
68 | # Block 2
69 | for _ in xrange(self.layer_counts[1]):
70 | self.dense_block(cnn, self.growth_rate)
71 | self.transition_layer(cnn)
72 | # Block 3
73 | for _ in xrange(self.layer_counts[2]):
74 | self.dense_block(cnn, self.growth_rate)
75 | cnn.batch_norm(**self.batch_norm_config)
76 | cnn.top_layer = tf.nn.relu(cnn.top_layer)
77 | channel_index = 3 if cnn.channel_pos == 'channels_last' else 1
78 | cnn.top_size = cnn.top_layer.get_shape().as_list()[channel_index]
79 | cnn.spatial_mean()
80 |
81 | def get_learning_rate(self, global_step, batch_size):
82 | num_batches_per_epoch = int(50000 / batch_size)
83 | boundaries = num_batches_per_epoch * np.array([150, 225, 300],
84 | dtype=np.int64)
85 | boundaries = [x for x in boundaries]
86 | values = [0.1, 0.01, 0.001, 0.0001]
87 | return tf.train.piecewise_constant(global_step, boundaries, values)
88 |
89 |
90 | def create_densenet40_k12_model():
91 | return DensenetCifar10Model('densenet40_k12', (12, 12, 12), 12)
92 |
93 |
94 | def create_densenet100_k12_model():
95 | return DensenetCifar10Model('densenet100_k12', (32, 32, 32), 12)
96 |
97 |
98 | def create_densenet100_k24_model():
99 | return DensenetCifar10Model('densenet100_k24', (32, 32, 32), 24)
100 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/googlenet_model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Googlenet model configuration.
17 |
18 | References:
19 | Szegedy, Christian, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
20 | Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, and Andrew Rabinovich
21 | Going deeper with convolutions
22 | arXiv preprint arXiv:1409.4842 (2014)
23 | """
24 |
25 | from models import model
26 |
27 |
28 | class GooglenetModel(model.Model):
29 |
30 | def __init__(self):
31 | super(GooglenetModel, self).__init__('googlenet', 224, 32, 0.005)
32 |
33 | def add_inference(self, cnn):
34 | def inception_v1(cnn, k, l, m, n, p, q):
35 | cols = [[('conv', k, 1, 1)], [('conv', l, 1, 1), ('conv', m, 3, 3)],
36 | [('conv', n, 1, 1), ('conv', p, 5, 5)],
37 | [('mpool', 3, 3, 1, 1, 'SAME'), ('conv', q, 1, 1)]]
38 | cnn.inception_module('incept_v1', cols)
39 |
40 | cnn.conv(64, 7, 7, 2, 2)
41 | cnn.mpool(3, 3, 2, 2, mode='SAME')
42 | cnn.conv(64, 1, 1)
43 | cnn.conv(192, 3, 3)
44 | cnn.mpool(3, 3, 2, 2, mode='SAME')
45 | inception_v1(cnn, 64, 96, 128, 16, 32, 32)
46 | inception_v1(cnn, 128, 128, 192, 32, 96, 64)
47 | cnn.mpool(3, 3, 2, 2, mode='SAME')
48 | inception_v1(cnn, 192, 96, 208, 16, 48, 64)
49 | inception_v1(cnn, 160, 112, 224, 24, 64, 64)
50 | inception_v1(cnn, 128, 128, 256, 24, 64, 64)
51 | inception_v1(cnn, 112, 144, 288, 32, 64, 64)
52 | inception_v1(cnn, 256, 160, 320, 32, 128, 128)
53 | cnn.mpool(3, 3, 2, 2, mode='SAME')
54 | inception_v1(cnn, 256, 160, 320, 32, 128, 128)
55 | inception_v1(cnn, 384, 192, 384, 48, 128, 128)
56 | cnn.apool(7, 7, 1, 1, mode='VALID')
57 | cnn.reshape([-1, 1024])
58 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/lenet_model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Lenet model configuration.
17 |
18 | References:
19 | LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner
20 | Gradient-based learning applied to document recognition
21 | Proceedings of the IEEE (1998)
22 | """
23 |
24 | from models import model
25 |
26 |
27 | class Lenet5Model(model.Model):
28 | def __init__(self):
29 | super(Lenet5Model, self).__init__('lenet5', 28, 32, 0.005)
30 |
31 | def add_inference(self, cnn):
32 | # Note: This matches TF's MNIST tutorial model
33 | cnn.conv(32, 5, 5)
34 | cnn.mpool(2, 2)
35 | cnn.conv(64, 5, 5)
36 | cnn.mpool(2, 2)
37 | cnn.reshape([-1, 64 * 7 * 7])
38 | cnn.affine(512)
39 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Base model configuration for CNN benchmarks."""
16 |
17 |
18 | class Model(object):
19 | """Base model configuration for CNN benchmarks."""
20 |
21 | def __init__(self,
22 | model,
23 | image_size,
24 | batch_size,
25 | learning_rate,
26 | layer_counts=None,
27 | fp16_loss_scale=128):
28 | self.model = model
29 | self.image_size = image_size
30 | self.batch_size = batch_size
31 | self.default_batch_size = batch_size
32 | self.learning_rate = learning_rate
33 | self.layer_counts = layer_counts
34 | # TODO(reedwm) Set custom loss scales for each model instead of using
35 | # the default of 128.
36 | self.fp16_loss_scale = fp16_loss_scale
37 |
38 | def get_model(self):
39 | return self.model
40 |
41 | def get_image_size(self):
42 | return self.image_size
43 |
44 | def get_batch_size(self):
45 | return self.batch_size
46 |
47 | def set_batch_size(self, batch_size):
48 | self.batch_size = batch_size
49 |
50 | def get_default_batch_size(self):
51 | return self.default_batch_size
52 |
53 | def get_layer_counts(self):
54 | return self.layer_counts
55 |
56 | def get_fp16_loss_scale(self):
57 | return self.fp16_loss_scale
58 |
59 | def get_learning_rate(self, global_step, batch_size):
60 | del global_step
61 | del batch_size
62 | return self.learning_rate
63 |
64 | def add_inference(self, unused_cnn):
65 | raise ValueError('Must be implemented in derived classes')
66 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/model_config.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Model configurations for CNN benchmarks.
17 | """
18 |
19 | from models import alexnet_model
20 | from models import densenet_model
21 | from models import googlenet_model
22 | from models import inception_model
23 | from models import lenet_model
24 | from models import overfeat_model
25 | from models import resnet_model
26 | from models import trivial_model
27 | from models import vgg_model
28 |
29 | _model_name_to_imagenet_model = {
30 | 'vgg11': vgg_model.Vgg11Model,
31 | 'vgg16': vgg_model.Vgg16Model,
32 | 'vgg19': vgg_model.Vgg19Model,
33 | 'lenet': lenet_model.Lenet5Model,
34 | 'googlenet': googlenet_model.GooglenetModel,
35 | 'overfeat': overfeat_model.OverfeatModel,
36 | 'alexnet': alexnet_model.AlexnetModel,
37 | 'trivial': trivial_model.TrivialModel,
38 | 'inception3': inception_model.Inceptionv3Model,
39 | 'inception4': inception_model.Inceptionv4Model,
40 | 'resnet50': resnet_model.create_resnet50_model,
41 | 'resnet50_v2': resnet_model.create_resnet50_v2_model,
42 | 'resnet101': resnet_model.create_resnet101_model,
43 | 'resnet101_v2': resnet_model.create_resnet101_v2_model,
44 | 'resnet152': resnet_model.create_resnet152_model,
45 | 'resnet152_v2': resnet_model.create_resnet152_v2_model,
46 | }
47 |
48 | _model_name_to_cifar_model = {
49 | 'alexnet': alexnet_model.AlexnetCifar10Model,
50 | 'resnet20': resnet_model.create_resnet20_cifar_model,
51 | 'resnet20_v2': resnet_model.create_resnet20_v2_cifar_model,
52 | 'resnet32': resnet_model.create_resnet32_cifar_model,
53 | 'resnet32_v2': resnet_model.create_resnet32_v2_cifar_model,
54 | 'resnet44': resnet_model.create_resnet44_cifar_model,
55 | 'resnet44_v2': resnet_model.create_resnet44_v2_cifar_model,
56 | 'resnet56': resnet_model.create_resnet56_cifar_model,
57 | 'resnet56_v2': resnet_model.create_resnet56_v2_cifar_model,
58 | 'resnet110': resnet_model.create_resnet110_cifar_model,
59 | 'resnet110_v2': resnet_model.create_resnet110_v2_cifar_model,
60 | 'trivial': trivial_model.TrivialCifar10Model,
61 | 'densenet40_k12': densenet_model.create_densenet40_k12_model,
62 | 'densenet100_k12': densenet_model.create_densenet100_k12_model,
63 | 'densenet100_k24': densenet_model.create_densenet100_k24_model,
64 | }
65 |
66 |
67 | def _get_model_map(dataset_name):
68 | if 'cifar10' == dataset_name:
69 | return _model_name_to_cifar_model
70 | elif dataset_name in ('imagenet', 'synthetic'):
71 | return _model_name_to_imagenet_model
72 | else:
73 | raise ValueError('Invalid dataset name: %s' % dataset_name)
74 |
75 |
76 | def get_model_config(model_name, dataset):
77 | """Map model name to model network configuration."""
78 | model_map = _get_model_map(dataset.name)
79 | if model_name not in model_map:
80 | raise ValueError('Invalid model name \'%s\' for dataset \'%s\'' %
81 | (model_name, dataset.name))
82 | else:
83 | return model_map[model_name]()
84 |
85 |
86 | def register_model(model_name, dataset_name, model_func):
87 | """Register a new model that can be obtained with `get_model_config`."""
88 | model_map = _get_model_map(dataset_name)
89 | if model_name in model_map:
90 | raise ValueError('Model "%s" is already registered for dataset "%s"' %
91 | (model_name, dataset_name))
92 | model_map[model_name] = model_func
93 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/overfeat_model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Overfeat model configuration.
17 |
18 | References:
19 | OverFeat: Integrated Recognition, Localization and Detection using
20 | Convolutional Networks
21 | Pierre Sermanet, David Eigen, Xiang Zhang, Michael Mathieu, Rob Fergus,
22 | Yann LeCun, 2014
23 | http://arxiv.org/abs/1312.6229
24 | """
25 |
26 | from models import model
27 |
28 |
29 | class OverfeatModel(model.Model):
30 |
31 | def __init__(self):
32 | super(OverfeatModel, self).__init__('overfeat', 231, 32, 0.005)
33 |
34 | def add_inference(self, cnn):
35 | # Note: VALID requires padding the images by 3 in width and height
36 | cnn.conv(96, 11, 11, 4, 4, mode='VALID')
37 | cnn.mpool(2, 2)
38 | cnn.conv(256, 5, 5, 1, 1, mode='VALID')
39 | cnn.mpool(2, 2)
40 | cnn.conv(512, 3, 3)
41 | cnn.conv(1024, 3, 3)
42 | cnn.conv(1024, 3, 3)
43 | cnn.mpool(2, 2)
44 | cnn.reshape([-1, 1024 * 6 * 6])
45 | cnn.affine(3072)
46 | cnn.affine(4096)
47 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/trivial_model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Trivial model configuration."""
16 |
17 | from models import model
18 |
19 |
20 | class TrivialModel(model.Model):
21 | """Trivial model configuration."""
22 |
23 | def __init__(self):
24 | super(TrivialModel, self).__init__('trivial', 224 + 3, 32, 0.005)
25 |
26 | def add_inference(self, cnn):
27 | cnn.reshape([-1, 227 * 227 * 3])
28 | cnn.affine(1)
29 | cnn.affine(4096)
30 |
31 |
32 | class TrivialCifar10Model(model.Model):
33 | """Trivial cifar10 model configuration."""
34 |
35 | def __init__(self):
36 | super(TrivialCifar10Model, self).__init__('trivial', 32, 32, 0.005)
37 |
38 | def add_inference(self, cnn):
39 | cnn.reshape([-1, 32 * 32 * 3])
40 | cnn.affine(1)
41 | cnn.affine(4096)
42 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/vgg_model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Vgg model configuration.
17 |
18 | Includes multiple models: vgg11, vgg16, vgg19, corresponding to
19 | model A, D, and E in Table 1 of [1].
20 |
21 | References:
22 | [1] Simonyan, Karen, Andrew Zisserman
23 | Very Deep Convolutional Networks for Large-Scale Image Recognition
24 | arXiv:1409.1556 (2014)
25 | """
26 |
27 | from six.moves import xrange # pylint: disable=redefined-builtin
28 | from models import model
29 |
30 |
31 | def _construct_vgg(cnn, num_conv_layers):
32 | """Build vgg architecture from blocks."""
33 | assert len(num_conv_layers) == 5
34 | for _ in xrange(num_conv_layers[0]):
35 | cnn.conv(64, 3, 3)
36 | cnn.mpool(2, 2)
37 | for _ in xrange(num_conv_layers[1]):
38 | cnn.conv(128, 3, 3)
39 | cnn.mpool(2, 2)
40 | for _ in xrange(num_conv_layers[2]):
41 | cnn.conv(256, 3, 3)
42 | cnn.mpool(2, 2)
43 | for _ in xrange(num_conv_layers[3]):
44 | cnn.conv(512, 3, 3)
45 | cnn.mpool(2, 2)
46 | for _ in xrange(num_conv_layers[4]):
47 | cnn.conv(512, 3, 3)
48 | cnn.mpool(2, 2)
49 | cnn.reshape([-1, 512 * 7 * 7])
50 | cnn.affine(4096)
51 | cnn.dropout()
52 | cnn.affine(4096)
53 | cnn.dropout()
54 |
55 |
56 | class Vgg11Model(model.Model):
57 |
58 | def __init__(self):
59 | super(Vgg11Model, self).__init__('vgg11', 224, 64, 0.005)
60 |
61 | def add_inference(self, cnn):
62 | _construct_vgg(cnn, [1, 1, 2, 2, 2])
63 |
64 |
65 | class Vgg16Model(model.Model):
66 |
67 | def __init__(self):
68 | super(Vgg16Model, self).__init__('vgg16', 224, 64, 0.005)
69 |
70 | def add_inference(self, cnn):
71 | _construct_vgg(cnn, [2, 2, 3, 3, 3])
72 |
73 |
74 | class Vgg19Model(model.Model):
75 |
76 | def __init__(self):
77 | super(Vgg19Model, self).__init__('vgg19', 224, 64, 0.005)
78 |
79 | def add_inference(self, cnn):
80 | _construct_vgg(cnn, [2, 2, 4, 4, 4])
81 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/parallax_config.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2018 Seoul National University
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import tensorflow as tf
17 | import parallax
18 |
19 |
20 | flags = tf.app.flags
21 | flags.DEFINE_boolean('replicate_variables', True, """replicate_variables""")
22 | flags.DEFINE_string('protocol', 'grpc', """The method for managing variables""")
23 | tf.app.flags.DEFINE_string('mpirun_options', '',
24 | 'option for mpirun')
25 | flags.DEFINE_string('run_option', 'HYBRID',
26 | 'The run option whether PS, MPI or HYBRID')
27 | flags.DEFINE_string('redirect_path', None, """redirect path to keep the log of distributed workers""")
28 | flags.DEFINE_integer('save_ckpt_steps', None,
29 | """Number of steps between two consecutive checkpoints""")
30 | flags.DEFINE_integer('save_n_ckpts_per_epoch', -1, """Save n checkpoints per every epoch""")
31 | flags.DEFINE_string('ckpt_dir', None, """Directory to save checkpoints""")
32 | flags.DEFINE_string('profile_dir', None, """Directory to save RunMetadata""")
33 | flags.DEFINE_string('profile_steps', None, """Comma separated porfile steps""")
34 | flags.DEFINE_string('profile_range', None, """profile_start_step,profile_end_step""")
35 | flags.DEFINE_integer('profile_worker', None, """The worker to profile""")
36 | flags.DEFINE_boolean('local_aggregation', True,
37 | """Whether to use local aggregation or not""")
38 | flags.DEFINE_boolean('boundary_among_servers', True,
39 | """Whether to use operation placement among servers""")
40 | flags.DEFINE_boolean('boundary_between_workers_and_servers', True,
41 | """Whether to use operation placement between workers and servers""")
42 | flags.DEFINE_string('export_graph_path', None, """export path to keep transformed graph definintion""")
43 |
44 | FLAGS = flags.FLAGS
45 |
46 | def calculate_ckpt_steps():
47 | if FLAGS.save_n_ckpts_per_epoch > 0:
48 | with open(FLAGS.resource_info_file) as resource_info:
49 | num_workers = sum([len(w['gpus']) for w in json.load(resource_info)['worker']])
50 | num_words_per_iter = FLAGS.batch_size * FLAGS.num_steps * num_workers
51 | num_iters_per_epoch = math.ceil(language_model_graph._NUM_WORDS['train'] / num_words_per_iter / FLAGS.save_n_ckpts_per_epoch)
52 | save_ckpt_steps = num_iters_per_epoch if FLAGS.sync else num_iters_per_epoch * num_workers
53 | parallax.log.info('Save checkpoint for every %d iters' % save_ckpt_steps)
54 | else:
55 | save_ckpt_steps = FLAGS.save_ckpt_steps
56 |
57 | return save_ckpt_steps
58 |
59 |
60 | def build_config():
61 |
62 | ckpt_config = parallax.CheckPointConfig(ckpt_dir=FLAGS.ckpt_dir,
63 | save_ckpt_steps=calculate_ckpt_steps())
64 | ps_config = parallax.PSConfig(replicate_variables=FLAGS.replicate_variables,
65 | protocol=FLAGS.protocol,
66 | local_aggregation=FLAGS.local_aggregation,
67 | boundary_among_servers=FLAGS.boundary_among_servers,
68 | boundary_between_workers_and_servers=\
69 | FLAGS.boundary_between_workers_and_servers)
70 | mpi_config = parallax.MPIConfig(mpirun_options=FLAGS.mpirun_options)
71 | def get_profile_steps():
72 | if FLAGS.profile_steps:
73 | FLAGS.profile_steps = FLAGS.profile_steps.strip()
74 | return [int(step) for step in FLAGS.profile_steps.split(',')]
75 | return None
76 |
77 | def get_profile_range():
78 | if FLAGS.profile_range:
79 | FLAGS.profile_range = FLAGS.profile_range.strip()
80 | splits = FLAGS.profile_range.split(',')
81 | return (int(splits[0]), int(splits[1]))
82 | return None
83 |
84 | profile_config = parallax.ProfileConfig(profile_dir=FLAGS.profile_dir,
85 | profile_steps=get_profile_steps(),
86 | profile_range=get_profile_range(),
87 | profile_worker=FLAGS.profile_worker)
88 |
89 | parallax_config = parallax.Config()
90 | parallax_config.run_option = FLAGS.run_option
91 | parallax_config.average_sparse = False
92 | parallax_config.communication_config = parallax.CommunicationConfig(ps_config, mpi_config)
93 | parallax_config.ckpt_config=ckpt_config
94 | parallax_config.profile_config = profile_config
95 | parallax_config.redirect_path = FLAGS.redirect_path
96 | parallax_config.export_graph_path = FLAGS.export_graph_path
97 |
98 | return parallax_config
99 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/platforms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/tf_cnn_benchmarks/platforms/__init__.py
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/platforms/default/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/tf_cnn_benchmarks/platforms/default/__init__.py
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/platforms/default/util.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Utility code for the default platform."""
17 |
18 | import cnn_util
19 |
20 |
21 | def get_platform_params():
22 | """Returns a dict of platform-specific params.
23 |
24 | No platform-specific flags are needed for the default platform, so this
25 | returns an empty dict.
26 |
27 | Returns:
28 | A dict that maps from param name to ParamSpec.
29 | """
30 | return {}
31 |
32 |
33 | def get_cluster_manager(params, config_proto):
34 | """Returns the cluster manager to be used."""
35 | return cnn_util.GrpcClusterManager(params, config_proto)
36 |
37 |
38 | def _initialize(params, config_proto):
39 | # Currently, no platform initialization needs to be done.
40 | del params, config_proto
41 |
42 |
43 | _is_initalized = False
44 |
45 |
46 | def initialize(params, config_proto):
47 | global _is_initalized
48 | if _is_initalized:
49 | return
50 | _is_initalized = True
51 | _initialize(params, config_proto)
52 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/platforms/util.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Utility code for a certain platform.
17 |
18 | This file simply imports everything from the default platform. To switch to a
19 | different platform, the import statement can be changed to point to a new
20 | platform.
21 |
22 | Creating a custom platform can be useful to, e.g., run some initialization code
23 | required by the platform or register a platform-specific model.
24 | """
25 |
26 | from platforms.default.util import * # pylint: disable=unused-import,wildcard-import
27 |
--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/resource_info:
--------------------------------------------------------------------------------
1 | 123.456.78.90:1,2
2 |
--------------------------------------------------------------------------------
/parallax/parallax/util/BUILD:
--------------------------------------------------------------------------------
1 | licenses(["notice"]) # Apache 2.0
2 |
3 | package(
4 | default_visibility = [
5 | "//visibility:public",
6 | ],
7 | )
8 |
9 | sh_binary(
10 | name = "build_pip_package",
11 | srcs = ["build_pip_package.sh"],
12 | data = [
13 | "//parallax:parallax",
14 | "//parallax/core/python/tools:tools",
15 | ],
16 | )
17 |
--------------------------------------------------------------------------------
/parallax/parallax/util/build_pip_package.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Copyright 2017 Google Inc. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # Script for building a pip package.
18 | #
19 | # Based on tensorflow/tools/pip_package/build_pip_package.sh.
20 | set -e
21 |
22 | function main() {
23 | PYTHON="python"
24 | POSITIONAL=()
25 | while [[ $# -gt 0 ]]
26 | do
27 | key="$1"
28 | case $key in
29 | -p|-py)
30 | PYTHON="$2"
31 | shift
32 | shift
33 | ;;
34 | --py=*|--python=*)
35 | PYTHON="${key#*=}"
36 | shift
37 | ;;
38 | *)
39 | POSITIONAL+=("$1")
40 | shift
41 | ;;
42 | esac
43 | done
44 | set -- "${POSITIONAL[@]}" # restore positional parameters
45 |
46 | if [ $# -lt 1 ] ; then
47 | echo "No destination dir provided"
48 | exit 1
49 | fi
50 |
51 | DEST=$1
52 | TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX)
53 |
54 | echo $(date) : "=== Using tmpdir: ${TMPDIR}"
55 |
56 | if [ ! -d bazel-bin/parallax ]; then
57 | echo "Could not find bazel-bin. Did you run from the root of the build tree?"
58 | exit 1
59 | fi
60 |
61 | cp -R \
62 | bazel-bin/parallax/util/build_pip_package.runfiles/parallax/parallax \
63 | "${TMPDIR}"
64 |
65 | cp parallax/util/setup.py ${TMPDIR}
66 |
67 | # Before we leave the top-level directory, make sure we know how to
68 | # call python.
69 | #source tensorflow/tools/python_bin_path.sh
70 |
71 | pushd ${TMPDIR}
72 | echo $(date) : "=== Building wheel"
73 | ${PYTHON} setup.py bdist_wheel >/dev/null
74 | mkdir -p ${DEST}
75 | cp dist/* ${DEST}
76 | popd
77 | rm -rf ${TMPDIR}
78 | echo $(date) : "=== Output wheel file is in: ${DEST}"
79 | echo ${PYTHON}
80 | }
81 |
82 | main "$@"
83 |
--------------------------------------------------------------------------------
/tools/bazel.rc:
--------------------------------------------------------------------------------
1 | import %workspace%/tensorflow/tools/bazel.rc
2 | import %workspace%/tensorflow/.tf_configure.bazelrc
3 |
4 | build --define PYTHON_LIB_PATH=$PYTHON_BINARY/../../lib/python$PYTHON_MAJOR_VERSION/site-packages
5 |
6 | build --package_path=%workspace%:%workspace%/tensorflow/
7 |
--------------------------------------------------------------------------------
/tools/style_check.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2018 Seoul National University
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Simple Python style check script.
16 | pycodestyle checks code against style
17 | conventions in PEP8. Do not check
18 | example files.
19 | requirements: pycodestyle"""
20 | import os
21 |
22 | # return 0 for success
23 | if os.system(
24 | "pycodestyle --statistics ../parallax/parallax/ "
25 | "--exclude=../parallax/parallax/examples/") == 0:
26 | print("PASS")
27 |
--------------------------------------------------------------------------------