├── .github
    ├── issue_template.md
    └── pull_request_template.md
├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── doc
    ├── figure
    │   ├── LM-1B Benchmark.png
    │   ├── Resnet50 Benchmark.png
    │   ├── benchmark.png
    │   ├── exec_model.png
    │   ├── hybrid.png
    │   ├── lm1b_convergence.png
    │   ├── nmt_convergence.png
    │   └── resnet50_convergence.png
    ├── installation.md
    ├── parallax_api.md
    ├── quick_start.md
    └── trouble_shooting.md
├── parallax
    ├── WORKSPACE
    └── parallax
    │   ├── BUILD
    │   ├── __init__.py
    │   ├── core
    │       ├── BUILD
    │       ├── __init__.py
    │       └── python
    │       │   ├── BUILD
    │       │   ├── __init__.py
    │       │   ├── common
    │       │       ├── BUILD
    │       │       ├── __init__.py
    │       │       ├── config.py
    │       │       ├── consts.py
    │       │       ├── graph_transform_lib.py
    │       │       ├── lib.py
    │       │       ├── partitions.py
    │       │       ├── runner.py
    │       │       ├── session_context.py
    │       │       └── shard.py
    │       │   ├── hybrid
    │       │       ├── BUILD
    │       │       ├── __init__.py
    │       │       ├── between_graph_parallel.py
    │       │       ├── graph_transform.py
    │       │       ├── in_graph_parallel.py
    │       │       └── runner.py
    │       │   ├── mpi
    │       │       ├── BUILD
    │       │       ├── __init__.py
    │       │       ├── graph_transform.py
    │       │       └── runner.py
    │       │   ├── ps
    │       │       ├── BUILD
    │       │       ├── __init__.py
    │       │       ├── between_graph_parallel.py
    │       │       ├── graph_transform.py
    │       │       ├── in_graph_parallel.py
    │       │       └── runner.py
    │       │   └── tools
    │       │       ├── BUILD
    │       │       ├── __init__.py
    │       │       └── launch_ps.py
    │   ├── examples
    │       ├── lm1b
    │       │   ├── LICENSE
    │       │   ├── README.md
    │       │   ├── __init__.py
    │       │   ├── data_utils.py
    │       │   ├── language_model.py
    │       │   ├── language_model_graph.py
    │       │   ├── lm1b_distributed_driver.py
    │       │   ├── lm1b_eval.py
    │       │   ├── lm1b_input.py
    │       │   ├── parallax_config.py
    │       │   ├── resource_info
    │       │   └── testdata
    │       │   │   ├── test_s2.txt
    │       │   │   ├── test_sentences.txt
    │       │   │   └── test_vocab.txt
    │       ├── nmt
    │       │   ├── .gitignore
    │       │   ├── CONTRIBUTING.md
    │       │   ├── LICENSE
    │       │   ├── README.md
    │       │   ├── __init__.py
    │       │   ├── attention_model.py
    │       │   ├── g3doc
    │       │   │   └── img
    │       │   │   │   ├── attention_equation_0.jpg
    │       │   │   │   ├── attention_equation_1.jpg
    │       │   │   │   ├── attention_mechanism.jpg
    │       │   │   │   ├── attention_vis.jpg
    │       │   │   │   ├── encdec.jpg
    │       │   │   │   ├── greedy_dec.jpg
    │       │   │   │   └── seq2seq.jpg
    │       │   ├── gnmt_model.py
    │       │   ├── inference.py
    │       │   ├── inference_test.py
    │       │   ├── model.py
    │       │   ├── model_helper.py
    │       │   ├── model_test.py
    │       │   ├── nmt.py
    │       │   ├── nmt_distributed_driver.py
    │       │   ├── nmt_eval.py
    │       │   ├── nmt_test.py
    │       │   ├── parallax_config.py
    │       │   ├── resource_info
    │       │   ├── scripts
    │       │   │   ├── __init__.py
    │       │   │   ├── bleu.py
    │       │   │   ├── download_iwslt15.sh
    │       │   │   ├── rouge.py
    │       │   │   └── wmt16_en_de.sh
    │       │   ├── standard_hparams
    │       │   │   ├── iwslt15.json
    │       │   │   ├── wmt16.json
    │       │   │   ├── wmt16_gnmt_4_layer.json
    │       │   │   └── wmt16_gnmt_8_layer.json
    │       │   ├── testdata
    │       │   │   ├── deen_output
    │       │   │   ├── deen_ref_bpe
    │       │   │   ├── deen_ref_spm
    │       │   │   ├── iwslt15.tst2013.100.en
    │       │   │   ├── iwslt15.tst2013.100.vi
    │       │   │   ├── iwslt15.vocab.100.en
    │       │   │   ├── iwslt15.vocab.100.vi
    │       │   │   ├── label_ref
    │       │   │   ├── pred_output
    │       │   │   ├── test_embed.txt
    │       │   │   ├── test_embed_with_header.txt
    │       │   │   ├── test_infer_file
    │       │   │   ├── test_infer_vocab.src
    │       │   │   └── test_infer_vocab.tgt
    │       │   ├── train.py
    │       │   └── utils
    │       │   │   ├── __init__.py
    │       │   │   ├── common_test_utils.py
    │       │   │   ├── evaluation_utils.py
    │       │   │   ├── evaluation_utils_test.py
    │       │   │   ├── iterator_utils.py
    │       │   │   ├── iterator_utils_test.py
    │       │   │   ├── misc_utils.py
    │       │   │   ├── misc_utils_test.py
    │       │   │   ├── nmt_utils.py
    │       │   │   ├── standard_hparams_utils.py
    │       │   │   ├── vocab_utils.py
    │       │   │   └── vocab_utils_test.py
    │       ├── simple
    │       │   ├── README.md
    │       │   ├── resource_info
    │       │   └── simple_driver.py
    │       ├── skip_thoughts
    │       │   ├── LICENSE
    │       │   ├── README.md
    │       │   ├── configuration.py
    │       │   ├── data
    │       │   │   ├── preprocess_dataset.py
    │       │   │   └── special_words.py
    │       │   ├── encoder_manager.py
    │       │   ├── evaluate.py
    │       │   ├── ops
    │       │   │   ├── __init__.py
    │       │   │   ├── gru_cell.py
    │       │   │   └── input_ops.py
    │       │   ├── parallax_config.py
    │       │   ├── resource_info
    │       │   ├── skip_distributed_driver.py
    │       │   ├── skip_thoughts_encoder.py
    │       │   ├── skip_thoughts_model.py
    │       │   ├── track_perplexity.py
    │       │   ├── train.py
    │       │   └── vocabulary_expansion.py
    │       └── tf_cnn_benchmarks
    │       │   ├── CNNBenchmark_distributed_driver.py
    │       │   ├── CNNBenchmark_eval.py
    │       │   ├── LICENSE
    │       │   ├── README.md
    │       │   ├── benchmark_cnn.py
    │       │   ├── cnn_util.py
    │       │   ├── convnet_builder.py
    │       │   ├── datasets.py
    │       │   ├── models
    │       │       ├── __init__.py
    │       │       ├── alexnet_model.py
    │       │       ├── densenet_model.py
    │       │       ├── googlenet_model.py
    │       │       ├── inception_model.py
    │       │       ├── lenet_model.py
    │       │       ├── model.py
    │       │       ├── model_config.py
    │       │       ├── overfeat_model.py
    │       │       ├── resnet_model.py
    │       │       ├── trivial_model.py
    │       │       └── vgg_model.py
    │       │   ├── parallax_config.py
    │       │   ├── platforms
    │       │       ├── __init__.py
    │       │       ├── default
    │       │       │   ├── __init__.py
    │       │       │   └── util.py
    │       │       └── util.py
    │       │   ├── preprocessing.py
    │       │   └── resource_info
    │   └── util
    │       ├── BUILD
    │       ├── build_pip_package.sh
    │       └── setup.py
└── tools
    ├── bazel.rc
    └── style_check.py


/.github/issue_template.md:
--------------------------------------------------------------------------------
 1 | ### Things to Change
 2 | 
 3 | ### Current Behavior
 4 | 
 5 | ### Expected Behavior
 6 | 
 7 | ### Failure Information (for bugs)
 8 | 
 9 | #### Failure Logs
10 | 
11 | #### How to Reproduce
12 | 
13 | ### Related Issues
14 | 
15 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | Github issue: #XX
 2 | 
 3 | **Major changes:**
 4 | - 
 5 | 
 6 | **Minor changes to note:**
 7 | - 
 8 | 
 9 | **Tests for the changes:**
10 | - 
11 | 
12 | **Other comments:**
13 | - 
14 | 
15 | resolves #XX
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.sh
2 | *.pyc
3 | *bazel*
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tensorflow"]
2 | 	path = tensorflow
3 | 	url = https://github.com/snuspl/tensorflow.git
4 | [submodule "horovod"]
5 | 	path = horovod
6 | 	url = https://github.com/horovod/horovod.git
7 |         branch = v0.16.3
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Parallax
 2 | **Parallax** is a tool that optimizes data parallel training by considering whether each variable in a deep learning model is sparse or dense. The sparsity-aware data parallel training improves performance of models with sparse variables that show relatively low scalability on existing frameworks while maintaining equal performance for models with only dense variables such as ResNet-50 and Inception-V3. In addition, Parallax automatically parallelizes training of a single-GPU deep learning model to minimize user efforts. If you are interested, you can find the technical details of Parallax in [our paper](https://dl.acm.org/citation.cfm?id=3303957).
 3 | 
 4 | Parallax is currently implemented on TensorFlow. We support [TensorFlow v1.6](https://github.com/tensorflow/tensorflow/tree/r1.6) and [TensorFlow v1.11](https://github.com/tensorflow/tensorflow/tree/r1.11). In case that Parallax uses Message Passing Interface (MPI), Parallax requires *AllReduce*, *AllGather* operations implemented in [Horovod v0.11.2](https://github.com/uber/horovod/tree/v0.11.2). We plan to support multiple TensorFlow versions. 
 5 | 
 6 | * [Installation](doc/installation.md)
 7 | * [Running Parallax](doc/quick_start.md)
 8 | * [Parallax API](doc/parallax_api.md)
 9 | 
10 | ## Why Parallax?
11 | Parallax makes it easier for users to do distributed training of a deep learning model developed in a single device (e.g., GPU or CPU) while employing various optimization techniques that Parallax provides. A Parallax user simply specifies a single-device model graph, resource specification for distributed training and Parallax does the rest! For distributed training, Parallax supports hybrid architecture that combines two different distributed training architectures: Parameter Server (PS) and AllReduce (AR). Hybrid architecture exploits the advantages of both architectures. Moreover, Parallax will provide large sparse variable partitioning soon to maximize parallelism while maintaining low computation and communication overhead. Parallax further optimizes training with local aggregation and smart operation placement to mitigate communication overhead. 
12 | 
13 | PS and AR architectures are still available in Parallax; users can choose the training architecture if they want (default is hybrid for synchronous training).
14 | 
15 | ### Hybrid Architecture
16 | <p align=center><img src=doc/figure/hybrid.png width="400"></p>
17 | 
18 | The amount of data transfer of each PS and AR achitecture changes according to whether a variable is sparse or dense. Based on the fact, Parallax pursues a hybrid architecture in which the AR architecture handles dense variables and the PS architecture handles sparse variables to minimize communication overhead. Each worker has a replica of dense variables, while separate server processes manage only sparse variables.
19 | 
20 | ### Parallax Execution Model
21 | 
22 | <p align=center><img src=doc/figure/exec_model.png></p>
23 | 
24 | 
25 | When a client initiates a deep learning job with a single-device computation graph, resource information, and optionally a flag that indicates either synchronous or asynchronous training, Parallax transforms the computation graph by analyzing its characteristics. Then, Parallax executes the transformed graph with its optimized communication layer in the distributed environment.
26 | 
27 | ### Parallax Benchmark
28 | 
29 | To give you an idea on how well Parallax performs, we present the following chart that shows the result of experiments done in a cluster of eight machines that are connected via Mellanox ConnectX-4 cards with 100Gbps InfiniBand. Each machine has six NVIDIA GeForce TITAN Xp GPU cards.
30 | 
31 | <p float="left">
32 |   <img src="doc/figure/resnet50_convergence.png" width="400" title="ResNet-50"/>
33 |   <img src="doc/figure/lm1b_convergence.png" width="400" title="LM1B"/>
34 | </p>
35 | 
36 | Parallax converges correctly as other frameworks(TensorFlow and Horovod). Parallax is faster than TensorFlow and similiar to Horovod for ResNet50 (dense model). In case of LM1B (sparse model), Parallax outperforms than both TensorFlow and Horovod.
37 | 
38 | <p align=center>
39 |   <img src=/doc/figure/benchmark.png>
40 | </p>
41 | Parallax outperforms TensorFlow for both Resnet50 and LM1B. In addition, Parallax outperforms Horovod for LM1B.
42 | 
43 | ## Troubleshooting
44 | See the [Troubleshooting](doc/trouble_shooting.md) page and submit a new [issue](https://github.com/snuspl/parallax/issues/new) or [contact us](#contact-us) if you cannot find an answer.
45 | 
46 | ## Contact us
47 | To contact us, send an email to parallax-dev@googlegroups.com.
48 | 
49 | ## License
50 | [Apache License 2.0](LICENSE)
51 | 


--------------------------------------------------------------------------------
/doc/figure/LM-1B Benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/LM-1B Benchmark.png


--------------------------------------------------------------------------------
/doc/figure/Resnet50 Benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/Resnet50 Benchmark.png


--------------------------------------------------------------------------------
/doc/figure/benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/benchmark.png


--------------------------------------------------------------------------------
/doc/figure/exec_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/exec_model.png


--------------------------------------------------------------------------------
/doc/figure/hybrid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/hybrid.png


--------------------------------------------------------------------------------
/doc/figure/lm1b_convergence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/lm1b_convergence.png


--------------------------------------------------------------------------------
/doc/figure/nmt_convergence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/nmt_convergence.png


--------------------------------------------------------------------------------
/doc/figure/resnet50_convergence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/doc/figure/resnet50_convergence.png


--------------------------------------------------------------------------------
/doc/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | Parallax runs under Linux with Python 2.7 and 3.6; we haven't yet tested Parallax on other platforms and 3.3+.
 3 | Parallax depends on a modified version of TensorFlow 1.6/1.11 and horovod 0.11.2 in parallax repository as submodules. *Each of these frameworks needs to be built and installed from source, which is explained in further detail below*. Parallax itself also requires installing from sources, and below explains the installation process step by step. We plan to provide binary files in the near future.
 4 | 
 5 | First, clone the parallax repository on your linux machine:
 6 | ```shell
 7 | $ git clone --recurse-submodules https://github.com/snuspl/parallax.git
 8 | ```
 9 | We recommend installing using Virtualenv and pip.
10 | 
11 | Install Python, pip, and Virtualenv:
12 | ```shell
13 | $ sudo apt-get install python-pip python-dev python-virtualenv
14 | ```
15 | 
16 | Create a Virtualenv environment in the directory `parallax_venv`(specify whichever name you prefer), and then activate it.
17 | ```shell
18 | $ virtualenv parallax_venv
19 | $ source parallax_venv/bin/activate
20 | ```
21 | 
22 | ## Install TensorFlow
23 | TensorFlow requires [Bazel](https://docs.bazel.build/versions/master/install.html) to build a binary file. (See [TF install](https://www.tensorflow.org/install/install_sources) for more instructions on how to build TensorFlow from source.) TensorFlow can be built CPU-only but Parallax needs TensorFlow with GPU support using [CUDA Toolkit 9.0 or 10.0](https://developer.nvidia.com/cuda-zone) and [CuDNN SDK v7](https://developer.nvidia.com/cudnn). To install TensorFlow with GPU support, follow the commands below.
24 | 
25 | ```shell
26 | $ cd parallax/tensorflow
27 | $ git checkout r1.11 (optional for TensorFlow v1.11)
28 | $ pip install numpy
29 | $ ./configure
30 |   (Configurations related to cuda should be turned on to use GPUs)
31 |   (verbs: ibverbs RDMA)
32 |   (gdr: GPU Direct (only for GPUs with GDR support))
33 | $ bazel build --config=opt --config=cuda //tensorflow/tools/pip_package:build_pip_package
34 | $ bazel-bin/tensorflow/tools/pip_package/build_pip_package {target_directory}
35 | $ pip install {target_directory}/tensorflow-*.whl
36 | ```
37 | 
38 | 
39 | ## Install Horovod
40 | To install horovod, [Open MPI](https://www.open-mpi.org/faq/?category=building#easy-build) and [NCCL](https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html) are required as MPI implementations. To install OpenMPI, `--with-cuda` flag should be in the configure line, and you can also add `--with-verbs` to use ibverbs.
41 | We tested on openmpi-3.0.0, NCCL 2.1.15(for cuda9.0) and NCCL 2.3.5(for cuda10.0).
42 | ```shell
43 | $ cd ../horovod
44 | $ python setup.py sdist
45 | $ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITHOUT_PYTORCH=True HOROVOD_WITHOUT_MXNET=True pip install --no-cache-dir dist/horovod-*.tar.gz
46 | ```
47 | 
48 | ## Install Parallax
49 | Parallax also uses [Bazel](https://docs.bazel.build/versions/master/install.html) for installation.
50 | ```shell
51 | $ cd ../parallax # parallax directory
52 | $ bazel build //parallax/util:build_pip_package
53 | $ bazel-bin/parallax/util/build_pip_package {target_directory}
54 | $ pip install {target_directory}/parallax-*.whl
55 | 


--------------------------------------------------------------------------------
/doc/trouble_shooting.md:
--------------------------------------------------------------------------------
 1 | # Trouble Shooting
 2 | 
 3 | Because Parallax execution involves many dependent software and hardware packages, debugging can be tricky if errors occur.
 4 | This page collects the troublesome situations we have experienced and the solutions. If you have a similar symptom, try following the suggestions. Also, if you have any additional trouble shooting case, please add it here.
 5 | 
 6 | ### Device placement error
 7 | Error message: 
 8 | 
 9 | `device placement error(Cannot assign a device for operation)`
10 | 
11 | Parallax assumes `allow_soft_placement=True` because Parallax assigns operators on CPU/GPU devices according to their characteristics(shared or replicated) if the placement of the device is not specified. If you face a device placement error, try setting `allow_soft_placement=True` on the session configuration.
12 | 
13 | ### RDMA queue issue while running parameter server model
14 | Error message: 
15 | ```
16 | tensorflow/contrib/verbs/rdma.cc:1009] Check failed: status.ok() RecvLocalAsync was not ok. error message: Step 123330693738664103
17 | tensorflow/contrib/verbs/rdma.cc:1009] Check failed: status.ok() RecvLocalAsync was not ok. error message: Step 95609778068110326
18 | ```
19 | There are some issues related to managing RDMA queue in Tensorflow. Consider increasing the RDMA queue depth by adjusting `RDMA_QUEUE_DEPTH=<desired_queue_depth>` in `.ssh/environment` or elsewhere you managing environment variables.
20 | 
21 | ### NCCL different version issue
22 | Error message:
23 | ```
24 | Signal: Segmentation fault (11)
25 | Signal code: Address not mapped (1)
26 | Failing at address: 0xa0
27 | ```
28 | This error can occur if multiple machines use different versions of NCCL. 
29 | 
30 | ### Hang by fetching gradients from non-chief workers while running parameter server model
31 | Error message: None
32 | 
33 | There are a chief(worker 0) worker and non-chief workers, and Parallax assumes that only the chief worker 
34 | can fetch the gradients. It means fetching gradients from non-chief workers can block the distributed training. 
35 | 


--------------------------------------------------------------------------------
/parallax/WORKSPACE:
--------------------------------------------------------------------------------
1 | workspace(name = "parallax")
2 | 


--------------------------------------------------------------------------------
/parallax/parallax/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])  # Apache 2.0
 2 | 
 3 | package(
 4 |     default_visibility = [
 5 |          "//visibility:public",
 6 |     ],
 7 | )
 8 | 
 9 | native.py_library(
10 |     name = "parallax",
11 |     srcs = ["__init__.py"],
12 |     deps = [
13 |         "//parallax/core:core",
14 |         "//parallax/core/python/common:runner",
15 |         "//parallax/core/python/common:shard",
16 |     ],
17 | )


--------------------------------------------------------------------------------
/parallax/parallax/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2018 Seoul National University
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | from parallax.core.python.common.partitions import get_partitioner
17 | from parallax.core.python.common.runner import parallel_run
18 | from parallax.core.python.common import shard
19 | from parallax.core.python.common.lib import parallax_log as log
20 | 
21 | from parallax.core.python.common.config import ParallaxConfig as Config
22 | from parallax.core.python.common.config import PSConfig
23 | from parallax.core.python.common.config import MPIConfig
24 | from parallax.core.python.common.config import CommunicationConfig
25 | from parallax.core.python.common.config import CheckPointConfig
26 | from parallax.core.python.common.config import ProfileConfig
27 | 


--------------------------------------------------------------------------------
/parallax/parallax/core/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])  # Apache 2.0
 2 | 
 3 | package(
 4 |     default_visibility = [
 5 |          "//visibility:public",
 6 |     ],
 7 | )
 8 | 
 9 | native.py_library(
10 |     name = "core",
11 |     srcs = ["__init__.py"],
12 |     deps = [
13 |         "//parallax/core/python:python"
14 |     ],
15 | )


--------------------------------------------------------------------------------
/parallax/parallax/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/__init__.py


--------------------------------------------------------------------------------
/parallax/parallax/core/python/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])  # Apache 2.0
 2 | 
 3 | package(
 4 |     default_visibility = [
 5 |          "//visibility:public",
 6 |     ],
 7 | )
 8 | 
 9 | native.py_library(
10 |     name = "python",
11 |     srcs = ["__init__.py"],
12 |     deps = [
13 |         "//parallax/core/python/common:common",
14 |         "//parallax/core/python/mpi:mpi",
15 |         "//parallax/core/python/ps:ps",
16 |         "//parallax/core/python/hybrid:hybrid",
17 |         "//parallax/core/python/tools:tools",
18 |     ],
19 | )
20 | 


--------------------------------------------------------------------------------
/parallax/parallax/core/python/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/__init__.py


--------------------------------------------------------------------------------
/parallax/parallax/core/python/common/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])  # Apache 2.0
 2 | 
 3 | package(
 4 |     default_visibility = [
 5 |          "//visibility:public",
 6 |     ],
 7 | )
 8 | 
 9 | 
10 | 
11 | native.py_library(
12 |     name = "lib",
13 |     srcs = ["lib.py"],
14 |     deps = [
15 |         "consts",
16 |     ]
17 | )
18 | 
19 | native.py_library(
20 |     name = "config",
21 |     srcs = ["config.py"],
22 |     deps = [
23 |     ]
24 | )
25 | 
26 | native.py_library(
27 |     name = "graph_transform_lib",
28 |     srcs = ["graph_transform_lib.py"],
29 |     deps = [
30 |         "lib",
31 |     ]
32 | )
33 | 
34 | native.py_library(
35 |     name = "session_context",
36 |     srcs = ["session_context.py"],
37 |     deps = [
38 |     ]
39 | )
40 | 
41 | native.py_library(
42 |     name = "runner",
43 |     srcs = ["runner.py"],
44 |     deps = [
45 |         "lib",
46 |         "graph_transform_lib",
47 |         "consts",
48 |         "partitions",
49 |         "//parallax/core/python/ps:runner",
50 |         "//parallax/core/python/mpi:runner",
51 |         "//parallax/core/python/hybrid:runner"
52 |     ]
53 | )
54 | 
55 | native.py_library(
56 |     name = "shard",
57 |     srcs = ["shard.py"],
58 |     deps = [
59 |         "graph_transform_lib",
60 |     ],
61 | )
62 | 
63 | native.py_library(
64 |     name = "consts",
65 |     srcs = ["consts.py"],
66 |     deps = [
67 |     ],
68 | )
69 | 
70 | native.py_library(
71 |     name = "partitions",
72 |     srcs = ["partitions.py"],
73 |     deps = [
74 |     ],
75 | )
76 | native.py_library(
77 |     name = "common",
78 |     srcs = ["__init__.py"],
79 |     deps = [
80 |         "graph_transform_lib",
81 |         "runner",
82 |         "shard",
83 |         "config",
84 |         "session_context",
85 |         "partitions"
86 |     ],
87 | )
88 | 
89 | 


--------------------------------------------------------------------------------
/parallax/parallax/core/python/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/common/__init__.py


--------------------------------------------------------------------------------
/parallax/parallax/core/python/common/consts.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2018 Seoul National University
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | import os
17 | 
18 | PARALLAX_RUN_OPTION = "PARALLAX_RUN_OPTION"
19 | PARALLAX_RUN_MASTER = "PARALLAX_RUN_MASTER"
20 | PARALLAX_RUN_MPI = "PARALLAX_RUN_MPI"
21 | PARALLAX_RUN_PS = "PARALLAX_RUN_PS"
22 | PARALLAX_RUN_HYBRID = "PARALLAX_RUN_HYBRID"
23 | PARALLAX_WORKER_ID = "PARALLAX_WORKER_ID"
24 | PARALLAX_NUM_WORKERS = "PARALLAX_NUM_WORKERS"
25 | PARALLAX_RESOURCE_INFO = "PARALLAX_RESOURCE_INFO"
26 | PARALLAX_MACHINE_ID = "PARALLAX_MACHINE_ID"
27 | PARALLAX_HOSTNAME = "PARALLAX_HOSTNAME"
28 | 
29 | LOCAL_CODE_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
30 | LOCAL_LAUNCH_PS_PATH = os.path.join(LOCAL_CODE_ROOT, 'tools',
31 |                                     'launch_ps.py')
32 | 
33 | REMOTE_PARALLAX_ROOT = os.path.join('/tmp', 'parallax-%s' % os.environ['USER'])
34 | REMOTE_LAUNCH_PS_PATH = os.path.join(REMOTE_PARALLAX_ROOT, 'launch_ps.py')
35 | REMOTE_MPI_SCRIPT_PATH = os.path.join(REMOTE_PARALLAX_ROOT, 'mpi_run.sh')
36 | 
37 | NUM_ITERATIONS_FOR_TEST = 200
38 | NUM_ITERATIONS_FOR_WARMUP = 200
39 | 


--------------------------------------------------------------------------------
/parallax/parallax/core/python/common/shard.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2018 Seoul National University
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | import tensorflow as tf
17 | 
18 | 
19 | NUM_SHARDS = "num_shards"
20 | SHARD_ID = "shard_id"
21 | SHARD_FILTER_PRED = "shard_filter_predicate"
22 | FILTER_DATASET_NUM_SHARDS_POS = 1
23 | FILTER_DATASET_SHARD_ID_POS = 2
24 | 
25 | 
26 | def create_num_shards_and_shard_id():
27 |     """Returns and create the num shards and the shard id tensors.
28 | 
29 |     Returns:
30 |       The num shards and the shard id tensors.
31 | 
32 |     Raises:
33 |       ValueError: if the num shards tensor or the shard id tensor is already
34 |       defined.
35 |     """
36 | 
37 |     # TODO: allow num_shards and shard_id inside a library function
38 |     graph = tf.get_default_graph()
39 |     num_shards_tensors = graph.get_collection(NUM_SHARDS)
40 |     if len(num_shards_tensors) > 0:
41 |         raise ValueError('"num_shards" already exists.')
42 |     shard_id_tensors = graph.get_collection(SHARD_ID)
43 |     if len(shard_id_tensors) > 0:
44 |         raise ValueError('"shard_id" already exists.')
45 |     # Create in proper graph and base name_scope.
46 |     with graph.as_default() as g, g.name_scope(None):
47 |         # Initialize num_shards_tensor=1, and shard_id_tensor=0.
48 |         # parallax updates the value when the graph is transformed
49 |         # for distributed version.
50 |         num_shards_tensor = tf.constant(1, dtype=tf.int64, name="num_shards")
51 |         shard_id_tensor = tf.constant(0, dtype=tf.int64, name="shard_id")
52 |     tf.add_to_collection(NUM_SHARDS, num_shards_tensor)
53 |     tf.add_to_collection(SHARD_ID, shard_id_tensor)
54 |     return num_shards_tensor, shard_id_tensor
55 | 
56 | 
57 | def _get_or_create_num_shards_and_shard_id():
58 |     graph = tf.get_default_graph()
59 |     num_shards_tensors = graph.get_collection(NUM_SHARDS)
60 |     if len(num_shards_tensors) > 0:
61 |         num_shards_tensor = num_shards_tensors[0]
62 |         shard_id_tensor = \
63 |             graph.get_collection(SHARD_ID)[0]
64 |     else:
65 |         num_shards_tensor, shard_id_tensor = create_num_shards_and_shard_id()
66 |     return num_shards_tensor, shard_id_tensor
67 | 
68 | 
69 | def shard(ds):
70 |     """Convert a dataset to include shard, it has same effect
71 |     with ds.shard(num_shards, index).
72 |     """
73 | 
74 |     # TODO: allow dataset shard inside a function or dataset api
75 |     # (e.g., map, parallel_interleave)
76 |     num_shards, shard_id = _get_or_create_num_shards_and_shard_id()
77 | 
78 |     def filter_fn(elem_index, _):
79 |         mod_result = tf.mod(elem_index, num_shards)
80 |         return tf.equal(mod_result, shard_id)
81 | 
82 |     f = ds._enumerate().filter(filter_fn)
83 |     assert f._predicate.captured_inputs[0] == num_shards
84 |     assert f._predicate.captured_inputs[1] == shard_id
85 |     tf.add_to_collection(SHARD_FILTER_PRED,
86 |                          f._predicate.name)
87 |     return f.map(lambda _, elem: elem)
88 | 


--------------------------------------------------------------------------------
/parallax/parallax/core/python/hybrid/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])  # Apache 2.0
 2 | 
 3 | package(
 4 |     default_visibility = [
 5 |          "//visibility:public",
 6 |     ],
 7 | )
 8 | 
 9 | native.py_library(
10 |     name = "graph_transform",
11 |     srcs = ["graph_transform.py"],
12 |     deps = [
13 |         "//parallax/core/python/common:graph_transform_lib",
14 |         "//parallax/core/python/common:lib",
15 |         "between_graph_parallel",
16 |         "in_graph_parallel",
17 |     ]
18 | )
19 | 
20 | native.py_library(
21 |     name = "between_graph_parallel",
22 |     srcs = ["between_graph_parallel.py"],
23 |     deps = [
24 |         "//parallax/core/python/common:graph_transform_lib",
25 |         "//parallax/core/python/common:lib",
26 |     ]
27 | )
28 | 
29 | native.py_library(
30 |     name = "in_graph_parallel",
31 |     srcs = ["in_graph_parallel.py"],
32 |     deps = [
33 |         "//parallax/core/python/common:graph_transform_lib",
34 |         "//parallax/core/python/common:lib",
35 |     ]
36 | )
37 | 
38 | native.py_library(
39 |     name = "runner",
40 |     srcs = ["runner.py"],
41 |     deps = [
42 |         "graph_transform",
43 |         "//parallax/core/python/common:lib",
44 |         "//parallax/core/python/common:consts",
45 |     ]
46 | )
47 | 
48 | native.py_library(
49 |     name = "hybrid",
50 |     srcs = ["__init__.py"],
51 |     deps = [
52 |         "runner"
53 |     ]
54 | )
55 | 


--------------------------------------------------------------------------------
/parallax/parallax/core/python/hybrid/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/hybrid/__init__.py


--------------------------------------------------------------------------------
/parallax/parallax/core/python/mpi/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])  # Apache 2.0
 2 | 
 3 | package(
 4 |     default_visibility = [
 5 |          "//visibility:public",
 6 |     ],
 7 | )
 8 | 
 9 | native.py_library(
10 |     name = "graph_transform",
11 |     srcs = ["graph_transform.py"],
12 |     deps = [
13 |         "//parallax/core/python/common:graph_transform_lib",
14 |         "//parallax/core/python/common:lib",
15 |     ]
16 | )
17 | 
18 | native.py_library(
19 |     name = "runner",
20 |     srcs = ["runner.py"],
21 |     deps = [
22 |         "graph_transform",
23 |         "//parallax/core/python/common:lib",
24 |         "//parallax/core/python/common:consts",
25 |         "//parallax/core/python/common:session_context",
26 |     ]
27 | )
28 | 
29 | native.py_library(
30 |     name = "mpi",
31 |     srcs = ["__init__.py"],
32 |     deps = [
33 |         "runner"
34 |     ]
35 | )
36 | 


--------------------------------------------------------------------------------
/parallax/parallax/core/python/mpi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/mpi/__init__.py


--------------------------------------------------------------------------------
/parallax/parallax/core/python/mpi/graph_transform.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2018 Seoul National University
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import tensorflow as tf
 17 | import horovod.tensorflow as hvd
 18 | 
 19 | from parallax.core.python.common.graph_transform_lib import get_all_control_consumers
 20 | from parallax.core.python.common.graph_transform_lib import update_consumers
 21 | from parallax.core.python.common.graph_transform_lib import update_control_consumers
 22 | from parallax.core.python.common.graph_transform_lib import update_shard_values_for_worker
 23 | from parallax.core.python.common.lib import *
 24 | 
 25 | 
 26 | def _add_broadcast_ops():
 27 |     bcast_global_variables_ops = []
 28 |     for var in tf.global_variables():
 29 |         bcast_global_variables_ops.append(
 30 |             tf.assign(var, hvd.broadcast(var, 0)))
 31 |     with tf.control_dependencies(bcast_global_variables_ops):
 32 |         tf.no_op(name='auto_parallel_bcast_global_vars')
 33 | 
 34 | 
 35 | def _add_aggregation_ops(gradients_info, op_to_control_consumer_ops, config):
 36 |     grad_tensor = gradients_info._grad
 37 |     if isinstance(grad_tensor, tf.Tensor):
 38 |         grad = grad_tensor
 39 |         grad_consumers = [c for c in grad.consumers()]
 40 |         agg_grad = hvd.allreduce(grad,
 41 |                                  average=True)
 42 |         update_consumers(grad_consumers, grad, agg_grad)
 43 |         update_control_consumers(op_to_control_consumer_ops[grad.op],
 44 |                                  grad.op, agg_grad.op)
 45 |     else:
 46 |         grad = grad_tensor.values
 47 |         indices = grad_tensor.indices
 48 |         dense_shape = grad_tensor.dense_shape
 49 |         grad_consumers = [c for c in grad.consumers()]
 50 |         indices_consumers = [c for c in indices.consumers()]
 51 |         agg_grad = \
 52 |             hvd.allreduce(tf.IndexedSlices(grad, indices, dense_shape),
 53 |                           average=config.average_sparse)
 54 |         update_consumers(grad_consumers, grad, agg_grad.values)
 55 |         update_consumers(indices_consumers, indices, agg_grad.indices)
 56 |         update_control_consumers(op_to_control_consumer_ops[grad.op],
 57 |                                  grad.op, agg_grad.values.op)
 58 |         update_control_consumers(
 59 |             op_to_control_consumer_ops[indices.op], indices.op,
 60 |             agg_grad.indices.op)
 61 |     gradients_info._grad = agg_grad
 62 | 
 63 | 
 64 | def graph_transform_mpi(single_gpu_meta_graph_def, config,
 65 |                         op_library_path=None):
 66 |     if op_library_path is not None:
 67 |         tf.load_op_library(op_library_path)
 68 | 
 69 |     with tf.Graph().as_default() as replica:
 70 |         tf.train.import_meta_graph(single_gpu_meta_graph_def)
 71 | 
 72 |         tensor_or_op_name_to_replica_names = {}
 73 |         for op in replica.get_operations():
 74 |             tensor_or_op_name_to_replica_names[op.name] = [op.name]
 75 |             for output in op.outputs:
 76 |                 tensor_or_op_name_to_replica_names[output.name] = [output.name]
 77 | 
 78 |         # Initialize horovod
 79 |         hvd.init()
 80 | 
 81 |         num_workers = hvd.size()
 82 |         worker_id = hvd.rank()
 83 |         update_shard_values_for_worker(num_workers, worker_id)
 84 | 
 85 |         op_to_control_consumer_ops = get_all_control_consumers(replica)
 86 |         trainable_variable_ops = [var.op for var in tf.get_collection(
 87 |             tf.GraphKeys.TRAINABLE_VARIABLES)]
 88 | 
 89 |         for gradients_info in tf.get_collection(tf.GraphKeys.GRADIENTS_INFO):
 90 |             target_tensor = gradients_info._target
 91 |             if target_tensor.op not in trainable_variable_ops:
 92 |                 parallax_log.debug(
 93 |                     "Gradient for non-trainable variable %s is created, ignore"
 94 |                     % target_tensor.op.name)
 95 |                 continue
 96 | 
 97 |             _add_aggregation_ops(gradients_info, op_to_control_consumer_ops, config)
 98 |         _add_broadcast_ops()
 99 | 
100 |     return tf.train.export_meta_graph(graph=replica), \
101 |            tensor_or_op_name_to_replica_names
102 | 


--------------------------------------------------------------------------------
/parallax/parallax/core/python/ps/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])  # Apache 2.0
 2 | 
 3 | package(
 4 |     default_visibility = [
 5 |          "//visibility:public",
 6 |     ],
 7 | )
 8 | 
 9 | native.py_library(
10 |     name = "graph_transform",
11 |     srcs = ["graph_transform.py"],
12 |     deps = [
13 |         "//parallax/core/python/common:graph_transform_lib",
14 |         "//parallax/core/python/common:lib",
15 |         "//parallax/core/python/ps:between_graph_parallel",
16 |         "//parallax/core/python/ps:in_graph_parallel",
17 |     ]
18 | )
19 | 
20 | native.py_library(
21 |     name = "between_graph_parallel",
22 |     srcs = ["between_graph_parallel.py"],
23 |     deps = [
24 |         "//parallax/core/python/common:graph_transform_lib",
25 |         "//parallax/core/python/common:lib",
26 |     ]
27 | )
28 | 
29 | native.py_library(
30 |     name = "in_graph_parallel",
31 |     srcs = ["in_graph_parallel.py"],
32 |     deps = [
33 |         "//parallax/core/python/common:graph_transform_lib",
34 |         "//parallax/core/python/common:lib",
35 |     ]
36 | )
37 | 
38 | native.py_library(
39 |     name = "runner",
40 |     srcs = ["runner.py"],
41 |     deps = [
42 |         "graph_transform",
43 |         "//parallax/core/python/common:lib",
44 |         "//parallax/core/python/common:consts",
45 |         "//parallax/core/python/common:graph_transform_lib",
46 |     ]
47 | )
48 | 
49 | native.py_library(
50 |     name = "ps",
51 |     srcs = ["__init__.py"],
52 |     deps = [
53 |         "runner"
54 |     ]
55 | )


--------------------------------------------------------------------------------
/parallax/parallax/core/python/ps/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/ps/__init__.py


--------------------------------------------------------------------------------
/parallax/parallax/core/python/ps/graph_transform.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2018 Seoul National University
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | from parallax.core.python.common.lib import *
17 | from parallax.core.python.ps.in_graph_parallel import in_graph_auto_parallel_compute
18 | from parallax.core.python.ps.between_graph_parallel import between_graph_auto_parallel_compute
19 | 
20 | 
21 | def graph_transform_ps(single_gpu_meta_graph_def,
22 |                        worker_id,
23 |                        config,
24 |                        op_library_path=None):
25 |     cluster_info = config.resource_info
26 |     # TODO: Handle all ps configurations 
27 |     if config.communication_config.ps_config.replicate_variables and not config.sync:
28 |         raise ValueError('replicate_variables is only possible with sync')
29 |     ps_device = '/job:ps' if 'ps' in cluster_info else '/job:worker/cpu:0'
30 |     cluster_spec = get_tf_clusterspec(cluster_info)
31 |     worker = cluster_info['worker'][worker_id]
32 |     num_gpus = len(worker['gpus'])
33 | 
34 |     parallax_log.debug(
35 |         "Starting graph transformation for PS for worker %d" % worker_id)
36 | 
37 |     tensor_or_op_name_to_replica_names = TensorOrOpNameToReplicaNames(
38 |         single_gpu_meta_graph_def.meta_info_def.stripped_op_list)
39 | 
40 |     multi_gpu_meta_graph_def = \
41 |         in_graph_auto_parallel_compute(
42 |             single_gpu_meta_graph_def, num_gpus, config=config,
43 |             op_library_path=op_library_path,
44 |             tensor_or_op_name_to_replica_names=tensor_or_op_name_to_replica_names)
45 | 
46 |     ps_meta_graph_def = \
47 |         between_graph_auto_parallel_compute(
48 |             multi_gpu_meta_graph_def,
49 |             worker_id=worker_id,
50 |             ps_device=ps_device,
51 |             worker_device='/job:worker/task:%d' % worker_id,
52 |             merge_devices=True,
53 |             cluster_spec=cluster_spec,
54 |             config=config,
55 |             op_library_path=op_library_path,
56 |             num_replicas_per_worker=num_gpus,
57 |             tensor_or_op_name_to_replica_names=tensor_or_op_name_to_replica_names)
58 |     parallax_log.debug(
59 |         "Finished graph transformation for PS for worker %d" % worker_id)
60 |     return ps_meta_graph_def, tensor_or_op_name_to_replica_names.export()
61 | 


--------------------------------------------------------------------------------
/parallax/parallax/core/python/tools/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])  # Apache 2.0
 2 | 
 3 | package(
 4 |     default_visibility = [
 5 |          "//visibility:public",
 6 |     ],
 7 | )
 8 | 
 9 | native.py_library(
10 |     name = "launch_ps",
11 |     srcs = ["launch_ps.py"]
12 | )
13 | 
14 | native.py_library(
15 |     name = "tools",
16 |     srcs = ["__init__.py"],
17 |     deps = [
18 |         "launch_ps"
19 |     ],
20 | )
21 | 


--------------------------------------------------------------------------------
/parallax/parallax/core/python/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/core/python/tools/__init__.py


--------------------------------------------------------------------------------
/parallax/parallax/core/python/tools/launch_ps.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2018 Seoul National University
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | import argparse
17 | import sys, os
18 | import json
19 | 
20 | import tensorflow as tf
21 | 
22 | FLAGS = tf.app.flags.FLAGS
23 | 
24 | tf.app.flags.DEFINE_string('ps_hosts', '',
25 |                            """Comma-separated list of target hosts""")
26 | tf.app.flags.DEFINE_string('worker_hosts', '',
27 |                            """Comma-separated list of target hosts""")
28 | tf.app.flags.DEFINE_string('job_name', '',
29 |                            """Job name in cluster""")
30 | tf.app.flags.DEFINE_integer('task_index', -1,
31 |                             """Task index of the job""")
32 | tf.app.flags.DEFINE_string('protocol', 'grpc',
33 |                            """Server protocol: grpc, grpc+verbs, grpc+gdr""")
34 | 
35 | 
36 | def main(argv=None):
37 |     assert FLAGS.job_name == 'ps'
38 |     tf_cluster_dict = {}
39 | 
40 |     if not FLAGS.ps_hosts == '':
41 |         tf_cluster_dict['ps'] = []
42 |         for ps in FLAGS.ps_hosts.split(','):
43 |             tf_cluster_dict['ps'].append(ps)
44 | 
45 |     tf_cluster_dict['worker'] = []
46 |     for worker in FLAGS.worker_hosts.split(','):
47 |         tf_cluster_dict['worker'].append(worker)
48 |     cluster = tf.train.ClusterSpec(tf_cluster_dict)
49 | 
50 |     server = tf.train.Server(cluster, job_name='ps',
51 |                              task_index=FLAGS.task_index,
52 |                              protocol=FLAGS.protocol)
53 |     server.join()
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     tf.app.run()
58 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Rafal Jozefowicz
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/README.md:
--------------------------------------------------------------------------------
 1 | # LM-1B 
 2 | LM-1B implements the LSTM language model described in [LM](https://arxiv.org/abs/1602.02410). 
 3 | The original code comes from https://github.com/rafaljozefowicz/lm, which supports 
 4 | synchronous training with multiple GPUs. We change the code as single GPU code, and 
 5 | then apply parallax auto-parallelization for multi-GPU, multi-machine with synchronous 
 6 | or asynchronous training.
 7 | 
 8 | ## Dataset
 9 | * [1B Word Benchmark Dataset](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark)
10 | 
11 | ## To Run
12 | Set your resource information in the `resource_info` file.
13 | 
14 | Then, you can run lm1b model with data in `<data_dir>` in parallel by executing: 
15 | ```shell
16 | $ python lm1b_distributed_driver.py --datadir <data_dir>
17 | ```
18 | 
19 | The command above runs a single LM model on multiple devices specified in `resource_info`.
20 | The command assumes that the data directory and the LM-1B codebase are distributed and reachable in the same absolute path in each of the machines.
21 | 
22 | Also, we have a few more options you can choose for distributed running.
23 | 
24 | | Parameter Name       |  Default            	| Description |
25 | | :------------------- |:-----------------------| :-----------|
26 | | --logdir			   | /tmp/lm1b				| Logging directory |
27 | | --datadir			   | None					| Data directory |
28 | | --hpconfig		   | ""						| Overrides default hyper-parameters |
29 | | --eval_steps		   | 70						| Number of evaluation steps |
30 | | --resource_info_file | `./resource_info`		| Filename containing cluster information written |
31 | | --max_steps 		   | 1000000    		    | Number of iterations to run for each workers |
32 | | --log_frequency 	   | 100  		    		| How many steps between two runop log |
33 | | --sync          	   | True  	 				| Whether to synchronize learning or not |
34 | | --ckpt_dir           | None					| Directory to save checkpoints |
35 | | --save_ckpt_steps    | 0						| Number of steps between two consecutive checkpoints |
36 | | --save_n_ckpts_per_epoch | -1					| Number of checkpoints to save per each epoch |
37 | | --run_option		   | None					| The run option whether PS or MPI, None utilizes both |
38 | | --search_partitions | False           | Whether to use Parallax's variable partitioning method or not 
39 | 
40 | You can adapt the distributed running with above options. For example, if you want to fix the communication model as MPI mode, you can add `run_option` value like below.
41 | 
42 | ```shell
43 | $ python lm1b_distributed_driver.py --datadir <data_dir> --run_option=MPI
44 | ```
45 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/lm1b/__init__.py


--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/data_utils.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import glob
  3 | import json
  4 | import random
  5 | 
  6 | import numpy as np
  7 | 
  8 | 
  9 | class Vocabulary(object):
 10 | 
 11 |     def __init__(self):
 12 |         self._token_to_id = {}
 13 |         self._token_to_count = {}
 14 |         self._id_to_token = []
 15 |         self._num_tokens = 0
 16 |         self._s_id = None
 17 |         self._unk_id = None
 18 | 
 19 |     @property
 20 |     def num_tokens(self):
 21 |         return self._num_tokens
 22 | 
 23 |     @property
 24 |     def unk(self):
 25 |         return "<UNK>"
 26 | 
 27 |     @property
 28 |     def unk_id(self):
 29 |         return self._unk_id
 30 | 
 31 |     @property
 32 |     def s(self):
 33 |         return "<S>"
 34 | 
 35 |     @property
 36 |     def s_id(self):
 37 |         return self._s_id
 38 | 
 39 |     def add(self, token, count):
 40 |         self._token_to_id[token] = self._num_tokens
 41 |         self._token_to_count[token] = count
 42 |         self._id_to_token.append(token)
 43 |         self._num_tokens += 1
 44 | 
 45 |     def finalize(self):
 46 |         self._s_id = self.get_id(self.s)
 47 |         self._unk_id = self.get_id(self.unk)
 48 | 
 49 |     def get_id(self, token):
 50 |         return self._token_to_id.get(token, self.unk_id)
 51 | 
 52 |     def get_token(self, id_):
 53 |         return self._id_to_token[id_]
 54 | 
 55 |     @staticmethod
 56 |     def from_file(filename):
 57 |         vocab = Vocabulary()
 58 |         with codecs.open(filename, "r", "utf-8") as f:
 59 |             for line in f:
 60 |                 word, count = line.strip().split()
 61 |                 vocab.add(word, int(count))
 62 |         vocab.finalize()
 63 |         return vocab
 64 | 
 65 | 
 66 | class Dataset(object):
 67 | 
 68 |     def __init__(self, vocab, file_pattern, deterministic=False):
 69 |         self._vocab = vocab
 70 |         self._file_pattern = file_pattern
 71 |         self._deterministic = deterministic
 72 | 
 73 |     def _parse_sentence(self, line):
 74 |         s_id = self._vocab.s_id
 75 |         return [s_id]\
 76 |                + [self._vocab.get_id(word) for word in line.strip().split()]\
 77 |                + [s_id]
 78 | 
 79 |     def _parse_file(self, file_name):
 80 |         print("Processing file: %s" % file_name)
 81 |         with codecs.open(file_name, "r", "utf-8") as f:
 82 |             lines = [line.strip() for line in f]
 83 |             if not self._deterministic:
 84 |                 random.shuffle(lines)
 85 |             print("Finished processing!")
 86 |             for line in lines:
 87 |                 yield self._parse_sentence(line)
 88 | 
 89 |     def _sentence_stream(self, file_stream):
 90 |         for file_name in file_stream:
 91 |             for sentence in self._parse_file(file_name):
 92 |                 yield sentence
 93 | 
 94 |     def _iterate(self, sentences, batch_size, num_steps):
 95 |         streams = [None] * batch_size
 96 |         x = np.zeros([batch_size, num_steps], np.int32)
 97 |         y = np.zeros([batch_size, num_steps], np.int32)
 98 |         w = np.zeros([batch_size, num_steps], np.uint8)
 99 |         while True:
100 |             x[:] = 0
101 |             y[:] = 0
102 |             w[:] = 0
103 |             for i in range(batch_size):
104 |                 tokens_filled = 0
105 |                 try:
106 |                     while tokens_filled < num_steps:
107 |                         if streams[i] is None or len(streams[i]) <= 1:
108 |                             streams[i] = next(sentences)
109 |                         num_tokens = min(len(streams[i]) - 1,
110 |                                          num_steps - tokens_filled)
111 |                         x[i, tokens_filled:tokens_filled+num_tokens] = \
112 |                             streams[i][:num_tokens]
113 |                         y[i, tokens_filled:tokens_filled + num_tokens] = \
114 |                             streams[i][1:num_tokens+1]
115 |                         w[i, tokens_filled:tokens_filled + num_tokens] = 1
116 |                         streams[i] = streams[i][num_tokens:]
117 |                         tokens_filled += num_tokens
118 |                 except StopIteration:
119 |                     pass
120 |             if not np.any(w):
121 |                 return
122 | 
123 |             yield x, y, w
124 | 
125 |     def iterate_once(self, batch_size, num_steps):
126 |         def file_stream():
127 |             for file_name in glob.glob(self._file_pattern):
128 |                 yield file_name
129 |         for value in self._iterate(
130 |                 self._sentence_stream(file_stream()), batch_size, num_steps):
131 |             yield value
132 | 
133 |     def iterate_forever(self, batch_size, num_steps, num_workers, worker_id):
134 |         def file_stream():
135 |             while True:
136 |                 file_patterns = glob.glob(self._file_pattern)
137 |                 file_patterns.sort()
138 |                 filenames_for_worker = []
139 |                 for i in range(len(file_patterns)):
140 |                     if i % num_workers == worker_id:
141 |                         filenames_for_worker.append(file_patterns[i])
142 |                 if not self._deterministic:
143 |                     random.shuffle(filenames_for_worker)
144 |                 for filename in filenames_for_worker:
145 |                     yield filename
146 |         for value in self._iterate(
147 |                 self._sentence_stream(file_stream()), batch_size, num_steps):
148 |             yield value
149 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/language_model.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import functools
  6 | 
  7 | import numpy as np
  8 | import tensorflow as tf
  9 | from tensorflow.python.framework import ops
 10 | from tensorflow.python.framework import tensor_shape
 11 | from tensorflow.python.layers import base
 12 | 
 13 | import parallax
 14 | 
 15 | FLAGS = tf.flags.FLAGS
 16 | tf.flags.DEFINE_integer('num_variable_shards', 32, 'Number of variable shard')
 17 | 
 18 | class LM(base.Layer):
 19 |   def __init__(self, num_steps):
 20 |     super(LM, self).__init__()
 21 |     self.num_steps = num_steps
 22 |     self.num_shards = FLAGS.num_variable_shards
 23 |     # Use keep_prob 1.0 at evaluation
 24 |     self.keep_prob = 0.9
 25 | 
 26 |     self.vocab_size = 793470
 27 |     self.emb_size = 512
 28 |     self.state_size = 2048
 29 |     self.projected_size = 512
 30 |     # Use num_sampled 0 (full softmax) at evaluation
 31 |     self.num_sampled = 8192
 32 | 
 33 |   def build(self, input_shape):
 34 |     partitioner = parallax.get_partitioner(self.num_shards)
 35 |     with tf.variable_scope(tf.get_variable_scope(), partitioner=partitioner):
 36 |       self.emb = tf.get_variable('emb', 
 37 |                                  shape=[self.vocab_size, self.emb_size],
 38 |                                  initializer=tf.uniform_unit_scaling_initializer(),
 39 |                                  trainable=True,
 40 |                                  dtype=tf.float32)
 41 |       self.softmax_w = tf.get_variable(name='softmax_w',
 42 |                                        shape=[self.vocab_size, self.projected_size],
 43 |                                        initializer=tf.uniform_unit_scaling_initializer(),
 44 |                                        trainable=True,
 45 |                                        dtype=tf.float32)
 46 | 
 47 |     self.softmax_b = self.add_variable(name='softmax_b',
 48 |                                        shape=[self.vocab_size],
 49 |                                        trainable=True,
 50 |                                        dtype=tf.float32)
 51 |     self.W = self.add_variable(name='W',
 52 |                                shape=[self.emb_size + self.projected_size, 4 * self.state_size],
 53 |                                trainable=True,
 54 |                                dtype=tf.float32)
 55 |     self.B = self.add_variable(name='B',
 56 |                                shape=[4 * self.state_size],
 57 |                                trainable=True,
 58 |                                dtype=tf.float32)
 59 |     self.W_P = self.add_variable(name='W_P',
 60 |                                  shape=[self.state_size, self.projected_size],
 61 |                                  trainable=True,
 62 |                                  dtype=tf.float32)
 63 |     self.built = True
 64 | 
 65 |   def call(self, x, y, w, initial_state_c, initial_state_h, training):
 66 |     # [bs, steps, emb_size]
 67 |     x = tf.nn.embedding_lookup(self.emb, x)
 68 |     if training:
 69 |       x = tf.nn.dropout(x, self.keep_prob)
 70 | 
 71 |     # [bs, emb_size] * steps
 72 |     inputs = [tf.squeeze(v, axis=[1]) for v in tf.split(value=x, num_or_size_splits=self.num_steps, axis=1)]
 73 | 
 74 |     c = initial_state_c
 75 |     h = initial_state_h
 76 |     for t in range(self.num_steps):
 77 |       # i = input_gate, j = new_input, f = forget_gate, o = output_gate
 78 |       cell_inputs = tf.concat([inputs[t], h], axis=1)
 79 |       lstm_matrix = tf.nn.xw_plus_b(cell_inputs, self.W, self.B)
 80 |       i, j, f, o = tf.split(lstm_matrix, 4, axis=1)
 81 | 
 82 |       c = tf.sigmoid(f + 1.0) * c + tf.sigmoid(i) * tf.tanh(j)
 83 |       h = tf.sigmoid(o) * tf.tanh(c)
 84 |       h = tf.matmul(h, self.W_P)
 85 |       inputs[t] = h
 86 |       if training:
 87 |         inputs[t] = tf.nn.dropout(inputs[t], self.keep_prob)
 88 | 
 89 |     inputs[t] = tf.identity(inputs[t])
 90 | 
 91 |     inputs = tf.reshape(tf.concat(inputs, axis=1), [-1, self.projected_size])
 92 | 
 93 |     if training:
 94 |       targets = tf.reshape(y, [-1, 1])
 95 |       loss = tf.nn.sampled_softmax_loss(self.softmax_w,
 96 |                                         self.softmax_b,
 97 |                                         targets,
 98 |                                         inputs,
 99 |                                         self.num_sampled,
100 |                                         self.vocab_size)
101 |     else:
102 |       full_softmax_w = tf.reshape(tf.concat(self.softmax_w, axis=1), [-1, self.projected_size])
103 |       full_softmax_w = full_softmax_w[:self.vocab_size, :]
104 | 
105 |       logits = tf.matmul(inputs, full_softmax_w, transpose_b=True) + self.softmax_b
106 |       targets = tf.reshape(y, [-1])
107 |       loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets)
108 | 
109 |     loss = tf.reduce_mean(loss * tf.reshape(tf.to_float(w), [-1]))
110 |     return loss, c, h
111 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/language_model_graph.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | import numpy as np
 6 | import tensorflow as tf
 7 | 
 8 | import language_model
 9 | 
10 | _NUM_WORDS = {
11 |     'train': 798945280,
12 |     'validation': 7789987,
13 | }
14 | 
15 | FLAGS = tf.flags.FLAGS
16 | tf.flags.DEFINE_integer('batch_size', 128, 'Batch size')
17 | tf.flags.DEFINE_integer('num_steps', 20, 'Number of steps')
18 | tf.flags.DEFINE_float('learning_rate', 0.2, 'Learning rate')
19 | tf.flags.DEFINE_float('max_grad_norm', 10.0, 'max_grad_norm')
20 | tf.flags.DEFINE_integer('num_epoch', 5, 'Number of epoch')
21 | tf.flags.DEFINE_boolean('use_synthetic', False, 'whether to use synthetic data or not')
22 | 
23 | 
24 | def build_model():
25 |   model = language_model.LM(FLAGS.num_steps)
26 |   global_step = tf.train.get_or_create_global_step()
27 | 
28 |   with tf.device('/gpu:0'):
29 |     placeholder_x = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.num_steps])
30 |     placeholder_y = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.num_steps])
31 |     placeholder_w = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.num_steps])
32 |     initial_state_c = tf.placeholder(dtype=tf.float32,
33 |                                      shape=[FLAGS.batch_size, model.state_size],
34 |                                      name='initial_c')
35 |     initial_state_h = tf.placeholder(dtype=tf.float32,
36 |                                      shape=[FLAGS.batch_size, model.projected_size],
37 |                                      name='initial_h')
38 |     loss, final_state_c, final_state_h = model(placeholder_x, placeholder_y, placeholder_w, initial_state_c, initial_state_h, training=True)
39 |     scaled_loss = loss * FLAGS.num_steps
40 | 
41 |     emb_vars = list(model.emb)
42 |     lstm_vars = [model.W, model.B, model.W_P]
43 |     softmax_vars = list(model.softmax_w) + [model.softmax_b]
44 |     all_vars = emb_vars + lstm_vars + softmax_vars
45 |     grads = tf.gradients(scaled_loss, all_vars)
46 | 
47 |     emb_grads = grads[:len(emb_vars)]
48 |     emb_grads = [tf.IndexedSlices(grad.values * FLAGS.batch_size,
49 |                                   grad.indices,
50 |                                   grad.dense_shape) for grad in emb_grads]
51 | 
52 |     lstm_grads = grads[len(emb_vars):len(emb_vars) + len(lstm_vars)]
53 |     lstm_grads, _ = tf.clip_by_global_norm(lstm_grads, FLAGS.max_grad_norm)
54 | 
55 |     softmax_grads = grads[len(emb_vars) + len(lstm_vars):]
56 | 
57 |     clipped_grads = emb_grads + lstm_grads + softmax_grads
58 |     grads_and_vars = list(zip(clipped_grads, all_vars))
59 | 
60 |     optimizer = tf.train.AdagradOptimizer(FLAGS.learning_rate, initial_accumulator_value=1.0)
61 |     train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
62 | 
63 |     ema = tf.train.ExponentialMovingAverage(decay=0.999)
64 |     with tf.control_dependencies([train_op]):
65 |       train_op = ema.apply(lstm_vars)
66 | 
67 |   model.global_step = global_step
68 |   model.loss = loss
69 |   model.train_op = train_op
70 | 
71 |   model.final_state_c = final_state_c
72 |   model.final_state_h = final_state_h
73 | 
74 |   model.initial_state_c = initial_state_c
75 |   model.initial_state_h = initial_state_h
76 | 
77 |   model.x = placeholder_x
78 |   model.y = placeholder_y
79 |   model.w = placeholder_w
80 | 
81 |   return model
82 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/lm1b_distributed_driver.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2018 Seoul National University
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import tensorflow as tf
 17 | import parallax
 18 | 
 19 | 
 20 | import os
 21 | import time
 22 | import math
 23 | import json
 24 | import sys
 25 | import numpy as np
 26 | 
 27 | from data_utils import Vocabulary, Dataset
 28 | import language_model_graph
 29 | import parallax_config
 30 | 
 31 | flags = tf.app.flags
 32 | flags.DEFINE_string("logdir", "/tmp/lm1b", "Logging directory.")
 33 | flags.DEFINE_string("datadir", None, "Logging directory.")
 34 | flags.DEFINE_string("hpconfig", "", "Overrides default hyper-parameters.")
 35 | flags.DEFINE_integer("eval_steps", 70, "Number of eval steps.")
 36 | flags.DEFINE_string('resource_info_file',
 37 |                     os.path.abspath(os.path.join(os.path.dirname(__file__),
 38 |                                                  '.',
 39 |                                                  'resource_info')),
 40 |                     'Filename containing cluster information')
 41 | flags.DEFINE_integer('max_steps', 1000000,
 42 |                      """Number of iterations to run for each workers.""")
 43 | flags.DEFINE_integer('log_frequency', 100,
 44 |                      """How many steps between two runop logs.""")
 45 | flags.DEFINE_boolean('sync', True, '')
 46 | FLAGS = flags.FLAGS
 47 | 
 48 | 
 49 | def main(_):
 50 | 
 51 |     vocab = Vocabulary.from_file(os.path.join(FLAGS.datadir, "1b_word_vocab.txt"))
 52 |     dataset = Dataset(vocab, os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*"))
 53 | 
 54 |     single_gpu_graph = tf.Graph()
 55 |     with single_gpu_graph.as_default():
 56 |         with tf.variable_scope("model"):
 57 |             model = language_model_graph.build_model()
 58 | 
 59 |     def run(sess,num_workers, worker_id, num_replicas_per_worker):
 60 |         
 61 |         state_c = []
 62 |         state_h = []
 63 | 
 64 |         if len(state_c) == 0:
 65 |             state_c.extend([np.zeros([FLAGS.batch_size, model.state_size], dtype=np.float32) for _ in range(num_replicas_per_worker)])
 66 |             state_h.extend([np.zeros([FLAGS.batch_size, model.projected_size], dtype=np.float32) for _ in range(num_replicas_per_worker)])
 67 | 
 68 |         prev_global_step = sess.run(model.global_step)[0]
 69 |         prev_time = time.time()
 70 |         data_iterator = dataset.iterate_forever(FLAGS.batch_size * num_replicas_per_worker,
 71 |                                                 FLAGS.num_steps, num_workers, worker_id)
 72 |         fetches = {
 73 |             'global_step': model.global_step,
 74 |             'loss': model.loss,
 75 |             'train_op': model.train_op,
 76 |             'final_state_c': model.final_state_c,
 77 |             'final_state_h': model.final_state_h
 78 |         }
 79 | 
 80 |         for local_step in range(FLAGS.max_steps):
 81 |             if FLAGS.use_synthetic:
 82 |               x = np.random.randint(low=0, high=model.vocab_size, size=(FLAGS.batch_size*num_replicas_per_worker, FLAGS.num_steps))
 83 |               y = np.random.randint(low=0, high=model.vocab_size, size=(FLAGS.batch_size*num_replicas_per_worker, FLAGS.num_steps))
 84 |               w = np.ones((FLAGS.batch_size*num_replicas_per_worker, FLAGS.num_steps))
 85 |             else:
 86 |               x, y, w = next(data_iterator)
 87 |             feeds = {}
 88 |             feeds[model.x] = np.split(x, num_replicas_per_worker)
 89 |             feeds[model.y] = np.split(y, num_replicas_per_worker)
 90 |             feeds[model.w] = np.split(w, num_replicas_per_worker)
 91 |             feeds[model.initial_state_c] = state_c
 92 |             feeds[model.initial_state_h] = state_h
 93 |             fetched = sess.run(fetches, feeds)
 94 | 
 95 |             state_c = fetched['final_state_c']
 96 |             state_h = fetched['final_state_h']
 97 | 
 98 |             if local_step % FLAGS.log_frequency == 0:
 99 |                 cur_time = time.time()
100 |                 elapsed_time = cur_time - prev_time
101 |                 num_words = FLAGS.batch_size * FLAGS.num_steps
102 |                 wps = (fetched['global_step'][0] - prev_global_step) * num_words / elapsed_time
103 |                 prev_global_step = fetched['global_step'][0]
104 |                 parallax.log.info("Iteration %d, time = %.2fs, wps = %.0f, train loss = %.4f" % (
105 |                     fetched['global_step'][0], cur_time - prev_time, wps, fetched['loss'][0]))
106 |                 prev_time = cur_time
107 | 
108 |     sess, num_workers, worker_id, num_replicas_per_worker = \
109 |         parallax.parallel_run(single_gpu_graph,
110 |                               FLAGS.resource_info_file,
111 |                               sync=FLAGS.sync,
112 |                               parallax_config=parallax_config.build_config())
113 |     run(sess, num_workers, worker_id, num_replicas_per_worker)
114 | 
115 | if __name__ == "__main__":
116 |     tf.app.run()
117 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/lm1b_input.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import glob
  3 | import json
  4 | import random
  5 | 
  6 | import numpy as np
  7 | import sys
  8 | 
  9 | 
 10 | class Vocabulary(object):
 11 | 
 12 |   def __init__(self):
 13 |     self._token_to_id = {}
 14 |     self._token_to_count = {}
 15 |     self._id_to_token = []
 16 |     self._num_tokens = 0
 17 |     self._s_id = None
 18 |     self._unk_id = None
 19 | 
 20 |   @property
 21 |   def num_tokens(self):
 22 |     return self._num_tokens
 23 | 
 24 |   @property
 25 |   def unk(self):
 26 |     return "<UNK>"
 27 | 
 28 |   @property
 29 |   def unk_id(self):
 30 |     return self._unk_id
 31 | 
 32 |   @property
 33 |   def s(self):
 34 |     return "<S>"
 35 | 
 36 |   @property
 37 |   def s_id(self):
 38 |     return self._s_id
 39 | 
 40 |   def add(self, token, count):
 41 |     self._token_to_id[token] = self._num_tokens
 42 |     self._token_to_count[token] = count
 43 |     self._id_to_token.append(token)
 44 |     self._num_tokens += 1
 45 | 
 46 |   def finalize(self):
 47 |     self._s_id = self.get_id(self.s)
 48 |     self._unk_id = self.get_id(self.unk)
 49 | 
 50 |   def get_id(self, token):
 51 |     return self._token_to_id.get(token, self.unk_id)
 52 | 
 53 |   def get_token(self, id_):
 54 |     return self._id_to_token[id_]
 55 | 
 56 |   @staticmethod
 57 |   def from_file(filename, num_tokens_limit=None):
 58 |     vocab = Vocabulary()
 59 |     with codecs.open(filename, "r", "utf-8") as f:
 60 |       for line in f:
 61 |         word, count = line.strip().split()
 62 |         vocab.add(word, int(count))
 63 |         if num_tokens_limit is not None:
 64 |           if vocab.num_tokens == num_tokens_limit:
 65 |             break
 66 |     vocab.finalize()
 67 |     return vocab
 68 | 
 69 | 
 70 | class Dataset(object):
 71 | 
 72 |   def __init__(self, vocab, filenames, deterministic=False):
 73 |     self._vocab = vocab
 74 |     self._filenames = filenames
 75 |     self._deterministic = deterministic
 76 | 
 77 |   def _parse_sentence(self, line):
 78 |     s_id = self._vocab.s_id
 79 |     return [s_id] + [self._vocab.get_id(word) for word in line.strip().split()] + [s_id]
 80 | 
 81 |   def _parse_file(self, file_name):
 82 |     print("Processing file: %s" % file_name)
 83 |     with codecs.open(file_name, "r", "utf-8") as f:
 84 |       lines = [line.strip() for line in f]
 85 |       if not self._deterministic:
 86 |         random.shuffle(lines)
 87 |       print("Finished processing!")
 88 |       for line in lines:
 89 |         yield self._parse_sentence(line)
 90 | 
 91 |   def _sentence_stream(self, file_stream):
 92 |     for file_name in file_stream:
 93 |       for sentence in self._parse_file(file_name):
 94 |         yield sentence
 95 | 
 96 |   def _iterate(self, sentences, batch_size, num_steps):
 97 |     streams = [None] * batch_size
 98 |     x = np.zeros([batch_size, num_steps], np.int32)
 99 |     y = np.zeros([batch_size, num_steps], np.int32)
100 |     w = np.zeros([batch_size, num_steps], np.uint8)
101 |     while True:
102 |       x[:] = 0
103 |       y[:] = 0
104 |       w[:] = 0
105 |       for i in range(batch_size):
106 |         tokens_filled = 0
107 |         try:
108 |           while tokens_filled < num_steps:
109 |             if streams[i] is None or len(streams[i]) <= 1:
110 |               streams[i] = next(sentences)
111 |             num_tokens = min(len(streams[i]) - 1, num_steps - tokens_filled)
112 |             x[i, tokens_filled:tokens_filled+num_tokens] = streams[i][:num_tokens]
113 |             y[i, tokens_filled:tokens_filled + num_tokens] = streams[i][1:num_tokens+1]
114 |             w[i, tokens_filled:tokens_filled + num_tokens] = 1
115 |             streams[i] = streams[i][num_tokens:]
116 |             tokens_filled += num_tokens
117 |         except StopIteration:
118 |           pass
119 |       if not np.any(w):
120 |         return
121 | 
122 |       yield x, y, w
123 | 
124 |   def iterate_once(self, batch_size, num_steps):
125 |     def file_stream():
126 |       for file_name in self._filenames:
127 |         yield file_name
128 |     for value in self._iterate(self._sentence_stream(file_stream()), batch_size, num_steps):
129 |       yield value
130 | 
131 |   def iterate_forever(self, batch_size, num_steps):
132 |     def file_stream():
133 |       while True:
134 |         if not self._deterministic:
135 |           random.shuffle(self._filenames)
136 |         for file_name in self._filenames:
137 |           yield file_name
138 |     for value in self._iterate(self._sentence_stream(file_stream()), batch_size, num_steps):
139 |       yield value
140 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/parallax_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2018 Seoul National University
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | import tensorflow as tf
17 | import parallax
18 | 
19 | flags = tf.app.flags
20 | flags.DEFINE_boolean('replicate_variables', True, """replicate_variables""")
21 | flags.DEFINE_string('protocol', 'grpc', """The method for managing variables""")
22 | flags.DEFINE_string('mpirun_options', '', 'The option for mpirun')
23 | flags.DEFINE_string('run_option', 'HYBRID',
24 |                     'The run option whether PS, MPI or HYBRID')
25 | flags.DEFINE_string('redirect_path', None, """redirect path to keep the log of distributed workers""")
26 | flags.DEFINE_string('ckpt_dir', None, """Directory to save checkpoints""")
27 | flags.DEFINE_integer('save_ckpt_steps', None,
28 |                      """Number of steps between two consecutive checkpoints""")
29 | flags.DEFINE_string('profile_dir', None, """Directory to save RunMetadata""")
30 | flags.DEFINE_string('profile_steps', None, """Comma separated porfile steps""")
31 | flags.DEFINE_string('profile_range', None, """profile_start_step,profile_end_step""")
32 | flags.DEFINE_boolean('local_aggregation', True,
33 |                      """Whether to use local aggregation or not""")
34 | flags.DEFINE_boolean('boundary_among_servers', True,
35 |                      """Whether to use operation placement among servers""")
36 | flags.DEFINE_boolean('boundary_between_workers_and_servers', True,
37 |                      """Whether to use operation placement between workers and servers""")
38 | flags.DEFINE_string('export_graph_path', None, """export path to keep transformed graph definintion""")
39 | flags.DEFINE_boolean('search_partitions', False, "Whether to use variable partitioning method")
40 | FLAGS = flags.FLAGS
41 | 
42 | def build_config():
43 | 
44 |     ckpt_config = parallax.CheckPointConfig(ckpt_dir=FLAGS.ckpt_dir,
45 |                                             save_ckpt_steps=FLAGS.save_ckpt_steps)
46 |     ps_config = parallax.PSConfig(replicate_variables=FLAGS.replicate_variables,
47 |                                   protocol=FLAGS.protocol,
48 |                                   local_aggregation=FLAGS.local_aggregation,
49 |                                   boundary_among_servers=FLAGS.boundary_among_servers,
50 |                                   boundary_between_workers_and_servers=\
51 |                                   FLAGS.boundary_between_workers_and_servers)
52 |     mpi_config = parallax.MPIConfig(mpirun_options=FLAGS.mpirun_options)
53 |     def get_profile_steps():
54 |         if FLAGS.profile_steps:
55 |             FLAGS.profile_steps = FLAGS.profile_steps.strip()
56 |             return [int(step) for step in FLAGS.profile_steps.split(',')]
57 |         return None
58 | 
59 |     def get_profile_range():
60 |         if FLAGS.profile_range:
61 |             FLAGS.profile_range = FLAGS.profile_range.strip()
62 |             splits = FLAGS.profile_range.split(',')
63 |             return (int(splits[0]), int(splits[1]))
64 |         return None
65 | 
66 |     profile_config = parallax.ProfileConfig(profile_dir=FLAGS.profile_dir,
67 |                                             profile_steps=get_profile_steps(),
68 |                                             profile_range=get_profile_range())
69 |     parallax_config = parallax.Config()
70 |     parallax_config.run_option = FLAGS.run_option
71 |     parallax_config.average_sparse = False
72 |     parallax_config.communication_config = parallax.CommunicationConfig(ps_config, mpi_config)
73 |     parallax_config.ckpt_config = ckpt_config
74 |     parallax_config.profile_config = profile_config
75 |     parallax_config.redirect_path = FLAGS.redirect_path
76 |     parallax_config.export_graph_path = FLAGS.export_graph_path
77 |     parallax_config.search_partitions = FLAGS.search_partitions
78 | 
79 |     return parallax_config
80 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/resource_info:
--------------------------------------------------------------------------------
1 | 123.456.78.90:1,2
2 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/lm1b/testdata/test_s2.txt:
--------------------------------------------------------------------------------
1 | 非婚姻所生 非婚姻所生
2 | ala ma kota
3 | test


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/.gitignore:
--------------------------------------------------------------------------------
1 | bazel-bin
2 | bazel-genfiles
3 | bazel-out
4 | bazel-testlogs
5 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | Want to contribute? Great! First, read this page (including the small print at the end).
 2 | 
 3 | ### Before you contribute
 4 | 
 5 | Before we can use your code, you must sign the
 6 | [Google Individual Contributor License Agreement]
 7 | (https://cla.developers.google.com/about/google-individual)
 8 | (CLA), which you can do online. The CLA is necessary mainly because you own the
 9 | copyright to your changes, even after your contribution becomes part of our
10 | codebase, so we need your permission to use and distribute your code. We also
11 | need to be sure of various other things—for instance that you'll tell us if you
12 | know that your code infringes on other people's patents. You don't have to sign
13 | the CLA until after you've submitted your code for review and a member has
14 | approved it, but you must do it before we can put your code into our codebase.
15 | Before you start working on a larger contribution, you should get in touch with
16 | us first through the issue tracker with your idea so that we can help out and
17 | possibly guide you. Coordinating up front makes it much easier to avoid
18 | frustration later on.
19 | 
20 | ### Code reviews
21 | 
22 | All submissions, including submissions by project members, require review. We
23 | use Github pull requests for this purpose.
24 | 
25 | ### The small print
26 | 
27 | Contributions made by corporations are covered by a different agreement than
28 | the one above, the
29 | [Software Grant and Corporate Contributor License Agreement]
30 | (https://cla.developers.google.com/about/google-corporate).
31 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/README.md:
--------------------------------------------------------------------------------
 1 | # Neural Machine Translation (seq2seq)
 2 | 
 3 | Neural Machine Translation (NMT) mimics translation process of human. For more detailed description about the program itself, please check out [https://github.com/tensorflow/nmt](https://github.com/tensorflow/nmt) where this program comes from.
 4 | 
 5 | ## Dataset
 6 | 
 7 | We can use the following publicly available datasets:
 8 | 
 9 | 1. *Small-scale*: English-Vietnamese parallel corpus of TED talks (133K sentence
10 |    pairs) provided by
11 |    the
12 |    [IWSLT Evaluation Campaign](https://sites.google.com/site/iwsltevaluation2015/).
13 | 1. *Large-scale*: German-English parallel corpus (4.5M sentence pairs) provided
14 |    by the [WMT Evaluation Campaign](http://www.statmt.org/wmt16/translation-task.html).
15 |    
16 | ## To Run
17 | 
18 | Set your resource information in the `resource_info` file.
19 | 
20 | The command below runs a single GNMT WMT German-English model on multiple devices specified in `resource_info`. The command assumes that the data directory and the NMT codebase are distributed and reachable in the same absolute path in each of the machines.
21 | 
22 | 
23 | ```
24 | $ python nmt_distributed_driver.py \ 
25 |     --src=de --tgt=en \
26 |     --hparams_path=nmt/standard_hparams/wmt16_gnmt_4_layer.json \
27 |     --out_dir=/tmp/deen_gnmt \
28 |     --vocab_prefix=/tmp/wmt16/vocab.bpe.32000 \
29 |     --train_prefix=/tmp/wmt16/train.tok.clean.bpe.32000 \
30 |     --dev_prefix=/tmp/wmt16/newstest2013.tok.bpe.32000 \
31 |     --test_prefix=/tmp/wmt16/newstest2015.tok.bpe.32000
32 | ```
33 | 
34 | For more options of nmt model command, please check out [https://github.com/tensorflow/nmt](https://github.com/tensorflow/nmt) again.
35 | 
36 | Besides, we have a few more options you can choose for distributed running.
37 | 
38 | | Parameter Name       |  Default            	| Description |
39 | | :------------------- |:-----------------------| :-----------|
40 | | --resource_info_file | `./resource_info`	    | Filename containing cluster information written |
41 | | --max_steps 		   | 1000000    		    | Number of iterations to run for each workers |
42 | | --steps_per_stats 	   | 100  		    		| How many steps between two runop log |
43 | | --sync          	   | True  	 				| Whether to synchronize learning or not |
44 | | --ckpt_dir           | None                   | Directory to save checkpoints |
45 | | --save_ckpt_steps    | 0						| Number of steps between two consecutive checkpoints |
46 | | --run_option		   | None					| The run option whether PS or MPI, None utilizes both |
47 | | --epoch_size 		   | 0						| total number of data instances |
48 | | --search_partitions   | False              | Whether to use Parallax's variable partitioning method or not |
49 | 
50 | You can adapt the distributed running with above options. For example, you can run the GNMT WMT German-English model in MPI mode by just adding `--run_option` value to the script like below:
51 | 
52 | ```
53 | $ python nmt_distributed_driver.py \ 
54 |     --src=de --tgt=en \
55 |     --hparams_path=${PWD}/nmt/standard_hparams/wmt16_gnmt_4_layer.json \
56 |     --out_dir=/tmp/deen_gnmt \
57 |     --vocab_prefix=/tmp/wmt16/vocab.bpe.32000 \
58 |     --train_prefix=/tmp/wmt16/train.tok.clean.bpe.32000 \
59 |     --dev_prefix=/tmp/wmt16/newstest2013.tok.bpe.32000 \
60 |     --test_prefix=/tmp/wmt16/newstest2015.tok.bpe.32000
61 |     --run_option=MPI 
62 | ```
63 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/__init__.py


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/g3doc/img/attention_equation_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/attention_equation_0.jpg


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/g3doc/img/attention_equation_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/attention_equation_1.jpg


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/g3doc/img/attention_mechanism.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/attention_mechanism.jpg


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/g3doc/img/attention_vis.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/attention_vis.jpg


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/g3doc/img/encdec.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/encdec.jpg


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/g3doc/img/greedy_dec.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/greedy_dec.jpg


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/g3doc/img/seq2seq.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/g3doc/img/seq2seq.jpg


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/nmt_test.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Tests for nmt.py, train.py and inference.py."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import argparse
 22 | import os
 23 | 
 24 | import tensorflow as tf
 25 | 
 26 | from . import inference
 27 | from . import nmt
 28 | from . import train
 29 | 
 30 | 
 31 | def _update_flags(flags, test_name):
 32 |   """Update flags for basic training."""
 33 |   flags.num_train_steps = 100
 34 |   flags.steps_per_stats = 5
 35 |   flags.src = "en"
 36 |   flags.tgt = "vi"
 37 |   flags.train_prefix = ("nmt/testdata/"
 38 |                         "iwslt15.tst2013.100")
 39 |   flags.vocab_prefix = ("nmt/testdata/"
 40 |                         "iwslt15.vocab.100")
 41 |   flags.dev_prefix = ("nmt/testdata/"
 42 |                       "iwslt15.tst2013.100")
 43 |   flags.test_prefix = ("nmt/testdata/"
 44 |                        "iwslt15.tst2013.100")
 45 |   flags.out_dir = os.path.join(tf.test.get_temp_dir(), test_name)
 46 | 
 47 | 
 48 | class NMTTest(tf.test.TestCase):
 49 | 
 50 |   def testTrain(self):
 51 |     """Test the training loop is functional with basic hparams."""
 52 |     nmt_parser = argparse.ArgumentParser()
 53 |     nmt.add_arguments(nmt_parser)
 54 |     FLAGS, unparsed = nmt_parser.parse_known_args()
 55 | 
 56 |     _update_flags(FLAGS, "nmt_train_test")
 57 | 
 58 |     default_hparams = nmt.create_hparams(FLAGS)
 59 | 
 60 |     train_fn = train.train
 61 |     nmt.run_main(FLAGS, default_hparams, train_fn, None)
 62 | 
 63 | 
 64 |   def testTrainWithAvgCkpts(self):
 65 |     """Test the training loop is functional with basic hparams."""
 66 |     nmt_parser = argparse.ArgumentParser()
 67 |     nmt.add_arguments(nmt_parser)
 68 |     FLAGS, unparsed = nmt_parser.parse_known_args()
 69 | 
 70 |     _update_flags(FLAGS, "nmt_train_test_avg_ckpts")
 71 |     FLAGS.avg_ckpts = True
 72 | 
 73 |     default_hparams = nmt.create_hparams(FLAGS)
 74 | 
 75 |     train_fn = train.train
 76 |     nmt.run_main(FLAGS, default_hparams, train_fn, None)
 77 | 
 78 | 
 79 |   def testInference(self):
 80 |     """Test inference is function with basic hparams."""
 81 |     nmt_parser = argparse.ArgumentParser()
 82 |     nmt.add_arguments(nmt_parser)
 83 |     FLAGS, unparsed = nmt_parser.parse_known_args()
 84 | 
 85 |     _update_flags(FLAGS, "nmt_train_infer")
 86 | 
 87 |     # Train one step so we have a checkpoint.
 88 |     FLAGS.num_train_steps = 1
 89 |     default_hparams = nmt.create_hparams(FLAGS)
 90 |     train_fn = train.train
 91 |     nmt.run_main(FLAGS, default_hparams, train_fn, None)
 92 | 
 93 |     # Update FLAGS for inference.
 94 |     FLAGS.inference_input_file = ("nmt/testdata/"
 95 |                                   "iwslt15.tst2013.100.en")
 96 |     FLAGS.inference_output_file = os.path.join(FLAGS.out_dir, "output")
 97 |     FLAGS.inference_ref_file = ("nmt/testdata/"
 98 |                                 "iwslt15.tst2013.100.vi")
 99 | 
100 |     default_hparams = nmt.create_hparams(FLAGS)
101 | 
102 |     inference_fn = inference.inference
103 |     nmt.run_main(FLAGS, default_hparams, None, inference_fn)
104 | 
105 | 
106 | if __name__ == "__main__":
107 |   tf.test.main()
108 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/parallax_config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright (C) 2018 Seoul National University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ==============================================================================
16 | 
17 | import tensorflow as tf
18 | import parallax
19 | 
20 | 
21 | flags = tf.app.flags
22 | flags.DEFINE_boolean('replicate_variables', True, """replicate_variables""")
23 | flags.DEFINE_string('protocol', 'grpc', """The method for managing variables""")
24 | tf.app.flags.DEFINE_string('mpirun_options', '', 'option for mpirun')
25 | flags.DEFINE_string('run_option', 'HYBRID',
26 |                     'The run option whether PS, MPI or HYBRID')
27 | flags.DEFINE_string('redirect_path', None, """redirect path to keep the log of distributed workers""")
28 | flags.DEFINE_integer('save_ckpt_steps', None,
29 |                      """Number of steps between two consecutive checkpoints""")
30 | flags.DEFINE_integer('save_n_ckpts_per_epoch', -1, """Save n checkpoints per every epoch""")
31 | flags.DEFINE_string('ckpt_dir', None, """Directory to save checkpoints""")
32 | flags.DEFINE_string('profile_dir', None, """Directory to save RunMetadata""")
33 | flags.DEFINE_string('profile_steps', None, """Comma separated porfile steps""")
34 | flags.DEFINE_boolean('local_aggregation', True,
35 |                      """Whether to use local aggregation or not""")
36 | flags.DEFINE_boolean('boundary_among_servers', True,
37 |                      """Whether to use operation placement among servers""")
38 | flags.DEFINE_boolean('boundary_between_workers_and_servers', True,
39 |                      """Whether to use operation placement between workers and servers""")
40 | flags.DEFINE_string('export_graph_path', None, """export path to keep transformed graph definintion""")
41 | flags.DEFINE_boolean('search_partitions', False, """Whether to use variable partitioning method""")
42 | FLAGS = flags.FLAGS
43 | 
44 | def calculate_ckpt_steps():
45 |     if FLAGS.save_n_ckpts_per_epoch > 0:
46 |       with open(FLAGS.resource_info_file) as resource_info:
47 |         num_workers = sum([len(w['gpus']) for w in json.load(resource_info)['worker']])
48 |       num_words_per_iter = FLAGS.batch_size * FLAGS.num_steps * num_workers
49 |       num_iters_per_epoch = math.ceil(language_model_graph._NUM_WORDS['train'] / num_words_per_iter / FLAGS.save_n_ckpts_per_epoch)
50 |       save_ckpt_steps = num_iters_per_epoch if FLAGS.sync else num_iters_per_epoch * num_workers
51 |       parallax.log.info('Save checkpoint for every %d iters' % save_ckpt_steps)
52 |     else:
53 |       save_ckpt_steps = FLAGS.save_ckpt_steps
54 | 
55 |     return save_ckpt_steps
56 | 
57 | 
58 | def build_config():
59 | 
60 |     ckpt_config = parallax.CheckPointConfig(ckpt_dir=FLAGS.ckpt_dir,
61 |                                             save_ckpt_steps=calculate_ckpt_steps())
62 |     ps_config = parallax.PSConfig(replicate_variables=FLAGS.replicate_variables,
63 |                                   protocol=FLAGS.protocol,
64 |                                   local_aggregation=FLAGS.local_aggregation,
65 |                                   boundary_among_servers=FLAGS.boundary_among_servers,
66 |                                   boundary_between_workers_and_servers=\
67 |                                   FLAGS.boundary_between_workers_and_servers)
68 |     mpi_config = parallax.MPIConfig(mpirun_options=FLAGS.mpirun_options)
69 |     parallax_config = parallax.Config()
70 |     parallax_config.run_option = FLAGS.run_option
71 |     parallax_config.average_sparse = False
72 |     parallax_config.communication_config = parallax.CommunicationConfig(ps_config, mpi_config)
73 |     parallax_config.ckpt_config = ckpt_config
74 |     def get_profile_steps():
75 |         if not FLAGS.profile_steps:
76 |             return []
77 |         FLAGS.profile_steps = FLAGS.profile_steps.strip()
78 |         return [int(step) for step in FLAGS.profile_steps.split(',')]
79 |     profile_config = parallax.ProfileConfig(profile_dir=FLAGS.profile_dir,
80 |                                             profile_steps=get_profile_steps())
81 |     parallax_config.profile_config = profile_config
82 |     parallax_config.redirect_path = FLAGS.redirect_path
83 |     parallax_config.export_graph_path = FLAGS.export_graph_path
84 |     parallax_config.search_partitions = FLAGS.search_partitions
85 | 
86 |     return parallax_config
87 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/resource_info:
--------------------------------------------------------------------------------
1 | 123.456.78.90:1,2
2 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/scripts/__init__.py


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/scripts/bleu.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Python implementation of BLEU and smooth-BLEU.
 17 | 
 18 | This module provides a Python implementation of BLEU and smooth-BLEU.
 19 | Smooth BLEU is computed following the method outlined in the paper:
 20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
 21 | evaluation metrics for machine translation. COLING 2004.
 22 | """
 23 | 
 24 | import collections
 25 | import math
 26 | 
 27 | 
 28 | def _get_ngrams(segment, max_order):
 29 |   """Extracts all n-grams upto a given maximum order from an input segment.
 30 | 
 31 |   Args:
 32 |     segment: text segment from which n-grams will be extracted.
 33 |     max_order: maximum length in tokens of the n-grams returned by this
 34 |         methods.
 35 | 
 36 |   Returns:
 37 |     The Counter containing all n-grams upto max_order in segment
 38 |     with a count of how many times each n-gram occurred.
 39 |   """
 40 |   ngram_counts = collections.Counter()
 41 |   for order in range(1, max_order + 1):
 42 |     for i in range(0, len(segment) - order + 1):
 43 |       ngram = tuple(segment[i:i+order])
 44 |       ngram_counts[ngram] += 1
 45 |   return ngram_counts
 46 | 
 47 | 
 48 | def compute_bleu(reference_corpus, translation_corpus, max_order=4,
 49 |                  smooth=False):
 50 |   """Computes BLEU score of translated segments against one or more references.
 51 | 
 52 |   Args:
 53 |     reference_corpus: list of lists of references for each translation. Each
 54 |         reference should be tokenized into a list of tokens.
 55 |     translation_corpus: list of translations to score. Each translation
 56 |         should be tokenized into a list of tokens.
 57 |     max_order: Maximum n-gram order to use when computing BLEU score.
 58 |     smooth: Whether or not to apply Lin et al. 2004 smoothing.
 59 | 
 60 |   Returns:
 61 |     3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
 62 |     precisions and brevity penalty.
 63 |   """
 64 |   matches_by_order = [0] * max_order
 65 |   possible_matches_by_order = [0] * max_order
 66 |   reference_length = 0
 67 |   translation_length = 0
 68 |   for (references, translation) in zip(reference_corpus,
 69 |                                        translation_corpus):
 70 |     reference_length += min(len(r) for r in references)
 71 |     translation_length += len(translation)
 72 | 
 73 |     merged_ref_ngram_counts = collections.Counter()
 74 |     for reference in references:
 75 |       merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
 76 |     translation_ngram_counts = _get_ngrams(translation, max_order)
 77 |     overlap = translation_ngram_counts & merged_ref_ngram_counts
 78 |     for ngram in overlap:
 79 |       matches_by_order[len(ngram)-1] += overlap[ngram]
 80 |     for order in range(1, max_order+1):
 81 |       possible_matches = len(translation) - order + 1
 82 |       if possible_matches > 0:
 83 |         possible_matches_by_order[order-1] += possible_matches
 84 | 
 85 |   precisions = [0] * max_order
 86 |   for i in range(0, max_order):
 87 |     if smooth:
 88 |       precisions[i] = ((matches_by_order[i] + 1.) /
 89 |                        (possible_matches_by_order[i] + 1.))
 90 |     else:
 91 |       if possible_matches_by_order[i] > 0:
 92 |         precisions[i] = (float(matches_by_order[i]) /
 93 |                          possible_matches_by_order[i])
 94 |       else:
 95 |         precisions[i] = 0.0
 96 | 
 97 |   if min(precisions) > 0:
 98 |     p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
 99 |     geo_mean = math.exp(p_log_sum)
100 |   else:
101 |     geo_mean = 0
102 | 
103 |   ratio = float(translation_length) / reference_length
104 | 
105 |   if ratio > 1.0:
106 |     bp = 1.
107 |   else:
108 |     bp = math.exp(1 - 1. / ratio)
109 | 
110 |   bleu = geo_mean * bp
111 | 
112 |   return (bleu, precisions, bp, ratio, translation_length, reference_length)
113 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/scripts/download_iwslt15.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Download small-scale IWSLT15 Vietnames to English translation data for NMT
 3 | # model training.
 4 | #
 5 | # Usage:
 6 | #   ./download_iwslt15.sh path-to-output-dir
 7 | #
 8 | # If output directory is not specified, "./iwslt15" will be used as the default
 9 | # output directory.
10 | OUT_DIR="${1:-iwslt15}"
11 | SITE_PREFIX="https://nlp.stanford.edu/projects/nmt/data"
12 | 
13 | mkdir -v -p $OUT_DIR
14 | 
15 | # Download iwslt15 small dataset from standford website.
16 | echo "Download training dataset train.en and train.vi."
17 | curl -o "$OUT_DIR/train.en" "$SITE_PREFIX/iwslt15.en-vi/train.en"
18 | curl -o "$OUT_DIR/train.vi" "$SITE_PREFIX/iwslt15.en-vi/train.vi"
19 | 
20 | echo "Download dev dataset tst2012.en and tst2012.vi."
21 | curl -o "$OUT_DIR/tst2012.en" "$SITE_PREFIX/iwslt15.en-vi/tst2012.en"
22 | curl -o "$OUT_DIR/tst2012.vi" "$SITE_PREFIX/iwslt15.en-vi/tst2012.vi"
23 | 
24 | echo "Download test dataset tst2013.en and tst2013.vi."
25 | curl -o "$OUT_DIR/tst2013.en" "$SITE_PREFIX/iwslt15.en-vi/tst2013.en"
26 | curl -o "$OUT_DIR/tst2013.vi" "$SITE_PREFIX/iwslt15.en-vi/tst2013.vi"
27 | 
28 | echo "Download vocab file vocab.en and vocab.vi."
29 | curl -o "$OUT_DIR/vocab.en" "$SITE_PREFIX/iwslt15.en-vi/vocab.en"
30 | curl -o "$OUT_DIR/vocab.vi" "$SITE_PREFIX/iwslt15.en-vi/vocab.vi"
31 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/standard_hparams/iwslt15.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention": "scaled_luong",
 3 |   "attention_architecture": "standard",
 4 |   "batch_size": 128,
 5 |   "colocate_gradients_with_ops": true,
 6 |   "dropout": 0.2,
 7 |   "encoder_type": "bi",
 8 |   "eos": "</s>",
 9 |   "forget_bias": 1.0,
10 |   "infer_batch_size": 32,
11 |   "init_weight": 0.1,
12 |   "learning_rate": 1.0,
13 |   "max_gradient_norm": 5.0,
14 |   "metrics": ["bleu"],
15 |   "num_buckets": 5,
16 |   "num_layers": 2,
17 |   "num_train_steps": 12000,
18 |   "decay_scheme": "luong234",
19 |   "num_units": 512,
20 |   "optimizer": "sgd",
21 |   "residual": false,
22 |   "share_vocab": false,
23 |   "subword_option": "",
24 |   "sos": "<s>",
25 |   "src_max_len": 50,
26 |   "src_max_len_infer": null,
27 |   "steps_per_external_eval": null,
28 |   "steps_per_stats": 100,
29 |   "tgt_max_len": 50,
30 |   "tgt_max_len_infer": null,
31 |   "time_major": true,
32 |   "unit_type": "lstm",
33 |   "beam_width": 10
34 | }
35 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/standard_hparams/wmt16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention": "normed_bahdanau",
 3 |   "attention_architecture": "standard",
 4 |   "batch_size": 128,
 5 |   "colocate_gradients_with_ops": true,
 6 |   "dropout": 0.2,
 7 |   "encoder_type": "bi",
 8 |   "eos": "</s>",
 9 |   "forget_bias": 1.0,
10 |   "infer_batch_size": 32,
11 |   "init_weight": 0.1,
12 |   "learning_rate": 1.0,
13 |   "max_gradient_norm": 5.0,
14 |   "metrics": ["bleu"],
15 |   "num_buckets": 5,
16 |   "num_layers": 4,
17 |   "num_train_steps": 340000,
18 |   "decay_scheme": "luong10",
19 |   "num_units": 1024,
20 |   "optimizer": "sgd",
21 |   "residual": false,
22 |   "share_vocab": false,
23 |   "subword_option": "bpe",
24 |   "sos": "<s>",
25 |   "src_max_len": 50,
26 |   "src_max_len_infer": null,
27 |   "steps_per_external_eval": null,
28 |   "steps_per_stats": 100,
29 |   "tgt_max_len": 50,
30 |   "tgt_max_len_infer": null,
31 |   "time_major": true,
32 |   "unit_type": "lstm",
33 |   "beam_width": 10
34 | }
35 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/standard_hparams/wmt16_gnmt_4_layer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention": "normed_bahdanau",
 3 |   "attention_architecture": "gnmt_v2",
 4 |   "batch_size": 128,
 5 |   "colocate_gradients_with_ops": true,
 6 |   "dropout": 0.2,
 7 |   "encoder_type": "gnmt",
 8 |   "eos": "</s>",
 9 |   "forget_bias": 1.0,
10 |   "infer_batch_size": 32,
11 |   "init_weight": 0.1,
12 |   "learning_rate": 1.0,
13 |   "max_gradient_norm": 5.0,
14 |   "metrics": ["bleu"],
15 |   "num_buckets": 5,
16 |   "num_layers": 4,
17 |   "num_train_steps": 340000,
18 |   "decay_scheme": "luong10",
19 |   "num_units": 1024,
20 |   "optimizer": "sgd",
21 |   "residual": true,
22 |   "share_vocab": false,
23 |   "subword_option": "bpe",
24 |   "sos": "<s>",
25 |   "src_max_len": 50,
26 |   "src_max_len_infer": null,
27 |   "steps_per_external_eval": null,
28 |   "steps_per_stats": 100,
29 |   "tgt_max_len": 50,
30 |   "tgt_max_len_infer": null,
31 |   "time_major": true,
32 |   "unit_type": "lstm",
33 |   "beam_width": 10,
34 |   "length_penalty_weight": 1.0
35 | }
36 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/standard_hparams/wmt16_gnmt_8_layer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention": "normed_bahdanau",
 3 |   "attention_architecture": "gnmt_v2",
 4 |   "batch_size": 128,
 5 |   "colocate_gradients_with_ops": true,
 6 |   "dropout": 0.2,
 7 |   "encoder_type": "gnmt",
 8 |   "eos": "</s>",
 9 |   "forget_bias": 1.0,
10 |   "infer_batch_size": 32,
11 |   "init_weight": 0.1,
12 |   "learning_rate": 1.0,
13 |   "max_gradient_norm": 5.0,
14 |   "metrics": ["bleu"],
15 |   "num_buckets": 5,
16 |   "num_layers": 8,
17 |   "num_train_steps": 340000,
18 |   "decay_scheme": "luong10",
19 |   "num_units": 1024,
20 |   "optimizer": "sgd",
21 |   "residual": true,
22 |   "share_vocab": false,
23 |   "subword_option": "bpe",
24 |   "sos": "<s>",
25 |   "src_max_len": 50,
26 |   "src_max_len_infer": null,
27 |   "steps_per_external_eval": null,
28 |   "steps_per_stats": 50,
29 |   "tgt_max_len": 50,
30 |   "tgt_max_len_infer": null,
31 |   "time_major": true,
32 |   "unit_type": "lstm",
33 |   "beam_width": 10,
34 |   "length_penalty_weight": 1.0
35 | }
36 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/iwslt15.vocab.100.en:
--------------------------------------------------------------------------------
  1 | <unk>
  2 | <s>
  3 | </s>
  4 | Rachel
  5 | :
  6 | The
  7 | science
  8 | behind
  9 | a
 10 | climate
 11 | headline
 12 | In
 13 | 4
 14 | minutes
 15 | ,
 16 | atmospheric
 17 | chemist
 18 | provides
 19 | glimpse
 20 | of
 21 | the
 22 | massive
 23 | scientific
 24 | effort
 25 | bold
 26 | headlines
 27 | on
 28 | change
 29 | with
 30 | her
 31 | team
 32 | --
 33 | one
 34 | thousands
 35 | who
 36 | contributed
 37 | taking
 38 | risky
 39 | flight
 40 | over
 41 | rainforest
 42 | in
 43 | pursuit
 44 | data
 45 | key
 46 | molecule
 47 | .
 48 | I
 49 | &apos;d
 50 | like
 51 | to
 52 | talk
 53 | you
 54 | today
 55 | about
 56 | scale
 57 | that
 58 | goes
 59 | into
 60 | making
 61 | see
 62 | paper
 63 | look
 64 | this
 65 | when
 66 | they
 67 | have
 68 | do
 69 | and
 70 | air
 71 | quality
 72 | or
 73 | smog
 74 | They
 75 | are
 76 | both
 77 | two
 78 | branches
 79 | same
 80 | field
 81 | Recently
 82 | looked
 83 | Panel
 84 | Climate
 85 | Change
 86 | IPCC
 87 | put
 88 | out
 89 | their
 90 | report
 91 | state
 92 | understanding
 93 | system
 94 | That
 95 | was
 96 | written
 97 | by
 98 | scientists
 99 | from
100 | 40
101 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/iwslt15.vocab.100.vi:
--------------------------------------------------------------------------------
  1 | <unk>
  2 | <s>
  3 | </s>
  4 | Khoa
  5 | học
  6 | đằng
  7 | sau
  8 | một
  9 | tiêu
 10 | đề
 11 | về
 12 | khí
 13 | hậu
 14 | Trong
 15 | 4
 16 | phút
 17 | ,
 18 | chuyên
 19 | gia
 20 | hoá
 21 | quyển
 22 | Rachel
 23 | giới
 24 | thiệu
 25 | sơ
 26 | lược
 27 | những
 28 | nỗ
 29 | lực
 30 | khoa
 31 | miệt
 32 | mài
 33 | táo
 34 | bạo
 35 | biến
 36 | đổi
 37 | cùng
 38 | với
 39 | đoàn
 40 | nghiên
 41 | cứu
 42 | của
 43 | mình
 44 | --
 45 | hàng
 46 | ngàn
 47 | người
 48 | đã
 49 | cống
 50 | hiến
 51 | cho
 52 | dự
 53 | án
 54 | này
 55 | chuyến
 56 | bay
 57 | mạo
 58 | hiểm
 59 | qua
 60 | rừng
 61 | già
 62 | để
 63 | tìm
 64 | kiếm
 65 | thông
 66 | tin
 67 | phân
 68 | tử
 69 | then
 70 | chốt
 71 | .
 72 | Tôi
 73 | muốn
 74 | các
 75 | bạn
 76 | biết
 77 | sự
 78 | to
 79 | lớn
 80 | góp
 81 | phần
 82 | làm
 83 | nên
 84 | dòng
 85 | tít
 86 | thường
 87 | thấy
 88 | trên
 89 | báo
 90 | Có
 91 | trông
 92 | như
 93 | thế
 94 | khi
 95 | bàn
 96 | và
 97 | nói
 98 | chất
 99 | lượng
100 | không
101 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/label_ref:
--------------------------------------------------------------------------------
1 | positive
2 | positive
3 | positive
4 | negative
5 | negative


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/pred_output:
--------------------------------------------------------------------------------
1 | positive
2 | positive
3 | negative
4 | negative
5 | positive


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/test_embed.txt:
--------------------------------------------------------------------------------
1 | some_word 1.0 2.0 3.0 4.0
2 | some_other_word 4.0 3.0 2.0 1.0
3 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/test_embed_with_header.txt:
--------------------------------------------------------------------------------
1 | 2 4
2 | some_word 1.0 2.0 3.0 4.0
3 | some_other_word 4.0 3.0 2.0 1.0
4 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/test_infer_file:
--------------------------------------------------------------------------------
1 | A Republic@@ an strategy to counter the re-@@ election of Obama
2 | Republic@@ an leaders justified their policy by the need to combat electoral fraud .
3 | However , the Brenn@@ an Centre considers this a my@@ th , stating that electoral fraud is rar@@ er in the United States than the number of people killed by ligh@@ tn@@ ing .
4 | Indeed , Republic@@ an lawyers identified only 300 cases of electoral fraud in the United States in a decade .
5 | One thing is certain : these new provisions will have a negative impact on vot@@ er tur@@ n-@@ out .


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/test_infer_vocab.src:
--------------------------------------------------------------------------------
1 | unk
2 | eos
3 | sos
4 | test1
5 | test2
6 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/testdata/test_infer_vocab.tgt:
--------------------------------------------------------------------------------
1 | unk
2 | eos
3 | test1
4 | test2
5 | test3
6 | test4
7 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/nmt/utils/__init__.py


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/utils/common_test_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Common utility functions for tests."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import tensorflow as tf
 23 | 
 24 | from tensorflow.python.ops import lookup_ops
 25 | 
 26 | from ..utils import iterator_utils
 27 | from ..utils import standard_hparams_utils
 28 | 
 29 | 
 30 | def create_test_hparams(unit_type="lstm",
 31 |                         encoder_type="uni",
 32 |                         num_layers=4,
 33 |                         attention="",
 34 |                         attention_architecture=None,
 35 |                         use_residual=False,
 36 |                         inference_indices=None,
 37 |                         num_translations_per_input=1,
 38 |                         beam_width=0,
 39 |                         init_op="uniform"):
 40 |   """Create training and inference test hparams."""
 41 |   num_residual_layers = 0
 42 |   if use_residual:
 43 |     # TODO(rzhao): Put num_residual_layers computation logic into
 44 |     # `model_utils.py`, so we can also test it here.
 45 |     num_residual_layers = 2
 46 | 
 47 |   standard_hparams = standard_hparams_utils.create_standard_hparams()
 48 | 
 49 |   # Networks
 50 |   standard_hparams.num_units  =  5
 51 |   standard_hparams.num_encoder_layers = num_layers
 52 |   standard_hparams.num_decoder_layers = num_layers
 53 |   standard_hparams.dropout = 0.5
 54 |   standard_hparams.unit_type = unit_type
 55 |   standard_hparams.encoder_type = encoder_type
 56 |   standard_hparams.residual = use_residual
 57 |   standard_hparams.num_residual_layers = num_residual_layers
 58 | 
 59 |   # Attention mechanisms
 60 |   standard_hparams.attention = attention
 61 |   standard_hparams.attention_architecture = attention_architecture
 62 | 
 63 |   # Train
 64 |   standard_hparams.init_op = init_op
 65 |   standard_hparams.num_train_steps = 1
 66 |   standard_hparams.decay_scheme = ""
 67 | 
 68 |   # Infer
 69 |   standard_hparams.tgt_max_len_infer = 100
 70 |   standard_hparams.beam_width = beam_width
 71 |   standard_hparams.num_translations_per_input = num_translations_per_input
 72 | 
 73 |   # Misc
 74 |   standard_hparams.forget_bias = 0.0
 75 |   standard_hparams.random_seed = 3
 76 | 
 77 |   # Vocab
 78 |   standard_hparams.src_vocab_size = 5
 79 |   standard_hparams.tgt_vocab_size = 5
 80 |   standard_hparams.eos = "eos"
 81 |   standard_hparams.sos = "sos"
 82 |   standard_hparams.src_vocab_file = ""
 83 |   standard_hparams.tgt_vocab_file = ""
 84 |   standard_hparams.src_embed_file = ""
 85 |   standard_hparams.tgt_embed_file = ""
 86 | 
 87 |   # For inference.py test
 88 |   standard_hparams.subword_option = "bpe"
 89 |   standard_hparams.src = "src"
 90 |   standard_hparams.tgt = "tgt"
 91 |   standard_hparams.src_max_len = 400
 92 |   standard_hparams.tgt_eos_id = 0
 93 |   standard_hparams.inference_indices = inference_indices
 94 |   return standard_hparams
 95 | 
 96 | 
 97 | def create_test_iterator(hparams, mode):
 98 |   """Create test iterator."""
 99 |   src_vocab_table = lookup_ops.index_table_from_tensor(
100 |       tf.constant([hparams.eos, "a", "b", "c", "d"]))
101 |   tgt_vocab_mapping = tf.constant([hparams.sos, hparams.eos, "a", "b", "c"])
102 |   tgt_vocab_table = lookup_ops.index_table_from_tensor(tgt_vocab_mapping)
103 |   if mode == tf.contrib.learn.ModeKeys.INFER:
104 |     reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_tensor(
105 |         tgt_vocab_mapping)
106 | 
107 |   src_dataset = tf.data.Dataset.from_tensor_slices(
108 |       tf.constant(["a a b b c", "a b b"]))
109 | 
110 |   if mode != tf.contrib.learn.ModeKeys.INFER:
111 |     tgt_dataset = tf.data.Dataset.from_tensor_slices(
112 |         tf.constant(["a b c b c", "a b c b"]))
113 |     return (
114 |         iterator_utils.get_iterator(
115 |             src_dataset=src_dataset,
116 |             tgt_dataset=tgt_dataset,
117 |             src_vocab_table=src_vocab_table,
118 |             tgt_vocab_table=tgt_vocab_table,
119 |             batch_size=hparams.batch_size,
120 |             sos=hparams.sos,
121 |             eos=hparams.eos,
122 |             random_seed=hparams.random_seed,
123 |             num_buckets=hparams.num_buckets),
124 |         src_vocab_table,
125 |         tgt_vocab_table)
126 |   else:
127 |     return (
128 |         iterator_utils.get_infer_iterator(
129 |             src_dataset=src_dataset,
130 |             src_vocab_table=src_vocab_table,
131 |             eos=hparams.eos,
132 |             batch_size=hparams.batch_size),
133 |         src_vocab_table,
134 |         tgt_vocab_table,
135 |         reverse_tgt_vocab_table)
136 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/utils/evaluation_utils_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Tests for evaluation_utils.py."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import tensorflow as tf
23 | 
24 | from ..utils import evaluation_utils
25 | 
26 | 
27 | class EvaluationUtilsTest(tf.test.TestCase):
28 | 
29 |   def testEvaluate(self):
30 |     output = "nmt/testdata/deen_output"
31 |     ref_bpe = "nmt/testdata/deen_ref_bpe"
32 |     ref_spm = "nmt/testdata/deen_ref_spm"
33 | 
34 |     expected_bleu_score = 22.5855084573
35 |     expected_rouge_score = 50.8429782599
36 | 
37 |     bpe_bleu_score = evaluation_utils.evaluate(
38 |         ref_bpe, output, "bleu", "bpe")
39 |     bpe_rouge_score = evaluation_utils.evaluate(
40 |         ref_bpe, output, "rouge", "bpe")
41 | 
42 |     self.assertAlmostEqual(expected_bleu_score, bpe_bleu_score)
43 |     self.assertAlmostEqual(expected_rouge_score, bpe_rouge_score)
44 | 
45 |     spm_bleu_score = evaluation_utils.evaluate(
46 |         ref_spm, output, "bleu", "spm")
47 |     spm_rouge_score = evaluation_utils.evaluate(
48 |         ref_spm, output, "rouge", "spm")
49 | 
50 |     self.assertAlmostEqual(expected_rouge_score, spm_rouge_score)
51 |     self.assertAlmostEqual(expected_bleu_score, spm_bleu_score)
52 | 
53 |   def testAccuracy(self):
54 |     pred_output = "nmt/testdata/pred_output"
55 |     label_ref = "nmt/testdata/label_ref"
56 | 
57 |     expected_accuracy_score = 60.00
58 | 
59 |     accuracy_score = evaluation_utils.evaluate(
60 |         label_ref, pred_output, "accuracy")
61 |     self.assertAlmostEqual(expected_accuracy_score, accuracy_score)
62 | 
63 |   def testWordAccuracy(self):
64 |     pred_output = "nmt/testdata/pred_output"
65 |     label_ref = "nmt/testdata/label_ref"
66 | 
67 |     expected_word_accuracy_score = 60.00
68 | 
69 |     word_accuracy_score = evaluation_utils.evaluate(
70 |         label_ref, pred_output, "word_accuracy")
71 |     self.assertAlmostEqual(expected_word_accuracy_score, word_accuracy_score)
72 | 
73 | 
74 | if __name__ == "__main__":
75 |   tf.test.main()
76 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/utils/misc_utils_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Tests for vocab_utils."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import tensorflow as tf
23 | 
24 | from ..utils import misc_utils
25 | 
26 | 
27 | class MiscUtilsTest(tf.test.TestCase):
28 | 
29 |   def testFormatBpeText(self):
30 |     bpe_line = (
31 |         b"En@@ ough to make already reluc@@ tant men hesitate to take screening"
32 |         b" tests ."
33 |     )
34 |     expected_result = (
35 |         b"Enough to make already reluctant men hesitate to take screening tests"
36 |         b" ."
37 |     )
38 |     self.assertEqual(expected_result,
39 |                      misc_utils.format_bpe_text(bpe_line.split(b" ")))
40 | 
41 |   def testFormatSPMText(self):
42 |     spm_line = u"\u2581This \u2581is \u2581a \u2581 te st .".encode("utf-8")
43 |     expected_result = "This is a test."
44 |     self.assertEqual(expected_result,
45 |                      misc_utils.format_spm_text(spm_line.split(b" ")))
46 | 
47 | 
48 | if __name__ == "__main__":
49 |   tf.test.main()
50 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/utils/nmt_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Utility functions specifically for NMT."""
 17 | from __future__ import print_function
 18 | 
 19 | import codecs
 20 | import time
 21 | import numpy as np
 22 | import tensorflow as tf
 23 | 
 24 | from utils import evaluation_utils
 25 | from utils import misc_utils as utils
 26 | 
 27 | __all__ = ["decode_and_evaluate", "get_translation"]
 28 | 
 29 | 
 30 | def decode_and_evaluate(name,
 31 |                         model,
 32 |                         sess,
 33 |                         trans_file,
 34 |                         ref_file,
 35 |                         metrics,
 36 |                         subword_option,
 37 |                         beam_width,
 38 |                         tgt_eos,
 39 |                         num_translations_per_input=1,
 40 |                         decode=True):
 41 |   """Decode a test set and compute a score according to the evaluation task."""
 42 |   # Decode
 43 |   if decode:
 44 |     utils.print_out("  decoding to output %s." % trans_file)
 45 | 
 46 |     start_time = time.time()
 47 |     num_sentences = 0
 48 |     with codecs.getwriter("utf-8")(
 49 |         tf.gfile.GFile(trans_file, mode="wb")) as trans_f:
 50 |       trans_f.write("")  # Write empty string to ensure file is created.
 51 | 
 52 |       num_translations_per_input = max(
 53 |           min(num_translations_per_input, beam_width), 1)
 54 |       while True:
 55 |         try:
 56 |           nmt_outputs, _ = model.decode(sess)
 57 |           if beam_width == 0:
 58 |             nmt_outputs = np.expand_dims(nmt_outputs, 0)
 59 | 
 60 |           batch_size = nmt_outputs.shape[1]
 61 |           num_sentences += batch_size
 62 | 
 63 |           for sent_id in range(batch_size):
 64 |             for beam_id in range(num_translations_per_input):
 65 |               translation = get_translation(
 66 |                   nmt_outputs[beam_id],
 67 |                   sent_id,
 68 |                   tgt_eos=tgt_eos,
 69 |                   subword_option=subword_option)
 70 |               trans_f.write((translation + b"\n").decode("utf-8"))
 71 |         except tf.errors.OutOfRangeError:
 72 |           utils.print_time(
 73 |               "  done, num sentences %d, num translations per input %d" %
 74 |               (num_sentences, num_translations_per_input), start_time)
 75 |           break
 76 | 
 77 |   # Evaluation
 78 |   evaluation_scores = {}
 79 |   if ref_file and tf.gfile.Exists(trans_file):
 80 |     for metric in metrics:
 81 |       score = evaluation_utils.evaluate(
 82 |           ref_file,
 83 |           trans_file,
 84 |           metric,
 85 |           subword_option=subword_option)
 86 |       evaluation_scores[metric] = score
 87 |       utils.print_out("  %s %s: %.1f" % (metric, name, score))
 88 | 
 89 |   return evaluation_scores
 90 | 
 91 | 
 92 | def get_translation(nmt_outputs, sent_id, tgt_eos, subword_option):
 93 |   """Given batch decoding outputs, select a sentence and turn to text."""
 94 |   if tgt_eos: tgt_eos = tgt_eos.encode("utf-8")
 95 |   # Select a sentence
 96 |   output = nmt_outputs[sent_id, :].tolist()
 97 | 
 98 |   # If there is an eos symbol in outputs, cut them at that point.
 99 |   if tgt_eos and tgt_eos in output:
100 |     output = output[:output.index(tgt_eos)]
101 | 
102 |   if subword_option == "bpe":  # BPE
103 |     translation = utils.format_bpe_text(output)
104 |   elif subword_option == "spm":  # SPM
105 |     translation = utils.format_spm_text(output)
106 |   else:
107 |     translation = utils.format_text(output)
108 | 
109 |   return translation
110 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/utils/standard_hparams_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """standard hparams utils."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import tensorflow as tf
 23 | 
 24 | 
 25 | def create_standard_hparams():
 26 |   return tf.contrib.training.HParams(
 27 |       # Data
 28 |       src="",
 29 |       tgt="",
 30 |       train_prefix="",
 31 |       dev_prefix="",
 32 |       test_prefix="",
 33 |       vocab_prefix="",
 34 |       embed_prefix="",
 35 |       out_dir="",
 36 | 
 37 |       # Networks
 38 |       num_units=512,
 39 |       num_layers=2,
 40 |       num_encoder_layers=2,
 41 |       num_decoder_layers=2,
 42 |       dropout=0.2,
 43 |       unit_type="lstm",
 44 |       encoder_type="bi",
 45 |       residual=False,
 46 |       time_major=True,
 47 |       num_embeddings_partitions=0,
 48 | 
 49 |       # Attention mechanisms
 50 |       attention="scaled_luong",
 51 |       attention_architecture="standard",
 52 |       output_attention=True,
 53 |       pass_hidden_state=True,
 54 | 
 55 |       # Train
 56 |       optimizer="sgd",
 57 |       batch_size=128,
 58 |       init_op="uniform",
 59 |       init_weight=0.1,
 60 |       max_gradient_norm=5.0,
 61 |       learning_rate=1.0,
 62 |       warmup_steps=0,
 63 |       warmup_scheme="t2t",
 64 |       decay_scheme="luong234",
 65 |       colocate_gradients_with_ops=True,
 66 |       num_train_steps=12000,
 67 | 
 68 |       # Data constraints
 69 |       num_buckets=5,
 70 |       max_train=0,
 71 |       src_max_len=50,
 72 |       tgt_max_len=50,
 73 |       src_max_len_infer=0,
 74 |       tgt_max_len_infer=0,
 75 | 
 76 |       # Data format
 77 |       sos="<s>",
 78 |       eos="</s>",
 79 |       subword_option="",
 80 |       check_special_token=True,
 81 | 
 82 |       # Misc
 83 |       forget_bias=1.0,
 84 |       num_gpus=1,
 85 |       epoch_step=0,  # record where we were within an epoch.
 86 |       steps_per_stats=100,
 87 |       steps_per_external_eval=0,
 88 |       share_vocab=False,
 89 |       metrics=["bleu"],
 90 |       log_device_placement=False,
 91 |       random_seed=None,
 92 |       # only enable beam search during inference when beam_width > 0.
 93 |       beam_width=0,
 94 |       length_penalty_weight=0.0,
 95 |       override_loaded_hparams=True,
 96 |       num_keep_ckpts=5,
 97 |       avg_ckpts=False,
 98 | 
 99 |       # For inference
100 |       inference_indices=None,
101 |       infer_batch_size=32,
102 |       sampling_temperature=0.0,
103 |       num_translations_per_input=1,
104 |   )
105 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/utils/vocab_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Utility to handle vocabularies."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import codecs
 23 | import os
 24 | import tensorflow as tf
 25 | 
 26 | from tensorflow.python.ops import lookup_ops
 27 | 
 28 | from utils import misc_utils as utils
 29 | 
 30 | 
 31 | UNK = "<unk>"
 32 | SOS = "<s>"
 33 | EOS = "</s>"
 34 | UNK_ID = 0
 35 | 
 36 | 
 37 | def load_vocab(vocab_file):
 38 |   vocab = []
 39 |   with codecs.getreader("utf-8")(tf.gfile.GFile(vocab_file, "rb")) as f:
 40 |     vocab_size = 0
 41 |     for word in f:
 42 |       vocab_size += 1
 43 |       vocab.append(word.strip())
 44 |   return vocab, vocab_size
 45 | 
 46 | 
 47 | def check_vocab(vocab_file, out_dir, check_special_token=True, sos=None,
 48 |                 eos=None, unk=None):
 49 |   """Check if vocab_file doesn't exist, create from corpus_file."""
 50 |   if tf.gfile.Exists(vocab_file):
 51 |     utils.print_out("# Vocab file %s exists" % vocab_file)
 52 |     vocab, vocab_size = load_vocab(vocab_file)
 53 |     if check_special_token:
 54 |       # Verify if the vocab starts with unk, sos, eos
 55 |       # If not, prepend those tokens & generate a new vocab file
 56 |       if not unk: unk = UNK
 57 |       if not sos: sos = SOS
 58 |       if not eos: eos = EOS
 59 |       assert len(vocab) >= 3
 60 |       if vocab[0] != unk or vocab[1] != sos or vocab[2] != eos:
 61 |         utils.print_out("The first 3 vocab words [%s, %s, %s]"
 62 |                         " are not [%s, %s, %s]" %
 63 |                         (vocab[0], vocab[1], vocab[2], unk, sos, eos))
 64 |         vocab = [unk, sos, eos] + vocab
 65 |         vocab_size += 3
 66 |         new_vocab_file = os.path.join(out_dir, os.path.basename(vocab_file))
 67 |         with codecs.getwriter("utf-8")(
 68 |             tf.gfile.GFile(new_vocab_file, "wb")) as f:
 69 |           for word in vocab:
 70 |             f.write("%s\n" % word)
 71 |         vocab_file = new_vocab_file
 72 |   else:
 73 |     raise ValueError("vocab_file '%s' does not exist." % vocab_file)
 74 | 
 75 |   vocab_size = len(vocab)
 76 |   return vocab_size, vocab_file
 77 | 
 78 | 
 79 | def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
 80 |   """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
 81 |   src_vocab_table = lookup_ops.index_table_from_file(
 82 |       src_vocab_file, default_value=UNK_ID)
 83 |   if share_vocab:
 84 |     tgt_vocab_table = src_vocab_table
 85 |   else:
 86 |     tgt_vocab_table = lookup_ops.index_table_from_file(
 87 |         tgt_vocab_file, default_value=UNK_ID)
 88 |   return src_vocab_table, tgt_vocab_table
 89 | 
 90 | 
 91 | def load_embed_txt(embed_file):
 92 |   """Load embed_file into a python dictionary.
 93 | 
 94 |   Note: the embed_file should be a Glove formated txt file. Assuming
 95 |   embed_size=5, for example:
 96 | 
 97 |   the -0.071549 0.093459 0.023738 -0.090339 0.056123
 98 |   to 0.57346 0.5417 -0.23477 -0.3624 0.4037
 99 |   and 0.20327 0.47348 0.050877 0.002103 0.060547
100 | 
101 |   Args:
102 |     embed_file: file path to the embedding file.
103 |   Returns:
104 |     a dictionary that maps word to vector, and the size of embedding dimensions.
105 |   """
106 |   emb_dict = dict()
107 |   emb_size = None
108 |   with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, 'rb')) as f:
109 |     for line in f:
110 |       tokens = line.strip().split(" ")
111 |       word = tokens[0]
112 |       vec = list(map(float, tokens[1:]))
113 |       emb_dict[word] = vec
114 |       if emb_size:
115 |         assert emb_size == len(vec), "All embedding size should be same."
116 |       else:
117 |         emb_size = len(vec)
118 |   return emb_dict, emb_size
119 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/nmt/utils/vocab_utils_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Tests for vocab_utils."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import codecs
23 | import os
24 | import tensorflow as tf
25 | 
26 | from ..utils import vocab_utils
27 | 
28 | 
29 | class VocabUtilsTest(tf.test.TestCase):
30 | 
31 |   def testCheckVocab(self):
32 |     # Create a vocab file
33 |     vocab_dir = os.path.join(tf.test.get_temp_dir(), "vocab_dir")
34 |     os.makedirs(vocab_dir)
35 |     vocab_file = os.path.join(vocab_dir, "vocab_file")
36 |     vocab = ["a", "b", "c"]
37 |     with codecs.getwriter("utf-8")(tf.gfile.GFile(vocab_file, "wb")) as f:
38 |       for word in vocab:
39 |         f.write("%s\n" % word)
40 | 
41 |     # Call vocab_utils
42 |     out_dir = os.path.join(tf.test.get_temp_dir(), "out_dir")
43 |     os.makedirs(out_dir)
44 |     vocab_size, new_vocab_file = vocab_utils.check_vocab(
45 |         vocab_file, out_dir)
46 | 
47 |     # Assert: we expect the code to add  <unk>, <s>, </s> and
48 |     # create a new vocab file
49 |     self.assertEqual(len(vocab) + 3, vocab_size)
50 |     self.assertEqual(os.path.join(out_dir, "vocab_file"), new_vocab_file)
51 |     new_vocab, _ = vocab_utils.load_vocab(new_vocab_file)
52 |     self.assertEqual(
53 |         [vocab_utils.UNK, vocab_utils.SOS, vocab_utils.EOS] + vocab, new_vocab)
54 | 
55 | 
56 | if __name__ == "__main__":
57 |   tf.test.main()
58 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/simple/README.md:
--------------------------------------------------------------------------------
 1 | # Simple Example 
 2 | This is a basic distributed training example with parallax.
 3 | 
 4 | ## To Run
 5 | Set your resource information in the `resource_info` file.
 6 | 
 7 | Then execute:
 8 | ```shell
 9 | $ python simple_driver.py
10 | ```
11 | 
12 | The command assumes the simple example codebase is distributed and reachable in the same absolute path in each of the machines.
13 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/simple/resource_info:
--------------------------------------------------------------------------------
1 | 123.456.78.90:1,2,4,5
2 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/simple/simple_driver.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2018 Seoul National University
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import os
 17 | import numpy as np
 18 | import tensorflow as tf
 19 | import argparse
 20 | 
 21 | import parallax
 22 | 
 23 | parser = argparse.ArgumentParser()
 24 | parser.add_argument('-lr', "--learning_rate", type=float, default=0.01,
 25 |                     help='Learning rate')
 26 | 
 27 | args = parser.parse_args()
 28 | 
 29 | [-0.880728357, -0.706550564],
 30 | [-0.179175969, 0.052373456],
 31 | [0.460992645, 0.328267666],
 32 | [-0.378916048, 0.86581809],
 33 | [-0.064562793, -0.755948805],
 34 | [-0.585833517, -0.46743004],
 35 | [-0.151177544, -0.582325109],
 36 | [-0.720116833, 0.834904979],
 37 | [-0.518939078, -0.670627318],
 38 | [-0.035878422, 0.750102543],
 39 | [-0.673400627, -0.919498322],
 40 | [-0.731202767, -0.159733489],
 41 | [-0.463404605, 0.697764632],
 42 | [0.706744043, 0.458026442],
 43 | [0.819940015, -0.867168658],
 44 | [-0.056113501, -0.602024627],
 45 | [0.213450484, -0.20133007],
 46 | [-0.358544296, -0.40380244],
 47 | 
 48 | train_x = np.array([
 49 |     [-0.880728357, -0.706550564],
 50 |     [-0.179175969, 0.052373456],
 51 |     [0.460992645, 0.328267666],
 52 |     [-0.378916048, 0.86581809],
 53 |     [-0.064562793, -0.755948805],
 54 |     [-0.585833517, -0.46743004],
 55 |     [-0.151177544, -0.582325109],
 56 |     [-0.720116833, 0.834904979],
 57 |     [-0.518939078, -0.670627318],
 58 |     [-0.035878422, 0.750102543],
 59 |     [-0.673400627, -0.919498322],
 60 |     [-0.731202767, -0.159733489],
 61 |     [-0.463404605, 0.697764632],
 62 |     [0.706744043, 0.458026442],
 63 |     [0.819940015, -0.867168658],
 64 |     [-0.056113501, -0.602024627],
 65 |     [0.213450484, -0.20133007],
 66 |     [-0.358544296, -0.40380244]
 67 | ])
 68 | 
 69 | train_y = np.array([
 70 |     [2.306799664],
 71 |     [1.825970013],
 72 |     [1.901374447],
 73 |     [0.909895597],
 74 |     [2.723102683],
 75 |     [2.145410027],
 76 |     [2.498034199],
 77 |     [0.844066487],
 78 |     [2.401599333],
 79 |     [1.274285598],
 80 |     [2.542184193],
 81 |     [1.81653423],
 82 |     [1.06511757],
 83 |     [1.891457798],
 84 |     [3.317388286],
 85 |     [2.579920223],
 86 |     [2.301286159],
 87 |     [2.197386858],
 88 | ])
 89 | 
 90 | num_samples = train_x.shape[0]
 91 | 
 92 | 
 93 | def main(_):
 94 |   single_gpu_graph = tf.Graph()
 95 |   with single_gpu_graph.as_default():
 96 |     global_step = tf.train.get_or_create_global_step()
 97 |     x = tf.placeholder(tf.float32, shape=(2))
 98 |     y = tf.placeholder(tf.float32, shape=(1))
 99 | 
100 |     w = tf.get_variable(name='w', shape=(2, 1))
101 |     b = tf.get_variable(name='b', shape=(1))
102 | 
103 |     pred = tf.nn.bias_add(tf.matmul(tf.expand_dims(x, axis=0), w), b)
104 |     loss = tf.reduce_sum(tf.pow(pred - tf.expand_dims(y, axis=0), 2)) / 2
105 | 
106 |     optimizer = tf.train.GradientDescentOptimizer(args.learning_rate)
107 |     train_op = optimizer.minimize(loss, global_step=global_step)
108 | 
109 |     # init = tf.global_variables_initializer()
110 | 
111 |   def run(sess, num_workers, worker_id, num_replicas_per_worker):
112 |     cursor = 0
113 |     for i in range(1000):
114 |       feed_dict = {}
115 |       feed_dict[x] = [train_x[(cursor + j) % num_samples] for j in \
116 |           range(num_replicas_per_worker)]
117 |       feed_dict[y] = [train_y[(cursor + j) % num_samples] for j in \
118 |           range(num_replicas_per_worker)]
119 |       cursor += num_replicas_per_worker
120 |       fetches = {
121 |           'global_step': global_step,
122 |           'loss': loss,
123 |           'train_op': train_op
124 |       }
125 | 
126 |       results = sess.run(fetches, feed_dict=feed_dict)
127 | 
128 |       if i % 5 == 0:
129 |         print("global step: %d, loss: %f"
130 |               % (results['global_step'][0], results['loss'][0]))
131 | 
132 |   resource_info = os.path.join(os.path.dirname(os.path.abspath(__file__)),
133 |                                'resource_info')
134 |   sess, num_workers, worker_id, num_replicas_per_worker = \
135 |       parallax.parallel_run(single_gpu_graph, resource_info)
136 |   run(sess, num_workers, worker_id, num_replicas_per_worker)
137 | 
138 | if __name__ == '__main__':
139 |   tf.app.run()
140 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/README.md:
--------------------------------------------------------------------------------
 1 | # Skip-Thought Vectors
 2 | This example implements the model described in [Skip-Thought Vectors](https://papers.nips.cc/paper/5950-skip-thought-vectors.pdf). 
 3 | The original code comes from [here](https://github.com/tensorflow/models/tree/master/research/skip_thoughts).
 4 | We changed a minimal amount of the original code;`import path` code and BUILD file.
 5 | We added the `skip_distributed_driver.py` file and modified `ops/input_ops.py`(for data sharding) file to run the example on parallax.
 6 | 
 7 | ## Dataset
 8 | * Follow the instructions shown in [Prepare the Training Data](https://github.com/tensorflow/models/tree/master/research/skip_thoughts).
 9 | 
10 | ## To Run
11 | Set your resource information in the `resource_info` file.
12 | 
13 | Then execute:
14 | ```shell
15 | $ python skip_distributed_driver.py --input_file_pattern ${DATA_DIR}/data/train-?????-of-00100
16 | ```
17 | The command above runs a single Skip-Thought Vectors model on multiple devices specified in `resource_info`.
18 | The command assumes that the data directory and the Skip-Thought Vectors codebase are distributed and reachable in the same absolute path in each of the machines.
19 | 
20 | Also, we have a few more options you can choose for distributed running.
21 | 
22 | | Parameter Name       |  Default               | Description |
23 | | :------------------- |:-----------------------| :-----------|
24 | | --data_path          | None	                | Where to training/test data is stored |
25 | | --input_file_pattern | ""                   	| File pattern of training data |
26 | | --batch_size         | 128                    | Batch size |
27 | | --resource_info_file | `./resource_info`      | Filename containing cluster information written |
28 | | --max_steps          | 1000000                | Number of iterations to run for each workers |
29 | | --log_frequency      | 100                    | How many steps between two runop log |
30 | | --sync               | True                   | Whether to synchronize learning or not |
31 | | --ckpt_dir           | None                   | Directory to save checkpoints |
32 | | --save_ckpt_steps    | 0                      | Number of steps between two consecutive checkpoints |
33 | | --run_option         | None                   | The run option whether PS or MPI, None utilizes both |
34 | 
35 | 
36 | You can adapt the distributed running with above options. For example, if you want to fix the communication model as MPI mode, you can add `run_option` value like below.
37 | 
38 | ```shell
39 | $ python skip_distributed_driver.py --input_file_pattern ${DATA_DIR}/data/train-?????-of-00100 --run_option MPI
40 | ```
41 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/configuration.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Default configuration for model architecture and training.
 16 | 
 17 | Original source : https://github.com/tensorflow/models/tree/master/skip_thoughts
 18 | 
 19 | """
 20 | 
 21 | from __future__ import absolute_import
 22 | from __future__ import division
 23 | from __future__ import print_function
 24 | 
 25 | 
 26 | class _HParams(object):
 27 |     """Wrapper for configuration parameters."""
 28 |     pass
 29 | 
 30 | 
 31 | def model_config(input_file_pattern=None,
 32 |                  input_queue_capacity=640000,
 33 |                  num_input_reader_threads=1,
 34 |                  shuffle_input_data=True,
 35 |                  uniform_init_scale=0.1,
 36 |                  vocab_size=20000,
 37 |                  batch_size=128,
 38 |                  word_embedding_dim=620,
 39 |                  bidirectional_encoder=False,
 40 |                  encoder_dim=2400):
 41 |     """Creates a model configuration object.
 42 | 
 43 |     Args:
 44 |       input_file_pattern: File pattern of sharded TFRecord files containing
 45 |         tf.Example protobufs.
 46 |       input_queue_capacity: Number of examples to keep in the input queue.
 47 |       num_input_reader_threads: Number of threads for prefetching input
 48 |         tf.Examples.
 49 |       shuffle_input_data: Whether to shuffle the input data.
 50 |       uniform_init_scale: Scale of random uniform initializer.
 51 |       vocab_size: Number of unique words in the vocab.
 52 |       batch_size: Batch size (training and evaluation only).
 53 |       word_embedding_dim: Word embedding dimension.
 54 |       bidirectional_encoder: Whether to use a bidirectional or unidirectional
 55 |         encoder RNN.
 56 |       encoder_dim: Number of output dimensions of the sentence encoder.
 57 | 
 58 |     Returns:
 59 |       An object containing model configuration parameters.
 60 |     """
 61 |     config = _HParams()
 62 |     config.input_file_pattern = input_file_pattern
 63 |     config.input_queue_capacity = input_queue_capacity
 64 |     config.num_input_reader_threads = num_input_reader_threads
 65 |     config.shuffle_input_data = shuffle_input_data
 66 |     config.uniform_init_scale = uniform_init_scale
 67 |     config.vocab_size = vocab_size
 68 |     config.batch_size = batch_size
 69 |     config.word_embedding_dim = word_embedding_dim
 70 |     config.bidirectional_encoder = bidirectional_encoder
 71 |     config.encoder_dim = encoder_dim
 72 |     return config
 73 | 
 74 | 
 75 | def training_config(learning_rate=0.0008,
 76 |                     learning_rate_decay_factor=0.5,
 77 |                     learning_rate_decay_steps=400000,
 78 |                     number_of_steps=500000,
 79 |                     clip_gradient_norm=5.0,
 80 |                     save_model_secs=600,
 81 |                     save_summaries_secs=600):
 82 |     """Creates a training configuration object.
 83 | 
 84 |     Args:
 85 |       learning_rate: Initial learning rate.
 86 |       learning_rate_decay_factor: If > 0, the learning rate decay factor.
 87 |       learning_rate_decay_steps: The number of steps before the learning rate
 88 |         decays by learning_rate_decay_factor.
 89 |       number_of_steps: The total number of training steps to run. Passing None
 90 |         will cause the training script to run indefinitely.
 91 |       clip_gradient_norm: If not None, then clip gradients to this value.
 92 |       save_model_secs: How often (in seconds) to save model checkpoints.
 93 |       save_summaries_secs: How often (in seconds) to save model summaries.
 94 | 
 95 |     Returns:
 96 |       An object containing training configuration parameters.
 97 | 
 98 |     Raises:
 99 |       ValueError: If learning_rate_decay_factor is set and
100 |         learning_rate_decay_steps is unset.
101 |     """
102 |     if learning_rate_decay_factor and not learning_rate_decay_steps:
103 |         raise ValueError(
104 |             "learning_rate_decay_factor requires learning_rate_decay_steps.")
105 | 
106 |     config = _HParams()
107 |     config.learning_rate = learning_rate
108 |     config.learning_rate_decay_factor = learning_rate_decay_factor
109 |     config.learning_rate_decay_steps = learning_rate_decay_steps
110 |     config.number_of_steps = number_of_steps
111 |     config.clip_gradient_norm = clip_gradient_norm
112 |     config.save_model_secs = save_model_secs
113 |     config.save_summaries_secs = save_summaries_secs
114 |     return config
115 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/data/special_words.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Special word constants.
16 | 
17 | NOTE: The ids of the EOS and UNK constants should not be modified. It is assumed
18 | that these always occupy the first two ids.
19 | """
20 | 
21 | # End of sentence.
22 | EOS = "<eos>"
23 | EOS_ID = 0
24 | 
25 | # Unknown.
26 | UNK = "<unk>"
27 | UNK_ID = 1
28 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/encoder_manager.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Manager class for loading and encoding with multiple skip-thoughts models.
 16 | 
 17 | If multiple models are loaded at once then the encode() function returns the
 18 | concatenation of the outputs of each model.
 19 | 
 20 | Example usage:
 21 |   manager = EncoderManager()
 22 |   manager.load_model(model_config_1, vocabulary_file_1, embedding_matrix_file_1,
 23 |                      checkpoint_path_1)
 24 |   manager.load_model(model_config_2, vocabulary_file_2, embedding_matrix_file_2,
 25 |                      checkpoint_path_2)
 26 |   encodings = manager.encode(data)
 27 | 
 28 | Original source : https://github.com/tensorflow/models/tree/master/skip_thoughts
 29 | 
 30 | """
 31 | 
 32 | from __future__ import absolute_import
 33 | from __future__ import division
 34 | from __future__ import print_function
 35 | 
 36 | import collections
 37 | 
 38 | import numpy as np
 39 | import tensorflow as tf
 40 | 
 41 | import skip_thoughts_encoder
 42 | 
 43 | 
 44 | class EncoderManager(object):
 45 |     """Manager class for loading and encoding with skip-thoughts models."""
 46 | 
 47 |     def __init__(self):
 48 |         self.encoders = []
 49 |         self.sessions = []
 50 | 
 51 |     def load_model(self, model_config, vocabulary_file, embedding_matrix_file,
 52 |                    checkpoint_path):
 53 |         """Loads a skip-thoughts model.
 54 | 
 55 |         Args:
 56 |           model_config: Object containing parameters for building the model.
 57 |           vocabulary_file: Path to vocabulary file containing a list of newline-
 58 |             separated words where the word id is the corresponding 0-based index in
 59 |             the file.
 60 |           embedding_matrix_file: Path to a serialized numpy array of shape
 61 |             [vocab_size, embedding_dim].
 62 |           checkpoint_path: SkipThoughtsModel checkpoint file or a directory
 63 |             containing a checkpoint file.
 64 |         """
 65 |         tf.logging.info("Reading vocabulary from %s", vocabulary_file)
 66 |         with tf.gfile.GFile(vocabulary_file, mode="r") as f:
 67 |             lines = list(f.readlines())
 68 |         reverse_vocab = [line.decode("utf-8").strip() for line in lines]
 69 |         tf.logging.info("Loaded vocabulary with %d words.", len(reverse_vocab))
 70 | 
 71 |         tf.logging.info("Loading embedding matrix from %s",
 72 |                         embedding_matrix_file)
 73 |         # Note: tf.gfile.GFile doesn't work here because np.load() calls f.seek()
 74 |         # with 3 arguments.
 75 |         with open(embedding_matrix_file, "r") as f:
 76 |             embedding_matrix = np.load(f)
 77 |         tf.logging.info("Loaded embedding matrix with shape %s",
 78 |                         embedding_matrix.shape)
 79 | 
 80 |         word_embeddings = collections.OrderedDict(
 81 |             zip(reverse_vocab, embedding_matrix))
 82 | 
 83 |         g = tf.Graph()
 84 |         with g.as_default():
 85 |             encoder = skip_thoughts_encoder.SkipThoughtsEncoder(word_embeddings)
 86 |             restore_model = encoder.build_graph_from_config(model_config,
 87 |                                                             checkpoint_path)
 88 | 
 89 |         sess = tf.Session(graph=g)
 90 |         restore_model(sess)
 91 | 
 92 |         self.encoders.append(encoder)
 93 |         self.sessions.append(sess)
 94 | 
 95 |     def encode(self,
 96 |                data,
 97 |                use_norm=True,
 98 |                verbose=False,
 99 |                batch_size=128,
100 |                use_eos=False):
101 |         """Encodes a sequence of sentences as skip-thought vectors.
102 | 
103 |         Args:
104 |           data: A list of input strings.
105 |           use_norm: If True, normalize output skip-thought vectors to unit
106 |             L2 norm.
107 |           verbose: Whether to log every batch.
108 |           batch_size: Batch size for the RNN encoders.
109 |           use_eos: If True, append the end-of-sentence word to each input
110 |             sentence.
111 | 
112 |         Returns:
113 |           thought_vectors: A list of numpy arrays corresponding to 'data'.
114 | 
115 |         Raises:
116 |           ValueError: If called before calling load_encoder.
117 |         """
118 |         if not self.encoders:
119 |             raise ValueError(
120 |                 "Must call load_model at least once before calling encode.")
121 | 
122 |         encoded = []
123 |         for encoder, sess in zip(self.encoders, self.sessions):
124 |             encoded.append(
125 |                 np.array(
126 |                     encoder.encode(
127 |                         sess,
128 |                         data,
129 |                         use_norm=use_norm,
130 |                         verbose=verbose,
131 |                         batch_size=batch_size,
132 |                         use_eos=use_eos)))
133 | 
134 |         return np.concatenate(encoded, axis=1)
135 | 
136 |     def close(self):
137 |         """Closes the active TensorFlow Sessions."""
138 |         for sess in self.sessions:
139 |             sess.close()
140 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/evaluate.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Script to evaluate a skip-thoughts model.
 16 | 
 17 | This script can evaluate a model with a unidirectional encoder ("uni-skip" in
 18 | the paper); or a model with a bidirectional encoder ("bi-skip"); or the
 19 | combination of a model with a unidirectional encoder and a model with a
 20 | bidirectional encoder ("combine-skip").
 21 | 
 22 | The uni-skip model (if it exists) is specified by the flags
 23 | --uni_vocab_file, --uni_embeddings_file, --uni_checkpoint_path.
 24 | 
 25 | The bi-skip model (if it exists) is specified by the flags
 26 | --bi_vocab_file, --bi_embeddings_path, --bi_checkpoint_path.
 27 | 
 28 | The evaluation tasks have different running times. SICK may take 5-10 minutes.
 29 | MSRP, TREC and CR may take 20-60 minutes. SUBJ, MPQA and MR may take 2+ hours.
 30 | 
 31 | Original source : https://github.com/tensorflow/models/tree/master/skip_thoughts
 32 | 
 33 | """
 34 | 
 35 | from __future__ import absolute_import
 36 | from __future__ import division
 37 | from __future__ import print_function
 38 | 
 39 | import tensorflow as tf
 40 | 
 41 | from skipthoughts import eval_classification
 42 | from skipthoughts import eval_msrp
 43 | from skipthoughts import eval_sick
 44 | from skipthoughts import eval_trec
 45 | from skip_thoughts import configuration
 46 | from skip_thoughts import encoder_manager
 47 | 
 48 | FLAGS = tf.flags.FLAGS
 49 | 
 50 | tf.flags.DEFINE_string("eval_task", "CR",
 51 |                        "Name of the evaluation task to run. Available tasks: "
 52 |                        "MR, CR, SUBJ, MPQA, SICK, MSRP, TREC.")
 53 | 
 54 | tf.flags.DEFINE_string("data_dir", None, "Directory containing training data.")
 55 | 
 56 | tf.flags.DEFINE_string("uni_vocab_file", None,
 57 |                        "Path to vocabulary file containing a list of newline-"
 58 |                        "separated words where the word id is the "
 59 |                        "corresponding 0-based index in the file.")
 60 | tf.flags.DEFINE_string("bi_vocab_file", None,
 61 |                        "Path to vocabulary file containing a list of newline-"
 62 |                        "separated words where the word id is the "
 63 |                        "corresponding 0-based index in the file.")
 64 | 
 65 | tf.flags.DEFINE_string("uni_embeddings_file", None,
 66 |                        "Path to serialized numpy array of shape "
 67 |                        "[vocab_size, embedding_dim].")
 68 | tf.flags.DEFINE_string("bi_embeddings_file", None,
 69 |                        "Path to serialized numpy array of shape "
 70 |                        "[vocab_size, embedding_dim].")
 71 | 
 72 | tf.flags.DEFINE_string("uni_checkpoint_path", None,
 73 |                        "Checkpoint file or directory containing a checkpoint "
 74 |                        "file.")
 75 | tf.flags.DEFINE_string("bi_checkpoint_path", None,
 76 |                        "Checkpoint file or directory containing a checkpoint "
 77 |                        "file.")
 78 | 
 79 | tf.logging.set_verbosity(tf.logging.INFO)
 80 | 
 81 | 
 82 | def main(unused_argv):
 83 |     if not FLAGS.data_dir:
 84 |         raise ValueError("--data_dir is required.")
 85 | 
 86 |     encoder = encoder_manager.EncoderManager()
 87 | 
 88 |     # Maybe load unidirectional encoder.
 89 |     if FLAGS.uni_checkpoint_path:
 90 |         print("Loading unidirectional model...")
 91 |         uni_config = configuration.model_config()
 92 |         encoder.load_model(uni_config, FLAGS.uni_vocab_file,
 93 |                            FLAGS.uni_embeddings_file, FLAGS.uni_checkpoint_path)
 94 | 
 95 |     # Maybe load bidirectional encoder.
 96 |     if FLAGS.bi_checkpoint_path:
 97 |         print("Loading bidirectional model...")
 98 |         bi_config = configuration.model_config(bidirectional_encoder=True)
 99 |         encoder.load_model(bi_config, FLAGS.bi_vocab_file,
100 |                            FLAGS.bi_embeddings_file,
101 |                            FLAGS.bi_checkpoint_path)
102 | 
103 |     if FLAGS.eval_task in ["MR", "CR", "SUBJ", "MPQA"]:
104 |         eval_classification.eval_nested_kfold(
105 |             encoder, FLAGS.eval_task, FLAGS.data_dir, use_nb=False)
106 |     elif FLAGS.eval_task == "SICK":
107 |         eval_sick.evaluate(encoder, evaltest=True, loc=FLAGS.data_dir)
108 |     elif FLAGS.eval_task == "MSRP":
109 |         eval_msrp.evaluate(
110 |             encoder, evalcv=True, evaltest=True, use_feats=True,
111 |             loc=FLAGS.data_dir)
112 |     elif FLAGS.eval_task == "TREC":
113 |         eval_trec.evaluate(encoder, evalcv=True, evaltest=True,
114 |                            loc=FLAGS.data_dir)
115 |     else:
116 |         raise ValueError("Unrecognized eval_task: %s" % FLAGS.eval_task)
117 | 
118 |     encoder.close()
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     tf.app.run()
123 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/skip_thoughts/ops/__init__.py


--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/ops/gru_cell.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """GRU cell implementation for the skip-thought vectors model."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | 
 22 | import tensorflow as tf
 23 | 
 24 | _layer_norm = tf.contrib.layers.layer_norm
 25 | 
 26 | 
 27 | class LayerNormGRUCell(tf.contrib.rnn.RNNCell):
 28 |   """GRU cell with layer normalization.
 29 | 
 30 |   The layer normalization implementation is based on:
 31 | 
 32 |     https://arxiv.org/abs/1607.06450.
 33 | 
 34 |   "Layer Normalization"
 35 |   Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
 36 |   """
 37 | 
 38 |   def __init__(self,
 39 |                num_units,
 40 |                w_initializer,
 41 |                u_initializer,
 42 |                b_initializer,
 43 |                activation=tf.nn.tanh):
 44 |     """Initializes the cell.
 45 | 
 46 |     Args:
 47 |       num_units: Number of cell units.
 48 |       w_initializer: Initializer for the "W" (input) parameter matrices.
 49 |       u_initializer: Initializer for the "U" (recurrent) parameter matrices.
 50 |       b_initializer: Initializer for the "b" (bias) parameter vectors.
 51 |       activation: Cell activation function.
 52 |     """
 53 |     self._num_units = num_units
 54 |     self._w_initializer = w_initializer
 55 |     self._u_initializer = u_initializer
 56 |     self._b_initializer = b_initializer
 57 |     self._activation = activation
 58 | 
 59 |   @property
 60 |   def state_size(self):
 61 |     return self._num_units
 62 | 
 63 |   @property
 64 |   def output_size(self):
 65 |     return self._num_units
 66 | 
 67 |   def _w_h_initializer(self):
 68 |     """Returns an initializer for the "W_h" parameter matrix.
 69 | 
 70 |     See equation (23) in the paper. The "W_h" parameter matrix is the
 71 |     concatenation of two parameter submatrices. The matrix returned is
 72 |     [U_z, U_r].
 73 | 
 74 |     Returns:
 75 |       A Tensor with shape [num_units, 2 * num_units] as described above.
 76 |     """
 77 | 
 78 |     def _initializer(shape, dtype=tf.float32, partition_info=None):
 79 |       num_units = self._num_units
 80 |       assert shape == [num_units, 2 * num_units]
 81 |       u_z = self._u_initializer([num_units, num_units], dtype, partition_info)
 82 |       u_r = self._u_initializer([num_units, num_units], dtype, partition_info)
 83 |       return tf.concat([u_z, u_r], 1)
 84 | 
 85 |     return _initializer
 86 | 
 87 |   def _w_x_initializer(self, input_dim):
 88 |     """Returns an initializer for the "W_x" parameter matrix.
 89 | 
 90 |     See equation (23) in the paper. The "W_x" parameter matrix is the
 91 |     concatenation of two parameter submatrices. The matrix returned is
 92 |     [W_z, W_r].
 93 | 
 94 |     Args:
 95 |       input_dim: The dimension of the cell inputs.
 96 | 
 97 |     Returns:
 98 |       A Tensor with shape [input_dim, 2 * num_units] as described above.
 99 |     """
100 | 
101 |     def _initializer(shape, dtype=tf.float32, partition_info=None):
102 |       num_units = self._num_units
103 |       assert shape == [input_dim, 2 * num_units]
104 |       w_z = self._w_initializer([input_dim, num_units], dtype, partition_info)
105 |       w_r = self._w_initializer([input_dim, num_units], dtype, partition_info)
106 |       return tf.concat([w_z, w_r], 1)
107 | 
108 |     return _initializer
109 | 
110 |   def __call__(self, inputs, state, scope=None):
111 |     """GRU cell with layer normalization."""
112 |     input_dim = inputs.get_shape().as_list()[1]
113 |     num_units = self._num_units
114 | 
115 |     with tf.variable_scope(scope or "gru_cell"):
116 |       with tf.variable_scope("gates"):
117 |         w_h = tf.get_variable(
118 |             "w_h", [num_units, 2 * num_units],
119 |             initializer=self._w_h_initializer())
120 |         w_x = tf.get_variable(
121 |             "w_x", [input_dim, 2 * num_units],
122 |             initializer=self._w_x_initializer(input_dim))
123 |         z_and_r = (_layer_norm(tf.matmul(state, w_h), scope="layer_norm/w_h") +
124 |                    _layer_norm(tf.matmul(inputs, w_x), scope="layer_norm/w_x"))
125 |         z, r = tf.split(tf.sigmoid(z_and_r), 2, 1)
126 |       with tf.variable_scope("candidate"):
127 |         w = tf.get_variable(
128 |             "w", [input_dim, num_units], initializer=self._w_initializer)
129 |         u = tf.get_variable(
130 |             "u", [num_units, num_units], initializer=self._u_initializer)
131 |         h_hat = (r * _layer_norm(tf.matmul(state, u), scope="layer_norm/u") +
132 |                  _layer_norm(tf.matmul(inputs, w), scope="layer_norm/w"))
133 |       new_h = (1 - z) * state + z * self._activation(h_hat)
134 |     return new_h, new_h
135 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/ops/input_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Input ops."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | 
 23 | 
 24 | import tensorflow as tf
 25 | from parallax import shard
 26 | 
 27 | # A SentenceBatch is a pair of Tensors:
 28 | #  ids: Batch of input sentences represented as sequences of word ids: an int64
 29 | #    Tensor with shape [batch_size, padded_length].
 30 | #  mask: Boolean mask distinguishing real words (1) from padded words (0): an
 31 | #    int32 Tensor with shape [batch_size, padded_length].
 32 | SentenceBatch = collections.namedtuple("SentenceBatch", ("ids", "mask"))
 33 | 
 34 | 
 35 | def parse_example_batch(serialized):
 36 |   """Parses a batch of tf.Example protos.
 37 | 
 38 |   Args:
 39 |     serialized: A 1-D string Tensor; a batch of serialized tf.Example protos.
 40 |   Returns:
 41 |     encode: A SentenceBatch of encode sentences.
 42 |     decode_pre: A SentenceBatch of "previous" sentences to decode.
 43 |     decode_post: A SentenceBatch of "post" sentences to decode.
 44 |   """
 45 |   features = tf.parse_example(
 46 |       serialized,
 47 |       features={
 48 |           "encode": tf.VarLenFeature(dtype=tf.int64),
 49 |           "decode_pre": tf.VarLenFeature(dtype=tf.int64),
 50 |           "decode_post": tf.VarLenFeature(dtype=tf.int64),
 51 |       })
 52 | 
 53 |   def _sparse_to_batch(sparse):
 54 |     ids = tf.sparse_tensor_to_dense(sparse)  # Padding with zeroes.
 55 |     mask = tf.sparse_to_dense(sparse.indices, sparse.dense_shape,
 56 |                               tf.ones_like(sparse.values, dtype=tf.int32))
 57 |     return SentenceBatch(ids=ids, mask=mask)
 58 | 
 59 |   output_names = ("encode", "decode_pre", "decode_post")
 60 |   return tuple(_sparse_to_batch(features[x]) for x in output_names)
 61 | 
 62 | 
 63 | def prefetch_input_data(reader,
 64 |                         file_pattern,
 65 |                         shuffle,
 66 |                         capacity,
 67 |                         num_reader_threads=1):
 68 |   """Prefetches string values from disk into an input queue.
 69 | 
 70 |   Args:
 71 |     reader: Instance of tf.ReaderBase.
 72 |     file_pattern: Comma-separated list of file patterns (e.g.
 73 |         "/tmp/train_data-?????-of-00100", where '?' acts as a wildcard that
 74 |         matches any character).
 75 |     shuffle: Boolean; whether to randomly shuffle the input data.
 76 |     capacity: Queue capacity (number of records).
 77 |     num_reader_threads: Number of reader threads feeding into the queue.
 78 | 
 79 |   Returns:
 80 |     A Queue containing prefetched string values.
 81 |   """
 82 |   data_files = []
 83 |   for pattern in file_pattern.split(","):
 84 |     data_files.extend(tf.gfile.Glob(pattern))
 85 |   if not data_files:
 86 |     tf.logging.fatal("Found no input files matching %s", file_pattern)
 87 |   else:
 88 |     tf.logging.info("Prefetching values from %d files matching %s",
 89 |                     len(data_files), file_pattern)
 90 |   data_files.sort()
 91 |   num_files = len(data_files)
 92 |   num_shards, shard_id = shard.create_num_shards_and_shard_id()
 93 |   shard_size = num_files / num_shards
 94 |   shard_size = tf.cast(shard_size, dtype=tf.int64)
 95 |   remainder = num_files % num_shards
 96 |   
 97 |   slice_begin = tf.cond(tf.less(shard_id, remainder + 1),
 98 |                         lambda: (shard_size + 1) * shard_id,
 99 |                         lambda: shard_size * shard_id + remainder)
100 |   slice_size = tf.cond(tf.less(shard_id, remainder), lambda: shard_size + 1,
101 |                        lambda: shard_size)
102 |   data_files = tf.slice(data_files, [slice_begin], [slice_size])
103 |   filename_queue = tf.train.string_input_producer(
104 |       data_files, shuffle=shuffle, capacity=16, name="filename_queue")
105 | 
106 |   if shuffle:
107 |     min_after_dequeue = int(0.6 * capacity)
108 |     values_queue = tf.RandomShuffleQueue(
109 |         capacity=capacity,
110 |         min_after_dequeue=min_after_dequeue,
111 |         dtypes=[tf.string],
112 |         shapes=[[]],
113 |         name="random_input_queue")
114 |   else:
115 |     values_queue = tf.FIFOQueue(
116 |         capacity=capacity,
117 |         dtypes=[tf.string],
118 |         shapes=[[]],
119 |         name="fifo_input_queue")
120 | 
121 |   enqueue_ops = []
122 |   for _ in range(num_reader_threads):
123 |     _, value = reader.read(filename_queue)
124 |     enqueue_ops.append(values_queue.enqueue([value]))
125 |   tf.train.queue_runner.add_queue_runner(
126 |       tf.train.queue_runner.QueueRunner(values_queue, enqueue_ops))
127 |   tf.summary.scalar("queue/%s/fraction_of_%d_full" % (values_queue.name,
128 |                                                       capacity),
129 |                     tf.cast(values_queue.size(), tf.float32) * (1.0 / capacity))
130 | 
131 |   return values_queue
132 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/parallax_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2018 Seoul National University
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | import tensorflow as tf
17 | import parallax
18 | 
19 | 
20 | flags = tf.app.flags
21 | flags.DEFINE_boolean('replicate_variables', True, """replicate_variables""")
22 | flags.DEFINE_string('protocol', 'grpc', """The method for managing variables""")
23 | flags.DEFINE_string('mpirun_options', '', 'The option for mpirun')
24 | flags.DEFINE_string('run_option', 'HYBRID',
25 |                     'The run option whether PS, MPI or HYBRID')
26 | flags.DEFINE_string('redirect_path', None, """redirect path to keep the log of distributed workers""")
27 | flags.DEFINE_string('ckpt_dir', None, """Directory to save checkpoints""")
28 | flags.DEFINE_integer('save_ckpt_steps', None,
29 |                      """Number of steps between two consecutive checkpoints""")
30 | flags.DEFINE_string('profile_dir', None, """Directory to save RunMetadata""")
31 | flags.DEFINE_string('profile_steps', None, """Comma separated porfile steps""")
32 | flags.DEFINE_boolean('local_aggregation', True,
33 |                      """Whether to use local aggregation or not""")
34 | flags.DEFINE_boolean('boundary_among_servers', True,
35 |                      """Whether to use operation placement among servers""")
36 | flags.DEFINE_boolean('boundary_between_workers_and_servers', True,
37 |                      """Whether to use operation placement between workers and servers""")
38 | flags.DEFINE_string('export_graph_path', None, """export path to keep transformed graph definintion""")
39 | FLAGS = flags.FLAGS
40 | 
41 | def build_config():
42 | 
43 |     ckpt_config = parallax.CheckPointConfig(ckpt_dir=FLAGS.ckpt_dir,
44 |                                             save_ckpt_steps=FLAGS.save_ckpt_steps)
45 |     ps_config = parallax.PSConfig(replicate_variables=FLAGS.replicate_variables,
46 |                                   protocol=FLAGS.protocol,
47 |                                   local_aggregation=FLAGS.local_aggregation,
48 |                                   boundary_among_servers=FLAGS.boundary_among_servers,
49 |                                   boundary_between_workers_and_servers=\
50 |                                   FLAGS.boundary_between_workers_and_servers)
51 |     mpi_config = parallax.MPIConfig(mpirun_options=FLAGS.mpirun_options)
52 |     parallax_config = parallax.Config()
53 |     parallax_config.run_option = FLAGS.run_option
54 |     parallax_config.average_sparse = False
55 |     parallax_config.communication_config = parallax.CommunicationConfig(ps_config, mpi_config)
56 |     parallax_config.ckpt_config=ckpt_config
57 |     def get_profile_steps():
58 |         if not FLAGS.profile_steps:
59 |             return []
60 |         FLAGS.profile_steps = FLAGS.profile_steps.strip()
61 |         return [int(step) for step in FLAGS.profile_steps.split(',')]
62 |     profile_config = parallax.ProfileConfig(profile_dir=FLAGS.profile_dir,
63 |                                             profile_steps=get_profile_steps())
64 |     parallax_config.profile_config = profile_config
65 |     parallax_config.redirect_path = FLAGS.redirect_path
66 |     parallax_config.export_graph_path = FLAGS.export_graph_path
67 | 
68 |     return parallax_config
69 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/resource_info:
--------------------------------------------------------------------------------
1 | 123.456.78.90:0
2 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/skip_distributed_driver.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2018 Seoul National University
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import sys
 17 | import os
 18 | import json
 19 | import time
 20 | 
 21 | import tensorflow as tf
 22 | from tensorflow.core.protobuf import config_pb2
 23 | import parallax
 24 | import parallax_config
 25 | 
 26 | import configuration
 27 | import skip_thoughts_model
 28 | 
 29 | FLAGS = tf.app.flags.FLAGS
 30 | 
 31 | tf.app.flags.DEFINE_string('data_path', None,
 32 |                            """Where to training/test data is stored.""")
 33 | tf.app.flags.DEFINE_string('input_file_pattern', '',
 34 |                            """File pattern of train data""")
 35 | tf.app.flags.DEFINE_integer('batch_size', 128,
 36 |                             """Batch_size""")
 37 | tf.app.flags.DEFINE_string('resource_info_file',
 38 |                            os.path.abspath(
 39 |                                os.path.join(os.path.dirname(__file__), '.',
 40 |                                             'resource_info')),
 41 |                            'Filename containing cluster information')
 42 | tf.app.flags.DEFINE_integer('max_steps', 1000000,
 43 |                             """Number of iterations to run for each workers.""")
 44 | tf.app.flags.DEFINE_integer('log_frequency', 100,
 45 |                             """How many steps between two runop logs.""")
 46 | tf.app.flags.DEFINE_boolean('sync', True, '')
 47 | 
 48 | def main(_):
 49 |     single_gpu_graph = tf.Graph()
 50 |     with single_gpu_graph.as_default():
 51 |         model_config = configuration.model_config(
 52 |             input_file_pattern=FLAGS.input_file_pattern,
 53 |             batch_size=FLAGS.batch_size)
 54 |         training_config = configuration.training_config()
 55 |         model = skip_thoughts_model.SkipThoughtsModel(model_config,
 56 |                                                       mode="train")
 57 |         model.build()
 58 | 
 59 |         # Setup learning rate
 60 |         if training_config.learning_rate_decay_factor > 0:
 61 |             learning_rate = tf.train.exponential_decay(
 62 |                 learning_rate=float(training_config.learning_rate),
 63 |                 global_step=model.global_step,
 64 |                 decay_steps=training_config.learning_rate_decay_steps,
 65 |                 decay_rate=training_config.learning_rate_decay_factor,
 66 |                 staircase=False)
 67 |         else:
 68 |             learning_rate = tf.constant(training_config.learning_rate)
 69 | 
 70 |         optimizer = tf.train.AdamOptimizer(learning_rate)
 71 | 
 72 |         train_tensor = tf.contrib.slim.learning.create_train_op(
 73 |             total_loss=model.total_loss,
 74 |             optimizer=optimizer,
 75 |             global_step=model.global_step,
 76 |             clip_gradient_norm=training_config.clip_gradient_norm)
 77 | 
 78 |     def run(sess, num_workers, worker_id, num_replicas_per_worker):
 79 |         fetches = {
 80 |             'global_step':
 81 |                 model.global_step,
 82 |             'cost':
 83 |                 model.total_loss,
 84 |             'train_op':
 85 |                 train_tensor,
 86 |         }
 87 | 
 88 |         start = time.time()
 89 |         for i in range(FLAGS.max_steps):
 90 |             results = sess.run(fetches)
 91 |             if i % FLAGS.log_frequency == 0:
 92 |                 end = time.time()
 93 |                 throughput = float(FLAGS.log_frequency) / float(end - start)
 94 |                 parallax.log.info(
 95 |                     "global step: %d, loss: %f, throughput: %f steps/sec"
 96 |                     % (results['global_step'][0], results['cost'][0], throughput))
 97 |                 start = time.time()
 98 | 
 99 |     sess, num_workers, worker_id, num_replicas_per_worker = \
100 |         parallax.parallel_run(single_gpu_graph,
101 |                               FLAGS.resource_info_file,
102 |                               sync=FLAGS.sync,
103 |                               parallax_config=parallax_config.build_config())
104 |     run(sess, num_workers, worker_id, num_replicas_per_worker)
105 | 
106 | if __name__ == "__main__":
107 |     tf.logging.set_verbosity(tf.logging.INFO)
108 |     tf.app.run()
109 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/skip_thoughts/train.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Train the skip-thoughts model."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import tensorflow as tf
 22 | 
 23 | from skip_thoughts import configuration
 24 | from skip_thoughts import skip_thoughts_model
 25 | 
 26 | FLAGS = tf.flags.FLAGS
 27 | 
 28 | tf.flags.DEFINE_string("input_file_pattern", None,
 29 |                        "File pattern of sharded TFRecord files containing "
 30 |                        "tf.Example protos.")
 31 | tf.flags.DEFINE_string("train_dir", None,
 32 |                        "Directory for saving and loading checkpoints.")
 33 | 
 34 | tf.logging.set_verbosity(tf.logging.INFO)
 35 | 
 36 | 
 37 | def _setup_learning_rate(config, global_step):
 38 |     """Sets up the learning rate with optional exponential decay.
 39 | 
 40 |     Args:
 41 |       config: Object containing learning rate configuration parameters.
 42 |       global_step: Tensor; the global step.
 43 | 
 44 |     Returns:
 45 |       learning_rate: Tensor; the learning rate with exponential decay.
 46 |     """
 47 |     if config.learning_rate_decay_factor > 0:
 48 |         learning_rate = tf.train.exponential_decay(
 49 |             learning_rate=float(config.learning_rate),
 50 |             global_step=global_step,
 51 |             decay_steps=config.learning_rate_decay_steps,
 52 |             decay_rate=config.learning_rate_decay_factor,
 53 |             staircase=False)
 54 |     else:
 55 |         learning_rate = tf.constant(config.learning_rate)
 56 |     return learning_rate
 57 | 
 58 | 
 59 | def main(unused_argv):
 60 |     if not FLAGS.input_file_pattern:
 61 |         raise ValueError("--input_file_pattern is required.")
 62 |     if not FLAGS.train_dir:
 63 |         raise ValueError("--train_dir is required.")
 64 | 
 65 |     model_config = configuration.model_config(
 66 |         input_file_pattern=FLAGS.input_file_pattern)
 67 |     training_config = configuration.training_config()
 68 | 
 69 |     tf.logging.info("Building training graph.")
 70 |     g = tf.Graph()
 71 |     with g.as_default():
 72 |         model = skip_thoughts_model.SkipThoughtsModel(model_config,
 73 |                                                       mode="train")
 74 |         model.build()
 75 | 
 76 |         learning_rate = _setup_learning_rate(training_config, model.global_step)
 77 |         optimizer = tf.train.AdamOptimizer(learning_rate)
 78 | 
 79 |         train_tensor = tf.contrib.slim.learning.create_train_op(
 80 |             total_loss=model.total_loss,
 81 |             optimizer=optimizer,
 82 |             global_step=model.global_step,
 83 |             clip_gradient_norm=training_config.clip_gradient_norm)
 84 | 
 85 |         saver = tf.train.Saver()
 86 | 
 87 |     tf.contrib.slim.learning.train(
 88 |         train_op=train_tensor,
 89 |         logdir=FLAGS.train_dir,
 90 |         graph=g,
 91 |         global_step=model.global_step,
 92 |         number_of_steps=training_config.number_of_steps,
 93 |         save_summaries_secs=training_config.save_summaries_secs,
 94 |         saver=saver,
 95 |         save_interval_secs=training_config.save_model_secs)
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     tf.app.run()
100 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/CNNBenchmark_distributed_driver.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2018 Seoul National University
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | import argparse
17 | import sys
18 | import os
19 | import json
20 | import time
21 | 
22 | from absl import flags
23 | import tensorflow as tf
24 | 
25 | import benchmark_cnn
26 | import cnn_util
27 | import parallax_config
28 | from cnn_util import log_fn
29 | from tensorflow.core.protobuf import config_pb2
30 | 
31 | import parallax
32 | 
33 | benchmark_cnn.define_flags()
34 | flags.adopt_module_key_flags(benchmark_cnn)
35 | 
36 | FLAGS = tf.app.flags.FLAGS
37 | 
38 | tf.app.flags.DEFINE_string('resource_info_file',
39 |                            os.path.abspath(os.path.join(
40 |                                os.path.dirname(__file__),
41 |                                '.',
42 |                                'resource_info')),
43 |                            'Filename containing cluster information')
44 | tf.app.flags.DEFINE_integer('max_steps', 1000000,
45 |                             """Number of iterations to run for each workers.""")
46 | tf.app.flags.DEFINE_integer('log_frequency', 100,
47 |                             """How many steps between two runop logs.""")
48 | tf.app.flags.DEFINE_boolean('sync', True, '')
49 | 
50 | def main(_):
51 |     # Build benchmark_cnn model
52 |     params = benchmark_cnn.make_params_from_flags()
53 |     params, sess_config = benchmark_cnn.setup(params)
54 |     bench = benchmark_cnn.BenchmarkCNN(params)
55 | 
56 |     # Print informaton
57 |     tfversion = cnn_util.tensorflow_version_tuple()
58 |     log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))
59 |     bench.print_info()
60 | 
61 |     # Build single-GPU benchmark_cnn model
62 |     single_gpu_graph = tf.Graph()
63 |     with single_gpu_graph.as_default():
64 |         bench.build_model()
65 | 
66 |     config = parallax_config.build_config()
67 |     config.sess_config = sess_config
68 | 
69 |     sess, num_workers, worker_id, num_replicas_per_worker = \
70 |         parallax.parallel_run(single_gpu_graph,
71 |                               FLAGS.resource_info_file,
72 |                               sync=FLAGS.sync,
73 |                               parallax_config=config)
74 | 
75 | 
76 |     fetches = {
77 |         'global_step': bench.global_step,
78 |         'cost': bench.cost,
79 |         'train_op': bench.train_op,
80 |     }
81 | 
82 |     start = time.time()
83 |     for i in range(FLAGS.max_steps):
84 |         results = sess.run(fetches)
85 |         if (i + 1) % FLAGS.log_frequency == 0:
86 |             end = time.time()
87 |             throughput = float(FLAGS.log_frequency) / float(end - start)
88 |             parallax.log.info(
89 |                 "global step: %d, loss: %f, throughput: %f steps/sec"
90 |                 % (results['global_step'][0]+1, results['cost'][0], throughput))
91 |             start = time.time()
92 | 
93 | if __name__ == '__main__':
94 |     tf.app.run()
95 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/CNNBenchmark_eval.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2018 Seoul National University
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | from absl import flags
17 | import tensorflow as tf
18 | 
19 | import benchmark_cnn
20 | 
21 | benchmark_cnn.define_flags()
22 | flags.adopt_module_key_flags(benchmark_cnn)
23 | 
24 | FLAGS = tf.app.flags.FLAGS
25 | 
26 | def main(_):
27 |   FLAGS.eval = True
28 |   params = benchmark_cnn.make_params_from_flags()
29 |   params, config = benchmark_cnn.setup(params)
30 |   bench = benchmark_cnn.BenchmarkCNN(params)
31 |   bench.evaluate()
32 | 
33 | if __name__ == '__main__':
34 |   tf.app.run()
35 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | # TensorFlow CNN Benchmarks
 2 | The original code of this example comes from [tf_cnn_benchmarks](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks).
 3 | We modified this code to build a computation graph for a single-gpu environment instead of a multi-GPU and multi-machine environment(We removed the unnecessary communication-related files like `varialble_mgr.py`, `variable_mgr_util.py`).
 4 | We added `CNNBenchmark_distributed_driver.py` for training and `CNNBenchmark_eval.py` for evaluation.
 5 | 
 6 | ## Dataset
 7 | * Synthetic data or imagenet data can be used. To use imagenet data follow these [instructions](https://github.com/tensorflow/models/tree/master/research/inception#getting-started).
 8 | 
 9 | ## Training
10 | Set your resource information in the `resource_info` file.
11 | 
12 | Then, execute:
13 | ```shell
14 | $ python CNNBenchmark_distributed_driver.py --model={model} --data_name={data_name} --data_dir={data_dir}
15 | ```
16 | 
17 | The command above runs a single CNN model on multiple devices specified in `resource_info`.
18 | The command assumes that the data directory and the TensorFlow CNN benchmark codebase are distributed and reachable in the same absolute path in each of the machines.
19 | 
20 | Also, we have a few more options you can choose for distributed running.
21 | 
22 | | Parameter Name       |  Default               | Description |
23 | | :------------------- |:-----------------------| :-----------|
24 | | --resource_info_file | `./resource_info`      | Filename containing cluster information written |
25 | | --max_steps          | 1000000                | Number of iterations to run for each workers |
26 | | --log_frequency      | 100                    | How many steps between two runop log |
27 | | --sync               | True                   | Whether to synchronize learning or not |
28 | | --ckpt_dir           | None                   | Directory to save checkpoints |
29 | | --save_ckpt_steps    | 0                      | Number of steps between two consecutive checkpoints |
30 | | --run_option         | None                   | The run option whether PS or MPI, None utilizes both |
31 | 
32 | You can adapt the distributed running with above options. For example, if you want to fix the communication model as MPI mode, you can add `run_option` value like below.
33 | 
34 | ```shell
35 | $ python CNNBenchmark_distributed_driver.py --model={model} --data_name={data_name} --data_dir={data_dir} --run_option=MPI
36 | ```
37 | 
38 | ## Evaluation
39 | Execute:
40 | ```shell
41 | $ python CNNBenchmark_eval.py --eval=True --model={model} --data_name={data_name} --data_dir={data_dir} --checkpoint_dir={checkpoint_dir}
42 | ```
43 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/tf_cnn_benchmarks/models/__init__.py


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/alexnet_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Alexnet model configuration.
17 | 
18 | References:
19 |   Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton
20 |   ImageNet Classification with Deep Convolutional Neural Networks
21 |   Advances in Neural Information Processing Systems. 2012
22 | """
23 | 
24 | import tensorflow as tf
25 | 
26 | from models import model
27 | 
28 | 
29 | class AlexnetModel(model.Model):
30 |     """Alexnet cnn model."""
31 | 
32 |     def __init__(self):
33 |         super(AlexnetModel, self).__init__('alexnet', 224 + 3, 512, 0.005)
34 | 
35 |     def add_inference(self, cnn):
36 |         # Note: VALID requires padding the images by 3 in width and height
37 |         cnn.conv(64, 11, 11, 4, 4, 'VALID')
38 |         cnn.mpool(3, 3, 2, 2)
39 |         cnn.conv(192, 5, 5)
40 |         cnn.mpool(3, 3, 2, 2)
41 |         cnn.conv(384, 3, 3)
42 |         cnn.conv(384, 3, 3)
43 |         cnn.conv(256, 3, 3)
44 |         cnn.mpool(3, 3, 2, 2)
45 |         cnn.reshape([-1, 256 * 6 * 6])
46 |         cnn.affine(4096)
47 |         cnn.dropout()
48 |         cnn.affine(4096)
49 |         cnn.dropout()
50 | 
51 | 
52 | class AlexnetCifar10Model(model.Model):
53 |     """Alexnet cnn model for cifar datasets.
54 | 
55 |     The model architecture follows the one defined in the tensorflow tutorial
56 |     model.
57 | 
58 |     Reference model: tensorflow/models/tutorials/image/cifar10/cifar10.py
59 |     Paper: http://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf
60 |     """
61 | 
62 |     def __init__(self):
63 |         super(AlexnetCifar10Model, self).__init__('alexnet', 32, 128, 0.1)
64 | 
65 |     def add_inference(self, cnn):
66 |         cnn.conv(64, 5, 5, 1, 1, 'SAME', stddev=5e-2)
67 |         cnn.mpool(3, 3, 2, 2, mode='SAME')
68 |         cnn.lrn(depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
69 |         cnn.conv(64, 5, 5, 1, 1, 'SAME', bias=0.1, stddev=5e-2)
70 |         cnn.lrn(depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
71 |         cnn.mpool(3, 3, 2, 2, mode='SAME')
72 |         shape = cnn.top_layer.get_shape().as_list()
73 |         flat_dim = shape[1] * shape[2] * shape[3]
74 |         cnn.reshape([-1, flat_dim])
75 |         cnn.affine(384, stddev=0.04, bias=0.1)
76 |         cnn.affine(192, stddev=0.04, bias=0.1)
77 | 
78 |     def get_learning_rate(self, global_step, batch_size):
79 |         num_examples_per_epoch = 50000
80 |         num_epochs_per_decay = 100
81 |         decay_steps = int(num_epochs_per_decay * num_examples_per_epoch /
82 |                           batch_size)
83 |         decay_factor = 0.1
84 |         return tf.train.exponential_decay(
85 |             self.learning_rate, global_step, decay_steps, decay_factor,
86 |             staircase=True)
87 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/densenet_model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Densenet model configuration.
 17 | 
 18 | References:
 19 |   "Densely Connected Convolutional Networks": https://arxiv.org/pdf/1608.06993
 20 | """
 21 | import numpy as np
 22 | from six.moves import xrange  # pylint: disable=redefined-builtin
 23 | import tensorflow as tf
 24 | 
 25 | from models import model as model_lib
 26 | 
 27 | 
 28 | class DensenetCifar10Model(model_lib.Model):
 29 |     """Densenet cnn network configuration."""
 30 | 
 31 |     def __init__(self, model, layer_counts, growth_rate):
 32 |         self.growth_rate = growth_rate
 33 |         super(DensenetCifar10Model, self).__init__(model, 32, 64, 0.1,
 34 |                                                    layer_counts=layer_counts)
 35 |         self.batch_norm_config = {'decay': 0.9, 'epsilon': 1e-5, 'scale': True}
 36 | 
 37 |     def dense_block(self, cnn, growth_rate):
 38 |         input_layer = cnn.top_layer
 39 |         c = cnn.batch_norm(input_layer, **self.batch_norm_config)
 40 |         c = tf.nn.relu(c)
 41 |         c = cnn.conv(growth_rate, 3, 3, 1, 1,
 42 |                      stddev=np.sqrt(2.0 / 9 / growth_rate),
 43 |                      activation=None, input_layer=c)
 44 |         channel_index = 3 if cnn.channel_pos == 'channels_last' else 1
 45 |         cnn.top_layer = tf.concat([input_layer, c], channel_index)
 46 |         cnn.top_size += growth_rate
 47 | 
 48 |     def transition_layer(self, cnn):
 49 |         in_size = cnn.top_size
 50 |         cnn.batch_norm(**self.batch_norm_config)
 51 |         cnn.top_layer = tf.nn.relu(cnn.top_layer)
 52 |         cnn.conv(in_size, 1, 1, 1, 1, stddev=np.sqrt(2.0 / 9 / in_size))
 53 |         cnn.apool(2, 2, 2, 2)
 54 | 
 55 |     def add_inference(self, cnn):
 56 |         if self.layer_counts is None:
 57 |             raise ValueError(
 58 |                 'Layer counts not specified for %s' % self.get_model())
 59 |         if self.growth_rate is None:
 60 |             raise ValueError(
 61 |                 'Growth rate not specified for %s' % self.get_model())
 62 | 
 63 |         cnn.conv(16, 3, 3, 1, 1, activation=None)
 64 |         # Block 1
 65 |         for _ in xrange(self.layer_counts[0]):
 66 |             self.dense_block(cnn, self.growth_rate)
 67 |         self.transition_layer(cnn)
 68 |         # Block 2
 69 |         for _ in xrange(self.layer_counts[1]):
 70 |             self.dense_block(cnn, self.growth_rate)
 71 |         self.transition_layer(cnn)
 72 |         # Block 3
 73 |         for _ in xrange(self.layer_counts[2]):
 74 |             self.dense_block(cnn, self.growth_rate)
 75 |         cnn.batch_norm(**self.batch_norm_config)
 76 |         cnn.top_layer = tf.nn.relu(cnn.top_layer)
 77 |         channel_index = 3 if cnn.channel_pos == 'channels_last' else 1
 78 |         cnn.top_size = cnn.top_layer.get_shape().as_list()[channel_index]
 79 |         cnn.spatial_mean()
 80 | 
 81 |     def get_learning_rate(self, global_step, batch_size):
 82 |         num_batches_per_epoch = int(50000 / batch_size)
 83 |         boundaries = num_batches_per_epoch * np.array([150, 225, 300],
 84 |                                                       dtype=np.int64)
 85 |         boundaries = [x for x in boundaries]
 86 |         values = [0.1, 0.01, 0.001, 0.0001]
 87 |         return tf.train.piecewise_constant(global_step, boundaries, values)
 88 | 
 89 | 
 90 | def create_densenet40_k12_model():
 91 |     return DensenetCifar10Model('densenet40_k12', (12, 12, 12), 12)
 92 | 
 93 | 
 94 | def create_densenet100_k12_model():
 95 |     return DensenetCifar10Model('densenet100_k12', (32, 32, 32), 12)
 96 | 
 97 | 
 98 | def create_densenet100_k24_model():
 99 |     return DensenetCifar10Model('densenet100_k24', (32, 32, 32), 24)
100 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/googlenet_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Googlenet model configuration.
17 | 
18 | References:
19 |   Szegedy, Christian, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
20 |   Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, and Andrew Rabinovich
21 |   Going deeper with convolutions
22 |   arXiv preprint arXiv:1409.4842 (2014)
23 | """
24 | 
25 | from models import model
26 | 
27 | 
28 | class GooglenetModel(model.Model):
29 | 
30 |     def __init__(self):
31 |         super(GooglenetModel, self).__init__('googlenet', 224, 32, 0.005)
32 | 
33 |     def add_inference(self, cnn):
34 |         def inception_v1(cnn, k, l, m, n, p, q):
35 |             cols = [[('conv', k, 1, 1)], [('conv', l, 1, 1), ('conv', m, 3, 3)],
36 |                     [('conv', n, 1, 1), ('conv', p, 5, 5)],
37 |                     [('mpool', 3, 3, 1, 1, 'SAME'), ('conv', q, 1, 1)]]
38 |             cnn.inception_module('incept_v1', cols)
39 | 
40 |         cnn.conv(64, 7, 7, 2, 2)
41 |         cnn.mpool(3, 3, 2, 2, mode='SAME')
42 |         cnn.conv(64, 1, 1)
43 |         cnn.conv(192, 3, 3)
44 |         cnn.mpool(3, 3, 2, 2, mode='SAME')
45 |         inception_v1(cnn, 64, 96, 128, 16, 32, 32)
46 |         inception_v1(cnn, 128, 128, 192, 32, 96, 64)
47 |         cnn.mpool(3, 3, 2, 2, mode='SAME')
48 |         inception_v1(cnn, 192, 96, 208, 16, 48, 64)
49 |         inception_v1(cnn, 160, 112, 224, 24, 64, 64)
50 |         inception_v1(cnn, 128, 128, 256, 24, 64, 64)
51 |         inception_v1(cnn, 112, 144, 288, 32, 64, 64)
52 |         inception_v1(cnn, 256, 160, 320, 32, 128, 128)
53 |         cnn.mpool(3, 3, 2, 2, mode='SAME')
54 |         inception_v1(cnn, 256, 160, 320, 32, 128, 128)
55 |         inception_v1(cnn, 384, 192, 384, 48, 128, 128)
56 |         cnn.apool(7, 7, 1, 1, mode='VALID')
57 |         cnn.reshape([-1, 1024])
58 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/lenet_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Lenet model configuration.
17 | 
18 | References:
19 |   LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner
20 |   Gradient-based learning applied to document recognition
21 |   Proceedings of the IEEE (1998)
22 | """
23 | 
24 | from models import model
25 | 
26 | 
27 | class Lenet5Model(model.Model):
28 |     def __init__(self):
29 |         super(Lenet5Model, self).__init__('lenet5', 28, 32, 0.005)
30 | 
31 |     def add_inference(self, cnn):
32 |         # Note: This matches TF's MNIST tutorial model
33 |         cnn.conv(32, 5, 5)
34 |         cnn.mpool(2, 2)
35 |         cnn.conv(64, 5, 5)
36 |         cnn.mpool(2, 2)
37 |         cnn.reshape([-1, 64 * 7 * 7])
38 |         cnn.affine(512)
39 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Base model configuration for CNN benchmarks."""
16 | 
17 | 
18 | class Model(object):
19 |     """Base model configuration for CNN benchmarks."""
20 | 
21 |     def __init__(self,
22 |                  model,
23 |                  image_size,
24 |                  batch_size,
25 |                  learning_rate,
26 |                  layer_counts=None,
27 |                  fp16_loss_scale=128):
28 |         self.model = model
29 |         self.image_size = image_size
30 |         self.batch_size = batch_size
31 |         self.default_batch_size = batch_size
32 |         self.learning_rate = learning_rate
33 |         self.layer_counts = layer_counts
34 |         # TODO(reedwm) Set custom loss scales for each model instead of using
35 |         # the default of 128.
36 |         self.fp16_loss_scale = fp16_loss_scale
37 | 
38 |     def get_model(self):
39 |         return self.model
40 | 
41 |     def get_image_size(self):
42 |         return self.image_size
43 | 
44 |     def get_batch_size(self):
45 |         return self.batch_size
46 | 
47 |     def set_batch_size(self, batch_size):
48 |         self.batch_size = batch_size
49 | 
50 |     def get_default_batch_size(self):
51 |         return self.default_batch_size
52 | 
53 |     def get_layer_counts(self):
54 |         return self.layer_counts
55 | 
56 |     def get_fp16_loss_scale(self):
57 |         return self.fp16_loss_scale
58 | 
59 |     def get_learning_rate(self, global_step, batch_size):
60 |         del global_step
61 |         del batch_size
62 |         return self.learning_rate
63 | 
64 |     def add_inference(self, unused_cnn):
65 |         raise ValueError('Must be implemented in derived classes')
66 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/model_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Model configurations for CNN benchmarks.
17 | """
18 | 
19 | from models import alexnet_model
20 | from models import densenet_model
21 | from models import googlenet_model
22 | from models import inception_model
23 | from models import lenet_model
24 | from models import overfeat_model
25 | from models import resnet_model
26 | from models import trivial_model
27 | from models import vgg_model
28 | 
29 | _model_name_to_imagenet_model = {
30 |     'vgg11': vgg_model.Vgg11Model,
31 |     'vgg16': vgg_model.Vgg16Model,
32 |     'vgg19': vgg_model.Vgg19Model,
33 |     'lenet': lenet_model.Lenet5Model,
34 |     'googlenet': googlenet_model.GooglenetModel,
35 |     'overfeat': overfeat_model.OverfeatModel,
36 |     'alexnet': alexnet_model.AlexnetModel,
37 |     'trivial': trivial_model.TrivialModel,
38 |     'inception3': inception_model.Inceptionv3Model,
39 |     'inception4': inception_model.Inceptionv4Model,
40 |     'resnet50': resnet_model.create_resnet50_model,
41 |     'resnet50_v2': resnet_model.create_resnet50_v2_model,
42 |     'resnet101': resnet_model.create_resnet101_model,
43 |     'resnet101_v2': resnet_model.create_resnet101_v2_model,
44 |     'resnet152': resnet_model.create_resnet152_model,
45 |     'resnet152_v2': resnet_model.create_resnet152_v2_model,
46 | }
47 | 
48 | _model_name_to_cifar_model = {
49 |     'alexnet': alexnet_model.AlexnetCifar10Model,
50 |     'resnet20': resnet_model.create_resnet20_cifar_model,
51 |     'resnet20_v2': resnet_model.create_resnet20_v2_cifar_model,
52 |     'resnet32': resnet_model.create_resnet32_cifar_model,
53 |     'resnet32_v2': resnet_model.create_resnet32_v2_cifar_model,
54 |     'resnet44': resnet_model.create_resnet44_cifar_model,
55 |     'resnet44_v2': resnet_model.create_resnet44_v2_cifar_model,
56 |     'resnet56': resnet_model.create_resnet56_cifar_model,
57 |     'resnet56_v2': resnet_model.create_resnet56_v2_cifar_model,
58 |     'resnet110': resnet_model.create_resnet110_cifar_model,
59 |     'resnet110_v2': resnet_model.create_resnet110_v2_cifar_model,
60 |     'trivial': trivial_model.TrivialCifar10Model,
61 |     'densenet40_k12': densenet_model.create_densenet40_k12_model,
62 |     'densenet100_k12': densenet_model.create_densenet100_k12_model,
63 |     'densenet100_k24': densenet_model.create_densenet100_k24_model,
64 | }
65 | 
66 | 
67 | def _get_model_map(dataset_name):
68 |     if 'cifar10' == dataset_name:
69 |         return _model_name_to_cifar_model
70 |     elif dataset_name in ('imagenet', 'synthetic'):
71 |         return _model_name_to_imagenet_model
72 |     else:
73 |         raise ValueError('Invalid dataset name: %s' % dataset_name)
74 | 
75 | 
76 | def get_model_config(model_name, dataset):
77 |     """Map model name to model network configuration."""
78 |     model_map = _get_model_map(dataset.name)
79 |     if model_name not in model_map:
80 |         raise ValueError('Invalid model name \'%s\' for dataset \'%s\'' %
81 |                          (model_name, dataset.name))
82 |     else:
83 |         return model_map[model_name]()
84 | 
85 | 
86 | def register_model(model_name, dataset_name, model_func):
87 |     """Register a new model that can be obtained with `get_model_config`."""
88 |     model_map = _get_model_map(dataset_name)
89 |     if model_name in model_map:
90 |         raise ValueError('Model "%s" is already registered for dataset "%s"' %
91 |                          (model_name, dataset_name))
92 |     model_map[model_name] = model_func
93 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/overfeat_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Overfeat model configuration.
17 | 
18 | References:
19 |   OverFeat: Integrated Recognition, Localization and Detection using
20 |   Convolutional Networks
21 |   Pierre Sermanet, David Eigen, Xiang Zhang, Michael Mathieu, Rob Fergus,
22 |   Yann LeCun, 2014
23 |   http://arxiv.org/abs/1312.6229
24 | """
25 | 
26 | from models import model
27 | 
28 | 
29 | class OverfeatModel(model.Model):
30 | 
31 |     def __init__(self):
32 |         super(OverfeatModel, self).__init__('overfeat', 231, 32, 0.005)
33 | 
34 |     def add_inference(self, cnn):
35 |         # Note: VALID requires padding the images by 3 in width and height
36 |         cnn.conv(96, 11, 11, 4, 4, mode='VALID')
37 |         cnn.mpool(2, 2)
38 |         cnn.conv(256, 5, 5, 1, 1, mode='VALID')
39 |         cnn.mpool(2, 2)
40 |         cnn.conv(512, 3, 3)
41 |         cnn.conv(1024, 3, 3)
42 |         cnn.conv(1024, 3, 3)
43 |         cnn.mpool(2, 2)
44 |         cnn.reshape([-1, 1024 * 6 * 6])
45 |         cnn.affine(3072)
46 |         cnn.affine(4096)
47 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/trivial_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Trivial model configuration."""
16 | 
17 | from models import model
18 | 
19 | 
20 | class TrivialModel(model.Model):
21 |     """Trivial model configuration."""
22 | 
23 |     def __init__(self):
24 |         super(TrivialModel, self).__init__('trivial', 224 + 3, 32, 0.005)
25 | 
26 |     def add_inference(self, cnn):
27 |         cnn.reshape([-1, 227 * 227 * 3])
28 |         cnn.affine(1)
29 |         cnn.affine(4096)
30 | 
31 | 
32 | class TrivialCifar10Model(model.Model):
33 |     """Trivial cifar10 model configuration."""
34 | 
35 |     def __init__(self):
36 |         super(TrivialCifar10Model, self).__init__('trivial', 32, 32, 0.005)
37 | 
38 |     def add_inference(self, cnn):
39 |         cnn.reshape([-1, 32 * 32 * 3])
40 |         cnn.affine(1)
41 |         cnn.affine(4096)
42 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/models/vgg_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Vgg model configuration.
17 | 
18 | Includes multiple models: vgg11, vgg16, vgg19, corresponding to
19 |   model A, D, and E in Table 1 of [1].
20 | 
21 | References:
22 | [1]  Simonyan, Karen, Andrew Zisserman
23 |      Very Deep Convolutional Networks for Large-Scale Image Recognition
24 |      arXiv:1409.1556 (2014)
25 | """
26 | 
27 | from six.moves import xrange  # pylint: disable=redefined-builtin
28 | from models import model
29 | 
30 | 
31 | def _construct_vgg(cnn, num_conv_layers):
32 |     """Build vgg architecture from blocks."""
33 |     assert len(num_conv_layers) == 5
34 |     for _ in xrange(num_conv_layers[0]):
35 |         cnn.conv(64, 3, 3)
36 |     cnn.mpool(2, 2)
37 |     for _ in xrange(num_conv_layers[1]):
38 |         cnn.conv(128, 3, 3)
39 |     cnn.mpool(2, 2)
40 |     for _ in xrange(num_conv_layers[2]):
41 |         cnn.conv(256, 3, 3)
42 |     cnn.mpool(2, 2)
43 |     for _ in xrange(num_conv_layers[3]):
44 |         cnn.conv(512, 3, 3)
45 |     cnn.mpool(2, 2)
46 |     for _ in xrange(num_conv_layers[4]):
47 |         cnn.conv(512, 3, 3)
48 |     cnn.mpool(2, 2)
49 |     cnn.reshape([-1, 512 * 7 * 7])
50 |     cnn.affine(4096)
51 |     cnn.dropout()
52 |     cnn.affine(4096)
53 |     cnn.dropout()
54 | 
55 | 
56 | class Vgg11Model(model.Model):
57 | 
58 |     def __init__(self):
59 |         super(Vgg11Model, self).__init__('vgg11', 224, 64, 0.005)
60 | 
61 |     def add_inference(self, cnn):
62 |         _construct_vgg(cnn, [1, 1, 2, 2, 2])
63 | 
64 | 
65 | class Vgg16Model(model.Model):
66 | 
67 |     def __init__(self):
68 |         super(Vgg16Model, self).__init__('vgg16', 224, 64, 0.005)
69 | 
70 |     def add_inference(self, cnn):
71 |         _construct_vgg(cnn, [2, 2, 3, 3, 3])
72 | 
73 | 
74 | class Vgg19Model(model.Model):
75 | 
76 |     def __init__(self):
77 |         super(Vgg19Model, self).__init__('vgg19', 224, 64, 0.005)
78 | 
79 |     def add_inference(self, cnn):
80 |         _construct_vgg(cnn, [2, 2, 4, 4, 4])
81 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/parallax_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2018 Seoul National University
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | import tensorflow as tf
17 | import parallax
18 | 
19 | 
20 | flags = tf.app.flags
21 | flags.DEFINE_boolean('replicate_variables', True, """replicate_variables""")
22 | flags.DEFINE_string('protocol', 'grpc', """The method for managing variables""")
23 | tf.app.flags.DEFINE_string('mpirun_options', '',
24 |                            'option for mpirun')
25 | flags.DEFINE_string('run_option', 'HYBRID',
26 |                     'The run option whether PS, MPI or HYBRID')
27 | flags.DEFINE_string('redirect_path', None, """redirect path to keep the log of distributed workers""")
28 | flags.DEFINE_integer('save_ckpt_steps', None,
29 |                      """Number of steps between two consecutive checkpoints""")
30 | flags.DEFINE_integer('save_n_ckpts_per_epoch', -1, """Save n checkpoints per every epoch""")
31 | flags.DEFINE_string('ckpt_dir', None, """Directory to save checkpoints""")
32 | flags.DEFINE_string('profile_dir', None, """Directory to save RunMetadata""")
33 | flags.DEFINE_string('profile_steps', None, """Comma separated porfile steps""")
34 | flags.DEFINE_string('profile_range', None, """profile_start_step,profile_end_step""")
35 | flags.DEFINE_integer('profile_worker', None, """The worker to profile""")
36 | flags.DEFINE_boolean('local_aggregation', True,
37 |                      """Whether to use local aggregation or not""")
38 | flags.DEFINE_boolean('boundary_among_servers', True,
39 |                      """Whether to use operation placement among servers""")
40 | flags.DEFINE_boolean('boundary_between_workers_and_servers', True,
41 |                      """Whether to use operation placement between workers and servers""")
42 | flags.DEFINE_string('export_graph_path', None, """export path to keep transformed graph definintion""")
43 | 
44 | FLAGS = flags.FLAGS
45 | 
46 | def calculate_ckpt_steps():
47 |     if FLAGS.save_n_ckpts_per_epoch > 0:
48 |       with open(FLAGS.resource_info_file) as resource_info:
49 |         num_workers = sum([len(w['gpus']) for w in json.load(resource_info)['worker']])
50 |       num_words_per_iter = FLAGS.batch_size * FLAGS.num_steps * num_workers
51 |       num_iters_per_epoch = math.ceil(language_model_graph._NUM_WORDS['train'] / num_words_per_iter / FLAGS.save_n_ckpts_per_epoch)
52 |       save_ckpt_steps = num_iters_per_epoch if FLAGS.sync else num_iters_per_epoch * num_workers
53 |       parallax.log.info('Save checkpoint for every %d iters' % save_ckpt_steps)
54 |     else:
55 |       save_ckpt_steps = FLAGS.save_ckpt_steps
56 | 
57 |     return save_ckpt_steps
58 | 
59 | 
60 | def build_config():
61 | 
62 |     ckpt_config = parallax.CheckPointConfig(ckpt_dir=FLAGS.ckpt_dir,
63 |                                             save_ckpt_steps=calculate_ckpt_steps())
64 |     ps_config = parallax.PSConfig(replicate_variables=FLAGS.replicate_variables,
65 |                                   protocol=FLAGS.protocol,
66 |                                   local_aggregation=FLAGS.local_aggregation,
67 |                                   boundary_among_servers=FLAGS.boundary_among_servers,
68 |                                   boundary_between_workers_and_servers=\
69 |                                   FLAGS.boundary_between_workers_and_servers)
70 |     mpi_config = parallax.MPIConfig(mpirun_options=FLAGS.mpirun_options)
71 |     def get_profile_steps():
72 |         if FLAGS.profile_steps:
73 |             FLAGS.profile_steps = FLAGS.profile_steps.strip()
74 |             return [int(step) for step in FLAGS.profile_steps.split(',')]
75 |         return None
76 |     
77 |     def get_profile_range():
78 |         if FLAGS.profile_range:
79 |             FLAGS.profile_range = FLAGS.profile_range.strip()
80 |             splits = FLAGS.profile_range.split(',')
81 |             return (int(splits[0]), int(splits[1]))
82 |         return None
83 |         
84 |     profile_config = parallax.ProfileConfig(profile_dir=FLAGS.profile_dir,
85 |                                             profile_steps=get_profile_steps(),
86 |                                             profile_range=get_profile_range(),
87 |                                             profile_worker=FLAGS.profile_worker)
88 | 
89 |     parallax_config = parallax.Config()
90 |     parallax_config.run_option = FLAGS.run_option
91 |     parallax_config.average_sparse = False
92 |     parallax_config.communication_config = parallax.CommunicationConfig(ps_config, mpi_config)
93 |     parallax_config.ckpt_config=ckpt_config
94 |     parallax_config.profile_config = profile_config
95 |     parallax_config.redirect_path = FLAGS.redirect_path
96 |     parallax_config.export_graph_path = FLAGS.export_graph_path
97 | 
98 |     return parallax_config
99 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/platforms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/tf_cnn_benchmarks/platforms/__init__.py


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/platforms/default/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snuspl/parallax/351a913877df3ae03f1b1b52320ee4536b17a667/parallax/parallax/examples/tf_cnn_benchmarks/platforms/default/__init__.py


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/platforms/default/util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Utility code for the default platform."""
17 | 
18 | import cnn_util
19 | 
20 | 
21 | def get_platform_params():
22 |     """Returns a dict of platform-specific params.
23 | 
24 |   No platform-specific flags are needed for the default platform, so this
25 |   returns an empty dict.
26 | 
27 |   Returns:
28 |     A dict that maps from param name to ParamSpec.
29 |   """
30 |     return {}
31 | 
32 | 
33 | def get_cluster_manager(params, config_proto):
34 |     """Returns the cluster manager to be used."""
35 |     return cnn_util.GrpcClusterManager(params, config_proto)
36 | 
37 | 
38 | def _initialize(params, config_proto):
39 |     # Currently, no platform initialization needs to be done.
40 |     del params, config_proto
41 | 
42 | 
43 | _is_initalized = False
44 | 
45 | 
46 | def initialize(params, config_proto):
47 |     global _is_initalized
48 |     if _is_initalized:
49 |         return
50 |     _is_initalized = True
51 |     _initialize(params, config_proto)
52 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/platforms/util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Utility code for a certain platform.
17 | 
18 | This file simply imports everything from the default platform. To switch to a
19 | different platform, the import statement can be changed to point to a new
20 | platform.
21 | 
22 | Creating a custom platform can be useful to, e.g., run some initialization code
23 | required by the platform or register a platform-specific model.
24 | """
25 | 
26 | from platforms.default.util import *  # pylint: disable=unused-import,wildcard-import
27 | 


--------------------------------------------------------------------------------
/parallax/parallax/examples/tf_cnn_benchmarks/resource_info:
--------------------------------------------------------------------------------
1 | 123.456.78.90:1,2
2 | 


--------------------------------------------------------------------------------
/parallax/parallax/util/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])  # Apache 2.0
 2 | 
 3 | package(
 4 |     default_visibility = [
 5 |         "//visibility:public",
 6 |     ],
 7 | )
 8 | 
 9 | sh_binary(
10 |     name = "build_pip_package",
11 |     srcs = ["build_pip_package.sh"],
12 |     data = [
13 |         "//parallax:parallax",
14 |         "//parallax/core/python/tools:tools",
15 |     ],
16 | )
17 | 


--------------------------------------------------------------------------------
/parallax/parallax/util/build_pip_package.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2017 Google Inc. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # Script for building a pip package.
18 | #
19 | # Based on tensorflow/tools/pip_package/build_pip_package.sh.
20 | set -e
21 | 
22 | function main() {
23 |   PYTHON="python"
24 |   POSITIONAL=()
25 |   while [[ $# -gt 0 ]]
26 |   do
27 |   key="$1"
28 |   case $key in
29 |       -p|-py)
30 |       PYTHON="$2"
31 |       shift
32 |       shift
33 |       ;;
34 |       --py=*|--python=*)
35 |       PYTHON="${key#*=}"
36 |       shift
37 |       ;;
38 |       *)
39 |       POSITIONAL+=("$1")
40 |       shift
41 |       ;;
42 |   esac
43 |   done
44 |   set -- "${POSITIONAL[@]}" # restore positional parameters
45 | 
46 |   if [ $# -lt 1 ] ; then
47 |     echo "No destination dir provided"
48 |     exit 1
49 |   fi
50 | 
51 |   DEST=$1
52 |   TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX)
53 | 
54 |   echo $(date) : "=== Using tmpdir: ${TMPDIR}"
55 | 
56 |   if [ ! -d bazel-bin/parallax ]; then
57 |     echo "Could not find bazel-bin.  Did you run from the root of the build tree?"
58 |     exit 1
59 |   fi
60 | 
61 |   cp -R \
62 |     bazel-bin/parallax/util/build_pip_package.runfiles/parallax/parallax \
63 |     "${TMPDIR}"
64 | 
65 |   cp parallax/util/setup.py ${TMPDIR}
66 | 
67 |   # Before we leave the top-level directory, make sure we know how to
68 |   # call python.
69 |   #source tensorflow/tools/python_bin_path.sh
70 | 
71 |   pushd ${TMPDIR}
72 |   echo $(date) : "=== Building wheel"
73 |   ${PYTHON} setup.py bdist_wheel >/dev/null
74 |   mkdir -p ${DEST}
75 |   cp dist/* ${DEST}
76 |   popd
77 |   rm -rf ${TMPDIR}
78 |   echo $(date) : "=== Output wheel file is in: ${DEST}"
79 |   echo ${PYTHON}
80 | }
81 | 
82 | main "$@"
83 | 


--------------------------------------------------------------------------------
/tools/bazel.rc:
--------------------------------------------------------------------------------
1 | import %workspace%/tensorflow/tools/bazel.rc
2 | import %workspace%/tensorflow/.tf_configure.bazelrc
3 | 
4 | build --define PYTHON_LIB_PATH=$PYTHON_BINARY/../../lib/python$PYTHON_MAJOR_VERSION/site-packages
5 | 
6 | build --package_path=%workspace%:%workspace%/tensorflow/
7 | 


--------------------------------------------------------------------------------
/tools/style_check.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2018 Seoul National University
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Simple Python style check script.
16 |    pycodestyle checks code against style
17 |    conventions in PEP8. Do not check
18 |    example files.
19 |    requirements: pycodestyle"""
20 | import os
21 | 
22 | # return 0 for success
23 | if os.system(
24 |     "pycodestyle --statistics ../parallax/parallax/ "
25 |         "--exclude=../parallax/parallax/examples/") == 0:
26 |     print("PASS")
27 | 


--------------------------------------------------------------------------------