├── .dockerignore
├── .github
    └── workflows
    │   └── test.yml
├── .gitignore
├── LICENSE
├── README.md
├── docs
    ├── inference_options.md
    └── rest_api.md
├── frameworks
    ├── baidu_translate
    │   ├── Dockerfile
    │   ├── README.md
    │   └── entrypoint.py
    ├── deepl_translate
    │   ├── Dockerfile
    │   ├── README.md
    │   └── entrypoint.py
    ├── google_translate
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── entrypoint.py
    │   └── requirements.txt
    ├── naver_translate
    │   ├── Dockerfile
    │   ├── README.md
    │   └── entrypoint.py
    ├── opennmt_py
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── entrypoint.py
    │   └── requirements.txt
    ├── opennmt_tf
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── entrypoint.py
    │   ├── requirements.txt
    │   └── test
    │   │   └── test.py
    ├── sogou_translate
    │   ├── Dockerfile
    │   ├── README.md
    │   └── entrypoint.py
    ├── tencent_translate
    │   ├── Dockerfile
    │   ├── README.md
    │   └── entrypoint.py
    └── youdao_translate
    │   ├── Dockerfile
    │   ├── README.md
    │   └── entrypoint.py
├── nmtwizard
    ├── __init__.py
    ├── beat_service.py
    ├── cloud_translation_framework.py
    ├── config.py
    ├── data.py
    ├── framework.py
    ├── logger.py
    ├── preprocess
    │   ├── __init__.py
    │   ├── consumer.py
    │   ├── loader.py
    │   ├── operators
    │   │   ├── __init__.py
    │   │   ├── align_perplexity_filter.py
    │   │   ├── alignment.py
    │   │   ├── identity_filter.py
    │   │   ├── length_filter.py
    │   │   ├── noise.py
    │   │   ├── parentheses_filter.py
    │   │   ├── similarity_filter.py
    │   │   └── tokenization.py
    │   ├── prepoperator.py
    │   ├── preprocess.py
    │   ├── sampler.py
    │   ├── tokenizer.py
    │   └── tu.py
    ├── serving.py
    ├── utility.py
    └── utils.py
├── requirements.txt
├── setup.cfg
├── setup.py
├── test
    ├── conftest.json
    ├── conftest.py
    ├── corpus
    │   ├── eval
    │   │   ├── testset1.out
    │   │   ├── testset1.ref
    │   │   ├── testset2.out
    │   │   ├── testset2.ref.1
    │   │   ├── testset2.ref.2
    │   │   ├── testset3.out
    │   │   └── testset3.ref
    │   ├── resources
    │   │   ├── alignment
    │   │   │   ├── ende_backward.probs
    │   │   │   └── ende_forward.probs
    │   │   ├── embeddings
    │   │   │   └── dbpedia.ftz
    │   │   └── subword
    │   │   │   └── en_de.sp
    │   ├── train
    │   │   ├── europarl-v7.de-en.10K.tok.de
    │   │   └── europarl-v7.de-en.10K.tok.en
    │   └── vocab
    │   │   ├── de-vocab.txt
    │   │   ├── en-vocab.txt
    │   │   └── vocab-extra.txt
    ├── pytest.ini
    ├── requirements.txt
    ├── test_cloud_translation_framework.py
    ├── test_config.py
    ├── test_data.py
    ├── test_framework.py
    ├── test_operators.py
    ├── test_preprocess.py
    ├── test_serving.py
    ├── test_tokenizer.py
    └── test_utility.py
├── tools
    └── docker_images.py
└── utilities
    ├── score
        ├── BLEU
        │   └── multi-bleu-detok_cjk.perl
        ├── Dockerfile
        ├── NIST
        │   ├── mteval-v14.pl
        │   └── xml_wrap.pl
        ├── README.md
        ├── TER
        │   └── tercom.7.25.jar
        └── entrypoint.py
    ├── similarity
        ├── Dockerfile
        ├── README.md
        └── entrypoint.py
    └── tuminer
        ├── Dockerfile
        ├── README.md
        └── entrypoint.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | *
2 | !frameworks
3 | !nmtwizard
4 | !requirements.txt
5 | !utilities
6 | **/*.pyc
7 | **/__pycache__
8 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: CI tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   test-generic-framework:
11 | 
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |       - uses: actions/checkout@v2
16 | 
17 |       - name: Set up Python 3.8
18 |         uses: actions/setup-python@v2
19 |         with:
20 |           python-version: 3.8
21 | 
22 |       - name: Install dependencies
23 |         run: |
24 |           python -m pip install -e .[tests]
25 | 
26 |       - name: Check code format with Black
27 |         run: |
28 |           black --check .
29 | 
30 |       - name: Check code style with Flake8
31 |         if: ${{ always() }}
32 |         run: |
33 |           flake8 .
34 | 
35 |       - name: Test with pytest
36 |         run: |
37 |           python -m pytest test/
38 | 
39 | 
40 |   test-opennmt-tf-framework:
41 | 
42 |     runs-on: ubuntu-latest
43 | 
44 |     steps:
45 |       - uses: actions/checkout@v2
46 | 
47 |       - name: Set up Python 3.8
48 |         uses: actions/setup-python@v2
49 |         with:
50 |           python-version: 3.8
51 | 
52 |       - name: Install dependencies
53 |         run: |
54 |           python -m pip install -e .[tests]
55 |           python -m pip install -r frameworks/opennmt_tf/requirements.txt
56 | 
57 |       - name: Test with pytest
58 |         run: |
59 |           python -m pytest frameworks/opennmt_tf/test/test.py
60 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .pytest_cache
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017-present The OpenNMT Authors.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # nmt-wizard-docker
  2 | 
  3 | The goal of this project is to encapsulate MT frameworks in Docker containers and expose a single interface for preprocessing, training, translating, and serving models. The project targets advanced users who are looking to automate the production and validation of translation models.
  4 | 
  5 | The [available Docker images](https://hub.docker.com/u/nmtwizard) extend the original frameworks with the following features:
  6 | 
  7 | * Data weighting, sampling, and preprocessing from raw training files.
  8 | * Files synchronization from and to remote storages such as Amazon S3, Swift, or any server via SSH.
  9 | * Metadata on model history such as parent model, training data that was used, training time, etc.
 10 | * Regular HTTP request to an URL to declare running status.
 11 | 
 12 | It supports the [OpenNMT-py](https://github.com/OpenNMT/OpenNMT-py/) and [OpenNMT-tf](https://github.com/OpenNMT/OpenNMT-tf/) training frameworks, and provides translate-only frameworks using online translation API from DeepL, Google, Baidu, and others.
 13 | 
 14 | ## Usage
 15 | 
 16 | We recommend using the Docker images that are available on [Docker Hub](https://hub.docker.com/u/nmtwizard).
 17 | 
 18 | The Docker image entrypoint is a Python script that exposes the same command line interface for all frameworks. For example, run the commands below to download the latest OpenNMT-tf image and list the available options:
 19 | 
 20 | ```bash
 21 | docker pull nmtwizard/opennmt-tf
 22 | docker run nmtwizard/opennmt-tf -h
 23 | ```
 24 | 
 25 | [JSON configuration files](#configuration) are used to define training data, data preprocess, and framework specific options (model, hyperparameters, etc).
 26 | 
 27 | As an example, let's train an English-German Transformer model with OpenNMT-tf.
 28 | 
 29 | **1\. Prepare the data.**
 30 | 
 31 | For simplicity, we assume that the training data is already tokenized and the current directory contains a `data/` subdirectory with the following structure:
 32 | 
 33 | ```
 34 | $ tree data/
 35 | .
 36 | ├── corpus
 37 | │   ├── train.de
 38 | │   └── train.en
 39 | └── vocab
 40 |     └── shared-vocab.txt
 41 | ```
 42 | 
 43 | where:
 44 | 
 45 | * `train.de` and `train.en` are tokenized training files
 46 | * `shared-vocab.txt` contains one token per line and no special tokens
 47 | 
 48 | **2\. Define the configuration.**
 49 | 
 50 | The JSON configuration file is used to describe where to read the data (`data`) and how to transform it (`preprocess`). The `options` block is **specific to each framework** and defines the model and training hyperparameters. See [Configuration](#configuration) for more details.
 51 | 
 52 | ```json
 53 | {
 54 |     "source": "en",
 55 |     "target": "de",
 56 |     "data": {
 57 |         "sample_dist": [
 58 |             {
 59 |                 "path": "/data/corpus",
 60 |                 "distribution": [
 61 |                     ["train", "*"]
 62 |                 ]
 63 |             }
 64 |         ]
 65 |     },
 66 |     "preprocess": [
 67 |         {
 68 |             "op":"tokenization",
 69 |             "source": {
 70 |                 "mode": "space"
 71 |             },
 72 |             "target": {
 73 |                 "mode": "space"
 74 |             }
 75 |         }
 76 |     ],
 77 |     "vocabulary": {
 78 |         "source": {
 79 |             "path": "/data/vocab/shared-vocab.txt"
 80 |         },
 81 |         "target": {
 82 |             "path": "/data/vocab/shared-vocab.txt"
 83 |         }
 84 |     },
 85 |     "options": {
 86 |         "model_type": "Transformer",
 87 |         "auto_config": true
 88 |     }
 89 | }
 90 | ```
 91 | 
 92 | **3\. Train the model.**
 93 | 
 94 | The `train` command is used to start the training:
 95 | 
 96 | ```bash
 97 | cat config.json | docker run -i --gpus all \
 98 |     -v $PWD/data:/data -v $PWD/models:/models nmtwizard/opennmt-tf \
 99 |     --model_storage /models --task_id my_model_1 --config - train
100 | ```
101 | 
102 | This command runs the training for 1 epoch and produces the model `models/my-model-1`. The model contains the latest checkpoint, the JSON configuration, and all required resources (such as the vocabulary files).
103 | 
104 | You can run the next epoch by passing the model name as argument:
105 | 
106 | ```bash
107 | docker run --gpus all -v $PWD/data:/data -v $PWD/models:/models nmtwizard/opennmt-tf \
108 |     --model_storage /models --task_id my_model_2 --model my_model_1 train
109 | ```
110 | 
111 | Alternatively, you can run the full training in one command by setting the training step option available in the training framework.
112 | 
113 | **4\. Translate test files.**
114 | 
115 | Once a model is saved, you can start translating files with the `trans` command:
116 | 
117 | ```bash
118 | docker run --gpus all -v $PWD/data:/data -v $PWD/models:/models nmtwizard/opennmt-tf \
119 |     --model_storage /models --model my_model_2 trans -i /data/test.en -o /data/output.de
120 | ```
121 | 
122 | This command translates `data/test.en` and saves the result to `data/output.de`.
123 | 
124 | **5\. Serve the model.**
125 | 
126 | At some point, you may want to turn your trained model into a translation service with the `serve` command:
127 | 
128 | ```bash
129 | docker run -p 4000:4000 --gpus all -v $PWD/models:/models nmtwizard/opennmt-tf \
130 |     --model_storage /models --model my_model_2 serve --host 0.0.0.0 --port 4000
131 | ```
132 | 
133 | This command starts a translation server that accepts HTTP requests:
134 | 
135 | ```bash
136 | curl -X POST http://localhost:4000/translate -d '{"src":[{"text": "Hello world !"}]}'
137 | ```
138 | 
139 | See the [REST translation API](docs/rest_api.md) for more details.
140 | 
141 | To optimize the model size and loading latency, you can also `release` the model before serving. It will remove training-only information and possibly run additional optimizations:
142 | 
143 | ```bash
144 | docker run -v $PWD/models:/models nmtwizard/opennmt-tf \
145 |     --model_storage /models --model my_model_2 release
146 | ```
147 | 
148 | This command produces the serving-only model `models/my_model_2_release`.
149 | 
150 | ## Remote storages
151 | 
152 | Files and directories can be automatically downloaded from remote storages such as Amazon S3, Swift, or any server via SSH. This includes training data, models, and resources used in the configuration.
153 | 
154 | Remote storages should be configured in a JSON file and passed to the  `--storage_config` command line option:
155 | 
156 | ```json
157 | {
158 |     "storage_id_1": {
159 |         "type": "s3",
160 |         "bucket": "model-catalog",
161 |         "aws_credentials": {
162 |             "access_key_id": "...",
163 |             "secret_access_key": "...",
164 |             "region_name": "..."
165 |         }
166 |     },
167 |     "storage_id_2": {
168 |         "type": "ssh",
169 |         "server": "my-server.com",
170 |         "basedir": "myrepo",
171 |         "user": "root",
172 |         "password": "root"
173 |     }
174 | }
175 | ```
176 | 
177 | *See the supported services and their parameters in [SYSTRAN/storages](https://github.com/SYSTRAN/storages).*
178 | 
179 | Paths on the command line and in the configuration can reference remote paths with the syntax `<storage_id>:<path>`, where:
180 | 
181 | * `<storage_id>` is the storage identifier in the JSON file above (e.g. `storage_id_1`)
182 | * `<path>` is a file path on the remote storage
183 | 
184 | Files will be downloaded in the `/root/workspace/shared` directory within the Docker image. To minimize download cost, it is possible to mount this directory when running the Docker image. Future runs will reuse the local file if the remote file has not changed.
185 | 
186 | ## Configuration
187 | 
188 | The JSON configuration file contains all parameters necessary to train and run models. It has the following general structure:
189 | 
190 | ```json
191 | {
192 |     "source": "string",
193 |     "target": "string",
194 |     "data": {},
195 |     "preprocess": [
196 |         {
197 |             "op":"tokenization",
198 |             "source": {
199 |                 "mode": "string"
200 |             },
201 |             "target": {
202 |                 "mode": "string"
203 |             }
204 |         }
205 |     ],
206 |     "vocabulary": {
207 |         "source": {
208 |             "path": "string"
209 |         },
210 |         "target": {
211 |             "path": "string"
212 |         }
213 |     },
214 |     "options": {},
215 |     "serving": {}
216 | }
217 | ```
218 | 
219 | ### Description
220 | 
221 | #### `source` and `target`
222 | 
223 | They define the source and target languages (e.g. "en", "de", etc.).
224 | 
225 | #### `data`
226 | 
227 | The `data` section of the JSON configuration can be used to select data based on file patterns. The distribution is a list where each element contains:
228 | 
229 | * `path`: path to a directory where the distribution applies
230 | * `distribution`: a list of filename patterns and weights
231 | 
232 | For example, the configuration below will randomly select 10,000 training examples in the directory `data/en_nl/train` from files that have `News`, `IT`, or `Dialog` in their name:
233 | 
234 | 
235 | ```json
236 | {
237 |     "data": {
238 |         "sample": 10000,
239 |         "sample_dist": [
240 |             {
241 |                 "path": "/data/en_de/train",
242 |                 "distribution": [
243 |                     ["News", 0.7],
244 |                     ["IT", 0.3],
245 |                     ["Dialog", 0.5]
246 |                 ]
247 |             }
248 |         ]
249 |     }
250 | }
251 | ```
252 | 
253 | Weights define the relative proportion that each pattern should take in the final sampling. They do not need to sum to 1. The special weight `"*"` can be used to force using all the examples associated with the pattern.
254 | 
255 | **Source and target files should have the same name and be suffixed by the language code.**
256 | 
257 | #### `preprocess`
258 | 
259 | This block applies preprocess operations, such as tokenization, to the data.
260 | 
261 | Tokenization operator accepts any tokenization options from [OpenNMT/Tokenizer](https://github.com/OpenNMT/Tokenizer/blob/master/docs/options.md).
262 | 
263 | #### `vocabulary`
264 | 
265 | This block contains the vocabularies used by translation framework.
266 | 
267 | The vocabulary file must have the following format:
268 | 
269 | * one token par line
270 | * no special tokens (such as `<s>`, `</s>`, `<blank>`, `<unk>`, etc.)
271 | 
272 | We plan to add a `buildvocab` command to automatically generate it from the data.
273 | 
274 | #### `options`
275 | 
276 | This block contains the parameters that are **specific to the selected framework**: the model architecture, the training parameters, etc.
277 | 
278 | See the file `frameworks/./README.md` of the selected framework for more information.
279 | 
280 | #### `serving`
281 | 
282 | Serving reads additional values from the JSON configuration:
283 | 
284 | ```json
285 | {
286 |     "serving": {
287 |         "max_batch_size": 64
288 |     }
289 | }
290 | ```
291 | 
292 | where:
293 | 
294 | * `max_batch_size` is the maximum batch size to execute at once
295 | 
296 | These values can be overriden for [each request](docs/rest_api.md).
297 | 
298 | ### Overriding the model configuration
299 | 
300 | When a model is set on the command line with `--model`, its configuration will be used. You can pass a partial configuration to `--config` in order to override some fields from the model configuration.
301 | 
302 | ### Environment variables
303 | 
304 | Values in the configuration can use environment variables with the syntax `${VARIABLE_NAME}`.
305 | 
306 | This is especially useful to avoid hardcoding a remote storage identifier in the configuration. For example, one can define a data path as `${DATA_DIR}/en_de/corpus` in the configuration and then configure the storage identifer when running the Docker image with `-e DATA_DIR=storage_id_1:`.
307 | 
308 | ## Add or extend frameworks
309 | 
310 | This repository consists of a Python module `nmtwizard` that implements the shared interface and extended features mentionned above. Each framework should then:
311 | 
312 | * extend the `nmtwizard.Framework` class and implement the logic that is specific to the framework
313 | * define a Python entrypoint script that invokes `Framework.run()`
314 | * define a `Dockerfile` that includes the framework, the `nmtwizard` module, and all dependencies.
315 | 
316 | Advanced users could extend existing frameworks to implement customized behavior.
317 | 
318 | ## Run tests
319 | 
320 | ```bash
321 | # Install test requirements:
322 | pip install -e .[tests]
323 | 
324 | # Run unit tests:
325 | pytest test/
326 | 
327 | # Automatically reformat code:
328 | black .
329 | 
330 | # Check code for errors:
331 | flake8 .
332 | ```
333 | 


--------------------------------------------------------------------------------
/docs/inference_options.md:
--------------------------------------------------------------------------------
 1 | ## Inference options
 2 | 
 3 | Custom frameworks can declare options they accept during inference. Suppose a translation model was trained to support different domains based on an input tag: the inference options mechanism can be used to pass this information at serving time.
 4 | 
 5 | 1\. The mechanism expects 2 components to be configured in the global model configuration:
 6 | 
 7 | * a [JSON Schema](https://json-schema.org/) describing the accepted options and the value constraints
 8 | * a mapping between inference options and configuration fields using a path-like representation
 9 | 
10 | ```json
11 | {
12 |     "inference_options": {
13 |         "json_schema": {
14 |             "type": "object",
15 |             "title": "Domain",
16 |             "description": "Domain to use for the translation",
17 |             "properties": {
18 |                 "domain": {
19 |                     "type": "string",
20 |                     "title": "Domain",
21 |                     "enum": ["IT", "News", "Medical"]
22 |                 }
23 |             }
24 |         },
25 |         "options": [
26 |             {
27 |                 "option_path": "domain",
28 |                 "config_path": "preprocess/domain/value"
29 |             }
30 |         ]
31 |     },
32 |     "preprocess": {
33 |         "domain": {
34 |             "some fields used during training": {}
35 |         }
36 |     },
37 |     "other_section": {...}
38 | }
39 | ```
40 | 
41 | 2\. When releasing the model, the `inference_options` configuration will be validated for correctness. Also to simplify the integration with external tools, the JSON Schema will be exported to the file `options.json` in the model directory.
42 | 
43 | 3\. During inference, the options can be passed in the request:
44 | 
45 | ```json
46 | {
47 |     "src": [{
48 |         "text": "Source sentence 1",
49 |         "options": {"domain": "IT"}
50 |     }]
51 | }
52 | ```
53 | 
54 | They will be validated against the schema and values will be injected in the global configuration:
55 | 
56 | ```json
57 | {
58 |     "inference_options": {...},
59 |     "preprocess": {
60 |         "domain": {
61 |             "value": "IT",
62 |             "some fields used during training": {}
63 |         }
64 |     },
65 |     "other_section": {...}
66 | }
67 | ```
68 | 
69 | It is up to the custom preprocessing module to correctly use the injected `value` field during inference. If the field has not been marked as required in the JSON Schema, the module should also work without this runtime value (e.g. it should apply a default value).
70 | 


--------------------------------------------------------------------------------
/docs/rest_api.md:
--------------------------------------------------------------------------------
 1 | # REST translation API
 2 | 
 3 | ### `GET /status`
 4 | 
 5 | **Output:**
 6 | 
 7 | Status 200 if the model is ready to run translations.
 8 | 
 9 | ### `POST /translate`
10 | 
11 | **Input (minimum required):**
12 | 
13 | ```json
14 | {
15 |     "src": [
16 |         {"text": "Source sentence 1"},
17 |         {"text": "Source sentence 2"}
18 |     ]
19 | }
20 | ```
21 | 
22 | **Input (with optional fields):**
23 | 
24 | ```json
25 | {
26 |     "options": {
27 |         "max_batch_size": 32,
28 |         "config": {}
29 |     },
30 |     "src": [
31 |         {"text": "Source sentence 1", "config": {}, "options": {}},
32 |         {"text": "Source sentence 2", "config": {}, "options": {}}
33 |     ]
34 | }
35 | ```
36 | 
37 | * The `config` fields define request-specific and sentence-specific overrides to the global JSON configuration file.
38 | * The `options` fields (in `src`) define [inference options](docs/inference_options.md) to be mapped to the global configuration file.
39 | 
40 | **Output:**
41 | 
42 | ```json
43 | {
44 |     "tgt": [
45 |         [{
46 |             "text": "Phrase cible 1",
47 |             "score": -2.16,
48 |             "align": [
49 |                 {"tgt": [ {"range": [0, 5], "id": 0} ],
50 |                  "src": [ {"range": [9, 14], "id": 1} ]},
51 |                 {"tgt": [ {"range": [7, 11], "id": 1} ],
52 |                  "src": [ {"range": [0, 5], "id": 0} ]},
53 |                 {"tgt": [ {"range": [13, 13], "id": 2} ],
54 |                  "src": [ {"range": [16, 16], "id": 2} ]}
55 |              ]
56 |         }],
57 |         [{
58 |             "text": "Phrase cible 2",
59 |             "score": -2.17,
60 |             "align": [
61 |                 {"tgt": [ {"range": [0, 5], "id": 0} ],
62 |                  "src": [ {"range": [9, 14], "id": 1} ]},
63 |                 {"tgt": [ {"range": [7, 11], "id": 1} ],
64 |                  "src": [ {"range": [0, 5], "id": 0} ]},
65 |                 {"tgt": [ {"range": [13, 13], "id": 2} ],
66 |                  "src": [ {"range": [16, 16], "id": 2} ]}
67 |              ]
68 |         }]
69 |     ]
70 | }
71 | ```
72 | 
73 | The `tgt` field is a list the size of the batch where each entry is a list listing all hypotheses (the N best list) ordered from best to worst (higher score means better prediction).
74 | 
75 | Note that the `score` and `align` fields might not be set by all frameworks and model types.
76 | 
77 | **Errors:**
78 | 
79 | * **HTTP 400**
80 |   * The input data is missing.
81 |   * The input data is not a JSON object.
82 |   * The input data does not contain the `src` field.
83 |   * The `src` field is not a list.
84 |   * The inference option is unexpected or invalid
85 | * **HTTP 500**
86 |   * Internal server exception.
87 | * **HTTP 503**
88 |   * The backend service is unavailable.
89 | * **HTTP 504**
90 |   * The translation request timed out.
91 | 
92 | ### `POST /unload_model`
93 | 
94 | Unload the model from the reserved resource. In its simplest form, this route will terminate the backend translation service.
95 | 
96 | ### `POST /reload_model`
97 | 
98 | Reload the model on the reserved resource. In its simplest form, this route will terminate the backend translation service if it is still running and start a new instance.
99 | 


--------------------------------------------------------------------------------
/frameworks/baidu_translate/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | 
 3 | WORKDIR /root
 4 | 
 5 | RUN apt-get update && \
 6 |     apt-get install -y --no-install-recommends \
 7 |         python3-pip \
 8 |         && \
 9 |     apt-get clean && \
10 |     rm -rf /var/lib/apt/lists/*
11 | 
12 | ADD requirements.txt /root
13 | RUN python3 -m pip --no-cache-dir install -r /root/requirements.txt
14 | 
15 | ADD frameworks/baidu_translate/entrypoint.py /root
16 | ADD nmtwizard /root/nmtwizard
17 | 
18 | ENTRYPOINT ["python3", "entrypoint.py"]
19 | 


--------------------------------------------------------------------------------
/frameworks/baidu_translate/README.md:
--------------------------------------------------------------------------------
1 | # Baidu Translate framework
2 | 
3 | This a translate-only framework using the [Baidu translation API](https://fanyi-api.baidu.com/api/trans/product/index).
4 | 
5 | Credentials should be configured with the following environment variables:
6 | 
7 | * `BAIDU_APPID`
8 | * `BAIDU_KEY`
9 | 


--------------------------------------------------------------------------------
/frameworks/baidu_translate/entrypoint.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import hashlib
  3 | import random
  4 | import requests
  5 | 
  6 | from nmtwizard.cloud_translation_framework import CloudTranslationFramework
  7 | 
  8 | 
  9 | baidu_lang_dict_map = {
 10 |     "ar": "ara",
 11 |     "bg": "bul",
 12 |     "zh": "zh",
 13 |     "cs": "cs",
 14 |     "da": "dan",
 15 |     "nl": "nl",
 16 |     "en": "en",
 17 |     "et": "est",
 18 |     "fi": "fin",
 19 |     "fr": "fra",
 20 |     "de": "de",
 21 |     "el": "el",
 22 |     "hu": "hu",
 23 |     "it": "it",
 24 |     "ja": "jp",
 25 |     "ko": "kor",
 26 |     "pl": "pl",
 27 |     "pt": "pt",
 28 |     "ro": "rom",
 29 |     "ru": "ru",
 30 |     "sl": "slo",
 31 |     "es": "spa",
 32 |     "sv": "swe",
 33 |     "th": "th",
 34 |     "zt": "cht",
 35 |     "vi": "vie",
 36 | }
 37 | 
 38 | 
 39 | class BaiduTranslateFramework(CloudTranslationFramework):
 40 |     def __init__(self):
 41 |         super(BaiduTranslateFramework, self).__init__()
 42 |         self._appid = os.getenv("BAIDU_APPID")
 43 |         self._key = os.getenv("BAIDU_KEY")
 44 |         if self._appid is None:
 45 |             raise ValueError("missing app id")
 46 |         if self._key is None:
 47 |             raise ValueError("missing key")
 48 | 
 49 |     def translate_batch(self, batch, source_lang, target_lang):
 50 |         query = "\n".join(batch)
 51 |         salt = str(random.randint(10000, 99999))
 52 |         sign = self._appid + query + salt + self._key
 53 |         m1 = hashlib.md5()
 54 |         m1.update(sign.encode("utf-8"))
 55 |         sign = m1.hexdigest()
 56 | 
 57 |         params = {
 58 |             "appid": self._appid,
 59 |             "q": query,
 60 |             "from": baidu_lang_dict_map[source_lang.lower()],
 61 |             "to": baidu_lang_dict_map[target_lang.lower()],
 62 |             "salt": salt,
 63 |             "sign": sign,
 64 |         }
 65 | 
 66 |         url = "http://api.fanyi.baidu.com/api/trans/vip/translate"
 67 |         result = self.send_request(lambda: requests.get(url, params=params))
 68 |         for trans in result["trans_result"]:
 69 |             yield trans["dst"]
 70 | 
 71 |     def supported_languages(self):
 72 |         return [
 73 |             "ar",
 74 |             "bg",
 75 |             "cs",
 76 |             "da",
 77 |             "de",
 78 |             "el",
 79 |             "en",
 80 |             "es",
 81 |             "et",
 82 |             "fi",
 83 |             "fr",
 84 |             "hu",
 85 |             "it",
 86 |             "ja",
 87 |             "ko",
 88 |             "nl",
 89 |             "pl",
 90 |             "pt",
 91 |             "ro",
 92 |             "ru",
 93 |             "sl",
 94 |             "sv",
 95 |             "th",
 96 |             "vi",
 97 |             "zh",
 98 |             "zt",
 99 |         ]
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     BaiduTranslateFramework().run()
104 | 


--------------------------------------------------------------------------------
/frameworks/deepl_translate/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | 
 3 | WORKDIR /root
 4 | 
 5 | RUN apt-get update && \
 6 |     apt-get install -y --no-install-recommends \
 7 |         python3-pip \
 8 |         && \
 9 |     apt-get clean && \
10 |     rm -rf /var/lib/apt/lists/*
11 | 
12 | ADD requirements.txt /root
13 | RUN python3 -m pip --no-cache-dir install -r /root/requirements.txt
14 | 
15 | ADD frameworks/deepl_translate/entrypoint.py /root
16 | ADD nmtwizard /root/nmtwizard
17 | 
18 | ENTRYPOINT ["python3", "entrypoint.py"]
19 | 


--------------------------------------------------------------------------------
/frameworks/deepl_translate/README.md:
--------------------------------------------------------------------------------
1 | # DeepL Translate framework
2 | 
3 | This a translate-only framework using the [DeepL translation API](https://www.deepl.com/pro.html).
4 | 
5 | Credentials should be configured with the environment variable `DEEPL_CREDENTIALS`.
6 | 


--------------------------------------------------------------------------------
/frameworks/deepl_translate/entrypoint.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import os
 3 | 
 4 | from nmtwizard.cloud_translation_framework import CloudTranslationFramework
 5 | 
 6 | 
 7 | class DeepLTranslateFramework(CloudTranslationFramework):
 8 |     def __init__(self):
 9 |         super(DeepLTranslateFramework, self).__init__()
10 |         self._credentials = os.getenv("DEEPL_CREDENTIALS")
11 |         if self._credentials is None:
12 |             raise ValueError("missing credentials")
13 | 
14 |     def translate_batch(self, batch, source_lang, target_lang):
15 |         params = {
16 |             "text": batch,
17 |             "source_lang": source_lang.upper(),
18 |             "target_lang": target_lang.upper(),
19 |             "split_sentences": 0,
20 |             "auth_key": self._credentials,
21 |         }
22 | 
23 |         url = "https://api.deepl.com/v2/translate"
24 |         result = self.send_request(lambda: requests.post(url, data=params))
25 |         for trans in result["translations"]:
26 |             yield trans["text"]
27 | 
28 |     def supported_languages(self):
29 |         return ["en", "de", "fr", "es", "pt", "it", "nl", "pl", "ru"]
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     DeepLTranslateFramework().run()
34 | 


--------------------------------------------------------------------------------
/frameworks/google_translate/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | 
 3 | WORKDIR /root
 4 | 
 5 | RUN apt-get update && \
 6 |     apt-get install -y --no-install-recommends \
 7 |         python3-pip \
 8 |         && \
 9 |     apt-get clean && \
10 |     rm -rf /var/lib/apt/lists/*
11 | 
12 | ADD requirements.txt /root/base_requirements.txt
13 | ADD frameworks/google_translate/requirements.txt /root
14 | RUN python3 -m pip --no-cache-dir install -r /root/requirements.txt -r /root/base_requirements.txt
15 | 
16 | ENV PYTHONWARNINGS="ignore"
17 | 
18 | ADD frameworks/google_translate/entrypoint.py /root
19 | ADD nmtwizard /root/nmtwizard
20 | 
21 | ENTRYPOINT ["python3", "entrypoint.py"]
22 | 


--------------------------------------------------------------------------------
/frameworks/google_translate/README.md:
--------------------------------------------------------------------------------
1 | # Google Translate framework
2 | 
3 | This a translate-only framework using the [Google Translate API](https://cloud.google.com/translate/docs/quickstart).
4 | 
5 | Credentials should be configured with the environment variable `GOOGLE_APPLICATION_CREDENTIALS`, which defines either:
6 | 
7 | * the path to the JSON credential file
8 | * **or** the content of the JSON credential file
9 | 


--------------------------------------------------------------------------------
/frameworks/google_translate/entrypoint.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import time
 4 | 
 5 | from google.cloud import translate_v2 as translate
 6 | from google.oauth2 import service_account
 7 | 
 8 | from nmtwizard.cloud_translation_framework import CloudTranslationFramework
 9 | from nmtwizard.logger import get_logger
10 | 
11 | logger = get_logger(__name__)
12 | 
13 | 
14 | class GoogleTranslateFramework(CloudTranslationFramework):
15 |     def __init__(self):
16 |         super(GoogleTranslateFramework, self).__init__()
17 |         credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
18 |         if credentials is None:
19 |             raise ValueError("missing credentials")
20 |         if credentials.startswith("{"):
21 |             credentials = service_account.Credentials.from_service_account_info(
22 |                 json.loads(credentials)
23 |             )
24 |         else:
25 |             credentials = None
26 |         self._client = translate.Client(credentials=credentials)
27 |         self.max_retry = 5
28 |         self._GOOGLE_LIMIT_TIME = 100
29 | 
30 |     def translate_batch(self, batch, source_lang, target_lang):
31 |         translation = None
32 |         retry = 0
33 |         while retry < self.max_retry:
34 |             try:
35 |                 translation = self._client.translate(
36 |                     batch,
37 |                     source_language=source_lang,
38 |                     target_language=target_lang,
39 |                     format_="text",
40 |                 )
41 |             except Exception as e:
42 |                 if e.code == 403 and "User Rate Limit Exceeded" in e.message:
43 |                     logger.warning(
44 |                         "Exceeding the Google API limit, retrying in %d seconds ..."
45 |                         % self._GOOGLE_LIMIT_TIME
46 |                     )
47 |                     time.sleep(self._GOOGLE_LIMIT_TIME)
48 |                     retry += 1
49 |                     continue
50 |                 else:
51 |                     raise
52 |             break
53 | 
54 |         for trans in translation:
55 |             yield trans["translatedText"]
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     GoogleTranslateFramework().run()
60 | 


--------------------------------------------------------------------------------
/frameworks/google_translate/requirements.txt:
--------------------------------------------------------------------------------
1 | google-cloud-translate==3.*
2 | 


--------------------------------------------------------------------------------
/frameworks/naver_translate/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | 
 3 | WORKDIR /root
 4 | 
 5 | RUN apt-get update && \
 6 |     apt-get install -y --no-install-recommends \
 7 |         python3-pip \
 8 |         && \
 9 |     apt-get clean && \
10 |     rm -rf /var/lib/apt/lists/*
11 | 
12 | ADD requirements.txt /root
13 | RUN python3 -m pip --no-cache-dir install -r /root/requirements.txt
14 | 
15 | ADD frameworks/naver_translate/entrypoint.py /root
16 | ADD nmtwizard /root/nmtwizard
17 | 
18 | ENTRYPOINT ["python3", "entrypoint.py"]
19 | 


--------------------------------------------------------------------------------
/frameworks/naver_translate/README.md:
--------------------------------------------------------------------------------
1 | # Naver Translate framework
2 | 
3 | This a translate-only framework using the [Naver translation API](https://developers.naver.com/docs/nmt/reference/).
4 | 
5 | Credentials should be configured with the following environment variables:
6 | 
7 | * `NAVER_CLIENT_ID`
8 | * `NAVER_SECRET`
9 | 


--------------------------------------------------------------------------------
/frameworks/naver_translate/entrypoint.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | 
 4 | from nmtwizard.cloud_translation_framework import CloudTranslationFramework
 5 | 
 6 | 
 7 | naver_lang_dict_map = {
 8 |     "zh": "zh-CN",
 9 |     "en": "en",
10 |     "fr": "fr",
11 |     "id": "id",
12 |     "ko": "ko",
13 |     "es": "es",
14 |     "th": "th",
15 |     "zt": "zh-TW",
16 |     "vi": "vi",
17 | }
18 | 
19 | 
20 | class NaverTranslateFramework(CloudTranslationFramework):
21 |     def __init__(self):
22 |         super(NaverTranslateFramework, self).__init__()
23 |         self._appid = os.getenv("NAVER_CLIENT_ID")
24 |         self._key = os.getenv("NAVER_SECRET")
25 |         if self._appid is None:
26 |             raise ValueError("missing app id")
27 |         if self._key is None:
28 |             raise ValueError("missing key")
29 | 
30 |     def translate_batch(self, batch, source_lang, target_lang):
31 |         url = "https://naveropenapi.apigw.ntruss.com/nmt/v1/translation"
32 |         data = {
33 |             "source": naver_lang_dict_map[source_lang.lower()],
34 |             "target": naver_lang_dict_map[target_lang.lower()],
35 |             "text": "\n".join(batch),
36 |         }
37 |         headers = {
38 |             "X-NCP-APIGW-API-KEY-ID": self._appid,
39 |             "X-NCP-APIGW-API-KEY": self._key,
40 |         }
41 | 
42 |         result = self.send_request(
43 |             lambda: requests.post(url, data=data, headers=headers)
44 |         )
45 |         yield result["message"]["result"]["translatedText"]
46 | 
47 |     def supported_languages(self):
48 |         return ["zh", "en", "fr", "id", "ko", "es", "th", "zt", "vi"]
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     NaverTranslateFramework().run()
53 | 


--------------------------------------------------------------------------------
/frameworks/opennmt_py/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
 2 | 
 3 | WORKDIR /root
 4 | 
 5 | ENV PYTHONDONTWRITEBYTECODE=1
 6 | ENV LANG=C.UTF-8
 7 | 
 8 | RUN apt-get update && \
 9 |     apt-get install -y --no-install-recommends \
10 |         python3-pip \
11 |         && \
12 |     apt-get clean && \
13 |     rm -rf /var/lib/apt/lists/*
14 | 
15 | ADD requirements.txt /root/base_requirements.txt
16 | ADD frameworks/opennmt_py/requirements.txt /root
17 | RUN python3 -m pip --no-cache-dir install -r /root/base_requirements.txt -r /root/requirements.txt
18 | 
19 | ADD frameworks/opennmt_py/entrypoint.py /root
20 | ADD nmtwizard /root/nmtwizard
21 | 
22 | ENTRYPOINT ["python3", "entrypoint.py"]
23 | 


--------------------------------------------------------------------------------
/frameworks/opennmt_py/README.md:
--------------------------------------------------------------------------------
 1 | # OpenNMT-py framework
 2 | 
 3 | This framework is based on [OpenNMT-py](https://github.com/OpenNMT/OpenNMT-py/).
 4 | 
 5 | [Preprocessing](https://opennmt.net/OpenNMT-py/options/preprocess.html), [training](https://opennmt.net/OpenNMT-py/options/train.html), and [translation](https://opennmt.net/OpenNMT-py/options/translate.html) options specific to OpenNMT-py can be configured in the `options` block of the configuration.
 6 | 
 7 | Example:
 8 | 
 9 | ```json
10 | {
11 |     "options": {
12 |         "config": {
13 |             "preprocess": {
14 |             },
15 |             "train": {
16 |                 "batch_size": 64,
17 |                 "optim": "sgd",
18 |                 "dropout": 0.3,
19 |                 "learning_rate": 1.0,
20 |                 "src_word_vec_size": 512,
21 |                 "tgt_word_vec_size": 512,
22 |                 "encoder_type": "rnn",
23 |                 "decoder_type": "rnn",
24 |                 "layers": 2,
25 |                 "enc_layers": 2,
26 |                 "dec_layers": 2,
27 |                 "rnn_size": 512
28 |             },
29 |             "trans": {
30 |                 "replace_unk": true
31 |             }
32 |         }
33 |     }
34 | }
35 | ```
36 | 


--------------------------------------------------------------------------------
/frameworks/opennmt_py/entrypoint.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import requests
  4 | 
  5 | from nmtwizard.framework import Framework
  6 | from nmtwizard.logger import get_logger
  7 | from nmtwizard import utils, serving
  8 | 
  9 | logger = get_logger(__name__)
 10 | 
 11 | _MODEL_NAME = "model.pt"
 12 | _RELEASED_MODEL_NAME = "model_released.pt"
 13 | 
 14 | 
 15 | class OpenNMTPYFramework(Framework):
 16 |     def train(
 17 |         self,
 18 |         config,
 19 |         src_file,
 20 |         tgt_file,
 21 |         src_vocab_info,
 22 |         tgt_vocab_info,
 23 |         align_file=None,
 24 |         example_weights_file=None,
 25 |         model_path=None,
 26 |         gpuid=0,
 27 |         models_to_average=None,
 28 |     ):
 29 |         if models_to_average:
 30 |             logger.warning("Checkpoint averaging is not implemented in this framework")
 31 | 
 32 |         # Preprocess training files.
 33 |         options = config["options"].get("config", {})
 34 |         options_preprocess = options.get("preprocess", {}).copy()
 35 |         options_preprocess["src_vocab"] = self._convert_vocab(
 36 |             config["vocabulary"]["source"]["path"]
 37 |         )
 38 |         options_preprocess["tgt_vocab"] = self._convert_vocab(
 39 |             config["vocabulary"]["target"]["path"]
 40 |         )
 41 |         bin_file = os.path.join(self._data_dir, "bin")
 42 |         cmd = [
 43 |             "onmt_preprocess",
 44 |             "-train_src",
 45 |             src_file,
 46 |             "-train_tgt",
 47 |             tgt_file,
 48 |             "-save_data",
 49 |             bin_file,
 50 |         ]
 51 |         cmd += _build_cmd(options_preprocess)
 52 |         utils.run_cmd(cmd)
 53 | 
 54 |         # Train.
 55 |         options_train = options.get("train", {}).copy()
 56 |         if "train_steps" not in options_train:
 57 |             options_train["single_pass"] = True
 58 |             options_train["train_steps"] = 0
 59 |         options_train["save_checkpoint_steps"] = 0
 60 |         options_train["data"] = bin_file
 61 |         options_train["save_model"] = self._output_dir + "/model"
 62 |         if isinstance(gpuid, list):
 63 |             options_train["world_size"] = len(gpuid)
 64 |             options_train["gpu_ranks"] = " ".join(str(i - 1) for i in gpuid)
 65 |         elif gpuid > 0:
 66 |             options_train["gpu_ranks"] = gpuid - 1
 67 |         if model_path is not None:
 68 |             options_train["train_from"] = os.path.join(model_path, _MODEL_NAME)
 69 |         utils.run_cmd(["onmt_train"] + _build_cmd(options_train))
 70 | 
 71 |         # Select model.
 72 |         models = os.listdir(self._output_dir)
 73 |         if not models:
 74 |             raise RuntimeError("no model generated by the training")
 75 |         if len(models) > 1:
 76 |             raise RuntimeError("more than one model generated by the training")
 77 |         model_file = os.path.join(self._output_dir, models[0])
 78 |         return {_MODEL_NAME: model_file}
 79 | 
 80 |     def trans(self, config, model_path, input, output, gpuid=0):
 81 |         options_trans = _trans_options(config, gpuid)
 82 |         options_trans["model"] = os.path.join(model_path, _MODEL_NAME)
 83 |         options_trans["src"] = input
 84 |         options_trans["output"] = output
 85 |         utils.run_cmd(["onmt_translate"] + _build_cmd(options_trans))
 86 | 
 87 |     def serve(self, config, model_path, gpuid=0):
 88 |         server_config_path = os.path.join(self._output_dir, "conf.json")
 89 |         with open(server_config_path, "w") as server_config_file:
 90 |             json.dump(
 91 |                 {
 92 |                     "models_root": model_path,
 93 |                     "models": [
 94 |                         {
 95 |                             "id": 0,
 96 |                             "model": _RELEASED_MODEL_NAME,
 97 |                             "opt": _trans_options(config, gpuid),
 98 |                         }
 99 |                     ],
100 |                 },
101 |                 server_config_file,
102 |             )
103 |         port = serving.pick_free_port()
104 |         process = utils.run_cmd(
105 |             [
106 |                 "onmt_server",
107 |                 "--ip",
108 |                 "127.0.0.1",
109 |                 "--port",
110 |                 str(port),
111 |                 "--url_root",
112 |                 "/translator-backend",
113 |                 "--config",
114 |                 server_config_path,
115 |             ],
116 |             background=True,
117 |         )
118 |         return process, {"port": port}
119 | 
120 |     def release(self, config, model_path, gpuid=0):
121 |         model = os.path.join(model_path, _MODEL_NAME)
122 |         released_model = os.path.join(self._output_dir, _RELEASED_MODEL_NAME)
123 |         utils.run_cmd(["onmt_release_model", "-m", model, "-o", released_model])
124 |         return {_RELEASED_MODEL_NAME: released_model}
125 | 
126 |     def forward_request(self, model_info, inputs, outputs=None, options=None):
127 |         if options is None:
128 |             options = {}
129 |         data = [{"src": " ".join(tokens), "id": 0} for tokens in inputs]
130 |         try:
131 |             result = requests.post(
132 |                 "http://127.0.0.1:%d/translator-backend/translate" % model_info["port"],
133 |                 json=data,
134 |                 timeout=options.get("timeout"),
135 |             )
136 |             return [
137 |                 [serving.TranslationOutput(r["tgt"].split(), score=r["pred_score"])]
138 |                 for r in result.json()[0]
139 |             ]
140 |         except requests.exceptions.Timeout as e:
141 |             logger.error("%s", e)
142 |             return None
143 | 
144 |     def _map_vocab_entry(self, index, token, vocab):
145 |         if index == 0:
146 |             vocab.write("<unk>\n")
147 |             vocab.write("<blank>\n")
148 |             vocab.write("<s>\n")
149 |             vocab.write("</s>\n")
150 |         vocab.write("%s\n" % token)
151 | 
152 | 
153 | def _trans_options(config, gpuid):
154 |     options = config["options"].get("config", {})
155 |     opt = options.get("trans", {}).copy()
156 |     if gpuid > 0:
157 |         opt["gpu"] = gpuid - 1
158 |     return opt
159 | 
160 | 
161 | def _build_cmd(options):
162 |     opts = []
163 |     for k, v in options.items():
164 |         if isinstance(v, bool):
165 |             if v:
166 |                 opts.append("-%s" % k)
167 |         else:
168 |             opts.append("-%s" % k)
169 |             opts.append(str(v))
170 |     return opts
171 | 
172 | 
173 | if __name__ == "__main__":
174 |     OpenNMTPYFramework().run()
175 | 


--------------------------------------------------------------------------------
/frameworks/opennmt_py/requirements.txt:
--------------------------------------------------------------------------------
1 | OpenNMT-py==1.2.0
2 | 


--------------------------------------------------------------------------------
/frameworks/opennmt_tf/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04
 2 | 
 3 | WORKDIR /root
 4 | 
 5 | ENV PYTHONDONTWRITEBYTECODE=1
 6 | ENV LANG=C.UTF-8
 7 | 
 8 | RUN apt-get update && \
 9 |     apt-get install -y --no-install-recommends \
10 |         cuda-minimal-build-11-2 \
11 |         python3-pip \
12 |         && \
13 |     apt-get clean && \
14 |     rm -rf /var/lib/apt/lists/*
15 | 
16 | ADD requirements.txt /root/base_requirements.txt
17 | ADD frameworks/opennmt_tf/requirements.txt /root
18 | RUN python3 -m pip --no-cache-dir install -r /root/base_requirements.txt -r /root/requirements.txt
19 | 
20 | ADD frameworks/opennmt_tf/entrypoint.py /root
21 | ADD nmtwizard /root/nmtwizard
22 | 
23 | ENTRYPOINT ["python3", "entrypoint.py"]
24 | 


--------------------------------------------------------------------------------
/frameworks/opennmt_tf/README.md:
--------------------------------------------------------------------------------
 1 | # OpenNMT-tf framework
 2 | 
 3 | This framework is based on [OpenNMT-tf](https://github.com/OpenNMT/OpenNMT-tf/).
 4 | 
 5 | The `options` block should define the [model](https://opennmt.net/OpenNMT-tf/model.html) and the [run configuration](https://opennmt.net/OpenNMT-tf/configuration.html) (or `auto_config` if it applied).
 6 | 
 7 | Example with `auto_config`:
 8 | 
 9 | ```json
10 | {
11 |     "options": {
12 |         "model_type": "Transformer",
13 |         "auto_config": true
14 |     }
15 | }
16 | ```
17 | 
18 | Example using a custom model and configuration:
19 | 
20 | ```json
21 |     "options": {
22 |         "model": "/path/to/model/definition.py",
23 |         "config": {
24 |             "params": {
25 |                 "optimizer": "SGD",
26 |                 "learning_rate": 0.1,
27 |                 "beam_width": 5
28 |             },
29 |             "train": {
30 |                 "batch_size": 64,
31 |                 "length_bucket_width": 1,
32 |                 "maximum_features_length": 50,
33 |                 "maximum_labels_length": 50
34 |             },
35 |             "infer": {
36 |                 "batch_size": 32
37 |             }
38 |         }
39 |     }
40 | }
41 | ```
42 | 


--------------------------------------------------------------------------------
/frameworks/opennmt_tf/entrypoint.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | 
  4 | # Import logger before TensorFlow to register the global config and avoid duplicated logs.
  5 | from nmtwizard.logger import get_logger
  6 | 
  7 | import tensorflow as tf
  8 | 
  9 | import opennmt
 10 | import yaml
 11 | 
 12 | from nmtwizard.framework import Framework
 13 | from nmtwizard import config as config_util
 14 | from nmtwizard import utils
 15 | from nmtwizard import serving
 16 | 
 17 | logger = get_logger(__name__)
 18 | tf.get_logger().setLevel(logger.level)
 19 | 
 20 | _V1_SAVED_MODEL_DIR = "1"
 21 | _SAVED_MODEL_DIR = "saved_model"
 22 | 
 23 | 
 24 | class OpenNMTTFFramework(Framework):
 25 |     def train(
 26 |         self,
 27 |         config,
 28 |         src_file,
 29 |         tgt_file,
 30 |         src_vocab_info,
 31 |         tgt_vocab_info,
 32 |         align_file=None,
 33 |         example_weights_file=None,
 34 |         model_path=None,
 35 |         gpuid=0,
 36 |         models_to_average=None,
 37 |     ):
 38 |         if model_path is None or tf.train.latest_checkpoint(model_path) is None:
 39 |             prev_src_vocab = None
 40 |             prev_tgt_vocab = None
 41 |         else:
 42 |             prev_src_vocab = src_vocab_info.previous
 43 |             prev_tgt_vocab = tgt_vocab_info.previous
 44 | 
 45 |         runner = self._build_runner(
 46 |             config,
 47 |             src_vocab=prev_src_vocab or src_vocab_info.current,
 48 |             tgt_vocab=prev_tgt_vocab or tgt_vocab_info.current,
 49 |             src_file=src_file,
 50 |             tgt_file=tgt_file,
 51 |             align_file=align_file,
 52 |             example_weights_file=example_weights_file,
 53 |             model_path=model_path,
 54 |         )
 55 | 
 56 |         if prev_src_vocab or prev_tgt_vocab:
 57 |             previous_model_dir = runner.model_dir
 58 |             # Update the model vocabulary on CPU to avoid initializing the GPU context
 59 |             # and allow batch size autotuning to run properly afterwards.
 60 |             with tf.device("cpu"):
 61 |                 runner.update_vocab(
 62 |                     os.path.join(self._output_dir, "new_vocab_checkpoint"),
 63 |                     src_vocab=src_vocab_info.current if prev_src_vocab else None,
 64 |                     tgt_vocab=tgt_vocab_info.current if prev_tgt_vocab else None,
 65 |                 )
 66 |             shutil.rmtree(previous_model_dir)
 67 | 
 68 |         output_dir, summary = runner.train(
 69 |             num_devices=utils.count_devices(gpuid),
 70 |             return_summary=True,
 71 |             fallback_to_cpu=not isinstance(gpuid, list) and gpuid == -1,
 72 |         )
 73 | 
 74 |         if models_to_average:
 75 |             models_to_average.append(output_dir)
 76 |             checkpoint_paths = [
 77 |                 tf.train.latest_checkpoint(path) for path in models_to_average
 78 |             ]
 79 | 
 80 |             with tf.device("cpu"):
 81 |                 average_dir = runner.average_checkpoints(
 82 |                     os.path.join(self._output_dir, "averaged_model"),
 83 |                     max_count=len(checkpoint_paths),
 84 |                     checkpoint_paths=checkpoint_paths,
 85 |                 )
 86 | 
 87 |             shutil.rmtree(output_dir)
 88 |             output_dir = average_dir
 89 |             summary["num_averaged_checkpoints"] = len(checkpoint_paths)
 90 | 
 91 |         return _list_checkpoint_files(output_dir), summary
 92 | 
 93 |     def export(self, config, model_path, output_dir):
 94 |         options = config["options"]
 95 | 
 96 |         # Copy checkpoint files.
 97 |         for filename, path in _list_checkpoint_files(model_path).items():
 98 |             shutil.copy(path, os.path.join(output_dir, filename))
 99 | 
100 |         # Convert vocabularies.
101 |         src_vocab = self._convert_vocab(
102 |             config["vocabulary"]["source"]["path"], output_dir=output_dir
103 |         )
104 |         tgt_vocab = self._convert_vocab(
105 |             config["vocabulary"]["target"]["path"], output_dir=output_dir
106 |         )
107 | 
108 |         # Save run configuration.
109 |         run_config = {
110 |             "auto_config": options.get("auto_config", True),
111 |             "data": {
112 |                 "source_vocabulary": os.path.basename(src_vocab),
113 |                 "target_vocabulary": os.path.basename(tgt_vocab),
114 |             },
115 |         }
116 |         run_config = config_util.merge_config(run_config, options.get("config", {}))
117 |         config_path = os.path.join(output_dir, "config.yml")
118 |         with open(config_path, "w") as config_file:
119 |             yaml.dump(run_config, config_file)
120 | 
121 |         # Save model description.
122 |         opennmt.load_model(
123 |             output_dir,
124 |             model_file=options.get("model"),
125 |             model_name=options.get("model_type"),
126 |         )
127 | 
128 |     def score(self, config, model_path, source, target, output, gpuid=0):
129 |         runner = self._build_runner(config, model_path=model_path)
130 |         runner.score(source, target, output_file=output)
131 |         return utils.ScoreType.CUMULATED_NLL
132 | 
133 |     def trans(self, config, model_path, input, output, gpuid=0):
134 |         runner = self._build_runner(config, model_path=model_path)
135 |         runner.infer(input, predictions_file=output)
136 |         with_scores = (
137 |             config["options"].get("config", {}).get("infer", {}).get("with_scores")
138 |         )
139 |         return utils.ScoreType.CUMULATED_LL if with_scores else None
140 | 
141 |     def release(self, config, model_path, optimization_level=None, gpuid=0):
142 |         export_dir = os.path.join(self._output_dir, _SAVED_MODEL_DIR)
143 |         runner = self._build_runner(config, model_path=model_path)
144 |         runner.export(export_dir)
145 |         return {os.path.basename(export_dir): export_dir}
146 | 
147 |     def serve(self, config, model_path, gpuid=0):
148 |         v1_export_dir = os.path.join(model_path, _V1_SAVED_MODEL_DIR)
149 |         if os.path.exists(v1_export_dir):
150 |             raise ValueError(
151 |                 "SavedModel exported with OpenNMT-tf 1.x are no longer supported. "
152 |                 "They include ops from tf.contrib which is not included in "
153 |                 "TensorFlow 2.x binaries. To upgrade automatically, you can release "
154 |                 "or serve from a OpenNMT-tf 1.x training checkpoint."
155 |             )
156 |         export_dir = os.path.join(model_path, _SAVED_MODEL_DIR)
157 |         translate_fn = tf.saved_model.load(export_dir).signatures["serving_default"]
158 |         return None, translate_fn
159 | 
160 |     def forward_request(self, model_info, inputs, outputs=None, options=None):
161 |         translate_fn = model_info
162 | 
163 |         tokens, lengths = utils.pad_lists(inputs, padding_value="")
164 |         outputs = translate_fn(
165 |             tokens=tf.constant(tokens, dtype=tf.string),
166 |             length=tf.constant(lengths, dtype=tf.int32),
167 |         )
168 | 
169 |         batch_predictions = outputs["tokens"].numpy()
170 |         batch_lengths = outputs["length"].numpy()
171 |         batch_log_probs = outputs["log_probs"].numpy()
172 | 
173 |         batch_outputs = []
174 |         for predictions, lengths, log_probs in zip(
175 |             batch_predictions, batch_lengths, batch_log_probs
176 |         ):
177 |             outputs = []
178 |             for prediction, length, log_prob in zip(predictions, lengths, log_probs):
179 |                 prediction = prediction[:length].tolist()
180 |                 prediction = [token.decode("utf-8") for token in prediction]
181 |                 score = float(log_prob)
182 |                 outputs.append(serving.TranslationOutput(prediction, score=score))
183 |             batch_outputs.append(outputs)
184 |         return batch_outputs
185 | 
186 |     def _map_vocab_entry(self, index, token, vocab):
187 |         if index == 0:
188 |             vocab.write("<blank>\n")
189 |             vocab.write("<s>\n")
190 |             vocab.write("</s>\n")
191 |         vocab.write("%s\n" % token)
192 | 
193 |     def _build_runner(
194 |         self,
195 |         config,
196 |         src_vocab=None,
197 |         tgt_vocab=None,
198 |         src_file=None,
199 |         tgt_file=None,
200 |         align_file=None,
201 |         example_weights_file=None,
202 |         model_path=None,
203 |     ):
204 |         model_dir = os.path.join(self._output_dir, "model")
205 |         if os.path.exists(model_dir):
206 |             shutil.rmtree(model_dir)
207 |         os.makedirs(model_dir)
208 | 
209 |         # Copy checkpoint files into the temporary model dir.
210 |         if model_path is not None:
211 |             checkpoint_files = _list_checkpoint_files(model_path)
212 |             for filename, path in checkpoint_files.items():
213 |                 shutil.copy(path, os.path.join(model_dir, filename))
214 | 
215 |         # Prepare vocabulary if not already done.
216 |         if src_vocab is None:
217 |             src_vocab = self._convert_vocab(config["vocabulary"]["source"]["path"])
218 |         if tgt_vocab is None:
219 |             tgt_vocab = self._convert_vocab(config["vocabulary"]["target"]["path"])
220 | 
221 |         options = config["options"]
222 |         run_config = _build_run_config(
223 |             options.get("config"),
224 |             model_dir,
225 |             src_vocab,
226 |             tgt_vocab,
227 |             src_file=src_file,
228 |             tgt_file=tgt_file,
229 |             align_file=align_file,
230 |             example_weights_file=example_weights_file,
231 |         )
232 |         model = opennmt.load_model(
233 |             model_dir,
234 |             model_file=options.get("model"),
235 |             model_name=options.get("model_type"),
236 |             as_builder=True,
237 |         )
238 |         return opennmt.Runner(
239 |             model,
240 |             run_config,
241 |             auto_config=options.get("auto_config", True),
242 |             mixed_precision=options.get("mixed_precision", True),
243 |         )
244 | 
245 | 
246 | def _build_run_config(
247 |     config,
248 |     model_dir,
249 |     src_vocab,
250 |     tgt_vocab,
251 |     src_file=None,
252 |     tgt_file=None,
253 |     align_file=None,
254 |     example_weights_file=None,
255 | ):
256 |     """Builds the final configuration for OpenNMT-tf."""
257 |     config = opennmt.convert_to_v2_config(config) if config else {}
258 |     config["model_dir"] = model_dir
259 | 
260 |     data = config.setdefault("data", {})
261 |     data["source_vocabulary"] = src_vocab
262 |     data["target_vocabulary"] = tgt_vocab
263 |     if src_file is not None:
264 |         data["train_features_file"] = src_file
265 |     if tgt_file is not None:
266 |         data["train_labels_file"] = tgt_file
267 |     if align_file is not None and os.path.exists(align_file):
268 |         data["train_alignments"] = align_file
269 |         params = config.setdefault("params", {})
270 |         params.setdefault("guided_alignment_type", "ce")
271 |     if example_weights_file is not None and os.path.exists(example_weights_file):
272 |         data["example_weights"] = example_weights_file
273 | 
274 |     train = config.setdefault("train", {})
275 |     train.setdefault("batch_size_autotune_scale", 0.7)
276 |     train.setdefault("sample_buffer_size", -1)
277 |     # No need to keep multiple checkpoints as only the last one will be pushed.
278 |     train.setdefault("save_checkpoints_steps", None)
279 |     if train.setdefault("average_last_checkpoints", 0) == 0:
280 |         train["keep_checkpoint_max"] = 1
281 |     if train.setdefault("max_step", None) is None:
282 |         # Force a single pass if the number of training steps in unspecified.
283 |         train["single_pass"] = True
284 | 
285 |     return config
286 | 
287 | 
288 | def _list_checkpoint_files(model_dir):
289 |     """Lists the checkpoint files that should be bundled in the model package."""
290 |     latest = tf.train.latest_checkpoint(model_dir)
291 |     if latest is None:
292 |         return {}
293 |     objects = {
294 |         "checkpoint": os.path.join(model_dir, "checkpoint"),  # Checkpoint state file.
295 |     }
296 |     for filename in os.listdir(model_dir):
297 |         path = os.path.join(model_dir, filename)
298 |         if os.path.isfile(path) and path.startswith(latest):
299 |             objects[filename] = path
300 |     return objects
301 | 
302 | 
303 | if __name__ == "__main__":
304 |     OpenNMTTFFramework().run()
305 | 


--------------------------------------------------------------------------------
/frameworks/opennmt_tf/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==2.11.*
2 | tensorflow-probability==0.19.*
3 | OpenNMT-tf==2.31.0
4 | 


--------------------------------------------------------------------------------
/frameworks/opennmt_tf/test/test.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import json
  3 | import os
  4 | import sys
  5 | 
  6 | import tensorflow as tf
  7 | 
  8 | current_dir = os.path.dirname(os.path.abspath(__file__))
  9 | test_dir = os.path.join(current_dir, "..", "..", "..", "test")
 10 | framework_path = os.path.join(current_dir, "..")
 11 | sys.path.insert(0, framework_path)
 12 | sys.path.insert(0, test_dir)
 13 | 
 14 | from entrypoint import OpenNMTTFFramework  # noqa: E402
 15 | from test_framework import _read_config, _run_framework  # noqa: E402
 16 | 
 17 | 
 18 | config_base = {
 19 |     "source": "en",
 20 |     "target": "de",
 21 |     "data": {
 22 |         "sample": 4,
 23 |         "sample_dist": [
 24 |             {
 25 |                 "path": os.path.join(test_dir, "corpus", "train"),
 26 |                 "distribution": [["europarl", 1]],
 27 |             }
 28 |         ],
 29 |     },
 30 |     "preprocess": [
 31 |         {
 32 |             "op": "tokenization",
 33 |             "source": {"mode": "space"},
 34 |             "target": {"mode": "space"},
 35 |         }
 36 |     ],
 37 |     "vocabulary": {
 38 |         "source": {"path": "${CORPUS_DIR}/vocab/en-vocab.txt"},
 39 |         "target": {"path": "${CORPUS_DIR}/vocab/de-vocab.txt"},
 40 |     },
 41 |     "options": {
 42 |         "model_type": "TransformerTiny",
 43 |         "auto_config": True,
 44 |         "config": {
 45 |             "train": {
 46 |                 "batch_size": 2,
 47 |                 "batch_type": "examples",
 48 |                 "effective_batch_size": None,
 49 |                 "length_bucket_width": None,
 50 |             },
 51 |         },
 52 |     },
 53 | }
 54 | 
 55 | 
 56 | def test_train(tmpdir):
 57 |     sample_size = config_base["data"]["sample"]
 58 |     batch_size = config_base["options"]["config"]["train"]["batch_size"]
 59 |     num_iterations = 3
 60 | 
 61 |     for iteration in range(num_iterations):
 62 |         last_iteration = iteration == num_iterations - 1
 63 | 
 64 |         command = "train"
 65 |         if last_iteration:
 66 |             command += " --average_models %d" % num_iterations
 67 | 
 68 |         model_dir, result = _run_framework(
 69 |             tmpdir,
 70 |             "model%d" % iteration,
 71 |             command,
 72 |             parent="model%d" % (iteration - 1) if iteration > 0 else None,
 73 |             config=config_base,
 74 |             framework_fn=OpenNMTTFFramework,
 75 |             return_output=True,
 76 |         )
 77 | 
 78 |         assert result["num_sentences"] == sample_size
 79 |         assert result["num_steps"] == sample_size // batch_size
 80 |         assert result["last_step"] == result["num_steps"] * (iteration + 1)
 81 |         if last_iteration:
 82 |             assert result["num_averaged_checkpoints"] == num_iterations
 83 | 
 84 |         config = _read_config(model_dir)
 85 |         assert "last_learning_rate" in config["build"]["trainingSummary"]
 86 | 
 87 |         assert "model_description.py" not in os.listdir(model_dir)
 88 | 
 89 |         checkpoint_state = tf.train.get_checkpoint_state(model_dir)
 90 |         assert len(checkpoint_state.all_model_checkpoint_paths) == 1
 91 | 
 92 |         last_checkpoint = tf.train.latest_checkpoint(model_dir)
 93 |         assert int(last_checkpoint.split("-")[-1]) == result["last_step"]
 94 | 
 95 | 
 96 | def _copy_lines(src, dst, begin, end):
 97 |     with open(src) as src_file, open(dst, "w") as dst_file:
 98 |         for line in itertools.islice(src_file, begin, end):
 99 |             dst_file.write(line)
100 | 
101 | 
102 | def test_eval(tmpdir):
103 |     _run_framework(
104 |         tmpdir,
105 |         "model0",
106 |         "train",
107 |         config=config_base,
108 |         framework_fn=OpenNMTTFFramework,
109 |     )
110 | 
111 |     data_dir = os.path.join(test_dir, "corpus", "train")
112 |     src_train = os.path.join(data_dir, "europarl-v7.de-en.10K.tok.en")
113 |     tgt_train = os.path.join(data_dir, "europarl-v7.de-en.10K.tok.en")
114 | 
115 |     src_valids = []
116 |     tgt_valids = []
117 | 
118 |     for i in range(3):
119 |         begin_line = 10 * i
120 |         end_line = 10 * (i + 1)
121 | 
122 |         src_valid = str(tmpdir.join("valid.src.%d" % i))
123 |         tgt_valid = str(tmpdir.join("valid.tgt.%d" % i))
124 | 
125 |         _copy_lines(src_train, src_valid, begin_line, end_line)
126 |         _copy_lines(tgt_train, tgt_valid, begin_line, end_line)
127 | 
128 |         src_valids.append(src_valid)
129 |         tgt_valids.append(tgt_valid)
130 | 
131 |     output_path = str(tmpdir.join("output"))
132 | 
133 |     _run_framework(
134 |         tmpdir,
135 |         "eval",
136 |         "eval -s %s -t %s -o %s"
137 |         % (" ".join(src_valids), " ".join(tgt_valids), output_path),
138 |         parent="model0",
139 |         config=config_base,
140 |         framework_fn=OpenNMTTFFramework,
141 |     )
142 | 
143 |     with open(output_path) as output_file:
144 |         result = json.load(output_file)
145 | 
146 |     assert 0 < result["all"]["loss"] < 10
147 |     for tgt_valid in tgt_valids:
148 |         assert 0 < result["files"][tgt_valid]["loss"] < 10
149 | 


--------------------------------------------------------------------------------
/frameworks/sogou_translate/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | 
 3 | WORKDIR /root
 4 | 
 5 | RUN apt-get update && \
 6 |     apt-get install -y --no-install-recommends \
 7 |         python3-pip \
 8 |         && \
 9 |     apt-get clean && \
10 |     rm -rf /var/lib/apt/lists/*
11 | 
12 | ADD requirements.txt /root
13 | RUN python3 -m pip --no-cache-dir install -r /root/requirements.txt
14 | 
15 | ADD frameworks/sogou_translate/entrypoint.py /root
16 | ADD nmtwizard /root/nmtwizard
17 | 
18 | ENTRYPOINT ["python3", "entrypoint.py"]
19 | 


--------------------------------------------------------------------------------
/frameworks/sogou_translate/README.md:
--------------------------------------------------------------------------------
1 | # Sogou Translate framework
2 | 
3 | This a translate-only framework using [Sogou translation API](http://deepi.sogou.com/fanyi).
4 | 
5 | Credentials should be configured with the following environment variables:
6 | 
7 | * `SOGOU_PID`
8 | * `SOGOU_KEY`
9 | 


--------------------------------------------------------------------------------
/frameworks/sogou_translate/entrypoint.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import hashlib
  4 | import requests
  5 | 
  6 | from nmtwizard.cloud_translation_framework import CloudTranslationFramework
  7 | 
  8 | 
  9 | sogou_lang_dict_map = {
 10 |     "ar": "ar",
 11 |     "bn": "bn",
 12 |     "bg": "bg",
 13 |     "zh": "zh-CHS",
 14 |     "hr": "hr",
 15 |     "cs": "cs",
 16 |     "da": "da",
 17 |     "nl": "nl",
 18 |     "en": "en",
 19 |     "et": "et",
 20 |     "fi": "fil",
 21 |     "fr": "fr",
 22 |     "de": "de",
 23 |     "el": "el",
 24 |     "he": "he",
 25 |     "hi": "hi",
 26 |     "hu": "hu",
 27 |     "id": "id",
 28 |     "it": "it",
 29 |     "ja": "ja",
 30 |     "ko": "ko",
 31 |     "lv": "lv",
 32 |     "lt": "lt",
 33 |     "ms": "ms",
 34 |     "no": "no",
 35 |     "fa": "fa",
 36 |     "pl": "pl",
 37 |     "pt": "pt",
 38 |     "ro": "ro",
 39 |     "ru": "ru",
 40 |     "sr": "sr-Cyrl",
 41 |     "sb": "sr-Latn",
 42 |     "sk": "sk",
 43 |     "sl": "sl",
 44 |     "es": "es",
 45 |     "sv": "sv",
 46 |     "th": "th",
 47 |     "zt": "zh-CHT",
 48 |     "tr": "tr",
 49 |     "uk": "uk",
 50 |     "ur": "ur",
 51 |     "vi": "vi",
 52 |     "cy": "cy",
 53 | }
 54 | 
 55 | 
 56 | class SogouTranslateFramework(CloudTranslationFramework):
 57 |     def __init__(self):
 58 |         super(SogouTranslateFramework, self).__init__()
 59 |         self._appid = os.getenv("SOGOU_PID")
 60 |         self._key = os.getenv("SOGOU_KEY")
 61 |         if self._appid is None:
 62 |             raise ValueError("missing pid")
 63 |         if self._key is None:
 64 |             raise ValueError("missing key")
 65 | 
 66 |     def translate_batch(self, batch, source_lang, target_lang):
 67 |         query = "\n".join(batch)
 68 |         salt = str(random.randint(10000, 99999))
 69 |         sign = self._appid + query + salt + self._key
 70 |         sign = hashlib.md5(sign.encode("utf-8")).hexdigest()
 71 | 
 72 |         url = "http://fanyi.sogou.com:80/reventondc/api/sogouTranslate"
 73 |         data = {
 74 |             "from": sogou_lang_dict_map[source_lang.lower()],
 75 |             "to": sogou_lang_dict_map[target_lang.lower()],
 76 |             "pid": self._appid,
 77 |             "q": query,
 78 |             "sign": sign,
 79 |             "salt": salt,
 80 |         }
 81 |         headers = {
 82 |             "content-type": "application/x-www-form-urlencoded",
 83 |             "accept": "application/json",
 84 |         }
 85 | 
 86 |         result = self.send_request(
 87 |             lambda: requests.post(url, data=data, headers=headers)
 88 |         )
 89 |         yield result["translation"]
 90 | 
 91 |     def supported_languages(self):
 92 |         return [
 93 |             "ar",
 94 |             "bg",
 95 |             "bn",
 96 |             "cs",
 97 |             "cy",
 98 |             "da",
 99 |             "de",
100 |             "el",
101 |             "en",
102 |             "es",
103 |             "et",
104 |             "fa",
105 |             "fi",
106 |             "fr",
107 |             "he",
108 |             "hi",
109 |             "hr",
110 |             "hu",
111 |             "id",
112 |             "it",
113 |             "ja",
114 |             "ko",
115 |             "lt",
116 |             "lv",
117 |             "ms",
118 |             "nl",
119 |             "no",
120 |             "pl",
121 |             "pt",
122 |             "ro",
123 |             "ru",
124 |             "sb",
125 |             "sk",
126 |             "sl",
127 |             "sr",
128 |             "sv",
129 |             "th",
130 |             "tr",
131 |             "uk",
132 |             "ur",
133 |             "vi",
134 |             "zh",
135 |             "zt",
136 |         ]
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     SogouTranslateFramework().run()
141 | 


--------------------------------------------------------------------------------
/frameworks/tencent_translate/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | 
 3 | WORKDIR /root
 4 | 
 5 | RUN apt-get update && \
 6 |     apt-get install -y --no-install-recommends \
 7 |         python3-pip \
 8 |         && \
 9 |     apt-get clean && \
10 |     rm -rf /var/lib/apt/lists/*
11 | 
12 | ADD requirements.txt /root
13 | RUN python3 -m pip --no-cache-dir install -r /root/requirements.txt
14 | 
15 | ADD frameworks/tencent_translate/entrypoint.py /root
16 | ADD nmtwizard /root/nmtwizard
17 | 
18 | ENTRYPOINT ["python3", "entrypoint.py"]
19 | 


--------------------------------------------------------------------------------
/frameworks/tencent_translate/README.md:
--------------------------------------------------------------------------------
1 | # Tencent Translate framework
2 | 
3 | This a translate-only framework using the [Tencent translation API](https://cloud.tencent.com/product/tmt).
4 | 
5 | Credentials should be configured with the following environment variables:
6 | 
7 | * `TENCENT_SecretId`
8 | * `TENCENT_SecretKey`
9 | 


--------------------------------------------------------------------------------
/frameworks/tencent_translate/entrypoint.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import hashlib
  4 | import hmac
  5 | import base64
  6 | import random
  7 | import sys
  8 | import binascii
  9 | import requests
 10 | import urllib.parse
 11 | 
 12 | from nmtwizard.cloud_translation_framework import CloudTranslationFramework
 13 | 
 14 | 
 15 | class TencentTranslateFramework(CloudTranslationFramework):
 16 |     def __init__(self):
 17 |         super(TencentTranslateFramework, self).__init__()
 18 |         self._appid = os.getenv("TENCENT_SecretId")
 19 |         self._key = os.getenv("TENCENT_SecretKey")
 20 |         if self._appid is None:
 21 |             raise ValueError("missing app id")
 22 |         if self._key is None:
 23 |             raise ValueError("missing key")
 24 | 
 25 |     def translate_batch(self, batch, source_lang, target_lang):
 26 |         # Tencent API does not support translating multi lines in one request
 27 |         for line in batch:
 28 |             yield self._translate_line(line, source_lang, target_lang)
 29 | 
 30 |     def _translate_line(self, line, source_lang, target_lang):
 31 |         url = "tmt.na-siliconvalley.tencentcloudapi.com"
 32 |         signature_method = "HmacSHA256"
 33 |         params = [
 34 |             ("Action", "TextTranslate"),
 35 |             ("Nonce", random.randint(1, sys.maxsize)),
 36 |             ("ProjectId", 0),
 37 |             ("Region", "na-siliconvalley"),
 38 |             ("SecretId", self._appid),
 39 |             ("SignatureMethod", signature_method),
 40 |             ("Source", source_lang.lower()),
 41 |             ("SourceText", line),
 42 |             ("Target", target_lang.lower()),
 43 |             ("Timestamp", int(time.time())),
 44 |             ("Version", "2018-03-21"),
 45 |         ]
 46 |         request = "GET%s/?%s" % (url, urllib.parse.urlencode(params))
 47 |         params.append(
 48 |             ("Signature", _sign_request(self._key, request, signature_method))
 49 |         )
 50 |         headers = {
 51 |             "content-type": "application/x-www-form-urlencoded",
 52 |             "accept": "application/json",
 53 |         }
 54 | 
 55 |         result = self.send_request(
 56 |             lambda: requests.get("https://" + url, params=params, headers=headers)
 57 |         )
 58 |         return result["Response"]["TargetText"]
 59 | 
 60 |     def supported_languages(self):
 61 |         return [
 62 |             "de",
 63 |             "en",
 64 |             "es",
 65 |             "fr",
 66 |             "id",
 67 |             "it",
 68 |             "ja",
 69 |             "ko",
 70 |             "ms",
 71 |             "pt",
 72 |             "ru",
 73 |             "th",
 74 |             "tr",
 75 |             "vi",
 76 |             "zh",
 77 |         ]
 78 | 
 79 | 
 80 | def _sign_request(secretKey, signStr, signMethod):
 81 |     signStr = bytes(signStr, "utf-8")
 82 |     secretKey = bytes(secretKey, "utf-8")
 83 | 
 84 |     digestmod = None
 85 |     if signMethod == "HmacSHA256":
 86 |         digestmod = hashlib.sha256
 87 |     elif signMethod == "HmacSHA1":
 88 |         digestmod = hashlib.sha1
 89 |     else:
 90 |         raise NotImplementedError(
 91 |             "signMethod invalid", "signMethod only support (HmacSHA1, HmacSHA256)"
 92 |         )
 93 | 
 94 |     hashed = hmac.new(secretKey, signStr, digestmod)
 95 |     return binascii.b2a_base64(hashed.digest())[:-1].decode()
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     TencentTranslateFramework().run()
100 | 


--------------------------------------------------------------------------------
/frameworks/youdao_translate/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | 
 3 | WORKDIR /root
 4 | 
 5 | RUN apt-get update && \
 6 |     apt-get install -y --no-install-recommends \
 7 |         python3-pip \
 8 |         && \
 9 |     apt-get clean && \
10 |     rm -rf /var/lib/apt/lists/*
11 | 
12 | ADD requirements.txt /root
13 | RUN python3 -m pip --no-cache-dir install -r /root/requirements.txt
14 | 
15 | ADD frameworks/youdao_translate/entrypoint.py /root
16 | ADD nmtwizard /root/nmtwizard
17 | 
18 | ENTRYPOINT ["python3", "entrypoint.py"]
19 | 


--------------------------------------------------------------------------------
/frameworks/youdao_translate/README.md:
--------------------------------------------------------------------------------
1 | # Youdao Translate framework
2 | 
3 | This a translate-only framework using the [Youdao translation API](https://ai.youdao.com/gw.s).
4 | 
5 | Credentials should be configured with the following environment variables:
6 | 
7 | * `YOUDAO_APPID`
8 | * `YOUDAO_KEY`
9 | 


--------------------------------------------------------------------------------
/frameworks/youdao_translate/entrypoint.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import hashlib
 3 | import random
 4 | import requests
 5 | 
 6 | from nmtwizard.cloud_translation_framework import CloudTranslationFramework
 7 | 
 8 | 
 9 | youdao_lang_dict_map = {
10 |     "zh": "zh-CHS",
11 |     "en": "en",
12 |     "fr": "fr",
13 |     "ja": "ja",
14 |     "ko": "ko",
15 |     "pt": "pt",
16 |     "ru": "ru",
17 |     "es": "es",
18 |     "vi": "vi",
19 | }
20 | 
21 | 
22 | class YoudaoTranslateFramework(CloudTranslationFramework):
23 |     def __init__(self):
24 |         super(YoudaoTranslateFramework, self).__init__()
25 |         self._appid = os.getenv("YOUDAO_APPID")
26 |         self._key = os.getenv("YOUDAO_KEY")
27 |         if self._appid is None:
28 |             raise ValueError("missing app id")
29 |         if self._key is None:
30 |             raise ValueError("missing key")
31 | 
32 |     def translate_batch(self, batch, source_lang, target_lang):
33 |         query = "\n".join(batch)
34 |         salt = str(random.randint(10000, 99999))
35 |         sign = self._appid + query + salt + self._key
36 |         m1 = hashlib.md5()
37 |         m1.update(sign.encode("utf-8"))
38 |         sign = m1.hexdigest()
39 | 
40 |         url = "http://openapi.youdao.com/api"
41 |         params = {
42 |             "appKey": self._appid,
43 |             "q": query,
44 |             "from": youdao_lang_dict_map[source_lang.lower()],
45 |             "to": youdao_lang_dict_map[target_lang.lower()],
46 |             "salt": salt,
47 |             "sign": sign,
48 |         }
49 | 
50 |         result = self.send_request(lambda: requests.get(url, params=params))
51 |         for trans in result["translation"]:
52 |             yield trans
53 | 
54 |     def supported_languages(self):
55 |         return ["zh", "en", "fr", "ja", "ko", "pt", "ru", "es", "vi"]
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     YoudaoTranslateFramework().run()
60 | 


--------------------------------------------------------------------------------
/nmtwizard/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenNMT/nmt-wizard-docker/a8469b2a292c1f38adfc3cffa35162ed9919f08e/nmtwizard/__init__.py


--------------------------------------------------------------------------------
/nmtwizard/beat_service.py:
--------------------------------------------------------------------------------
 1 | """Define the beat service to interact with the task launcher."""
 2 | 
 3 | import contextlib
 4 | import time
 5 | import threading
 6 | import requests
 7 | 
 8 | from nmtwizard.logger import get_logger
 9 | 
10 | logger = get_logger(__name__)
11 | 
12 | _stop_beat = threading.Event()
13 | _beat_thread = None
14 | _last_activity = None
15 | _last_activity_lock = threading.Lock()
16 | 
17 | 
18 | def start_beat_service(
19 |     container_id, url, task_id, interval=30, inactivity_timeout=None
20 | ):
21 |     """Start a background service that sends HTTP PUT requests to:
22 | 
23 |     url/task/beat/task_id?container_id=container_id&duration=interval
24 | 
25 |     every `interval` seconds.
26 |     """
27 |     # If no URL is set, consider the beat service as disabled.
28 |     if url is None or task_id is None:
29 |         logger.warning(
30 |             "CALLBACK_URL or task_id is unset; beat service will be disabled"
31 |         )
32 |         return
33 |     if beat_service_is_running():
34 |         logger.warning("The beat service is already running")
35 |         return
36 | 
37 |     request_params = {"container_id": container_id, "duration": str(interval * 2)}
38 | 
39 |     def _beat():
40 |         requests.put("%s/task/beat/%s" % (url, task_id), params=request_params)
41 | 
42 |     def _beat_loop():
43 |         while True:
44 |             if _stop_beat.wait(interval):
45 |                 break
46 |             if inactivity_timeout is not None:
47 |                 with _last_activity_lock:
48 |                     if (
49 |                         _last_activity is not None
50 |                         and time.time() - _last_activity > inactivity_timeout
51 |                     ):
52 |                         logger.warning(
53 |                             "No activity reported after %d seconds. Stopping the beat requests.",
54 |                             inactivity_timeout,
55 |                         )
56 |                         break
57 |             _beat()
58 | 
59 |     _beat()  # First beat in the main thread to fail for wrong url.
60 |     logger.info("Starting the beat service to %s with interval %d", url, interval)
61 |     global _beat_thread
62 |     _beat_thread = threading.Thread(target=_beat_loop)
63 |     _beat_thread.daemon = True
64 |     _beat_thread.start()
65 | 
66 | 
67 | def stop_beat_service():
68 |     """Stop the beat service."""
69 |     if beat_service_is_running():
70 |         _stop_beat.set()
71 |         _beat_thread.join()
72 |         _stop_beat.clear()
73 | 
74 | 
75 | def beat_service_is_running():
76 |     """Returns True if the beat service is currently running."""
77 |     return _beat_thread is not None and _beat_thread.is_alive()
78 | 
79 | 
80 | @contextlib.contextmanager
81 | def monitor_activity():
82 |     monitor = _ActivityMonitor()
83 |     monitor.notify()
84 |     yield monitor
85 |     monitor.stop()
86 | 
87 | 
88 | class _ActivityMonitor:
89 |     def notify(self):
90 |         global _last_activity
91 |         with _last_activity_lock:
92 |             _last_activity = time.time()
93 | 
94 |     def stop(self):
95 |         global _last_activity
96 |         with _last_activity_lock:
97 |             _last_activity = None
98 | 


--------------------------------------------------------------------------------
/nmtwizard/cloud_translation_framework.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import time
 3 | 
 4 | from nmtwizard.framework import Framework
 5 | from nmtwizard.serving import TranslationOutput
 6 | 
 7 | 
 8 | def _batch_iter(iterable, size):
 9 |     batch = []
10 |     for x in iterable:
11 |         batch.append(x.strip())
12 |         if len(batch) == size:
13 |             yield batch
14 |             batch = []
15 |     if batch:
16 |         yield batch
17 | 
18 | 
19 | class CloudTranslationFramework(Framework):
20 |     def __init__(self):
21 |         super(CloudTranslationFramework, self).__init__(stateless=True)
22 | 
23 |     def supported_languages(self):
24 |         return None
25 | 
26 |     @abc.abstractmethod
27 |     def translate_batch(self, batch, source_lang, target_lang):
28 |         raise NotImplementedError()
29 | 
30 |     def _check_lang(self, lang):
31 |         supported_languages = self.supported_languages()
32 |         if supported_languages is not None and lang not in supported_languages:
33 |             raise ValueError("unsupported language: %s" % lang)
34 | 
35 |     def send_request(self, request_fn, max_retry=5, retry_delay=5):
36 |         retry = 0
37 |         while retry < max_retry:
38 |             r = request_fn()
39 |             if r.status_code == 429:
40 |                 retry += 1
41 |                 time.sleep(5)
42 |             else:
43 |                 break
44 |         if r.status_code != 200:
45 |             raise RuntimeError("Error status %d: %s" % (r.status_code, r.text))
46 |         return r.json()
47 | 
48 |     def trans(self, config, model_path, input, output, gpuid=0):
49 |         self._check_lang(config["source"])
50 |         self._check_lang(config["target"])
51 |         with open(input, "r") as input_file, open(output, "w") as output_file:
52 |             for batch in _batch_iter(input_file, 10):
53 |                 translations = self.translate_batch(
54 |                     batch, config["source"], config["target"]
55 |                 )
56 |                 for translation in translations:
57 |                     output_file.write(translation)
58 |                     output_file.write("\n")
59 | 
60 |     def train(self, *args, **kwargs):
61 |         raise NotImplementedError("This framework can only be used for translation")
62 | 
63 |     def release(self, *arg, **kwargs):
64 |         raise NotImplementedError("This framework does not require a release step")
65 | 
66 |     def serve(self, config, model_path, gpuid=0):
67 |         self._check_lang(config["source"])
68 |         self._check_lang(config["target"])
69 |         return None, {"source": config["source"], "target": config["target"]}
70 | 
71 |     def forward_request(self, model_info, inputs, outputs=None, options=None):
72 |         return [
73 |             [TranslationOutput(translation)]
74 |             for translation in self.translate_batch(
75 |                 inputs, model_info["source"], model_info["target"]
76 |             )
77 |         ]
78 | 
79 |     def _get_preprocessor(self, *args, **kwargs):
80 |         return None
81 | 
82 |     def _get_postprocessor(self, *args, **kwargs):
83 |         return None
84 | 


--------------------------------------------------------------------------------
/nmtwizard/data.py:
--------------------------------------------------------------------------------
 1 | """Data utilities."""
 2 | 
 3 | import os
 4 | import shutil
 5 | 
 6 | 
 7 | def merge_files(files, output):
 8 |     """Merge all files in output."""
 9 |     with open(output, "wb") as output_file:
10 |         for f in files:
11 |             with open(f, "rb") as fd:
12 |                 shutil.copyfileobj(fd, output_file, 1024 * 1024 * 10)  # Chunk of 10MB.
13 | 
14 | 
15 | def merge_files_in_directory(input_dir, output_dir, src_suffix, tgt_suffix):
16 |     """Merge all source and target files in the directory input_dir to a single
17 |     parallel file in output_dir.
18 |     """
19 |     if not os.path.exists(output_dir):
20 |         os.makedirs(output_dir)
21 |     files = [
22 |         f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))
23 |     ]
24 |     src_files = sorted(
25 |         os.path.join(input_dir, f) for f in files if f.endswith(src_suffix)
26 |     )
27 |     if tgt_suffix:
28 |         tgt_files = sorted(
29 |             os.path.join(input_dir, f) for f in files if f.endswith(tgt_suffix)
30 |         )
31 |     align_files = sorted(
32 |         os.path.join(input_dir, f) for f in files if f.endswith("align")
33 |     )
34 |     weight_files = sorted(
35 |         os.path.join(input_dir, f) for f in files if f.endswith("weights")
36 |     )
37 |     merge_files(src_files, os.path.join(output_dir, "train.%s" % src_suffix))
38 |     if tgt_suffix:
39 |         merge_files(tgt_files, os.path.join(output_dir, "train.%s" % tgt_suffix))
40 |     if align_files:
41 |         merge_files(align_files, os.path.join(output_dir, "train.align"))
42 |     if weight_files:
43 |         merge_files(weight_files, os.path.join(output_dir, "train.weights"))
44 | 
45 | 
46 | def paste_files(input_files, output_file, separator="\t"):
47 |     input_fhs = [open(f, "r") for f in input_files]
48 |     output_fb = open(output_file, "w")
49 |     while True:
50 |         line = []
51 |         for fh in input_fhs:
52 |             line.append(fh.readline())
53 |         if "" in line:
54 |             break
55 |         output_fb.write("%s\n" % separator.join([s.strip() for s in line]))
56 |     output_fb.close()
57 |     for fh in input_fhs:
58 |         fh.close()
59 | 


--------------------------------------------------------------------------------
/nmtwizard/logger.py:
--------------------------------------------------------------------------------
 1 | """Logging utilities."""
 2 | 
 3 | import os
 4 | import logging
 5 | 
 6 | logging.basicConfig(
 7 |     format="%(asctime)s.%(msecs)03d000 UTC [%(module)s@%(processName)s] %(levelname)s %(message)s",
 8 |     datefmt="%Y-%b-%d %H:%M:%S",
 9 |     level=os.getenv("LOG_LEVEL", "INFO"),
10 | )
11 | 
12 | 
13 | def get_logger(name=None):
14 |     """Returns a logger with configured level."""
15 |     logger = logging.getLogger(name)
16 |     return logger
17 | 


--------------------------------------------------------------------------------
/nmtwizard/preprocess/__init__.py:
--------------------------------------------------------------------------------
1 | from nmtwizard.preprocess import operators
2 | 


--------------------------------------------------------------------------------
/nmtwizard/preprocess/operators/__init__.py:
--------------------------------------------------------------------------------
1 | from nmtwizard.preprocess.operators import align_perplexity_filter
2 | from nmtwizard.preprocess.operators import alignment
3 | from nmtwizard.preprocess.operators import identity_filter
4 | from nmtwizard.preprocess.operators import length_filter
5 | from nmtwizard.preprocess.operators import noise
6 | from nmtwizard.preprocess.operators import similarity_filter
7 | from nmtwizard.preprocess.operators import tokenization
8 | from nmtwizard.preprocess.operators import parentheses_filter
9 | 


--------------------------------------------------------------------------------
/nmtwizard/preprocess/operators/align_perplexity_filter.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | from nmtwizard.preprocess import prepoperator
  4 | from nmtwizard.logger import get_logger
  5 | 
  6 | logger = get_logger(__name__)
  7 | 
  8 | 
  9 | @prepoperator.register_operator("align_perplexity_filter")
 10 | class AlignPerplexityFilter(prepoperator.Filter):
 11 |     @classmethod
 12 |     def _config_schema(cls):
 13 |         schema = super(AlignPerplexityFilter, cls)._config_schema()
 14 | 
 15 |         threshold_block = {
 16 |             "type": "object",
 17 |             "properties": {
 18 |                 "lower": {"type": ["number", "null"]},
 19 |                 "upper": {"type": ["number", "null"]},
 20 |             },
 21 |             "additionalProperties": False,
 22 |         }
 23 | 
 24 |         schema["properties"].update(
 25 |             {"hard_threshold": threshold_block, "percent_threshold": threshold_block}
 26 |         )
 27 |         return schema
 28 | 
 29 |     def __init__(self, config, process_type, build_state):
 30 |         super().__init__([])
 31 |         self._hard_threshold = config.get("hard_threshold")
 32 |         self._percent_threshold = config.get("percent_threshold")
 33 | 
 34 |         if self._hard_threshold is not None:
 35 |             self._lower = _get_hard_threshold(self._hard_threshold, "lower")
 36 |             self._upper = _get_hard_threshold(self._hard_threshold, "upper")
 37 |             if (
 38 |                 self._lower is not None
 39 |                 and self._upper is not None
 40 |                 and self._upper <= self._lower
 41 |             ):
 42 |                 raise ValueError(
 43 |                     "align_perplexity_filter: hard threshold 'upper' should be "
 44 |                     "greater than 'lower'"
 45 |                 )
 46 | 
 47 |         elif self._percent_threshold is not None:
 48 |             self._lower = _get_percent_threshold(self._percent_threshold, "lower")
 49 |             self._upper = _get_percent_threshold(self._percent_threshold, "upper")
 50 |             total_removed = self._lower + self._upper
 51 |             if total_removed >= 1:
 52 |                 raise ValueError(
 53 |                     "align_perplexity_filter: percent threshold values will filter "
 54 |                     "all sentences (lower=%.2f and upper=%.2f mean %.2f%% of sentences "
 55 |                     "will be filtered)"
 56 |                     % (self._lower, self._upper, total_removed * 100)
 57 |                 )
 58 | 
 59 |     def _preprocess(self, tu_batch):
 60 |         if self._hard_threshold is None and self._percent_threshold is None:
 61 |             return tu_batch
 62 | 
 63 |         tu_list, meta_batch = tu_batch
 64 |         batch_size = len(tu_list)
 65 |         perplexity = list(map(_compute_perplexity, tu_list))
 66 |         new_tu_list = []
 67 | 
 68 |         if self._hard_threshold is not None:
 69 |             # Filter TUs on perplexity value.
 70 |             for tu, perplexity in zip(tu_list, perplexity):
 71 |                 if (self._lower is None or perplexity >= self._lower) and (
 72 |                     self._upper is None or perplexity <= self._upper
 73 |                 ):
 74 |                     new_tu_list.append(tu)
 75 |                 elif self._verbose:
 76 |                     message = (
 77 |                         f"Perplexity value ({perplexity:.2f}) outside hard thresholds"
 78 |                     )
 79 |                     logger.info(
 80 |                         f"{message} : '{self.name}' operator filters the following sentence \tSRC : {tu.src_detok}\tTGT : {tu.tgt_detok}"
 81 |                     )
 82 | 
 83 |         elif self._percent_threshold is not None:
 84 |             # Remove the worst $lower percent and the best $upper percent perplexity values.
 85 |             keep_ids = range(batch_size)
 86 |             keep_ids = list(
 87 |                 sorted(keep_ids, key=lambda i: perplexity[i], reverse=True)
 88 |             )  # From best to worst.
 89 |             worst_to_remove = int(self._lower * batch_size)
 90 |             best_to_remove = int(self._upper * batch_size)
 91 |             worst_ids = []
 92 |             best_ids = []
 93 |             if worst_to_remove != 0:
 94 |                 worst_ids = keep_ids[-worst_to_remove:]
 95 |                 keep_ids = keep_ids[:-worst_to_remove]
 96 |             if best_to_remove != 0:
 97 |                 best_ids = keep_ids[:best_to_remove]
 98 |                 keep_ids = keep_ids[best_to_remove:]
 99 |             for i in sorted(keep_ids):
100 |                 new_tu_list.append(tu_list[i])
101 |             if self._verbose:
102 |                 for i in worst_ids:
103 |                     message = f"Perplexity value ({perplexity[i]:.2f}) worse than percentage threshold"
104 |                     logger.info(
105 |                         f"{message} : '{self.name}' operator filters the following sentence \tSRC : {tu_list[i].src_detok}\tTGT : {tu_list[i].tgt_detok}"
106 |                     )
107 |                 for i in best_ids:
108 |                     message = f"Perplexity value ({perplexity[i]:.2f}) better than percentage threshold"
109 |                     logger.info(
110 |                         f"{message} : '{self.name}' operator filters the following sentence \tSRC : {tu_list[i].src_detok}\tTGT : {tu_list[i].tgt_detok}"
111 |                     )
112 | 
113 |         return new_tu_list, meta_batch
114 | 
115 | 
116 | def _get_hard_threshold(config, field):
117 |     value = config.get(field)
118 |     if value is not None and value > 0:
119 |         raise ValueError(
120 |             "align_perplexity_filter: perplexity values range from "
121 |             "-inf (worst perplexity) to 0 (best perplexity), but hard "
122 |             "threshold '%s' is set to %.2f" % (field, value)
123 |         )
124 |     return value
125 | 
126 | 
127 | def _get_percent_threshold(config, field):
128 |     value = config.get(field, 0)
129 |     if value < 0 or value >= 1:
130 |         raise ValueError(
131 |             "align_perplexity_filter: percent threshold should be between "
132 |             "0 (included) and 1 (excluded), but '%s' is set to %.2f" % (field, value)
133 |         )
134 |     return value
135 | 
136 | 
137 | def _compute_perplexity(tu):
138 |     # Compute the average source/target perplexity.
139 |     fwd, bwd = _get_log_probs(tu)
140 | 
141 |     src_size = len(tu.src_tok.tokens[0])
142 |     tgt_size = len(tu.tgt_tok.tokens[0])
143 | 
144 |     min_size = min(src_size, tgt_size) or 1
145 | 
146 |     return math.log((math.exp(fwd / min_size) + math.exp(bwd / min_size)) / 2)
147 | 
148 | 
149 | def _get_log_probs(tu):
150 |     log_probs = tu.alignment_log_probs
151 |     if log_probs is None:
152 |         raise ValueError("Alignment log probs are not set")
153 |     return log_probs[0]
154 | 


--------------------------------------------------------------------------------
/nmtwizard/preprocess/operators/alignment.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from nmtwizard.preprocess import prepoperator
 4 | 
 5 | 
 6 | @prepoperator.register_operator("alignment")
 7 | class Aligner(prepoperator.Operator):
 8 |     @classmethod
 9 |     def _config_schema(cls):
10 |         schema = super(Aligner, cls)._config_schema()
11 | 
12 |         alignment_block = {
13 |             "type": "object",
14 |             "properties": {"probs": {"type": "string", "is_file": True}},
15 |             "additionalProperties": False,
16 |             "required": ["probs"],
17 |         }
18 | 
19 |         schema["properties"].update(
20 |             {
21 |                 "forward": alignment_block,
22 |                 "backward": alignment_block,
23 |                 "write_alignment": {"type": "boolean"},
24 |                 "sample": {"type": "integer", "minimum": 0},
25 |             }
26 |         )
27 | 
28 |         return schema
29 | 
30 |     @staticmethod
31 |     def is_applied_for(process_type):
32 |         return process_type.training
33 | 
34 |     # Alignment models can take several GB in memory so we need to share an Aligner
35 |     # instance across workers.
36 | 
37 |     @staticmethod
38 |     def get_shared_classes():
39 |         import systran_align
40 | 
41 |         return [systran_align.Aligner]
42 | 
43 |     @staticmethod
44 |     def get_shared_builders(config, process_type):
45 |         import systran_align
46 | 
47 |         forward_probs_path = config.get("forward", {}).get("probs")
48 |         backward_probs_path = config.get("backward", {}).get("probs")
49 |         if forward_probs_path is None or backward_probs_path is None:
50 |             return None
51 |         if not os.path.isfile(forward_probs_path):
52 |             raise ValueError(
53 |                 "Forward probs file for alignment doesn't exist: %s"
54 |                 % (forward_probs_path)
55 |             )
56 |         if not os.path.isfile(backward_probs_path):
57 |             raise ValueError(
58 |                 "Backward probs file for alignment doesn't exist: %s"
59 |                 % (backward_probs_path)
60 |             )
61 |         return {
62 |             "aligner": (
63 |                 systran_align.Aligner,
64 |                 (forward_probs_path, backward_probs_path),
65 |             )
66 |         }
67 | 
68 |     def __init__(self, align_config, process_type, build_state, shared_state=None):
69 |         self._aligner = shared_state["aligner"] if shared_state else None
70 |         self._write_alignment = align_config.get("write_alignment", False)
71 | 
72 |     def _preprocess(self, tu_batch):
73 |         tu_list, meta_batch = tu_batch
74 |         if self.process_type.training:
75 |             meta_batch["write_alignment"] = self._write_alignment
76 | 
77 |         src_tokens = []
78 |         tgt_tokens = []
79 |         for tu in tu_list:
80 |             src_tok = tu.src_tok
81 |             tgt_tok = tu.tgt_tok
82 |             if src_tok.tokenizer is None or tgt_tok.tokenizer is None:
83 |                 raise RuntimeError("Cannot set alignment if no tokenization is set")
84 |             if len(src_tok.tokens) != 1 or len(tgt_tok.tokens) != 1:
85 |                 raise RuntimeError("Alignment operator only supports single-part TUs")
86 |             src_tokens.append(src_tok.tokens[0])
87 |             tgt_tokens.append(tgt_tok.tokens[0])
88 | 
89 |         results = self._aligner.align_batch(src_tokens, tgt_tokens)
90 |         for tu, result in zip(tu_list, results):
91 |             tu.set_alignment(
92 |                 result["alignments"],
93 |                 result["forward_log_prob"],
94 |                 result["backward_log_prob"],
95 |             )
96 | 
97 |         return tu_list, meta_batch
98 | 


--------------------------------------------------------------------------------
/nmtwizard/preprocess/operators/identity_filter.py:
--------------------------------------------------------------------------------
 1 | from nmtwizard.preprocess import prepoperator
 2 | 
 3 | 
 4 | @prepoperator.register_operator("identity_filter")
 5 | class IdentityFilter(prepoperator.Filter):
 6 |     """Ignore TU with the same source and target."""
 7 | 
 8 |     @classmethod
 9 |     def _config_schema(cls):
10 |         schema = super(IdentityFilter, cls)._config_schema()
11 | 
12 |         schema["properties"].update(
13 |             {"min_characters": {"type": "integer", "minimum": 0}}
14 |         )
15 |         return schema
16 | 
17 |     def __init__(self, config, *args, **kwargs):
18 |         # Do not ignore identity TU if it has less than this number of characters.
19 |         min_characters = config.get("min_characters", 0)
20 | 
21 |         filter_fn = (
22 |             lambda tu: len(tu.src_detok) > min_characters
23 |             and tu.src_detok == tu.tgt_detok
24 |         )
25 |         super().__init__([filter_fn])
26 | 


--------------------------------------------------------------------------------
/nmtwizard/preprocess/operators/length_filter.py:
--------------------------------------------------------------------------------
  1 | from nmtwizard.preprocess import prepoperator
  2 | 
  3 | 
  4 | @prepoperator.register_operator("length_filter")
  5 | class LengthFilter(prepoperator.Filter):
  6 |     @classmethod
  7 |     def _config_schema(cls):
  8 |         schema = super(LengthFilter, cls)._config_schema()
  9 | 
 10 |         length_mono_block = {
 11 |             "type": "object",
 12 |             "properties": {
 13 |                 "lang": {"type": "string"},
 14 |                 "max_characters": {"type": "integer", "minimum": 0},
 15 |                 "max_words": {"type": "integer", "minimum": 0},
 16 |                 "min_words": {"type": "integer", "minimum": 0},
 17 |             },
 18 |             "additionalProperties": False,
 19 |         }
 20 |         schema["properties"].update(
 21 |             {
 22 |                 "source": length_mono_block,
 23 |                 "target": length_mono_block,
 24 |                 "min_words_ratio": {"type": "number"},
 25 |                 "max_words_ratio": {"type": "number"},
 26 |                 "min_num_words_for_ratio": {"type": "integer"},
 27 |             }
 28 |         )
 29 |         return schema
 30 | 
 31 |     def __init__(self, config, process_type, build_state):
 32 |         source_config = _get_side_config(config, "source")
 33 |         target_config = _get_side_config(config, "target")
 34 |         self._verbose = config.get("verbose", False)
 35 | 
 36 |         filters = []
 37 |         filters.extend(
 38 |             _get_side_filters(
 39 |                 source_config,
 40 |                 lambda tu: tu.src_detok,
 41 |                 lambda tu: tu.src_tok.tokens[0],
 42 |                 self._verbose,
 43 |             )
 44 |         )
 45 |         filters.extend(
 46 |             _get_side_filters(
 47 |                 target_config,
 48 |                 lambda tu: tu.tgt_detok,
 49 |                 lambda tu: tu.tgt_tok.tokens[0],
 50 |                 self._verbose,
 51 |             )
 52 |         )
 53 | 
 54 |         min_words_ratio = config.get("min_words_ratio")
 55 |         min_num_words_for_ratio = config.get("min_num_words_for_ratio", 0)
 56 | 
 57 |         if min_words_ratio is not None:
 58 |             message_min_words_ratio = "Inferior to min word length ratio (%.2f) (Src length : %d Tgt length : %d Ratio : %.2f)"
 59 |             filters.append(
 60 |                 lambda tu: (
 61 |                     len(tu.src_tok.tokens[0]) >= min_num_words_for_ratio
 62 |                     and len(tu.tgt_tok.tokens[0]) >= min_num_words_for_ratio
 63 |                     and len(tu.src_tok.tokens[0]) / len(tu.tgt_tok.tokens[0])
 64 |                     < min_words_ratio,
 65 |                     message_min_words_ratio
 66 |                     % (
 67 |                         min_words_ratio,
 68 |                         len(tu.src_tok.tokens[0]),
 69 |                         len(tu.tgt_tok.tokens[0]),
 70 |                         len(tu.src_tok.tokens[0]) / len(tu.tgt_tok.tokens[0]),
 71 |                     ),
 72 |                 )
 73 |             )
 74 | 
 75 |         max_words_ratio = config.get("max_words_ratio")
 76 |         if max_words_ratio is not None:
 77 |             message_max_words_ratio = "Exceeds max word length ratio (%.2f) (Src length : %d Tgt length : %d Ratio : %.2f)"
 78 |             filters.append(
 79 |                 lambda tu: (
 80 |                     len(tu.src_tok.tokens[0]) >= min_num_words_for_ratio
 81 |                     and len(tu.tgt_tok.tokens[0]) >= min_num_words_for_ratio
 82 |                     and len(tu.src_tok.tokens[0]) / len(tu.tgt_tok.tokens[0])
 83 |                     > max_words_ratio,
 84 |                     message_max_words_ratio
 85 |                     % (
 86 |                         max_words_ratio,
 87 |                         len(tu.src_tok.tokens[0]),
 88 |                         len(tu.tgt_tok.tokens[0]),
 89 |                         len(tu.src_tok.tokens[0]) / len(tu.tgt_tok.tokens[0]),
 90 |                     ),
 91 |                 )
 92 |             )
 93 | 
 94 |         super(LengthFilter, self).__init__(filters)
 95 | 
 96 | 
 97 | def _get_side_config(config, side):
 98 |     config = config.get(side, {})
 99 |     # Filter empty sentences by default.
100 |     config.setdefault("min_words", 1)
101 |     return config
102 | 
103 | 
104 | def _get_side_filters(config, chars_fn, words_fn, verbose):
105 |     filters = []
106 | 
107 |     max_chars = config.get("max_characters")
108 |     if max_chars is not None:
109 |         message_max_chars = f"Longer than max chars ({max_chars})"
110 |         filters.append(lambda tu: (len(chars_fn(tu)) > max_chars, message_max_chars))
111 | 
112 |     max_words = config.get("max_words")
113 |     if max_words is not None:
114 |         message_max_words = f"Longer than max words ({max_words})"
115 |         filters.append(lambda tu: (len(words_fn(tu)) > max_words, message_max_words))
116 | 
117 |     min_words = config.get("min_words")
118 |     if min_words is not None:
119 |         message_min_words = f"Shorter than min words ({min_words})"
120 |         filters.append(lambda tu: (len(words_fn(tu)) < min_words, message_min_words))
121 | 
122 |     return filters
123 | 


--------------------------------------------------------------------------------
/nmtwizard/preprocess/operators/parentheses_filter.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import collections
  3 | 
  4 | 
  5 | from nmtwizard.preprocess import prepoperator
  6 | 
  7 | 
  8 | @prepoperator.register_operator("parentheses")
  9 | class ParenthesesFilter(prepoperator.Filter):
 10 |     @classmethod
 11 |     def _config_schema(cls):
 12 |         schema = super(ParenthesesFilter, cls)._config_schema()
 13 | 
 14 |         schema["properties"].update(
 15 |             {
 16 |                 "side": {"type": "string", "enum": ["source", "target", "both"]},
 17 |                 "type": {
 18 |                     "type": "array",
 19 |                     "items": {
 20 |                         "type": "array",
 21 |                         "items": {"type": "string"},
 22 |                         "minItems": 2,
 23 |                         "maxItems": 2,
 24 |                     },
 25 |                 },
 26 |             }
 27 |         )
 28 | 
 29 |         schema["required"] = ["side"]
 30 | 
 31 |         return schema
 32 | 
 33 |     def __init__(self, config, process_type, build_state):
 34 |         side = config.get("side")
 35 |         self._remove_src = side == "source" or side == "both"
 36 |         self._remove_tgt = side == "target" or side == "both"
 37 | 
 38 |         self._parentheses_types = {("(", ")")}
 39 | 
 40 |         parentheses_types = config.get("type")
 41 |         if parentheses_types:
 42 |             for par in parentheses_types:
 43 |                 self._parentheses_types.add(tuple(par))
 44 | 
 45 |         filters = [self._filter_parentheses]
 46 | 
 47 |         super(ParenthesesFilter, self).__init__(filters)
 48 | 
 49 |     def _discover_parentheses(self, tokens):
 50 |         replacements = collections.defaultdict(list)
 51 |         for parentheses_type in self._parentheses_types:
 52 |             opening, closing = parentheses_type
 53 |             i = 0
 54 |             while i < len(tokens):
 55 |                 tok = tokens[i]
 56 |                 if closing in tok:
 57 |                     return None
 58 |                 if opening in tok:
 59 |                     for j in range(i + 1, len(tokens)):
 60 |                         tok_after = tokens[j]
 61 |                         if closing in tok_after:
 62 |                             joiner_marker = "￭"
 63 |                             repl = []
 64 |                             if tok.startswith(joiner_marker) and tok_after.endswith(
 65 |                                 joiner_marker
 66 |                             ):
 67 |                                 repl.append(joiner_marker)
 68 |                             replacements[parentheses_type].append((i, j - i + 1, repl))
 69 |                             i = j
 70 |                             break
 71 |                         if j == len(tokens) - 1:  # Didn't find a pair
 72 |                             return None
 73 |                         for par in itertools.chain(
 74 |                             *self._parentheses_types
 75 |                         ):  # Found a nested/mismatched parenthesis after
 76 |                             if par in tok_after:
 77 |                                 return None
 78 |                 i += 1
 79 |         return replacements
 80 | 
 81 |     def _filter_parentheses(self, tu):
 82 |         src_tokens = tu.src_tok.tokens[0]
 83 |         src_replacements = self._discover_parentheses(src_tokens)
 84 |         if src_replacements is None:  # Unbalanced or nested parentheses in source
 85 |             return True
 86 | 
 87 |         tgt_tokens = tu.tgt_tok.tokens[0]
 88 |         tgt_replacements = self._discover_parentheses(tgt_tokens)
 89 |         if tgt_replacements is None:  # Unbalanced or nested parentheses in target
 90 |             return True
 91 | 
 92 |         src_replacements_to_keep = []
 93 |         tgt_replacements_to_keep = []
 94 |         for parentheses_type in self._parentheses_types:
 95 |             src_repl = src_replacements[parentheses_type]
 96 |             tgt_repl = tgt_replacements[parentheses_type]
 97 |             length_src_repl = len(src_repl)
 98 |             length_tgt_repl = len(tgt_repl)
 99 |             if length_src_repl != length_tgt_repl and (
100 |                 length_src_repl > 1 or length_tgt_repl > 1
101 |             ):  # Unabalanced source/target
102 |                 return True
103 | 
104 |             if self._remove_src and length_src_repl == 1 and length_tgt_repl == 0:
105 |                 src_replacements_to_keep.append(src_repl[0])
106 | 
107 |             if self._remove_tgt and length_tgt_repl == 1 and length_src_repl == 0:
108 |                 tgt_replacements_to_keep.append(tgt_repl[0])
109 | 
110 |         src_replacements_to_keep.sort(key=lambda tup: tup[0])
111 |         tgt_replacements_to_keep.sort(key=lambda tup: tup[0])
112 | 
113 |         for repl in reversed(src_replacements_to_keep):
114 |             tu.replace_tokens_side("source", repl)
115 | 
116 |         for repl in reversed(tgt_replacements_to_keep):
117 |             tu.replace_tokens_side("target", repl)
118 | 
119 |         return False
120 | 


--------------------------------------------------------------------------------
/nmtwizard/preprocess/operators/similarity_filter.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import math
 3 | 
 4 | from nmtwizard.preprocess import prepoperator
 5 | 
 6 | 
 7 | @prepoperator.register_operator("similarity_filter")
 8 | class SimilarityFilter(prepoperator.Filter):
 9 |     @classmethod
10 |     def _config_schema(cls):
11 |         schema = super(SimilarityFilter, cls)._config_schema()
12 | 
13 |         schema["properties"].update(
14 |             {
15 |                 "threshold": {"type": "number"},
16 |                 "mode": {
17 |                     "type": "string",
18 |                     "enum": ["hard", "soft_linear", "soft_sigmoid"],
19 |                 },
20 |                 "factor": {"type": "number"},
21 |             }
22 |         )
23 | 
24 |         return schema
25 | 
26 |     def __init__(self, config, process_type, build_state):
27 |         threshold = config.get("threshold", 0)
28 |         mode = config.get("mode")
29 |         factor = config.get("factor", 1)
30 |         self._verbose = config.get("verbose", False)
31 | 
32 |         if mode is None:
33 |             raise ValueError("Missing mode field in similarity configuration")
34 |         if mode not in ("hard", "soft_linear", "soft_sigmoid"):
35 |             raise ValueError("Invalid mode %s in similarity configuration" % mode)
36 | 
37 |         def _filter(tu):
38 |             annotations = tu.annotations
39 |             if annotations is None:
40 |                 return False
41 |             similarity = annotations.get("similarity")
42 |             if similarity is None:
43 |                 return False
44 |             v = float(similarity)
45 |             norm_v = ((v - threshold) * factor + 1) / 2
46 |             if mode == "hard":
47 |                 p = 0.5
48 |             else:
49 |                 p = random.random()
50 |                 if mode == "soft_sigmoid":
51 |                     norm_v = 1 / (1 + math.exp(-norm_v))
52 |             to_filter = p > norm_v
53 |             return (
54 |                 (to_filter, f"Similarity score {norm_v} lower than {p}")
55 |                 if self._verbose
56 |                 else to_filter
57 |             )
58 | 
59 |         super().__init__([_filter])
60 | 


--------------------------------------------------------------------------------
/nmtwizard/preprocess/operators/tokenization.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from nmtwizard.preprocess import prepoperator
  4 | from nmtwizard.preprocess import tokenizer
  5 | 
  6 | 
  7 | @prepoperator.register_operator("tokenization")
  8 | class Tokenizer(prepoperator.MonolingualOperator):
  9 |     @classmethod
 10 |     def _config_schema(cls):
 11 |         schema = super(Tokenizer, cls)._config_schema()
 12 | 
 13 |         tokenization_block = {
 14 |             "type": "object",
 15 |             "properties": {
 16 |                 "mode": {
 17 |                     "type": "string",
 18 |                     "enum": ["aggressive", "conservative", "char", "space", "none"],
 19 |                 },
 20 |                 "no_substitution": {"type": "boolean"},
 21 |                 "case_feature": {"type": "boolean"},
 22 |                 "case_markup": {"type": "boolean"},
 23 |                 "soft_case_regions": {"type": "boolean"},
 24 |                 "lang": {"type": "string"},
 25 |                 "bpe_model_path": {"type": "string", "is_file": True},
 26 |                 "bpe_dropout": {"type": "number"},
 27 |                 "sp_model_path": {"type": "string", "is_file": True},
 28 |                 "sp_nbest_size": {"type": "integer"},
 29 |                 "sp_alpha": {"type": "number"},
 30 |                 "joiner_annotate": {"type": "boolean"},
 31 |                 "joiner": {"type": "string"},
 32 |                 "joiner_new": {"type": "boolean"},
 33 |                 "spacer_annotate": {"type": "boolean"},
 34 |                 "spacer_new": {"type": "boolean"},
 35 |                 "preserve_placeholders": {"type": "boolean"},
 36 |                 "preserve_segmented_tokens": {"type": "boolean"},
 37 |                 "support_prior_joiners": {"type": "boolean"},
 38 |                 "segment_case": {"type": "boolean"},
 39 |                 "segment_numbers": {"type": "boolean"},
 40 |                 "segment_alphabet": {"type": "array", "items": {"type": "string"}},
 41 |                 "segment_alphabet_change": {"type": "boolean"},
 42 |                 "restrict_subword_vocabulary": {"type": "boolean"},
 43 |                 "build_vocabulary": {"type": "object"},
 44 |                 "build_subword": {"type": ["object", "null"]},
 45 |             },
 46 |             "additionalProperties": False,
 47 |         }
 48 | 
 49 |         schema["properties"].update(
 50 |             {
 51 |                 "source": {**tokenization_block, "required": ["mode"]},
 52 |                 "target": {**tokenization_block, "required": ["mode"]},
 53 |                 "multi": tokenization_block,
 54 |             }
 55 |         )
 56 |         return schema
 57 | 
 58 |     @property
 59 |     def _detok(self):
 60 |         return False
 61 | 
 62 |     @property
 63 |     def _apply_in_postprocess(self):
 64 |         return True
 65 | 
 66 |     def _build_process(self, config, side, build_state):
 67 |         # Disable subword regularization in inference.
 68 |         if not self.process_type.training:
 69 |             config["bpe_dropout"] = 0
 70 |             config["sp_nbest_size"] = 0
 71 |             config["sp_alpha"] = 0
 72 | 
 73 |         if config.get("restrict_subword_vocabulary", False):
 74 |             vocabulary_path = build_state.get(
 75 |                 "src_vocabulary" if side == "source" else "tgt_vocabulary"
 76 |             )
 77 |             if vocabulary_path is None:
 78 |                 raise ValueError(
 79 |                     "restrict_subword_vocabulary is set but no vocabulary is set"
 80 |                 )
 81 | 
 82 |             config["vocabulary"] = list(tokenizer.load_vocabulary(vocabulary_path))
 83 | 
 84 |         current_tokenizer = tokenizer.build_tokenizer(config)
 85 | 
 86 |         previous_tokenizer = None
 87 |         if build_state:
 88 |             if side == "source":
 89 |                 previous_tokenizer = build_state["src_tokenizer"]
 90 |                 build_state["src_tokenizer"] = current_tokenizer
 91 |             else:
 92 |                 previous_tokenizer = build_state["tgt_tokenizer"]
 93 |                 build_state["tgt_tokenizer"] = current_tokenizer
 94 |         if self.process_type.postprocess and not self._postprocess_only:
 95 |             return previous_tokenizer
 96 |         return current_tokenizer
 97 | 
 98 |     def _apply_process(self, tokenizer, tok):
 99 |         return (tokenizer, None)
100 | 


--------------------------------------------------------------------------------
/nmtwizard/preprocess/tokenizer.py:
--------------------------------------------------------------------------------
  1 | """Tokenization utilities."""
  2 | 
  3 | import pyonmttok
  4 | 
  5 | _ALLOWED_TOKENIZER_ARGS = set(
  6 |     [
  7 |         "allow_isolated_marks",
  8 |         "bpe_dropout",
  9 |         "bpe_model_path",
 10 |         "case_feature",
 11 |         "case_markup",
 12 |         "joiner",
 13 |         "joiner_annotate",
 14 |         "joiner_new",
 15 |         "lang",
 16 |         "mode",
 17 |         "no_substitution",
 18 |         "preserve_placeholders",
 19 |         "preserve_segmented_tokens",
 20 |         "segment_alphabet",
 21 |         "segment_alphabet_change",
 22 |         "segment_case",
 23 |         "segment_numbers",
 24 |         "soft_case_regions",
 25 |         "sp_alpha",
 26 |         "sp_model_path",
 27 |         "sp_nbest_size",
 28 |         "spacer_annotate",
 29 |         "spacer_new",
 30 |         "support_prior_joiners",
 31 |         "vocabulary",
 32 |         "vocabulary_path",
 33 |         "vocabulary_threshold",
 34 |     ]
 35 | )
 36 | 
 37 | 
 38 | def build_tokenizer(args):
 39 |     """Builds a tokenizer based on user arguments."""
 40 |     args = {
 41 |         name: value for name, value in args.items() if name in _ALLOWED_TOKENIZER_ARGS
 42 |     }
 43 |     if not args:
 44 |         return None
 45 |     lang = args.get("lang")
 46 |     if lang is not None and not pyonmttok.is_valid_language(lang):
 47 |         args.pop("lang")
 48 |     return pyonmttok.Tokenizer(**args)
 49 | 
 50 | 
 51 | def make_subword_learner(subword_config, subword_dir, tokenizer=None):
 52 |     params = subword_config.get("params")
 53 |     if params is None:
 54 |         raise ValueError(
 55 |             "'params' field should be specified for subword model learning."
 56 |         )
 57 |     subword_type = subword_config.get("type")
 58 |     if subword_type is None:
 59 |         raise ValueError("'type' field should be specified for subword model learning.")
 60 |     vocab_size = params.get("vocab_size")
 61 |     if vocab_size is None:
 62 |         raise ValueError(
 63 |             "'vocab_size' parameter should be specified for subword model learning."
 64 |         )
 65 | 
 66 |     if subword_type == "bpe":
 67 |         learner = pyonmttok.BPELearner(
 68 |             tokenizer=tokenizer,
 69 |             symbols=vocab_size,
 70 |             min_frequency=params.get("min-frequency", 0),
 71 |             total_symbols=params.get("total_symbols", False),
 72 |         )
 73 |     elif subword_type == "sp":
 74 |         learner = pyonmttok.SentencePieceLearner(tokenizer=tokenizer, **params)
 75 |     else:
 76 |         raise ValueError("Invalid subword type : '%s'." % subword_type)
 77 | 
 78 |     return {"learner": learner, "subword_type": subword_type, "size": vocab_size}
 79 | 
 80 | 
 81 | def vocabulary_iterator(vocabulary_path):
 82 |     """Iterates over each token included in the vocabulary file."""
 83 |     with open(vocabulary_path, encoding="utf-8") as vocabulary_file:
 84 |         header = True
 85 |         for line in vocabulary_file:
 86 |             # The vocabulary file might start with some comments prefixed with '#'.
 87 |             if header and line[0] == "#":
 88 |                 continue
 89 |             header = False
 90 |             line = line.rstrip("\n\r")
 91 |             fields = line.split(" ")
 92 |             if len(fields) == 1:
 93 |                 # No frequency value, the line is just the token.
 94 |                 yield fields[0]
 95 |             else:
 96 |                 # The code below checks the last field is a frequency and not a part of
 97 |                 # a badly formatted token.
 98 |                 try:
 99 |                     float(fields[-1])
100 |                     fields.pop()
101 |                 except ValueError:
102 |                     pass
103 |                 yield " ".join(fields)
104 | 
105 | 
106 | def load_vocabulary(vocabulary_path):
107 |     if vocabulary_path and isinstance(vocabulary_path, str):
108 |         return set(vocabulary_iterator(vocabulary_path))
109 |     return vocabulary_path
110 | 


--------------------------------------------------------------------------------
/nmtwizard/utils.py:
--------------------------------------------------------------------------------
  1 | """Various utilities."""
  2 | 
  3 | import hashlib
  4 | import subprocess
  5 | import os
  6 | import gzip
  7 | import enum
  8 | 
  9 | from nmtwizard.logger import get_logger
 10 | 
 11 | logger = get_logger(__name__)
 12 | 
 13 | context_placeholder = "｟mrk_context｠"
 14 | 
 15 | 
 16 | class Task(enum.Enum):
 17 |     TRAINING = 0
 18 |     TRANSLATION = 1
 19 |     SCORING = 2
 20 | 
 21 | 
 22 | class ScoreType(enum.Enum):
 23 |     CUMULATED_LL = 0
 24 |     CUMULATED_NLL = 1
 25 |     NORMALIZED_LL = 2
 26 |     NORMALIZED_NLL = 3
 27 | 
 28 | 
 29 | def md5file(path, buffer_size=16777216):
 30 |     """Computes the MD5 hash of the given file."""
 31 |     md5 = hashlib.md5()
 32 |     _update_hash(path, md5, buffer_size)
 33 |     return md5.hexdigest()
 34 | 
 35 | 
 36 | def md5files(files, buffer_size=16777216):
 37 |     """Computes the combined MD5 hash of multiple files, represented as a list
 38 |     of (key, path).
 39 |     """
 40 |     m = hashlib.md5()
 41 |     for key, path in sorted(files, key=lambda x: x[0]):
 42 |         m.update(key.encode("utf-8"))
 43 |         if os.path.isdir(path):
 44 |             sub_md5 = md5files(
 45 |                 [
 46 |                     (os.path.join(key, filename), os.path.join(path, filename))
 47 |                     for filename in os.listdir(path)
 48 |                     if not filename.startswith(".")
 49 |                 ]
 50 |             )
 51 |             m.update(sub_md5.encode("utf-8"))
 52 |         else:
 53 |             _update_hash(path, m, buffer_size)
 54 |     return m.hexdigest()
 55 | 
 56 | 
 57 | def _update_hash(path, hash_object, buffer_size):
 58 |     with open(path, "rb") as f:
 59 |         while True:
 60 |             data = f.read(buffer_size)
 61 |             if not data:
 62 |                 break
 63 |             hash_object.update(data)
 64 | 
 65 | 
 66 | def run_cmd(cmd, cwd=None, background=False):
 67 |     """Runs the command."""
 68 |     logger.debug("RUN %s", " ".join(cmd))
 69 |     if background:
 70 |         return subprocess.Popen(cmd, cwd=cwd)
 71 |     else:
 72 |         return subprocess.call(cmd, cwd=cwd)
 73 | 
 74 | 
 75 | def count_devices(gpuid):
 76 |     if isinstance(gpuid, list):
 77 |         return len(gpuid)
 78 |     else:
 79 |         return 1
 80 | 
 81 | 
 82 | def pad_lists(lists, padding_value=None, max_length=None):
 83 |     """Pads a list of lists.
 84 | 
 85 |     Args:
 86 |       lists: A list of lists.
 87 | 
 88 |     Returns:
 89 |       A tuple with the padded collection of lists and the original length of each
 90 |       list.
 91 |     """
 92 |     if max_length is None:
 93 |         max_length = max(len(lst) for lst in lists)
 94 |     lengths = []
 95 |     for lst in lists:
 96 |         length = len(lst)
 97 |         lst += [padding_value] * (max_length - length)
 98 |         lengths.append(length)
 99 |     return lists, lengths
100 | 
101 | 
102 | def get_file_path(path):
103 |     if os.path.isfile(path):
104 |         return path
105 |     elif os.path.isfile(path + ".gz"):
106 |         return path + ".gz"
107 |     else:
108 |         return None
109 | 
110 | 
111 | def is_gzip_file(path):
112 |     return path.endswith(".gz")
113 | 
114 | 
115 | def open_file(path, *args, **kwargs):
116 |     if path is None:
117 |         return None
118 |     if is_gzip_file(path):
119 |         return gzip.open(path, *args, **kwargs)
120 |     else:
121 |         return open(path, *args, **kwargs)
122 | 
123 | 
124 | def open_and_check_unicode(path, encoding="utf-8"):
125 |     with open_file(path, "rb") as f:
126 |         for index, line in enumerate(f):
127 |             try:
128 |                 yield line.decode(encoding)
129 |             except UnicodeError as e:
130 |                 raise RuntimeError(
131 |                     "Invalid Unicode character (shown as � below) in file '%s' on line %d:\n%s"
132 |                     % (
133 |                         os.path.basename(path),
134 |                         index + 1,
135 |                         line.decode(encoding, errors="replace").strip(),
136 |                     )
137 |                 ) from e
138 | 
139 | 
140 | def count_lines(path, buffer_size=65536):
141 |     with open_file(path, "rb") as f:
142 |         num_lines = 0
143 |         eol = False
144 |         while True:
145 |             data = f.read(buffer_size)
146 |             if not data:
147 |                 if not eol:
148 |                     num_lines += 1
149 |                 return num_lines
150 |             num_lines += data.count(b"\n")
151 |             eol = True if data.endswith(b"\n") else False
152 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | systran-storages @ https://github.com/SYSTRAN/storages/archive/a082f0e151f07671dbb2629b416863877354e433.tar.gz
2 | jsonschema==3.*
3 | json-stream==2.2.*
4 | pyonmttok==1.37.1
5 | requests==2.*
6 | systran-align>=3.3,<4;platform_system=='Linux'
7 | fasttext-wheel==0.9.2
8 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore =
3 |   E203,
4 |   E501,
5 |   E731,
6 |   F401,
7 |   W503,
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | 
 6 | def _load_requirements(path):
 7 |     with open(path, encoding="utf-8") as requirements:
 8 |         return [requirement.strip() for requirement in requirements]
 9 | 
10 | 
11 | base_dir = os.path.dirname(os.path.abspath(__file__))
12 | install_requires = _load_requirements(os.path.join(base_dir, "requirements.txt"))
13 | tests_require = _load_requirements(os.path.join(base_dir, "test", "requirements.txt"))
14 | 
15 | setup(
16 |     name="nmt-wizard-docker",
17 |     version="0.1.0",
18 |     license="MIT",
19 |     description="Dockerization of NMT frameworks",
20 |     author="OpenNMT",
21 |     python_requires=">=3.6",
22 |     install_requires=install_requires,
23 |     extras_require={
24 |         "tests": tests_require,
25 |     },
26 |     packages=find_packages(),
27 | )
28 | 


--------------------------------------------------------------------------------
/test/conftest.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"storages": {
 3 |     "local_abs": {
 4 |       "type": "local"
 5 |     },
 6 |     "local_rel": {
 7 |       "type": "local",
 8 |       "basedir": "/tmp"
 9 |     },
10 |     "_s3_test": {
11 |       "description": "test repository on s3",
12 |       "type": "s3",
13 |       "bucket": "auto-test-bucket-systran",
14 |       "aws_credentials": {
15 |           "access_key_id": "[[ACCESSKEYID]]",
16 |           "secret_access_key": "[[SECRETACCESSKEY]]"
17 |       }
18 |     },
19 |     "_local_ssh_wbasedir": {
20 |       "description": "localhost server",
21 |       "type": "ssh",
22 |       "server": "localhost",
23 |       "user": "[[LOGIN]]",
24 |       "password": "[[PASSWORD]]",
25 |       "basedir": "/Users/test/sshtest"
26 |     },
27 |     "_remote_ssh_wobasedir": {
28 |       "description": "localhost server",
29 |       "type": "ssh",
30 |       "server": "[[SERVER]]",
31 |       "user": "[[LOGIN]]",
32 |       "pkey": "[[PRIVATEKEY]]"
33 |     }
34 |   }
35 | }


--------------------------------------------------------------------------------
/test/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import json
 3 | import os
 4 | 
 5 | 
 6 | def pytest_generate_tests(metafunc):
 7 |     with open(
 8 |         os.path.join(os.path.dirname(os.path.realpath(__file__)), "conftest.json")
 9 |     ) as f:
10 |         config = json.load(f)
11 | 
12 |     if "storages" in config:
13 |         if "storage_id" in metafunc.fixturenames:
14 |             metafunc.parametrize("storage_id", config["storages"].keys())
15 | 
16 |         if "storages" in metafunc.fixturenames:
17 |             metafunc.parametrize("storages", [config["storages"]])
18 | 


--------------------------------------------------------------------------------
/test/corpus/eval/testset2.out:
--------------------------------------------------------------------------------
  1 | Gutach: Increased safety for pedestrians
  2 | They are not even 100 metres apart: On Tuesday, the new B 33 pedestrian lights in Dorfparkplatz in Gutach became operational - within view of the existing Town Hall traffic lights.
  3 | Two sets of lights so close to one another: intentional or just a silly error?
  4 | Yesterday, Gutacht's Mayor gave a clear answer to this question.
  5 | "At the time, the Town Hall traffic lights were installed because this was a school route," explained Eckert yesterday.
  6 | The Kluser lights protect cyclists, as well as those travelling by bus and the residents of Bergle.
  7 | The system, which officially became operational yesterday, is of importance to the Sulzbachweg/Kirchstrasse junction.
  8 | We have the museum, two churches, the spa gardens, the bus stop, a doctor's practice and a bank, not to mention the traffic from the 'Grub' residential area.
  9 | "At times of high road and pedestrian traffic, an additional set of lights were required to ensure safety," said Eckert.
 10 | This was also confirmed by Peter Arnold from the Offenburg District Office.
 11 | "According to current measurements, around 12,000 vehicles travel through the town of Gutach on the B33 on a daily basis, of which heavy goods traffic accounts for around ten per cent," emphasised Arnold.
 12 | Therefore the construction of an additional set of lights was more than necessary: "Here safety comes first, it's that simple," said Arnold.
 13 | A total of four road safety inspections were carried out and a roundabout was also considered, however, this idea was rejected on account of the narrowness of the Sulzbachweg/Kirchstrasse junctions.
 14 | According to Arnold, every possible test was carried out prior to the selection of the location for the traffic light posts: "Using a goods vehicle loaded with particularly long tree trunks, we also tested whether such vehicles could access the B 33 from the Sulzbachweg without knocking over the traffic light posts".
 15 | The traffic light system itself, which cost around EUR 15,000, is the "most modern system that is currently available on the market," explained Arnold.
 16 | The system is fitted with coloured LEDs, which are bright enough that drivers can easily see the lights, even when the sun is low in the sky.
 17 | And they are also energy-efficient: The older light systems consume around 100 Watts, with the new ones consuming just eight.
 18 | There are three sets of lights per direction of travel.
 19 | Arnold explained the technology used by the new system: It is fitted with two radar sensors.
 20 | If the pedestrian presses the button at the traffic lights, the top radar sensor checks the traffic status.
 21 | If the street is clear, the pedestrian obtains a green light immediately, if not, there is a delay of around 15 seconds.
 22 | An additional radar sensor checks whether the green phase for the pedestrian can be ended.
 23 | "If a group of people or if disabled persons are crossing the street, the green phase is extended, thus ensuring that everyone gets safely across the street," explained Arnold.
 24 | Of course, drivers must also play their part and keep their eyes on the road.
 25 | Yesterday this was not the case: The light had barely turned green for pedestrians when a luxury vehicle sped through on a red light.
 26 | For more than 30 years, Josef Winkler has been writing from the heart, telling of the hardships of his childhood and youth.
 27 | The catastrophes of his Catholic village upbringing - the speechlessness, his tendency towards brute force and dulled sexuality, the confinement and lack of joy - all of this has been described many times by the Kaernten-born poet.
 28 | The Büchner prizewinner is known primarily as a writer of prose, with theatre texts something of a rarity for him.
 29 | In a collage of prose texts For his performance piece, "Wetterleuchten auf der Zungenspitze" (Summer lightning on the tip of your tongue), which can now be seen in Garage X on Petersplatz, Gerhard Fresacher ,creates a collage of prose texts.
 30 | The theatre producer has thus combined elements from the autobiographically inspired novel "Der Leibeigene" (1987) [The Bondsman] featuring prose miniatures from "Leichnam, seine Familie belauernd" (2003) [Corpse, stalking his family].
 31 | On the predominantly empty stage - with one important requirement: a crumpled sofa, on which cast members allude to copulating and masturbating - the eight-person ensemble work their way through the text material.
 32 | However, Director Fresacher seems to have little trust in the text.
 33 | The 70-minute performance glosses over the script with a host of director's additions, well-known from the repertoire of post-dramatic theatrical styles.
 34 | In particular, the actresses play a major role in the sometimes rather dubious staging.
 35 | They are manhandled, their heads held under water, tacked to the wall by their evening gowns.
 36 | Wrapped in cellophane or in girdles, they stumble on dangerously high heels across the set, either delivering monologues at the top of their voices or lying completely silent on the stage.
 37 | However, the source text makes barely any reference to this intense delivery.
 38 | The best moments of the evening is when the singing starts - tracks range from Deep Purple to traditional folk songs.
 39 | Only towards the end does the highly charged performance start to wind down, and we see flashes of Winkler's somewhat absurd sense of humour.
 40 | A black box in your car?
 41 | As America's road planners struggle to find the cash to mend a crumbling highway system, many are beginning to see a solution in a little black box that fits neatly by the dashboard of your car.
 42 | The devices, which track every mile a motorist drives and transmit that information to bureaucrats, are at the center of a controversial attempt in Washington and state planning offices to overhaul the outdated system for funding America's major roads.
 43 | The usually dull arena of highway planning has suddenly spawned intense debate and colorful alliances.
 44 | Libertarians have joined environmental groups in lobbying to allow government to use the little boxes to keep track of the miles you drive, and possibly where you drive them - then use the information to draw up a tax bill.
 45 | The tea party is aghast.
 46 | The American Civil Liberties Union is deeply concerned, too, raising a variety of privacy issues.
 47 | And while Congress can't agree on whether to proceed, several states are not waiting.
 48 | They are exploring how, over the next decade, they can move to a system in which drivers pay per mile of road they roll over.
 49 | Thousands of motorists have already taken the black boxes, some of which have GPS monitoring, for a test drive.
 50 | This really is a must for our nation.
 51 | "It is not a matter of something we might choose to do," said Hasan Ikhrata, executive director of the Southern California Assn. of Governments, which is planning for the state to start tracking miles driven by every California motorist by 2025.
 52 | There is going to be a change in how we pay these taxes.
 53 | The technology is there to do it.
 54 | The push comes as the country's Highway Trust Fund, financed with taxes Americans pay at the gas pump, is broke.
 55 | Americans don't buy as much gas as they used to.
 56 | Cars get many more miles to the gallon.
 57 | The federal tax itself, 18.4 cents per gallon, hasn't gone up in 20 years.
 58 | Politicians are loath to raise the tax even one penny when gas prices are high.
 59 | "The gas tax is just not sustainable," said Lee Munnich, a transportation policy expert at the University of Minnesota.
 60 | His state recently put tracking devices on 500 cars to test out a pay-by-mile system.
 61 | "This works out as the most logical alternative over the long term," he said.
 62 | Wonks call it a mileage-based user fee.
 63 | It is no surprise that the idea appeals to urban liberals, as the taxes could be rigged to change driving patterns in ways that could help reduce congestion and greenhouse gases, for example.
 64 | California planners are looking to the system as they devise strategies to meet the goals laid out in the state's ambitious global warming laws.
 65 | But Rep. Bill Shuster (R-Pa.), chairman of the House Transportation Committee, has said he, too, sees it as the most viable long-term alternative.
 66 | The free marketeers at the Reason Foundation are also fond of having drivers pay per mile.
 67 | "This is not just a tax going into a black hole," said Adrian Moore, vice president of policy at Reason.
 68 | People are paying more directly into what they are getting.
 69 | The movement is also bolstered by two former U.S. Transportation secretaries, who in a 2011 report urged Congress to move in the pay-per-mile direction.
 70 | The U.S. Senate approved a $90-million pilot project last year that would have involved about 10,000 cars.
 71 | But the House leadership killed the proposal, acting on concerns of rural lawmakers representing constituents whose daily lives often involve logging lots of miles to get to work or into town.
 72 | Several states and cities are nonetheless moving ahead on their own.
 73 | The most eager is Oregon, which is enlisting 5,000 drivers in the country's biggest experiment.
 74 | Those drivers will soon pay the mileage fees instead of gas taxes to the state.
 75 | Nevada has already completed a pilot.
 76 | New York City is looking into one.
 77 | Illinois is trying it on a limited basis with trucks.
 78 | And the I-95 Coalition, which includes 17 state transportation departments along the Eastern Seaboard (including Maryland, Pennsylvania, Virginia and Florida), is studying how they could go about implementing the change.
 79 | The concept is not a universal hit.
 80 | In Nevada, where about 50 volunteers' cars were equipped with the devices not long ago, drivers were uneasy about the government being able to monitor their every move.
 81 | "Concerns about Big Brother and those sorts of things were a major problem," said Alauddin Khan, who directs strategic and performance management at the Nevada Department of Transportation.
 82 | It was not something people wanted.
 83 | As the trial got underway, the ACLU of Nevada warned on its website: "It would be fairly easy to turn these devices into full-fledged tracking devices."
 84 | There is no need to build an enormous, unwieldy technological infrastructure that will inevitably be expanded to keep records of individuals' everyday comings and goings.
 85 | Nevada is among several states now scrambling to find affordable technology that would allow the state to keep track of how many miles a car is being driven, but not exactly where and at what time.
 86 | If you can do that, Khan said, the public gets more comfortable.
 87 | The hunt for that technology has led some state agencies to a small California startup called True Mileage.
 88 | The firm was not originally in the business of helping states tax drivers.
 89 | It was seeking to break into an emerging market in auto insurance, in which drivers would pay based on their mileage.
 90 | But the devices it is testing appeal to highway planners because they don't use GPS and deliver a limited amount of information, uploaded periodically by modem.
 91 | "People will be more willing to do this if you do not track their speed and you do not track their location," said Ryan Morrison, chief executive of True Mileage.
 92 | There have been some big mistakes in some of these state pilot programs.
 93 | There are a lot less expensive and less intrusive ways to do this.
 94 | In Oregon, planners are experimenting with giving drivers different choices.
 95 | They can choose a device with or without GPS.
 96 | Or they can choose not to have a device at all, opting instead to pay a flat fee based on the average number of miles driven by all state residents.
 97 | Other places are hoping to sell the concept to a wary public by having the devices do more, not less.
 98 | In New York City, transportation officials are seeking to develop a taxing device that would also be equipped to pay parking meter fees, provide "pay-as-you-drive" insurance, and create a pool of real-time speed data from other drivers that motorists could use to avoid traffic.
 99 | "Motorists would be attracted to participate because of the value of the benefits it offers to them," says a city planning document.
100 | Some transportation planners, though, wonder if all the talk about paying by the mile is just a giant distraction.
101 | 


--------------------------------------------------------------------------------
/test/corpus/eval/testset2.ref.1:
--------------------------------------------------------------------------------
  1 | Good : More safety for pedestrians !
  2 | They are not 100 metres from each other : on Tuesday the new B 33 pedestrian lamp on the village parking lot was put into operation in Gutach , within sight of the older town hall lights .
  3 | Two installations so close to one another : intention or centrifugal ?
  4 | This question was answered clearly yesterday by Gutach &apos;s mayor .
  5 | &quot; The Rathausampel was installed at the time because it secures the school path &quot; , explains Eckert yesterday .
  6 | Car traffic is safe for cyclists , bus passengers and mountain residents .
  7 | The plant , officially opened yesterday , was important for the Crusade of Sulzbachweg / Kirchstraße .
  8 | We have the museum , two churches , a spa park , the bus stop , a doctor and a bank as well as the traffic flow from the ‘ Grub ’ residential area .
  9 | &quot; In view of the high traffic and pedestrian traffic volumes , a further traffic light had to be produced for their safety &quot; , Eckert says .
 10 | This is also confirmed by Peter Arnold of the State Council in Offenburg .
 11 | &quot; According to current measurements , approximately 12 000 vehicles pass through the municipal district of Gutach on the B 33 daily , of which about ten percent are heavy goods traffic , &quot; Arnold says .
 12 | This is why the construction of another light is more than necessary : &quot; Security is easy here &quot; , says Arnold .
 13 | In total , four traffic surveys were carried out , and a roundabout was also planned , but were discarded because of the congestion at the Sulzbachweg / Kirchstraße intersection .
 14 | According to Arnold , everything was tested before at the choice of traffic lights : &quot; By means of an extra transporter loaded for us with particularly long wooden trunks we could check out on the lights .
 15 | The traffic light system , which costs approximately 15,000 euros , is the &quot; most modern that exists on the market &quot; , explained Arnold .
 16 | The system is equipped with coloured LEDs that shine so strongly that the lights are visible from the motorists , for example , even in the deep sun .
 17 | And it is also economical : the older lighting systems consume about 100 watts , the new just eight watts .
 18 | There are three light installations per direction of travel .
 19 | Arnold explained the technique of the new system : This is equipped with two radar sensors .
 20 | If the pedestrian hits the traffic light , the upper radar sensor tests the traffic situation .
 21 | If the road is free , there is direct green for the pedestrian , if not it takes about 15 seconds .
 22 | Another radar sensor checks whether the initial phase can be stopped for the pedestrian .
 23 | &quot; If a group or disabled people go across the road , the green period will be extended , so everyone will come safely over the lane , &quot; said Arnold .
 24 | Of course , the driver had to remember as a partner and watch the track .
 25 | This was not the case yesterday : the traffic light for pedestrians showed green , the telescope smoked through an upper class vehicle - at glowing red .
 26 | Josef Winkler has been writing away the needs of his childhood and youth for more than 30 years .
 27 | The catastrophes of his Catholic village kindness - voicelessness , inclination to raw violence and stumbling sexuality , the duck and joy - have been described many times by the Carinthian poet .
 28 | The prize-winner of the Büchner Prize is primarily known as a prosaist , and the theatre texts are rare in his work .
 29 | Collage of prose texts Gerhard Fresacher presents for his performance &quot; Weather luminaires on the tip of the tongue &quot; , which can now be seen in Garage X on Petersplatz .
 30 | The theater maker , for example , combines elements from the autobiographical novel &quot; Der Leibeigene &quot; ( 1987 ) with prosamines from &quot; Leichnam , reviving his family &quot; ( 2003 ) .
 31 | On the largely empty stage - important Requisit : a think-nautical sofa , which is impressively copied and masturbated - the eight-headed ensemble is adorned .
 32 | But director Fresacher seems to have little faith in the text .
 33 | The 70-minute performance surpasses the original with a plethora of rainfalls , known from the repertoire of post-dramatic forms of play .
 34 | The actresses , in particular , come to the attention of somewhat questionable enslavement .
 35 | They are tightly attached , dipped with the head under water , stained with their evening albums to the wall .
 36 | In cellophane or moody , they are forced into dangerously high sticks by the staging , either monologise them loudly or lie completely silent on the stage .
 37 | The text is hardly conveyed in this narrow way .
 38 | The best moments are the evening when sung - the bandwidth ranges from Deep Purple to popular song .
 39 | It is only towards the end that the above performance comes to rest , and Winklers barely lighten humour .
 40 | A black box in the car ?
 41 | US street planners are looking for a source of money to fix the decaying Highway system , and believe they have found the solution in a small black box that can fit in the dashboard of each car .
 42 | The devices that record each kilometre and report information to the authorities are at the heart of a controversial attempt by Washington and the state planning offices to overhaul the obsolete system for financing US roads .
 43 | The normally rather boring area of road planning has suddenly sparked an intense debate with colourful alliances .
 44 | Liberals have allied with environmental groups and argue that the government can use the small boxes to record the driven kilometers – and possibly even where they were driven – and then use the information modestly for calculation .
 45 | The Tea Party is horrified .
 46 | The American Civil Rights Association ( ACLU ) is also deeply concerned and expresses a number of concerns about data protection .
 47 | But while it is not possible to agree on a course of action in Congress , several states are no longer waiting .
 48 | They are currently considering ways to switch over the next ten years to a system where drivers pay per mile .
 49 | Thousands of drivers have already tested the tachograph , some of which are equipped with GPS monitoring .
 50 | This is a must for our country .
 51 | “ It ’ s not something we ’ ll possibly be using , ” said Hasan Ikhrata , Managing Director of Southern California Assn. of Governments , who plans to record Mileage for all California ’ s motorists .
 52 | The way we pay these taxes will change .
 53 | The technology is there .
 54 | The initiative comes at a time when the Highway Trust Fund , financed from the taxes paid by Americans at the fuel column , is bust .
 55 | In America , however , there is no longer as much mowing as before .
 56 | Cars consume less petrol .
 57 | The state fuel tax of 18.4 cents per gallon ( less than 4 cents per litre ) has not risen for 20 years .
 58 | Politicians do not dare to raise the tax even to raise one cent at high cost .
 59 | &quot; The gasoline tax is simply not sustainable , &quot; says Lee Munnich , an expert on traffic legislation at the University of Minnesota .
 60 | Its state recently equipped 500 cars with tachographs to test a most-based payment system .
 61 | “ This is the most viable alternative in the long term , ” he said .
 62 | Bureaucrats call it a most-based user fee .
 63 | It is not surprising that the idea is popular with urban liberals , for example , the tax could be used to influence driving behaviour in such a way that congestion and climate-damaging exhaust gases are reduced .
 64 | California planners rely on the system to develop strategies to meet the Federal State &apos;s ambitious , legally binding climate change targets .
 65 | But the Republican Bill Shuster of Pennsylvania , Chairman of the House Transportation Committee , has also declared that he sees this as the most viable long-term alternative .
 66 | The free marketers of the Reason Foundation are also inspired by the idea of having drivers pay after a reduced distance .
 67 | “ This is not a tax that disappears in a black hole , ” explains Adrian Moore , Vice President for Policy at Reason .
 68 | People pay directly for what they get .
 69 | The movement is also supported by two former US Transport Ministers who , in a report in 2011 , called on Congress to move in the direction of mile-based accounting .
 70 | The U.S. Senate approved a $ 90 million pilot project last year that would have included 10,000 cars .
 71 | However , the majority in the House of Representatives prevented the process and responded to the concerns of MEPs from rural areas who represent people who often have to travel many miles on their way to work or to the city .
 72 | Several states and large cities are moving in this direction .
 73 | Oregon is currently the most dedicated driver to the country &apos;s largest experiment .
 74 | These drivers will soon pay the mileage tax instead of the fuel tax to the state .
 75 | Nevada has already completed a pilot project .
 76 | New York City is also considering this .
 77 | Illinois uses trucks to test them .
 78 | And the I-95 coalition , which includes the transport ministries of 17 states on the east coast ( including Maryland , Pennsylvania , Virginia and Florida ) , is currently investigating how to introduce the change .
 79 | The concept is not a universal hit .
 80 | In Nevada , where recently 50 volunteers were equipped with the equipment , motorists were sceptical about the idea that the government could follow any of its movements .
 81 | &quot; Concerns about Big Brother and such things were a major problem , &quot; says Alauddin Khan , Head of Strategy and Rescue Management at the Ministry of Transport in Nevada .
 82 | People did not want it .
 83 | When the test started , the ACLU warned of Nevada on their website : “ It would be relatively easy to turn the tachograph into full-grown surveillance devices . ”
 84 | There is no need to build a gigantic , cumbersome technological infrastructure that would inevitably be used to collect data about the daily movements of individuals .
 85 | Nevada is one of a number of states that are now looking for affordable technology to cover the kilometers passed , but not exactly when and where .
 86 | According to Khan , the public would be more reassured .
 87 | The search for this technology led some authorities to a small start-up company called True Mileage in California .
 88 | The company did not initially intervene to help states tax their car drivers .
 89 | Rather , it was their goal to gain a foothold in an emerging market for car insurance , in which drivers would pay on the basis of the mileage they had travelled .
 90 | But the devices she tested are also interesting for street planners , because they do not work with GPS and provide limited information that is regularly uploaded via modem .
 91 | &quot; People are more willing to participate if their speed and locations are not recorded , &quot; said Ryan Morrison , Managing Director of True Mileage .
 92 | Major mistakes were made in some of these public pilot programmes .
 93 | There are much cheaper and less intrusive ways to implement this .
 94 | In Oregon , the planers experiment with giving drivers a number of choices .
 95 | You can choose a device with or without GPS .
 96 | Or they don ’ t choose any equipment at all and pay a flat rate instead on the basis of the mileage averaged by all inhabitants of the state .
 97 | Others hope to be able to sell the concept of a suspicious public by providing the equipment with more functions than with lesser .
 98 | In New York City , traffic officials want to develop a tool for taxation , which also pays parking fees , pays insurance for miles only , and allows drivers to leak speed data from other vehicles in real time .
 99 | &quot; Drivers would be motivated to participate by the added value of the benefits the system offers , &quot; says a city planning document .
100 | Some traffic planners are wondering , however , if all the talk about paying per mile is not just a huge diversion exercise .
101 | 


--------------------------------------------------------------------------------
/test/corpus/eval/testset2.ref.2:
--------------------------------------------------------------------------------
  1 | Good : More safety for pedestrians !
  2 | They are not 100 metres from one another : on Tuesday the new B 33 pedestrian lamp on the village car park was put into operation in Gutach , within sight of the older town hall platform .
  3 | Two installations so close to one another : intention or bourgeois empire ?
  4 | This question was answered clearly yesterday by Gutachs Mayor .
  5 | &quot; The town hall lamp was installed at the time because it ensures the school path &quot; , according to Eckert yesterday .
  6 | The local traffic lights are safe for cyclists , bus passengers and mountain residents .
  7 | The plant , officially opened yesterday , was important for the Sulzbachweg / Kirchstraße crusade .
  8 | We have the museum , two churches , spa park , bus stop , a doctor and a bank as well as the traffic flow from the residential area ‘ Grub ’ .
  9 | &quot; The high traffic and pedestrian traffic volumes had to produce another traffic light for their safety &quot; , Eckert says .
 10 | This is also confirmed by Peter Arnold , the State Council of Offenburg .
 11 | &quot; According to current measurements , approximately 12 000 vehicles pass the municipal district of Gutach on the B 33 daily , of which about ten percent are heavy goods traffic , &quot; Arnold says .
 12 | That is why the construction of another light is more than necessary : &quot; Security is easy here &quot; , says Arnold .
 13 | In total , four traffic surveys were carried out , and a roundabout was also planned , but were discarded because of the duct in the Sulzbachweg / Kirchstraße intersection .
 14 | According to Arnold , everything was tested before at the choice of traffic lights : &quot; By means of an extra transporter with particularly long wooden trunks we could test out whether these vehicles had been taken out of the vehicle .
 15 | The 15,000 Euros traffic light system itself is the &quot; most modern that exists on the market &quot; , explained Arnold .
 16 | The system is equipped with coloured LEDs that shine so strongly that the lights are visible from the motorists , for example , even when the sun is deep .
 17 | And it is also economical : the older lighting systems consume about 100 watts , the new just eight watts .
 18 | There are three light installations per direction .
 19 | Arnold explained the technique of the new system : This is equipped with two radar sensors .
 20 | If the pedestrian hits the traffic light , the upper radar sensor tests the traffic situation .
 21 | If the road is free , there is direct green for the pedestrian , if not it takes about 15 seconds .
 22 | Another radar sensor checks whether the initial phase can be finished for the pedestrian .
 23 | &quot; If a group or disabled people go across the road , the green period is extended , so everyone comes safely over the road , &quot; said Arnold .
 24 | Of course , the driver had to remember as a partner and watch the track .
 25 | This was not the case yesterday : there was hardly any show of traffic lights for pedestrians green , there was a clamping vehicle wiping through - at glowing red .
 26 | Josef Winkler has written the needs of his childhood and youth from the soul for more than 30 years .
 27 | The catastrophes of his catholic village kindness - voicelessness , inclination to raw violence and stumbling sexuality , the ducks and joy - have been described many times by the Carinthian poet .
 28 | The prize-winner of the Büchner Prize is known primarily as a protector , and theater texts are rare in his work .
 29 | Collage from prose texts Gerhard Fresacher presents for his performance &quot; Wetterleuchten auf der Tongenspitze &quot; , which is now shown in Garage X on Petersplatz , a collage of prosaken texts .
 30 | The theater maker , for example , combines elements from the autobiographical novel &quot; The Sergeant &quot; ( 1987 ) with &quot; Leichnam , reviving his family &quot; ( 2003 ) .
 31 | On the largely empty stage - important Requisit : a tender nautical couch , which is impressively copied and masturbated - the eight-headed ensemble pins itself .
 32 | Director Fresacher seems to have little faith in the text .
 33 | The 70-minute performance embodies the original with a wealth of rainfalls , known from the repertoire of post-dramatic forms of play .
 34 | The actresses , in particular , come to the attention of the somewhat questionable ensemble .
 35 | They are pumped hard , dipped with the head under water , with their evening robes thrown to the wall .
 36 | Embedded in cellophane or moody , they rely on dangerously high sticks by the staging , either monologise them loudly or lie completely silent on the stage .
 37 | The text is hardly conveyed in this narrow way .
 38 | The best moments are the evening when sung - the bandwidth ranges from Deep Purple to popular song .
 39 | It is not until the end that the overturned performance comes to rest , and Winklers barely lighten humour .
 40 | A black box in the car ?
 41 | US street planners are looking for a source of money to fix the decaying Highway system , and believe that they have found the solution in a small black box that fits in the car ’ s dashboard .
 42 | The devices that record each kilometre and report information to the authorities are at the heart of a controversial attempt by Washington and the state planning offices to overhaul the outdated US street financing system .
 43 | The normally rather boring area of road planning suddenly sparked an intense debate with colourful alliances .
 44 | Libertarians have allied with environmental groups and argue that the government can use the small boxes to record the driving kilometers – and possibly also where they were driven – and then use the information modestly for the calculation .
 45 | The Tea Party is appalled .
 46 | The American Civil Rights Association ( ACLU ) is also deeply concerned and expresses a number of concerns about data protection .
 47 | But while it is not possible to agree on a course of action in Congress , several states are no longer waiting .
 48 | They are currently examining ways to switch over the next ten years to a system where drivers pay per mile .
 49 | Thousands of drivers have already tested the tachograph , some of which are equipped with GPS monitoring .
 50 | This is a must for our country .
 51 | “ It ’ s not something we can only possibly use , ” said Hasan Ikhrata , Managing Director of Southern California Assn. of Governments , who plans to record miles for all California drivers in the federal state of California .
 52 | The way we pay these taxes will change .
 53 | The technology is there .
 54 | The initiative comes at a time when the Highway Trust Fund , financed from the taxes paid by Americans at the fuel column , is dead .
 55 | But in America there is no longer as much mowing as before .
 56 | Cars consume less petrol .
 57 | The state fuel tax of 18.4 cents per gallon ( less than 4 cents per litre ) has not risen for 20 years .
 58 | Politicians do not dare to raise the tax just to raise one cent at high cost .
 59 | “ The gasoline tax is simply not sustainable ” , says Lee Munnich , an expert on traffic legislation at the University of Minnesota .
 60 | Its state recently equipped 500 cars with tachographs to test a most-based payment system .
 61 | “ This is the most viable alternative in the long term , ” he said .
 62 | Bureaucrats call it a role-based user fee .
 63 | It is not surprising that the idea is popular with urban liberals , for example , because the tax could be used to influence driving behaviour in such a way that congestion and climate-damaging exhaust gases are reduced .
 64 | California planners rely on the system to develop strategies to achieve the federal state &apos;s ambitious , legally binding climate change targets .
 65 | But the Republican Bill Shuster of Pennsylvania , Chairman of the House Transportation Committee , has also declared that he sees this as the most viable long-term alternative .
 66 | The free marketers of the Reason Foundation are also inspired by the idea of paying drivers after a reduced distance .
 67 | “ This is not a tax that disappears in a black hole , ” explains Adrian Moore , Vice President of Reason Policy .
 68 | People pay directly for what they get .
 69 | The movement is also supported by two former US Transport Ministers who , in a report in 2011 , called on Congress to move towards the most-based settlement .
 70 | The U.S. Senate approved a $ 90 million pilot project last year that would have included 10,000 cars .
 71 | But the majority in the House of Representatives prevented the process and responded to the concerns of MEPs from rural areas who represent people who often travel many miles on their way to work or to the city .
 72 | Several states and cities are moving in this direction .
 73 | Oregon is currently the most dedicated driver to the country &apos;s largest experiment .
 74 | These drivers will soon pay the mileage tax instead of the fuel tax to the state .
 75 | Nevada has already completed a pilot project .
 76 | New York City is also considering this .
 77 | Illinois limited testing of vehicles .
 78 | And the I-95 coalition , which includes the transport ministries of 17 states on the east coast ( including Maryland , Pennsylvania , Virginia and Florida ) , is currently investigating how to introduce the change .
 79 | The concept is not a universal hit .
 80 | In Nevada , where recently 50 volunteers were equipped with the equipment , drivers were sceptical about the government being able to follow any of its movements .
 81 | “ Concerns about Big Brother and such things were a major problem , ” explains Alauddin Khan , Head of Strategy and Results Management at the Ministry of Transport in Nevada .
 82 | People did not want it .
 83 | When the test started , the ACLU warned Nevada to its website : “ It would be relatively easy to turn the tachograph into full-grown surveillance devices . ”
 84 | There is no need to build a gigantic , cumbersome technological infrastructure that would inevitably be used to collect data on the daily movements of individuals .
 85 | Nevada is one of a number of states that are now looking for affordable technology to cover the kilometers passed , but not exactly when and where .
 86 | According to Khan , the public would be more reassured .
 87 | The search for this technology led some authorities to a small start-up company called True Mileage in California .
 88 | The company did not initially intervene to help states tax their car drivers .
 89 | Instead , their aim was to gain a foothold in an emerging market for motor insurance , in which drivers would pay on the basis of the mileage they had travelled .
 90 | However , the devices she tested are also interesting for street planners , because they do not work with GPS and provide limited information that is regularly uploaded via modem .
 91 | &quot; People are more willing to participate if their speed and locations are not recorded , &quot; said Ryan Morrison , Managing Director of True Mileage .
 92 | Major mistakes were made in some of these public pilot programmes .
 93 | There are much cheaper and less intrusive ways to implement this .
 94 | In Oregon , the planers experiment with giving drivers a number of choices .
 95 | You can choose a device with or without GPS .
 96 | Or you choose no equipment at all and pay a lump sum instead on the basis of the mileage averaged by all inhabitants of the state .
 97 | Others hope to be able to sell the concept of a suspicious public by providing the equipment with more functions than with lesser ones .
 98 | In New York City , traffic officials want to develop a tool for taxation , which also pays parking fees , pays insurance only for driving miles , and allows drivers to leverage speed data from other vehicles in real time .
 99 | “ Drivers would be motivated to participate by the added value of the benefits the system offers , ” says a city planning document .
100 | However , some traffic planners wonder if all the talk about the amount per mile is not just a huge diversion .
101 | 


--------------------------------------------------------------------------------
/test/corpus/eval/testset3.out:
--------------------------------------------------------------------------------
 1 | 然而，知情人士表示，潜在竞购者提出的指示性报价与美银的预期不符。
 2 | 许多人期望DTCC能“履行”自己的职位，但他们不是成员，没有任何要求，只是被迫单独管理交易的清算。
 3 | 此举被广泛视为中国政府努力解决公众对与外国援助项目相关的腐败指控的戒心的一部分，尤其是那些由中国资助的项目。中国在没有竞标的情况下提名供应商和承包商。
 4 | 那不是借口。
 5 | TPG联合创始人戴维・邦德曼在上世纪80年代末和90年代初的储蓄和贷款危机之后帮助他树立了声誉，他于1996年以巨额利润参与了收购美国储蓄银行及其后出售给华盛顿互惠银行。
 6 | 从这个数字来看，经过3年的强劲增长，中国目前的资产总量为2836亿元人民币。
 7 | 这将帮助我们留住用户。”
 8 | 共有37家外国基金管理公司通过与中资集团的合资企业中的少数股权，在中国开展业务。
 9 | “你会认为，有些参议员想给他一个奖。”
10 | 该地区人口超过6亿，国内生产总值总和超过巴西或印度。
11 | 图克韦尔创立的ETF Securities目前提供300多种交易所交易产品，这些产品原本是禁止所有人交易的，但只有那些足够富有的人才能直接交易期货。
12 | 2006年6月，在新总行召开的一次战略会议上，一些董事会成员质疑该行是否雄心勃勃。
13 | 就连竞选美国总统的共和党地产大亨唐纳德・特朗普也对对冲基金经理大加抨击，称他们是“送纸人”，不缴纳应得税。
14 | 欧盟正涉入震动伦敦金融城的利率操纵丑闻，提出一项提议，拟非法操纵欧盟各地的市场指数，并对有关Libor设定规则的基本审查。
15 | 他表示，此举是人民币全面自由化的一部分，人民币最终目标是完全可兑换-尽管这距离实现完全可兑换还有很长的路要走。
16 | 过去几个月，至少两名员工已表明，他们愿意泄露监管信息。传统上，监管信息被认为是严格保密的：
17 | 花旗集团发言人香农・贝尔对这笔交易“没有置评”。
18 | 在债市，2012年巴西企业创下了此前所有纪录，在美国上市发行的债券筹资433亿美元，而去年同期为305亿美元。
19 | ICAP在5月份的年报中表示，该集团的部分部门“收到一些政府机构的请求，要求它们提供有关Libor如何设定的调查信息”。
20 | 巴克莱资本估计，爱尔兰高级银行债务占国内生产总值的38%，低于比利时、西班牙、英国和荷兰。
21 | 当然，华盛顿和华尔街之间紧密的联系并不新鲜，尤其是在高盛方面。
22 | 澳大利亚的澳新银行和华侨银行以及新加坡的大华银行等都有报道。
23 | 奥斯本正考虑对这起事件展开公开调查，这个法庭可能会迫使银行家解释奥斯本在过去10年中期所称的金融城“系统贪婪”文化。
24 | 欧智华表示，汇丰著名的广告口号-“全球本地银行”-聚焦于零售分行，如今已具有误导性。
25 | 彼等拥有直接的交易及管理市场风险的职业生涯经验。
26 | 高级监管人士称，中国政府还试图压低飙升的住宅地产价格，并可能在未来几个月上调抵押贷款最低存款要求。
27 | 向银行购买有毒资产提供财务援助，并提供担保，以弥补这些资产的部分损失。
28 | 但他补充称，在处理不同国家的不同规则方面，存在法律障碍。
29 | Omfif计划5月在吉隆坡举行类似的会议。
30 | 民主是发展中国家的奢侈品。
31 | 威尔在回忆录中说，普林斯一直在“成长为这个职位”，而时代华纳首席执行官、花旗集团董事帕森斯今年早些时候对《财富》杂志表示，普林斯在被任命时没有足够的运营经验，但最终却成为“比我们希望的更好的领导人”。
32 | 根据从今年3月到12月的第一份合同，只要纽约交易的油价保持在每桶63.50美元以上，J. aron就必须每月支付这家中国集团30万美元。
33 | 瑞信预计，全球最富有的人中，有40%位于亚洲。该行希望通过利用资产管理和投行部门，从这些目标客户中获取更多收入，实际上，该行将加快向富有私人客户的产品分销。
34 | 这使得20%的估值相对好消息，即便这只是银行自己在今年银行业监管机构进行更正式评估之前提供的初步数据。
35 | 中国央行表示，正在评估这种行为是否“如果得到证实”，是否对香港银行同业拆借利率的设定产生“重大影响”。
36 | 这让香港和新加坡金融中心今年的奖金发放和裁员感到失望。
37 | “印度是同性恋相关法律没有改变但社会没有改变的地方之一。
38 | 他表示： “那些不急于在香港注册基金产品的基金经理可能会暂时松一口气。”
39 | 据一位知情人士透露，马斯特斯决定不去摩科瑞，但尚未决定下一步该怎么做。在备忘录中，她被形容为“把业务打造成了当今行业领先的特许经营”。
40 | 美林经历了两年的动荡。此前，美国房地产市场崩盘导致大规模资产减记，导致大量高管离职，令员工和客户士气以及经营业绩受挫。
41 | 如今，从美国银行从中国建设银行退出的无情部分来看，中国高管们年纪大了，也更聪明了。
42 | 美国财政部昨日禁止美国银行与汇业银行进行交易，此前一项调查得出结论，该澳门银行曾帮助朝鲜洗钱。
43 | AIGFP在其售出的120亿美元次级抵押贷款证券信用保险中，亏损4.54亿美元，表明该业务仍有多严重。
44 | 尼日利亚银行的未来发展可能对整个非洲大陆产生影响。
45 | 对投资者来说，最关键的因素之一是了解他们基金的开曼董事是谁。
46 | 最终，这是一种不双赢的局面。”
47 | 这份名单并不公开，它包含许多跨国银行的名字，这些名字将是人们普遍预期的：
48 | 香港银行同业拆借利率周三大幅上扬，加大了小银行从规模更大的竞争对手那里获取贷款的难度。
49 | 没有一群人在抽长烟。”
50 | 瑞银强调，该行“致力于”与中行及其它中国内地业务的关系。
51 | Mediobanca分析师克里斯托弗・惠勒表示： “这感觉像是一笔200亿至300亿美元的发行。”
52 | 它一直能够为投资者募资，从一只当地基金-亚洲另类基金-到美国华盛顿州。
53 | 由阿里巴巴旗下企业浙江蚂蚁金服旗下的库，持有6张新的银行牌照，其中一张是根据一项试点计划授予民营企业的6张银行牌照，该计划旨在实现中国金融体系多元化。
54 | 去年，大学毕业生报告的薪资中值为2.05万英镑。
55 | 他很有能力，而且非常亲力亲为。”
56 | 2000年代初格林斯潘担任美联储主席时，我曾报道过美联储，我对以这种教授风格与格林斯潘长期交谈的记忆很快又回到了我脑海中。
57 | 过去6个月，纽约五大投行中的3家因投资者蜂拥而倒闭，此举也被视为某种投降。
58 | “许多银行今年已经进行了6轮裁员，这让高管们感到恐慌，”一家大型投行部门的负责人表示。
59 | 与此同时，爱尔兰等欧洲国家正艰难地为过去银行纾困带来的公共债务负担再融资。
60 | 这家百年老交易所将此举作为香港加大投资力度、以人民币计价的举措的一部分，也是利用内地黄金需求旺盛而赚钱的尝试。与2010年相比，中国对黄金的需求仍在增长，截至2011年第二季度，黄金需求已跃升近40%。
61 | 这位官员不愿透露需要多少钱。
62 | 我们希望与那些经得起时间考验的公司合作，而不是那些只想赚钱的公司。”
63 | 22岁的儿子李兆辉剪短了朋克发型，开始经营这家公司。
64 | 高盛的私人股本部门、新加坡主权财富基金淡马锡和中国私人股本集团新视野持有少量股份。
65 | 香港、台湾和新加坡也有类似的配额协议。
66 | 面对主权债务危机，欧洲领导人争相恢复团结。德国单方面禁止裸卖空令盟国失望。
67 | 结果，投资者往往把债券和信托产品视为政府的隐性支持，这帮助中国自2009年以来迅速实现了信贷繁荣。
68 | 能源公司和其他大宗商品交易公司内部的交易量也令人担忧。
69 | 近年来，由于缺乏支付基础设施，中国信用卡市场开局缓慢，增长速度开始加快。
70 | 该委员会还希望，如果最大的审计网络将成员公司分割成独立的审计和咨询业务，如果这些业务超过一定的门槛，那么它就会超越美国等其它地区对“非审计”工作的限制。
71 | 事实上，依赖易得易得的资金是5年前许多银行需要纾困的一大原因。
72 | 随着基准的10年期美国国债收益率飙升至3%，大银行证券投资组合的利润从年初的近400亿美元骤降。
73 | 该财团的晚期变动将惹恼兴业银行，该行表示，其自己的集团仍保持坚挺。
74 | 遗憾的是，全球金融危机让许多投资者措手不及，富通的股价也未能幸免。”
75 | 在就职演说中，中国央行行长刘安东表示，将特别努力向农村家庭和企业提供信贷，以“促进”中国农村的发展。
76 | 一家全球航空公司的想法被吹捧为一个奇特但非常合乎逻辑的想法有多久了？
77 | 《日经商报》报道称，预计该行将融资约4000亿日元，成为日本本财年规模最大的融资发行。
78 | 然而，中国银行本周表示，其他主要外国投资者中没有一家计划出售所持股份。中国银行目前仍持有中国政府的多数股权。
79 | 但摩根士丹利和咨询师奥纬咨询去年在一份报告中写道，当新规出台时，抵押品转换的收入“很可能抵消掉大部分损失”，这可能会抑制这些银行过去的一些赚钱行为。
80 | 换句话说，Countrywide和美林实际上都是破产机构，甚至支付1美元购入股本，这很可能是过头了。
81 | 随着自动化取代人，大约3000至4000名员工也将被从消费行业裁减。
82 | 建行表示：
83 | 但在金融危机之后，它们已走向全球监管努力的前沿，以改革衍生品市场，降低大银行构成的风险。
84 | 银行自己声称，这种转变是真实而持久的。
85 | 特里表示： “我们将看到一些波动，高薪银行家将离职，低薪人员将被招聘到同一个职位，以使商业模式在这一特定领域发挥作用。”
86 | 自我建议的大型并购的兴起，与总部位于加州的风险投资业崛起为科技公司最重要的单一资金来源，并非巧合。
87 | 这不可能是一种孤立的不当行为。
88 | 花旗的经营过程加剧了这种担忧。
89 | 美银授权去年收购的美林和瑞银安排在1月份出售这28亿美元的部分。
90 | 这笔道歉和纾困贷款是凯雷试图为凯雷资本(Carlyle Capital corporation,CCC)的危机划清界限。自7月4日上市以来，凯雷一直遭受着一连串坏消息。
91 | 在欧洲，法兰克福上升5位，超过卢森堡，华沙上升26位，第38位，都柏林上升6位，第46位。
92 | 他的首站将会见Mittelstand的几位主要成员。 mittelstand是德国经济支柱的家族企业。
93 | 这曾帮助印度企业走出国门，美国企业来印度。
94 | 相反，政治障碍似乎来自欧洲，人们担心即将出台的“另类投资基金管理公司”指令，可能会让欧盟投资者难以利用位于27个成员国之外的私人股本基金。
95 | 但是，出于对改革的渴望，欧洲政策制定者已经提出了从反效果到彻底行不通的提案草案。
96 | “好消息，”
97 | 传统银行正在反击，敦促加强监管，推出类似产品，提高存款利率，但迄今收效甚微。
98 | 高盛前中国区主管、中国最古老、最知名的私人股本公司之一Primavera的创始人胡祖六表示，国企内部有一些非常优秀、有才华的人，但他们缺乏推动改革的激励-无论是金融还是其他。
99 | 


--------------------------------------------------------------------------------
/test/corpus/eval/testset3.ref:
--------------------------------------------------------------------------------
 1 | 但知情人士表示，潜在竞购方提出的指示性报价与美国银行的期望不符。
 2 | 许多人期望DTCC“兑现”他们的头寸，但由于不是会员，他们没有任何索取权，只能被迫个人独自管理自己的交易清算事宜。
 3 | 外界普遍认为，此举是菲律宾政府的对策之一，旨在应对公众对外国援助项目（尤其是中国提供资金支持的项目）有关的腐败指控的关注。 这些项目在没有竞争性竞标的情况下就指定了供应商和承包商。
 4 | 这是无法开脱的。
 5 | 上世纪80年代末、90年代初的储蓄和贷款危机之后，TPG创始人之一庞德文(David Bonderman)参与收购了美国储蓄银行(American Savings Bank)，并在1996年将其出售给华盛顿互惠银行(Washington Mutual)并获利巨大，由此在这个领域一举成名。
 6 | 在经历了三年强劲增长之后，信达目前的总资产规模为2836亿元人民币，由此可见彼时债务剥离的力度之大。
 7 | 这将有助于我们留住用户。”
 8 | 总共有37家外资资产管理公司通过与中国公司组建合资公司并掌控少数股权的形式，在中国从事经营活动。
 9 | 你会感到有些参议员简直想给戴蒙颁个大奖。”
10 | 该地区人口超过6亿，合计国内生产总值(GDP)超过巴西或印度。
11 | 塔克韦尔创立的ETF证券公司(ETF Securities)现提供300只交易所交易型产品，它们的投资标的曾是富人的专利，只有财富多到可以直接交易期货的投资者才能接触到。
12 | 2006年6月，在新总部召开的一个战略会议上，一些董事会成员质疑该银行的理想是否足够远大。
13 | 就连正在竞选美国总统的地产大亨、共和党人唐纳德・特朗普(Donald Trump)都已向对冲基金经理们开了火，把他们称为“文员”，说他们缴纳的税款太少。
14 | 布鲁塞尔方面正介入撼动伦敦金融城的利率操纵丑闻，提议在整个欧盟范围内将操纵市场指数的尝试列为非法，并对伦敦银行间同业拆借利率(Libor)的设定方式展开根本性的审议。
15 | 陶冬表示，此举是向进一步放开人民币汇率迈出的一步，最终目标是实现人民币完全自由兑换，尽管这还有很长的路要走。
16 | 过去几个月里，有至少两名纽约联储银行自己的员工，表示出愿意披露以往一向被视为应严格保密的监管信息。
17 | 花旗发言人香农・贝尔(Shannon Bell)表示对此交易“无可奉告”。
18 | 在债券市场，2012年巴西公司在美国合计发债筹资433亿美元，创历史最高纪录，2011年则为305亿美元。
19 | 在其去年5月公布的年报中，ICAP表示该集团部分部门“收到了一些政府机构披露某些信息的要求，以调查Libor是如何制定的”。
20 | 巴克莱资本(Barclays Capital)估计，爱尔兰优先银行债务占到国内生产总值(GDP)的38%，低于比利时、西班牙、英国与荷兰。
21 | 华盛顿与华尔街关系密切当然不是新闻，尤其是就高盛来说。
22 | 据报道对永亨银行感兴趣的有澳大利亚的澳新银行(ANZ)以及新加坡的华侨银行(OCBC)和大华银行(UOB)。
23 | 眼下，奥斯本正考虑就此事开展公共调查，这一决定将迫使银行家对这样一个现象作出解释，即： 2000年至2010年这十年的中期，被奥斯本称为“系统性贪婪”的伦敦金融城文化。
24 | 欧智华表示，汇丰重点宣传零售分支机构的著名广告词——“全球金融、地方智慧”——现在存在误导作用。
25 | 他们在交易及管理市场风险方面都有着直接的工作经验。
26 | 据高层监管消息人士称，中国政府还有意降低不断飙升的住房价格，未来数月有可能提高住房抵押贷款的首付比例。
27 | 为购买银行不良资产提供财政援助，以及担保对这些资产的部分损失进行补偿。
28 | 但童道驰补充表示，在处理不同国家的不同法规方面存在一些司法障碍。
29 | OMFIF计划于5月份在吉隆坡召开类似会议。
30 | 对于一个发展中国家而言，民主政治是个奢侈品。
31 | 威尔在其自传中表示，普林斯对这项工作日益得心应手。 而帕森斯今年早些时候接受《财富》(Fortune)杂志采访时表示，普林斯刚刚就任的时候，缺乏足够的运营经验，但他最终成了“我们期望的更为优秀的领导者”。 帕森斯是时代华纳的首席执行官，兼任花旗集团董事。
32 | 根据从今年3月至12月有效的第一份合约，只要在纽约交易的油价保持在每桶63.50美元以上，J. Aron就必须每月向深南电支付30万美元。
33 | 据瑞信称，全球最富有的人群有40%在亚洲。 瑞信的目标是借助资产管理和投资银行部门加大对富有私人客户的产品销售力度，从亚洲的这一目标客户群体身上赚到更多收入。
34 | 这使20%的估计成为相对而言的好消息，即便这只是各银行自己提供的初步数据，中国银行业监管机构今年还将进行更为正式的评估。
35 | 香港金管局表示，它正在评估此类行为“如果得到证实”，是否对Hibor的设定产生“重大不利影响”。
36 | 其结果是，在香港和新加坡这两个金融中心，今年的奖金发放和裁员动作引起了业内人士的失望。
37 | “与有些地方一样，印度有关同性恋的法律没有变，但社会对待同性恋的态度已经改变。
38 | 他说： “没有赶紧在香港注册基金产品的基金公司可能会暂时松一口气。”
39 | 根据知情人士的说法，马斯特斯决定不去摩科瑞，不过她并未确定离职后做什么。
40 | 此前，由于与美国住房市场崩溃相关的巨额减记导致大量高管离职，员工及客户士气和运营绩效均遭受打击，美林曾经历了两年多的煎熬。
41 | 如今，对于美国银行（Bank of America）出售在中国建设银行所持的一半股份，这一近乎冷血的撤退行为，中国的高管们所采取的应对措施，明显已成熟睿智了许多。
42 | 美国财政部昨日宣布，禁止美国各家银行与汇业银行(Banco Delta Asia)进行业务往来。 此前有调查发现，这家澳门银行曾帮助朝鲜洗钱。
43 | 针对为次级抵押贷款证券提供的剩余120亿美元信用保险，该公司进行了4.54亿美元的价值减记，显示出这项业务仍然具有破坏性。
44 | 尼日利亚银行业未来的发展可能对整个非洲大陆具有借鉴意义。
45 | 对于投资者而言，最重要的问题之一已变成弄明白他们所投资的基金的开曼董事是谁。
46 | 最终，这是某种必输的局面。”
47 | 这份未公开的名单中，包含了人们普遍预期的诸多跨国银行：
48 | 香港银行同业拆息(HIBOR)周三大幅上涨，提高了小型银行从规模更大的竞争对手那里贷款的难度。
49 | 没有几个人在享受长时间的抽烟休息时间。”
50 | 瑞银强调称，仍“致力于”与中行的业务关系，及其在中国大陆的其它业务。
51 | 意大利中期银行(Mediobanca)分析师克里斯托弗・惠勒(Christopher Wheeler)说： “解决这一问题可能需要200亿至300亿美元。”
52 | 该公司曾为多种投资者成功筹资，从中国本土的基金的基金Asia Alternatives，到美国华盛顿州。
53 | 网商银行(MYBank)的母公司为阿里巴巴的附属公司——浙江蚂蚁小微金融服务集团有限公司（Zhejiang Ant Small & Micro Financial Services Group，简称“蚂蚁金服”）。 蚂蚁金服是中国6家获得新的银行牌照的民营企业之一。 授予民企银行牌照是旨在推动中国金融体系多元化的一个试点项目的一部分。
54 | 据报道，去年大学毕业生的年薪中值为2.05万英镑。
55 | 他很有能耐，也常常身体力行。”
56 | 在本世纪初格林斯潘还在担任主席时，我曾负责报道美联储，当年与他以这种专业风格长谈的记忆迅速重现于我的脑海。
57 | 此举也被视为某种意义上的投降。 在过去6个月，由于投资者慌张撤离，纽约五大投行已消失了三家。
58 | 一个大型投行部门的主管表示： “许多银行今年进行了六轮裁员，这把公司主管们吓坏了。
59 | 与此同时，爱尔兰等欧洲国家正在艰难地为之前银行纾困造成的公共债务负担进行再融资。
60 | 具有百年历史的香港金银业贸易场的这一举措，既体现了香港加大推出人民币计价投资产品的努力，也是试图从内地对黄金的强劲需求中分一杯羹。 中国对黄金的需求仍在增长，2011年第二季度需求同比增长了近40%。
61 | 这位官员没有指出具体将需要多少资金。
62 | 我们希望与经得起时间考验的公司合作，而不愿与那些只想圈钱的公司打交道。”
63 | 他的儿子、时年22岁的李兆会削短自己的朋克发型，开始管理业务。
64 | 高盛(Goldman Sachs)旗下私人股本公司、新加坡主权财富基金淡马锡(Temasek)以及中国私人股本集团新天域资本(New Horizons)持有双汇少量股权。
65 | 香港、台湾和新加坡也有类似的额度协议。
66 | 在德国单方面实行“裸卖空”禁令、引起盟国震惊后，欧洲各领导人急忙出面修补主权债务危机下的团结局面。
67 | 结果，投资者经常认为债券和信托产品得到政府的隐性担保，这也对2009年以来中国快速的信贷增长起了推波助澜的作用。
68 | 此外，人们对能源公司和其它大宗商品交易机构内部的交易量也感到担心。
69 | 中国的信用卡市场曾因欠缺支付基础设施而起步较慢，但近年已开始以更快速度增长。
70 | 欧盟委员会还希望超越美国等其它地区对“非审计”工作的限制，如果越过某个界限，四大将被迫把其成员所分拆为独立的审计和咨询业务。
71 | 事实上，依赖这些“来得快去得也快”的资金，是5年前大批银行需要纾困的一大原因。
72 | 随着基准10年期美国国债收益率蹿升至3%，各大银行的证券投资组合收益较年初的近400亿美元急剧减少。
73 | 财团的这个最新变化将令法国兴业银行感到苦恼。 法国兴业银行表示，它牵头的财团成员构成一直十分稳定。
74 | 但全球性金融海啸的席卷，令所有投资者始料未及，平安投资的富通也未能幸免，股价大幅下跌。”
75 | 中国邮政储蓄银行董事长刘安东在挂牌仪式上表示，通过为农村家庭和企业提供信贷，该银行将促进农村地区的发展。
76 | 建立全球航空公司这种大胆而又完全合乎逻辑的构想存在了多久？
77 | 据日本《日经产业新闻》(Nikkei business daily)报道，预计瑞穗将筹集约4000亿日元，从而成为日本本财政年度最大的一笔融资交易。
78 | 不过，中行上周表示，该行的其他主要外国投资者没有出售所持股份的计划。 中国政府仍持有中行多数股权。
79 | 但摩根士丹利(Morgan Stanley)和咨询公司奥纬咨询(Oliver Wyman)在去年的一份报告中写道，在监管新规冲击银行以往的获利方式之时，来自抵押品转换的收入“可能将弥补了大部分损失”。
80 | 也就是说，Countrywide和美林事实上已经破产，即便支付1美元购买它们的股权，也很可能付出了过高的代价。
81 | 消费者业务部门也将裁减3000至4000名员工，得益于自动化取代人工。
82 | 建行表示：
83 | 但在金融危机以后，清算所却出现在一场全球监管运动的前沿——这场运动旨在改革衍生品市场、降低大银行带来的风险。
84 | 这些银行自称，转型是真实的、永久的。
85 | 特里表示： “我们将看到一些人才流失，薪资较高的银行家将离开，而薪资较低的人将被聘用到同样的职位，使这种业务模式在这个具体领域发挥作用。”
86 | 自我咨询的大型并购的兴起，和位于加州的风险投资业作为科技公司最重要的单一投资来源的崛起，发生在同一时间，这并非巧合。
87 | 巴克莱的过错不可能是孤立事件。
88 | 花旗的运作过程加深了这种担心。
89 | 美国银行今年1月出售28亿美元建行股份时，曾委任美林(Merrill Lynch)以及瑞银(UBS)安排相关事宜。
90 | 此次道歉和近几周来的第二笔纾困贷款，表明凯雷集团试图了解凯雷资本公司(Carlyle Capital Corporation)的危机。 自7月4日上市以来，凯雷资本公司连续遭遇了一系列坏消息的打击。
91 | 在欧洲，法兰克福的排名上升5位，超过了卢森堡，而华沙跃升26位至第38名，都柏林上升6位至第46名。
92 | 他的第一站将与德国中小型企业(Mittelstand)中的一些领先成员会面——这些家族企业构成了德国经济的支柱。
93 | 过去的方式是帮助印度企业走出去，美国企业走进来。
94 | 相反，欧洲似乎正开始出现政治障碍。 人们担心，即将颁布的“另类投资基金经理指令”(Alternative Investment Fund Managers directive)，可能会使欧盟27个成员国的投资者难以利用欧盟外的私募股权基金。
95 | 但为了实行改革，欧洲的政策制定者已形成了多份提议草案，可这些草案不是效果适得其反，就是根本没有可行性。
96 | “好消息，”
97 | 传统银行正在反击，它们敦促实施更严厉的监管，并推出了类似的产品和提高了自己的存款利率，但迄今收效甚微。
98 | 胡祖六(Fred Hu)是高盛中国区前主管、中国成立最久、知名度最高的私募股权公司之一春华资本(Primavera Capital)的创始人。 他表示，国有企业拥有一些非常出色、极有才干的经营人才，但这些人缺乏推动变革的动力——不论是财务还是其他动力。
99 | 


--------------------------------------------------------------------------------
/test/corpus/resources/alignment/ende_backward.probs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenNMT/nmt-wizard-docker/a8469b2a292c1f38adfc3cffa35162ed9919f08e/test/corpus/resources/alignment/ende_backward.probs


--------------------------------------------------------------------------------
/test/corpus/resources/alignment/ende_forward.probs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenNMT/nmt-wizard-docker/a8469b2a292c1f38adfc3cffa35162ed9919f08e/test/corpus/resources/alignment/ende_forward.probs


--------------------------------------------------------------------------------
/test/corpus/resources/embeddings/dbpedia.ftz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenNMT/nmt-wizard-docker/a8469b2a292c1f38adfc3cffa35162ed9919f08e/test/corpus/resources/embeddings/dbpedia.ftz


--------------------------------------------------------------------------------
/test/corpus/resources/subword/en_de.sp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenNMT/nmt-wizard-docker/a8469b2a292c1f38adfc3cffa35162ed9919f08e/test/corpus/resources/subword/en_de.sp


--------------------------------------------------------------------------------
/test/corpus/vocab/de-vocab.txt:
--------------------------------------------------------------------------------
  1 | ,
  2 | .
  3 | die
  4 | der
  5 | und
  6 | in
  7 | zu
  8 | den
  9 | für
 10 | von
 11 | ist
 12 | das
 13 | dass
 14 | wir
 15 | des
 16 | nicht
 17 | auf
 18 | eine
 19 | werden
 20 | es
 21 | im
 22 | mit
 23 | ich
 24 | auch
 25 | dem
 26 | ein
 27 | sich
 28 | wird
 29 | Ich
 30 | Die
 31 | haben
 32 | sind
 33 | hat
 34 | um
 35 | -
 36 | Kommission
 37 | über
 38 | daß
 39 | wie
 40 | sie
 41 | als
 42 | an
 43 | Herr
 44 | zur
 45 | einer
 46 | Wir
 47 | diese
 48 | Sie
 49 | Union
 50 | uns
 51 | Europäischen
 52 | bei
 53 | müssen
 54 | einen
 55 | dieser
 56 | möchte
 57 | Präsident
 58 | Es
 59 | noch
 60 | Das
 61 | können
 62 | diesem
 63 | vor
 64 | nur
 65 | (
 66 | kann
 67 | )
 68 | :
 69 | aber
 70 | so
 71 | nach
 72 | Parlament
 73 | zum
 74 | wenn
 75 | !
 76 | aus
 77 | Bericht
 78 | oder
 79 | sehr
 80 | sein
 81 | Mitgliedstaaten
 82 | durch
 83 | "
 84 | einem
 85 | Der
 86 | Frau
 87 | dieses
 88 | muss
 89 | was
 90 | wurde
 91 | Europäische
 92 | Europa
 93 | ?
 94 | mehr
 95 | alle
 96 | ihre
 97 | Rat
 98 | hier
 99 | In
100 | diesen
101 | dies
102 | habe
103 | eines
104 | EU
105 | europäischen
106 | man
107 | zwischen
108 | gibt
109 | er
110 | damit
111 | keine
112 | unsere
113 | mich
114 | Frage
115 | gegen
116 | jedoch
117 | sondern
118 | Herrn
119 | war
120 | am
121 | anderen
122 | Maßnahmen
123 | heute
124 | sollte
125 | sollten
126 | unter
127 | dann
128 | sowie
129 | bereits
130 | da
131 | Menschen
132 | immer
133 | Entwicklung
134 | Parlaments
135 | –
136 | denn
137 | weil
138 | wurden
139 | bin
140 | meine
141 | denen
142 | darauf
143 | Vorschlag
144 | /
145 | vom
146 | unserer
147 | dafür
148 | sagen
149 | ganz
150 | geht
151 | jetzt
152 | ;
153 | dazu
154 | Wenn
155 | Kollegen
156 | Bürger
157 | ihrer
158 | mir
159 | neuen
160 | insbesondere
161 | Kommissar
162 | Arbeit
163 | würde
164 | muß
165 | Ihnen
166 | Zusammenarbeit
167 | Länder
168 | doch
169 | %
170 | wichtig
171 | Diese
172 | allem
173 | wollen
174 | Bereich
175 | Zeit
176 | machen
177 | Jahr
178 | Unterstützung
179 | einige
180 | bis
181 | allen
182 | ohne
183 | neue
184 | seine
185 | andere
186 | Rahmen
187 | Richtlinie
188 | Jahren
189 | ob
190 | wäre
191 | Dies
192 | Land
193 | tun
194 | Recht
195 | meiner
196 | Im
197 | Problem
198 | Namen
199 | Ländern
200 | schon
201 | Präsidentin
202 | einmal
203 | zwei
204 | Unternehmen
205 | Thema
206 | selbst
207 | Fall
208 | Sicherheit
209 | soll
210 | Wie
211 | also
212 | politischen
213 | Bedeutung
214 | darüber
215 | Fragen
216 | europäische
217 | viele
218 | etwas
219 | Meinung
220 | Lage
221 | worden
222 | nun
223 | Rates
224 | ihren
225 | besteht
226 | geben
227 | unterstützen
228 | Ziel
229 | Was
230 | viel
231 | Fraktion
232 | Politik
233 | wieder
234 | Aussprache
235 | Probleme
236 | daher
237 | Staaten
238 | wissen
239 | Zukunft
240 | Frauen
241 | Situation
242 | wirklich
243 | Regierung
244 | natürlich
245 | Zusammenhang
246 | gesagt
247 | Rolle
248 | Deshalb
249 | zwar
250 | Mittel
251 | möglich
252 | Abstimmung
253 | glaube
254 | nämlich
255 | Welt
256 | letzten
257 | unseren
258 | seiner
259 | davon
260 | betrifft
261 | Menschenrechte
262 | Weise
263 | könnte
264 | ihr
265 | Europas
266 | Punkt
267 | Herren
268 | daran
269 | Gemeinschaft
270 | Artikel
271 | Aber
272 | gegenüber
273 | Teil
274 | hoffe
275 | Schutz
276 | ja
277 | stellen
278 | darf
279 | während
280 | Und
281 | dabei
282 | große
283 | deren
284 | Berichterstatter
285 | Weg
286 | drei
287 | Vorschläge
288 | nationalen
289 | waren
290 | Damen
291 | kein
292 | Jahre
293 | politische
294 | Ansicht
295 | stehen
296 | sowohl
297 | Möglichkeit
298 | gut
299 | kommen
300 | lassen
301 | Rechte
302 | Kolleginnen
303 | darin
304 | Lösung
305 | Debatte
306 | führen
307 | Art
308 | Ebene
309 | besonders
310 | brauchen
311 | seit
312 | ihnen
313 | Euro
314 | bringen
315 | internationalen
316 | Umsetzung
317 | Änderungsanträge
318 | steht
319 | Beispiel
320 | Änderungsantrag
321 | gemeinsamen
322 | weiter
323 | Bezug
324 | Auch
325 | dürfen
326 | liegt
327 | Abgeordneten
328 | großen
329 | klar
330 | wo
331 | danken
332 | erhalten
333 | sozialen
334 | Hinblick
335 | Eine
336 | Demokratie
337 | Millionen
338 | Ein
339 | weiterhin
340 | aller
341 | deshalb
342 | innerhalb
343 | welche
344 | Grund
345 | '
346 | gestimmt
347 | denke
348 | Daher
349 | Ausschusses
350 | Verhandlungen
351 | gemeinsame
352 | Ziele
353 | Entschließung
354 | Hilfe
355 | Wirtschaft
356 | sollen
357 | Ihre
358 | erreichen
359 | einigen
360 | Verantwortung
361 | Abkommen
362 | ersten
363 | handelt
364 | Verordnung
365 | wichtige
366 | gerade
367 | deutlich
368 | alles
369 | Ende
370 | Verfahren
371 | Grundlage
372 | finden
373 | Regionen
374 | schaffen
375 | werde
376 | notwendig
377 | solche
378 | Ausschuss
379 | Mit
380 | Institutionen
381 | Tatsache
382 | beiden
383 | sprechen
384 | beim
385 | Vertrag
386 | weitere
387 | gilt
388 | Auswirkungen
389 | gemacht
390 | Strategie
391 | würden
392 | will
393 | indem
394 | nichts
395 | Ausdruck
396 | unterstützt
397 | wirtschaftlichen
398 | kommt
399 | stellt
400 | Informationen
401 | Nach
402 | unser
403 | ebenfalls
404 | darum
405 | keinen
406 | 1
407 | weniger
408 | beispielsweise
409 | vielen
410 | Entscheidung
411 | Interesse
412 | Förderung
413 | eigenen
414 | Dank
415 | dessen
416 | Türkei
417 | verschiedenen
418 | seinen
419 | Als
420 | Beziehungen
421 | ab
422 | sehen
423 | hatte
424 | vielleicht
425 | Behörden
426 | Aus
427 | Kommissarin
428 | Vereinigten
429 | hinaus
430 | bedeutet
431 | Er
432 | genau
433 | Schaffung
434 | Erachtens
435 | Verbraucher
436 | Antwort
437 | besser
438 | nehmen
439 | öffentlichen
440 | Region
441 | Erweiterung
442 | einfach
443 | sogar
444 | aufgrund
445 | sei
446 | Notwendigkeit
447 | hätte
448 | erforderlich
449 | Zugang
450 | dort
451 | Bedingungen
452 | nächsten
453 | Bekämpfung
454 | angenommen
455 | könnten
456 | Gebiet
457 | sicher
458 | Da
459 | Dieser
460 | meinen
461 | Schritt
462 | Aufgabe
463 | Bevölkerung
464 | Beitrag
465 | Anwendung
466 | System
467 | all
468 | Verbesserung
469 | schriftlich
470 | hinsichtlich
471 | meines
472 | Reihe
473 | gehen
474 | soziale
475 | heißt
476 | ihrem
477 | Bei
478 | erreicht
479 | gute
480 | weiß
481 | wichtigen
482 | Krise
483 | Auf
484 | Fortschritte
485 | Standpunkt
486 | allerdings
487 | Kosten
488 | USA
489 | Forschung
490 | ihm
491 | Für
492 | begrüße
493 | gehört
494 | Kontrolle
495 | Interessen
496 | Berichts
497 | Beschäftigung
498 | Bereichen
499 | halte
500 | Wort
501 | 


--------------------------------------------------------------------------------
/test/corpus/vocab/en-vocab.txt:
--------------------------------------------------------------------------------
  1 | the
  2 | ,
  3 | .
  4 | of
  5 | to
  6 | and
  7 | in
  8 | that
  9 | is
 10 | a
 11 | for
 12 | I
 13 | on
 14 | this
 15 | be
 16 | we
 17 | are
 18 | have
 19 | it
 20 | not
 21 | as
 22 | with
 23 | which
 24 | European
 25 | The
 26 | will
 27 | by
 28 | '
 29 | has
 30 | Mr
 31 | Commission
 32 | s
 33 | an
 34 | at
 35 | would
 36 | also
 37 | should
 38 | all
 39 | -
 40 | from
 41 | We
 42 | our
 43 | must
 44 | President
 45 | but
 46 | been
 47 | Union
 48 | Parliament
 49 | you
 50 | It
 51 | can
 52 | was
 53 | Member
 54 | more
 55 | or
 56 | report
 57 | its
 58 | States
 59 | their
 60 | Council
 61 | there
 62 | do
 63 | like
 64 | very
 65 | (
 66 | )
 67 | Europe
 68 | This
 69 | these
 70 | In
 71 | they
 72 | EU
 73 | one
 74 | countries
 75 | about
 76 | what
 77 | other
 78 | us
 79 | so
 80 | need
 81 | who
 82 | important
 83 | :
 84 | my
 85 | people
 86 | only
 87 | new
 88 | policy
 89 | time
 90 | because
 91 | if
 92 | no
 93 | ?
 94 | am
 95 | up
 96 | now
 97 | support
 98 | such
 99 | out
100 | take
101 | make
102 | into
103 | between
104 | those
105 | work
106 | rights
107 | when
108 | therefore
109 | some
110 | being
111 | any
112 | them
113 | political
114 | made
115 | were
116 | ;
117 | way
118 | than
119 | Commissioner
120 | believe
121 | economic
122 | say
123 | many
124 | proposal
125 | against
126 | fact
127 | debate
128 | first
129 | social
130 | market
131 | That
132 | think
133 | Committee
134 | just
135 | development
136 | issue
137 | point
138 | years
139 | ’
140 | situation
141 | human
142 | future
143 | order
144 | question
145 | cannot
146 | right
147 | country
148 | does
149 | citizens
150 | here
151 | want
152 | same
153 | had
154 | measures
155 | possible
156 | even
157 | two
158 | As
159 | could
160 | well
161 | /
162 | said
163 | public
164 | today
165 | most
166 | agreement
167 | already
168 | vote
169 | where
170 | much
171 | me
172 | good
173 | national
174 | clear
175 | too
176 | within
177 | –
178 | Mrs
179 | part
180 | still
181 | how
182 | financial
183 | There
184 | know
185 | However
186 | see
187 | year
188 | particular
189 | use
190 | common
191 | both
192 | cooperation
193 | course
194 | Community
195 | view
196 | %
197 | hope
198 | taken
199 | your
200 | under
201 | world
202 | international
203 | ensure
204 | level
205 | able
206 | over
207 | system
208 | area
209 | his
210 | case
211 | own
212 | place
213 | why
214 | process
215 | then
216 | put
217 | rapporteur
218 | means
219 | information
220 | energy
221 | problem
222 | number
223 | example
224 | may
225 | last
226 | action
227 | gentlemen
228 | without
229 | through
230 | House
231 | however
232 | If
233 | problems
234 | directive
235 | Madam
236 | particularly
237 | budget
238 | necessary
239 | he
240 | great
241 | give
242 | issues
243 | before
244 | protection
245 | areas
246 | end
247 | resolution
248 | given
249 | security
250 | respect
251 | Treaty
252 | regard
253 | position
254 | matter
255 | legal
256 | women
257 | law
258 | Group
259 | whether
260 | amendments
261 | ladies
262 | again
263 | far
264 | Members
265 | sector
266 | thank
267 | United
268 | behalf
269 | role
270 | since
271 | further
272 | proposals
273 | programme
274 | set
275 | terms
276 | basis
277 | services
278 | certain
279 | next
280 | favour
281 | trade
282 | done
283 | rules
284 | aid
285 | come
286 | adopted
287 | legislation
288 | different
289 | health
290 | better
291 | agree
292 | What
293 | help
294 | needs
295 | present
296 | framework
297 | crisis
298 | But
299 | continue
300 | opinion
301 | ask
302 | institutions
303 | change
304 | For
305 | after
306 | progress
307 | concerned
308 | wish
309 | going
310 | decision
311 | group
312 | result
313 | State
314 | whole
315 | towards
316 | few
317 | welcome
318 | something
319 | approach
320 | working
321 | long
322 | voted
323 | forward
324 | three
325 | principle
326 | resources
327 | conditions
328 | A
329 | including
330 | current
331 | subject
332 | importance
333 | shall
334 | proposed
335 | go
336 | opportunity
337 | really
338 | did
339 | employment
340 | used
341 | specific
342 | million
343 | policies
344 | authorities
345 | together
346 | provide
347 | On
348 | increase
349 | Presidency
350 | especially
351 | every
352 | during
353 | transport
354 | regulation
355 | strategy
356 | industry
357 | negotiations
358 | account
359 | find
360 | clearly
361 | state
362 | general
363 | products
364 | another
365 | become
366 | environmental
367 | greater
368 | itself
369 | responsibility
370 | safety
371 | quite
372 | second
373 | research
374 | serious
375 | rather
376 | down
377 | least
378 | They
379 | No
380 | democracy
381 | always
382 | reason
383 | regions
384 | difficult
385 | making
386 | once
387 | access
388 | attention
389 | call
390 | essential
391 | effective
392 | These
393 | fundamental
394 | deal
395 | things
396 | interests
397 | achieve
398 | real
399 | freedom
400 | society
401 | 


--------------------------------------------------------------------------------
/test/corpus/vocab/vocab-extra.txt:
--------------------------------------------------------------------------------
1 | vocab_mama
2 | vocab_papa
3 | 


--------------------------------------------------------------------------------
/test/pytest.ini:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenNMT/nmt-wizard-docker/a8469b2a292c1f38adfc3cffa35162ed9919f08e/test/pytest.ini


--------------------------------------------------------------------------------
/test/requirements.txt:
--------------------------------------------------------------------------------
1 | black==22.*
2 | flake8==3.9.*
3 | pytest==7.*
4 | requests-mock==1.*
5 | 


--------------------------------------------------------------------------------
/test/test_cloud_translation_framework.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import pytest
  4 | import json
  5 | import functools
  6 | 
  7 | from nmtwizard.cloud_translation_framework import CloudTranslationFramework
  8 | from nmtwizard import serving
  9 | 
 10 | 
 11 | def _generate_numbers_file(path, max_count=12):
 12 |     with open(path, "w") as f:
 13 |         for i in range(max_count):
 14 |             f.write("%d\n" % i)
 15 |     return path
 16 | 
 17 | 
 18 | def _count_lines(path):
 19 |     with open(path, "rb") as f:
 20 |         i = 0
 21 |         for _ in f:
 22 |             i += 1
 23 |         return i
 24 | 
 25 | 
 26 | class _CopyTranslationFramework(CloudTranslationFramework):
 27 |     def translate_batch(self, batch, source_lang, target_lang):
 28 |         return batch
 29 | 
 30 | 
 31 | def _test_framework(tmpdir, framework_class):
 32 |     os.environ["WORKSPACE_DIR"] = str(tmpdir.join("workspace"))
 33 |     framework = framework_class()
 34 |     config = {"source": "en", "target": "fr"}
 35 |     input_path = str(tmpdir.join("input.txt"))
 36 |     output_path = str(tmpdir.join("output.txt"))
 37 |     _generate_numbers_file(input_path)
 38 |     args = [
 39 |         "-c",
 40 |         json.dumps(config),
 41 |         "trans",
 42 |         "-i",
 43 |         input_path,
 44 |         "-o",
 45 |         output_path,
 46 |     ]
 47 |     framework.run(args=args)
 48 |     assert os.path.isfile(output_path)
 49 |     assert _count_lines(input_path) == _count_lines(output_path)
 50 | 
 51 | 
 52 | def _test_real_framework(tmpdir, directory):
 53 |     root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 54 |     sys.path.insert(0, os.path.join(root_dir, "frameworks", directory))
 55 |     import entrypoint
 56 | 
 57 |     class_name = None
 58 |     for symbol in dir(entrypoint):
 59 |         if symbol.endswith("Framework") and symbol != "CloudTranslationFramework":
 60 |             class_name = symbol
 61 |     _test_framework(tmpdir, getattr(entrypoint, class_name))
 62 |     sys.path.pop(0)
 63 |     del sys.modules["entrypoint"]
 64 | 
 65 | 
 66 | def test_cloud_translation_framework(tmpdir):
 67 |     _test_framework(tmpdir, _CopyTranslationFramework)
 68 | 
 69 | 
 70 | def test_serve_cloud_translation_framework():
 71 |     class _ReverseTranslationFramework(CloudTranslationFramework):
 72 |         def translate_batch(self, batch, source_lang, target_lang):
 73 |             assert source_lang == "en"
 74 |             assert target_lang == "fr"
 75 |             return ["".join(reversed(list(text))) for text in batch]
 76 | 
 77 |     framework = _ReverseTranslationFramework()
 78 |     config = {"source": "en", "target": "fr"}
 79 |     _, service_info = framework.serve(config, None)
 80 |     request = {"src": [{"text": "Hello"}]}
 81 |     result = serving.run_request(
 82 |         request, functools.partial(framework.forward_request, service_info)
 83 |     )
 84 |     assert result["tgt"][0][0]["text"] == "olleH"
 85 | 
 86 | 
 87 | @pytest.mark.skipif(
 88 |     "BAIDU_APPID" not in os.environ or "BAIDU_KEY" not in os.environ,
 89 |     reason="missing Baidu credentials",
 90 | )
 91 | def test_baidu_translate(tmpdir):
 92 |     _test_real_framework(tmpdir, "baidu_translate")
 93 | 
 94 | 
 95 | @pytest.mark.skipif(
 96 |     "DEEPL_CREDENTIALS" not in os.environ, reason="missing DeepL credentials"
 97 | )
 98 | def test_deepl_translate(tmpdir):
 99 |     _test_real_framework(tmpdir, "deepl_translate")
100 | 
101 | 
102 | @pytest.mark.skipif(
103 |     "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ,
104 |     reason="missing Google credentials",
105 | )
106 | def test_google_translate(tmpdir):
107 |     _test_real_framework(tmpdir, "google_translate")
108 | 
109 | 
110 | @pytest.mark.skipif(
111 |     "NAVER_CLIENT_ID" not in os.environ or "NAVER_SECRET" not in os.environ,
112 |     reason="missing Naver credentials",
113 | )
114 | def test_naver_translate(tmpdir):
115 |     _test_real_framework(tmpdir, "naver_translate")
116 | 
117 | 
118 | @pytest.mark.skipif(
119 |     "SOGOU_PID" not in os.environ or "SOGOU_KEY" not in os.environ,
120 |     reason="missing Sogou credentials",
121 | )
122 | def test_sogou_translate(tmpdir):
123 |     _test_real_framework(tmpdir, "sogou_translate")
124 | 
125 | 
126 | @pytest.mark.skipif(
127 |     "TENCENT_SecretId" not in os.environ or "TENCENT_SecretKey" not in os.environ,
128 |     reason="missing Tencent credentials",
129 | )
130 | def test_tencent_translate(tmpdir):
131 |     _test_real_framework(tmpdir, "tencent_translate")
132 | 
133 | 
134 | @pytest.mark.skipif(
135 |     "YOUDAO_APPID" not in os.environ or "YOUDAO_KEY" not in os.environ,
136 |     reason="missing Youdao credentials",
137 | )
138 | def test_youdao_translate(tmpdir):
139 |     _test_real_framework(tmpdir, "youdao_translate")
140 | 


--------------------------------------------------------------------------------
/test/test_config.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import pytest
  3 | import jsonschema
  4 | 
  5 | from nmtwizard import config
  6 | 
  7 | 
  8 | def test_key_override():
  9 |     a = {"a": {"b": 42, "c": "d"}, "e": "f"}
 10 |     b = {"a": None}
 11 |     c = config.merge_config(a, b)
 12 |     assert c == {"a": None, "e": "f"}
 13 | 
 14 | 
 15 | def test_key_replace():
 16 |     a = {"a": {"b": 42, "c": "d"}, "e": "f"}
 17 |     b = {"e": {"x": "y"}}
 18 |     c = config.merge_config(a, b)
 19 |     assert c == {"a": {"b": 42, "c": "d"}, "e": {"x": "y"}}
 20 | 
 21 | 
 22 | _base_schema = {
 23 |     "json_schema": {
 24 |         "title": "Translation options",
 25 |         "description": "Translation options",
 26 |         "type": "object",
 27 |         "required": ["bpreprocess"],
 28 |         "properties": {
 29 |             "bpreprocess": {
 30 |                 "type": "object",
 31 |                 "title": "User friendly name",
 32 |                 "properties": {
 33 |                     "politeness": {
 34 |                         "type": "string",
 35 |                         "title": "Politeness Mode",
 36 |                         "default": "neutral",
 37 |                         "enum": ["formal", "informal", "neutral"],
 38 |                     },
 39 |                     "domain": {
 40 |                         "type": "string",
 41 |                         "title": "Domain",
 42 |                         "enum": ["IT", "News"],
 43 |                     },
 44 |                 },
 45 |             }
 46 |         },
 47 |     },
 48 | }
 49 | 
 50 | _test_inference_options = _base_schema.copy()
 51 | _test_inference_options.update(
 52 |     {
 53 |         "options": [
 54 |             {
 55 |                 "option_path": "bpreprocess/politeness",
 56 |                 "config_path": "bpreprocess/classifiers/0/value",
 57 |             },
 58 |             {
 59 |                 "option_path": "bpreprocess/domain",
 60 |                 "config_path": "bpreprocess/classifiers/1/value",
 61 |             },
 62 |         ]
 63 |     }
 64 | )
 65 | 
 66 | _test_config = {
 67 |     "bpreprocess": {"classifiers": [{"name": "politeness"}, {"name": "domain"}]}
 68 | }
 69 | 
 70 | _test_inference_options_v2 = _base_schema.copy()
 71 | _test_inference_options_v2.update(
 72 |     {
 73 |         "options": [
 74 |             {
 75 |                 "option_path": "bpreprocess/politeness",
 76 |                 "config_path": "preprocess/my-politeness-op/value",
 77 |             },
 78 |             {
 79 |                 "option_path": "bpreprocess/domain",
 80 |                 "config_path": "preprocess/my-domain-op/value",
 81 |             },
 82 |         ]
 83 |     }
 84 | )
 85 | 
 86 | _test_config_v2 = {
 87 |     "preprocess": [
 88 |         {
 89 |             "op": "domain-classifier",
 90 |             "name": "my-domain-op",
 91 |         },
 92 |         {
 93 |             "op": "politeness-classifier",
 94 |             "name": "my-politeness-op",
 95 |         },
 96 |     ],
 97 | }
 98 | 
 99 | 
100 | def test_inference_options_index_schema():
101 |     schema = _test_inference_options["json_schema"]
102 |     politeness_schema = schema["properties"]["bpreprocess"]["properties"]["politeness"]
103 |     assert config.index_schema(schema, "bpreprocess/politeness") == politeness_schema
104 |     with pytest.raises(ValueError, match="Invalid path"):
105 |         config.index_schema(schema, "bpreprocess/domains")
106 | 
107 | 
108 | def test_inference_options_index_config():
109 |     cfg = _test_config
110 |     assert (
111 |         config.index_config(cfg, "bpreprocess/classifiers/1")
112 |         == cfg["bpreprocess"]["classifiers"][1]
113 |     )
114 |     with pytest.raises(ValueError, match="Invalid path"):
115 |         config.index_config(cfg, "bpreprocess/annotate")
116 |     assert config.index_config(
117 |         cfg, "bpreprocess/classifiers/1/value", index_structure=False
118 |     ) == (cfg["bpreprocess"]["classifiers"][1], "value")
119 |     with pytest.raises(ValueError, match="Invalid path"):
120 |         config.index_config(cfg, "bpreprocess/classifiers/1/value")
121 | 
122 | 
123 | def test_inference_options_index_config_v2():
124 |     cfg = _test_config_v2
125 |     assert config.index_config(cfg, "preprocess/my-domain-op") == cfg["preprocess"][0]
126 | 
127 | 
128 | def test_inference_options_validation():
129 |     schema = config.validate_inference_options(_test_inference_options, _test_config)
130 |     assert isinstance(schema, dict)
131 | 
132 | 
133 | def test_inference_options_invalid_shema():
134 |     opt = copy.deepcopy(_test_inference_options)
135 |     opt["json_schema"]["type"] = "objects"
136 |     with pytest.raises(jsonschema.SchemaError):
137 |         config.validate_inference_options(opt, _test_config)
138 | 
139 | 
140 | def test_read_options():
141 |     cfg = copy.deepcopy(_test_config)
142 |     cfg["inference_options"] = copy.deepcopy(_test_inference_options)
143 | 
144 |     with pytest.raises(ValueError):
145 |         options = {"bpreprocess": {"domain": "Technology"}}
146 |         config.read_options(cfg, options)
147 | 
148 |     options = {"bpreprocess": {"domain": "IT"}}
149 |     assert config.read_options(cfg, options) == {
150 |         "bpreprocess": {
151 |             "classifiers": [
152 |                 {"name": "politeness"},
153 |                 {
154 |                     "name": "domain",
155 |                     "value": "IT",
156 |                 },
157 |             ]
158 |         }
159 |     }
160 | 
161 | 
162 | def test_read_options_v2():
163 |     cfg = copy.deepcopy(_test_config_v2)
164 |     cfg["inference_options"] = copy.deepcopy(_test_inference_options_v2)
165 |     options = {"bpreprocess": {"domain": "IT"}}
166 |     assert config.read_options(cfg, options) == {"my-domain-op": {"value": "IT"}}
167 | 
168 | 
169 | def test_build_override():
170 |     c = {"a": {"b": {"c": 42}, "d": [{"e": 43}, {"f": 44}]}}
171 |     assert config.build_override(c, "a/z", 45) == {"a": {"z": 45}}
172 |     assert config.build_override(c, "a/d/1/g", 45) == {
173 |         "a": {"d": [{"e": 43}, {"f": 44, "g": 45}]}
174 |     }
175 | 


--------------------------------------------------------------------------------
/test/test_data.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from nmtwizard import data
 4 | 
 5 | 
 6 | def test_paste_files(tmpdir):
 7 |     a = str(tmpdir.join("a.txt"))
 8 |     b = str(tmpdir.join("b.txt"))
 9 |     c = str(tmpdir.join("c.txt"))
10 |     with open(a, "w") as af:
11 |         af.write("1 2 3\n4 5\n")
12 |     with open(b, "w") as bf:
13 |         bf.write("7 8\n9\n")
14 |     data.paste_files([a, b], c, separator="|")
15 |     with open(c) as c:
16 |         assert c.read() == "1 2 3|7 8\n4 5|9\n"
17 | 


--------------------------------------------------------------------------------
/test/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from nmtwizard.preprocess import tokenizer
 4 | 
 5 | 
 6 | def test_vocabulary_iterator(tmpdir):
 7 |     vocab_path = str(tmpdir.join("vocab.txt"))
 8 |     with open(vocab_path, "w") as vocab_file:
 9 |         vocab_file.write("# Comment 1\n")
10 |         vocab_file.write("# Comment 2\n")
11 |         vocab_file.write("\n")
12 |         vocab_file.write("hello\n")
13 |         vocab_file.write("world 42\n")
14 |         vocab_file.write("toto 0.0224656\n")
15 |         vocab_file.write("titi 2.8989e-08\n")
16 |         vocab_file.write("hello world\n")  # Bad token with a space.
17 | 
18 |     tokens = list(tokenizer.vocabulary_iterator(vocab_path))
19 |     assert tokens == ["", "hello", "world", "toto", "titi", "hello world"]
20 | 


--------------------------------------------------------------------------------
/test/test_utility.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from systran_storages import StorageClient
 6 | 
 7 | from nmtwizard import utility
 8 | 
 9 | 
10 | def test_resolve_env():
11 |     config = {"a": "${A_DIR}/a", "b": ["${B_DIR}/b", "${A_TRAIN_DIR}/a"]}
12 |     os.environ["A_DIR"] = "foo"
13 |     os.environ["B_DIR"] = "bar"
14 |     config = utility.resolve_environment_variables(config)
15 |     assert config["a"] == "foo/a"
16 |     assert config["b"] == ["bar/b", "foo/a"]
17 |     del os.environ["A_DIR"]
18 |     del os.environ["B_DIR"]
19 | 
20 | 
21 | def test_resolve_env_no_training():
22 |     config = {"a": "${A_DIR}/a", "b": "${A_TRAIN_DIR}/a"}
23 |     os.environ["A_DIR"] = "foo"
24 |     config = utility.resolve_environment_variables(config, training=False)
25 |     assert config["a"] == "foo/a"
26 |     assert config["b"] == "${A_TRAIN_DIR}/a"
27 | 
28 | 
29 | def test_resolve_remote_files(tmpdir):
30 |     tmpdir.join("remote").join("dir").join("a.txt").write("toto", ensure=True)
31 |     tmpdir.join("local").ensure_dir()
32 |     storage_config = {
33 |         "tmp": {"type": "local", "basedir": str(tmpdir)},
34 |         "tmp2": {"type": "local", "basedir": str(tmpdir.join("remote"))},
35 |     }
36 |     client = StorageClient(config=storage_config)
37 |     config = {
38 |         "a": "/home/ubuntu/a.txt",
39 |         "b": "non_storage:b.txt",
40 |         "c": "tmp:remote/dir/a.txt",
41 |         "d": "tmp2:/dir/a.txt",
42 |         "e": True,
43 |         "f": "tmp:",
44 |     }
45 |     config = utility.resolve_remote_files(config, str(tmpdir.join("local")), client)
46 |     c_path = tmpdir.join("local").join("tmp/remote/dir/a.txt")
47 |     d_path = tmpdir.join("local").join("tmp2/dir/a.txt")
48 |     f_path = tmpdir.join("local").join("tmp")
49 |     assert config["a"] == "/home/ubuntu/a.txt"
50 |     assert config["b"] == "non_storage:b.txt"
51 |     assert config["c"] == str(c_path)
52 |     assert config["d"] == str(d_path)
53 |     assert c_path.check(file=1)
54 |     assert d_path.check(file=1)
55 |     assert f_path.check(dir=1)
56 | 


--------------------------------------------------------------------------------
/tools/docker_images.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import argparse
 4 | import os
 5 | import subprocess
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument(
 9 |     "--org", default="nmtwizard", help="Organization name on Docker Hub."
10 | )
11 | parser.add_argument(
12 |     "--path", default="frameworks", help="Path to a directory or a Dockerfile to build."
13 | )
14 | parser.add_argument("--build", action="store_true", help="Build the image.")
15 | parser.add_argument("--version", default="latest", help="Image version.")
16 | parser.add_argument("--push", action="store_true", help="Push the image.")
17 | parser.add_argument("--sudo", action="store_true", help="Run commands with sudo.")
18 | args = parser.parse_args()
19 | 
20 | 
21 | def run(cmd):
22 |     print("+ %s" % " ".join(cmd))
23 |     if args.sudo:
24 |         cmd = ["sudo"] + cmd
25 |     exit_code = subprocess.call(cmd)
26 |     if exit_code != 0:
27 |         exit(exit_code)
28 | 
29 | 
30 | if os.path.isfile(args.path):
31 |     dockerfiles = [args.path]
32 | else:
33 |     dockerfiles = []
34 |     for filename in os.listdir(args.path):
35 |         path = os.path.join(args.path, filename)
36 |         if filename == "Dockerfile":
37 |             dockerfiles.append(path)
38 |         elif not os.path.isdir(path):
39 |             continue
40 |         else:
41 |             dockerfile = os.path.join(path, "Dockerfile")
42 |             if os.path.isfile(dockerfile):
43 |                 dockerfiles.append(dockerfile)
44 | 
45 | for dockerfile in dockerfiles:
46 |     framework_dir = os.path.basename(os.path.split(dockerfile)[0])
47 |     repo_name = framework_dir.replace("_", "-")
48 |     image_name = "%s/%s" % (args.org, repo_name)
49 |     image_latest = "%s:latest" % image_name
50 |     image_full_name = "%s:%s" % (image_name, args.version)
51 |     if args.build:
52 |         run(["docker", "build", "--pull", "-t", image_latest, "-f", dockerfile, "."])
53 |     run(["docker", "tag", image_latest, image_full_name])
54 |     if args.push:
55 |         run(["docker", "push", image_latest])
56 |         run(["docker", "push", image_full_name])
57 | 


--------------------------------------------------------------------------------
/utilities/score/BLEU/multi-bleu-detok_cjk.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  4 | # Public License version 2.1 or, at your option, any later version.
  5 | 
  6 | # This file uses the internal tokenization of mteval-v13a.pl,
  7 | # giving the exact same (case-sensitive) results on untokenized text.
  8 | # 
  9 | # like multi-bleu.perl , it supports plain text input and multiple references.
 10 | 
 11 | # $Id$
 12 | use warnings;
 13 | use strict;
 14 | binmode STDIN, ':utf8';
 15 | 
 16 | my $lowercase = 0;
 17 | if ($ARGV[0] eq "-lc") {
 18 |   $lowercase = 1;
 19 |   shift;
 20 | }
 21 | 
 22 | my $stem = $ARGV[0];
 23 | if (!defined $stem) {
 24 |   print STDERR "usage: multi-bleu-detok.pl [-lc] reference < hypothesis\n";
 25 |   print STDERR "Reads the references from reference or reference0, reference1, ...\n";
 26 |   exit(1);
 27 | }
 28 | 
 29 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
 30 | 
 31 | my @REF;
 32 | my $ref=0;
 33 | while(-e "$stem$ref") {
 34 |     &add_to_ref("$stem$ref",\@REF);
 35 |     $ref++;
 36 | }
 37 | &add_to_ref($stem,\@REF) if -e $stem;
 38 | die("ERROR: could not find reference file $stem") unless scalar @REF;
 39 | 
 40 | # add additional references explicitly specified on the command line
 41 | shift;
 42 | foreach my $stem (@ARGV) {
 43 |     &add_to_ref($stem,\@REF) if -e $stem;
 44 | }
 45 | 
 46 | 
 47 | 
 48 | sub add_to_ref {
 49 |     my ($file,$REF) = @_;
 50 |     my $s=0;
 51 |     if ($file =~ /\.gz$/) {
 52 | 	open(REF, '<:encoding(UTF-8)', "gzip -dc $file|") or die "Can't read $file";
 53 |     } else { 
 54 | 	open(REF, '<:encoding(UTF-8)', $file) or die "Can't read $file";
 55 |     }
 56 |     while(<REF>) {
 57 | 	chop;
 58 | 	$_ = tokenization($_);
 59 | 	push @{$$REF[$s++]}, $_;
 60 |     }
 61 |     close(REF);
 62 | }
 63 | 
 64 | my(@CORRECT,@TOTAL,$length_translation,$length_reference);
 65 | my $s=0;
 66 | while(<STDIN>) {
 67 |     chop;
 68 |     $_ = lc if $lowercase;
 69 |     $_ = tokenization($_);
 70 |     my @WORD = split;
 71 |     my %REF_NGRAM = ();
 72 |     my $length_translation_this_sentence = scalar(@WORD);
 73 |     my ($closest_diff,$closest_length) = (9999,9999);
 74 |     foreach my $reference (@{$REF[$s]}) {
 75 | #      print "$s $_ <=> $reference\n";
 76 |   $reference = lc($reference) if $lowercase;
 77 | 	my @WORD = split(' ',$reference);
 78 | 	my $length = scalar(@WORD);
 79 |         my $diff = abs($length_translation_this_sentence-$length);
 80 | 	if ($diff < $closest_diff) {
 81 | 	    $closest_diff = $diff;
 82 | 	    $closest_length = $length;
 83 | 	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
 84 | 	} elsif ($diff == $closest_diff) {
 85 |             $closest_length = $length if $length < $closest_length;
 86 |             # from two references with the same closeness to me
 87 |             # take the *shorter* into account, not the "first" one.
 88 |         }
 89 | 	for(my $n=1;$n<=4;$n++) {
 90 | 	    my %REF_NGRAM_N = ();
 91 | 	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
 92 | 		my $ngram = "$n";
 93 | 		for(my $w=0;$w<$n;$w++) {
 94 | 		    $ngram .= " ".$WORD[$start+$w];
 95 | 		}
 96 | 		$REF_NGRAM_N{$ngram}++;
 97 | 	    }
 98 | 	    foreach my $ngram (keys %REF_NGRAM_N) {
 99 | 		if (!defined($REF_NGRAM{$ngram}) ||
100 | 		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
101 | 		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
102 | #	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
103 | 		}
104 | 	    }
105 | 	}
106 |     }
107 |     $length_translation += $length_translation_this_sentence;
108 |     $length_reference += $closest_length;
109 |     for(my $n=1;$n<=4;$n++) {
110 | 	my %T_NGRAM = ();
111 | 	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
112 | 	    my $ngram = "$n";
113 | 	    for(my $w=0;$w<$n;$w++) {
114 | 		$ngram .= " ".$WORD[$start+$w];
115 | 	    }
116 | 	    $T_NGRAM{$ngram}++;
117 | 	}
118 | 	foreach my $ngram (keys %T_NGRAM) {
119 | 	    $ngram =~ /^(\d+) /;
120 | 	    my $n = $1;
121 |             # my $corr = 0;
122 | #	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
123 | 	    $TOTAL[$n] += $T_NGRAM{$ngram};
124 | 	    if (defined($REF_NGRAM{$ngram})) {
125 | 		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
126 | 		    $CORRECT[$n] += $T_NGRAM{$ngram};
127 |                     # $corr =  $T_NGRAM{$ngram};
128 | #	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
129 | 		}
130 | 		else {
131 | 		    $CORRECT[$n] += $REF_NGRAM{$ngram};
132 |                     # $corr =  $REF_NGRAM{$ngram};
133 | #	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
134 | 		}
135 | 	    }
136 |             # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
137 |             # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
138 | 	}
139 |     }
140 |     $s++;
141 | }
142 | my $brevity_penalty = 1;
143 | my $bleu = 0;
144 | 
145 | my @bleu=();
146 | 
147 | for(my $n=1;$n<=4;$n++) {
148 |   if (defined ($TOTAL[$n])){
149 |     $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
150 |     # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
151 |   }else{
152 |     $bleu[$n]=0;
153 |   }
154 | }
155 | 
156 | if ($length_reference==0){
157 |   printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
158 |   exit(1);
159 | }
160 | 
161 | if ($length_translation<$length_reference) {
162 |   $brevity_penalty = exp(1-$length_reference/$length_translation);
163 | }
164 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
165 | 				my_log( $bleu[2] ) +
166 | 				my_log( $bleu[3] ) +
167 | 				my_log( $bleu[4] ) ) / 4) ;
168 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
169 |     100*$bleu,
170 |     100*$bleu[1],
171 |     100*$bleu[2],
172 |     100*$bleu[3],
173 |     100*$bleu[4],
174 |     $brevity_penalty,
175 |     $length_translation / $length_reference,
176 |     $length_translation,
177 |     $length_reference;
178 | 
179 | sub my_log {
180 |   return -9999999999 unless $_[0];
181 |   return log($_[0]);
182 | }
183 | 
184 | 
185 | 
186 | sub tokenization
187 | {
188 | 	my ($norm_text) = @_;
189 | 
190 | # language-independent part:
191 | 	$norm_text =~ s/<skipped>//g; # strip "skipped" tags
192 | 	$norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
193 | 	$norm_text =~ s/\n/ /g; # join lines
194 | 	$norm_text =~ s/&quot;/"/g;  # convert SGML tag for quote to "
195 | 	$norm_text =~ s/&amp;/&/g;   # convert SGML tag for ampersand to &
196 | 	$norm_text =~ s/&lt;/</g;    # convert SGML tag for less-than to >
197 | 	$norm_text =~ s/&gt;/>/g;    # convert SGML tag for greater-than to <
198 | 
199 | # language-dependent part (assuming Western languages):
200 | 	$norm_text = " $norm_text ";
201 | 	$norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g;   # tokenize punctuation
202 | 	$norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
203 | 	$norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
204 | 	$norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
205 | 
206 | # for CJK, Thai, Burmese
207 | 	$norm_text =~ s/([\p{Sc=Han}\p{Sc=Hiragana}\p{Sc=Katakana}\p{Sc=Hangul}\p{Sc=Thai}\p{Sc=Mymr}])/ $1 /g;
208 | 	$norm_text =~ s/\s+/ /g; # one space only between words
209 | 	$norm_text =~ s/^\s+//;  # no leading space
210 | 	$norm_text =~ s/\s+$//;  # no trailing space
211 | 
212 | 	return $norm_text;
213 | }
214 | 


--------------------------------------------------------------------------------
/utilities/score/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | 
 3 | WORKDIR /root
 4 | 
 5 | ENV LANG=C.UTF-8
 6 | RUN apt-get update && apt-get install -y --no-install-recommends \
 7 |         python \
 8 |         python3 \
 9 |         python3-setuptools \
10 |         git \
11 |         perl \
12 |         default-jre \
13 |         libsort-naturally-perl \
14 |         libxml-parser-perl \
15 |         libxml-twig-perl \
16 |         wget && \
17 |     wget -nv https://bootstrap.pypa.io/pip/3.6/get-pip.py && \
18 |     python3 get-pip.py && \
19 |     rm get-pip.py && \
20 |     rm -rf /var/lib/apt/lists/*
21 | 
22 | RUN mkdir /root/tools
23 | 
24 | # rely on python2
25 | ARG OTEMUTEM_URL
26 | ENV OTEMUTEM_URL=${OTEMUTEM_URL:-https://github.com/DeepLearnXMU/Otem-Utem.git}
27 | ARG OTEMUTEM_REF
28 | ENV OTEMUTEM_REF=${OTEMUTEM_REF:-8b4891827d6e4894ebb364284eb38b4cce57cb5e}
29 | 
30 | RUN git clone --depth 1 --single-branch ${OTEMUTEM_URL} /root/OTEMUTEM && \
31 | 	cd OTEMUTEM && git checkout ${OTEMUTEM_REF} && cd / && \
32 | 	mkdir /root/tools/Otem-Utem && \
33 | 	cp /root/OTEMUTEM/*.py /root/tools/Otem-Utem && \
34 | 	rm -rf /root/OTEMUTEM
35 | 
36 | ARG METEOR_URL
37 | ENV METEOR_URL=${METEOR_URL:-http://www.cs.cmu.edu/~alavie/METEOR/download/meteor-1.5.tar.gz}
38 | 
39 | RUN wget $METEOR_URL && \
40 |   tar xvf meteor-1.5.tar.gz && \
41 |   mv meteor-1.5 /root/tools/METEOR && \
42 |   rm meteor-1.5.tar.gz
43 | 
44 | ADD utilities/score/BLEU /root/tools/BLEU
45 | ADD utilities/score/TER /root/tools/TER
46 | ADD utilities/score/NIST /root/tools/NIST
47 | 
48 | ADD requirements.txt /root
49 | RUN pip --no-cache-dir install --upgrade pip
50 | RUN pip --no-cache-dir install -r /root/requirements.txt --use-feature=2020-resolver
51 | 
52 | ADD nmtwizard /root/nmtwizard
53 | ADD utilities/score/entrypoint.py /root/
54 | 
55 | ENTRYPOINT ["python3", "entrypoint.py"]
56 | 


--------------------------------------------------------------------------------
/utilities/score/NIST/xml_wrap.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | my $type = shift;
 4 | my $file = @ARGV;
 5 | 
 6 | 
 7 | print '<?xml version="1.0" encoding="UTF-8"?>'."\n";
 8 | print '<!DOCTYPE mteval SYSTEM "ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-xml-v1.7.dtd">'."\n";
 9 | print '<mteval>'."\n";
10 | 
11 | if($type eq "src")
12 | {
13 |   print '<srcset setid="sample_document_1" srclang="German">'."\n";
14 |   wrap_each_file($ARGV[0]);
15 |   print '</srcset>'."\n";
16 | }
17 | if($type eq "ref")
18 | {
19 |   for(my $i = 1; $i <= scalar @ARGV; $i++)
20 |   {
21 |     print '<refset setid="sample_document_1" srclang="German" trglang="English" refid="reference';
22 |     print sprintf("%02d", $i);
23 |     print '">'."\n";
24 |     wrap_each_file($ARGV[$i-1]);
25 |     print '</refset>'."\n";
26 |   }
27 | }
28 | if($type eq "tst")
29 | {
30 |   print '<tstset setid="sample_document_1" srclang="German" trglang="English" sysid="Score">'."\n";
31 |   wrap_each_file($ARGV[0]);
32 |   print '</tstset>'."\n";
33 | }
34 | 
35 | print '</mteval>'."\n";
36 | 
37 | 
38 | sub wrap_each_file
39 | {
40 |   my ($file) = @_;
41 | 
42 |   print '<doc docid="sample_document_1" genre="temp">'."\n";
43 | 
44 |   open(F, $file);
45 |   my $n = 1;
46 |   while(my $line = <F>)
47 |   {
48 |     chomp($line);
49 | 
50 |     print "<seg id=\"".$n."\">".escape_xml($line)."<\/seg>\n";
51 |     $n++;
52 |   }
53 | 
54 |   close(F);
55 | 
56 |   print '</doc>'."\n";
57 | }
58 | 
59 | sub escape_xml
60 | {
61 |   my ($text) = @_;
62 | 
63 | 	$text =~ s/\&/\&amp;/g;   # escape escape
64 | 	$text =~ s/\|/\&#124;/g;  # factor separator
65 | 	$text =~ s/\</\&lt;/g;    # xml
66 | 	$text =~ s/\>/\&gt;/g;    # xml
67 | 	$text =~ s/\'/\&apos;/g;  # xml
68 | 	$text =~ s/\"/\&quot;/g;  # xml
69 | 	$text =~ s/\[/\&#91;/g;   # syntax non-terminal
70 | 	$text =~ s/\]/\&#93;/g;   # syntax non-terminal
71 | 
72 |   return $text;
73 | }
74 | 


--------------------------------------------------------------------------------
/utilities/score/README.md:
--------------------------------------------------------------------------------
 1 | # Score Utility
 2 | 
 3 | ## Metrics
 4 | - BLEU: [multi-bleu-detok.perl](https://github.com/OpenNMT/OpenNMT-tf/blob/master/third_party/multi-bleu-detok.perl) with CJK tokenizating (Character based).
 5 | - TER: [Version 0.7.25](http://www.cs.umd.edu/~snover/tercom/)
 6 | - Otem-Utem: [Over- and Under-Translation Evaluation Metric for NMT](https://github.com/DeepLearnXMU/Otem-Utem)
 7 | - NIST: [mteval-v14.pl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl) from moses
 8 | - Meteor: [Version 1.5](http://www.cs.cmu.edu/~alavie/METEOR) for **cz de en es fr ru** only
 9 | 
10 | ## Direct run
11 | 
12 | 1\. Install dependencies:
13 | 
14 | ```bash
15 | virtualenv utilities/score/env
16 | source utilities/score/env/bin/activate
17 | pip install -r utilities/score/requirements.txt
18 | ```
19 | 
20 | 2\. Define local environment:
21 | 
22 | ```bash
23 | export WORKSPACE_DIR=/tmp/workspace
24 | export TOOLS_DIR=$PWD/utilities/score
25 | export PYTHONPATH=$PWD:$PYTHONPATH
26 | ```
27 | 
28 | 3\. Run:
29 | 
30 | ### Local run
31 | 
32 | If you run this utility locally, you need some additional packages:
33 | ```bash
34 | # For Otem-Utem
35 | cd utilities/score; git clone https://github.com/DeepLearnXMU/Otem-Utem.git
36 | # For NIST
37 | apt-get install libsort-naturally-perl libxml-parser-perl libxml-twig-perl
38 | # For Meteor
39 | wget http://www.cs.cmu.edu/~alavie/METEOR/download/meteor-1.5.tar.gz; tar xvf meteor-1.5.tar.gz; mv meteor-1.5 utilities/score/METEOR
40 | ```
41 | ```bash
42 | python utilities/score/entrypoint.py score \
43 |   -o test/corpus/eval/testset1.out \
44 |      test/corpus/eval/testset2.out \
45 |   -r test/corpus/eval/testset1.ref \
46 |      test/corpus/eval/testset2.ref.1,test/corpus/eval/testset2.ref.2 \
47 |   -l en \
48 |   -f scores.json
49 | ```
50 | ```bash
51 | python utilities/score/entrypoint.py score \
52 |   -o test/corpus/eval/testset3.out \
53 |   -r test/corpus/eval/testset3.ref \
54 |   -l zh
55 | ```
56 | 
57 | 
58 | ### Docker run
59 | 
60 | ```bash
61 | docker run -i \
62 |   -v $PWD/test/corpus:/root/corpus \
63 |   nmtwizard/score \
64 |   score \
65 |   -o /root/corpus/eval/testset1.out \
66 |      /root/corpus/eval/testset2.out \
67 |   -r /root/corpus/eval/testset1.ref \
68 |      /root/corpus/eval/testset2.ref.1,/root/corpus/eval/testset2.ref.2 \
69 |   -l en
70 | ```
71 | 


--------------------------------------------------------------------------------
/utilities/score/TER/tercom.7.25.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenNMT/nmt-wizard-docker/a8469b2a292c1f38adfc3cffa35162ed9919f08e/utilities/score/TER/tercom.7.25.jar


--------------------------------------------------------------------------------
/utilities/similarity/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:1.14.0-gpu
 2 | 
 3 | WORKDIR /root
 4 | 
 5 | RUN apt-get update && apt-get install -y --no-install-recommends \
 6 |         git cmake && \
 7 |     rm -rf /var/lib/apt/lists/*
 8 | 
 9 | ADD requirements.txt /root
10 | RUN pip --no-cache-dir install -r /root/requirements.txt
11 | 
12 | ARG SIMILARITY_URL
13 | ENV SIMILARITY_URL=${SIMILARITY_URL:-https://github.com/systran/similarity.git}
14 | ARG SIMILARITY_REF
15 | ENV SIMILARITY_REF=${SIMILARITY_REF:-master}
16 | 
17 | RUN git clone --depth 1 --single-branch ${SIMILARITY_URL} /root/similarity && \
18 | 	cd similarity && git checkout ${SIMILARITY_REF} && cd .. && \
19 | 	rm -rf similarity/.git/* && \
20 | 	cp /root/similarity/src/*.py /root/similarity/requirements.txt /root && \
21 | 	rm -rf /root/similarity
22 | 
23 | RUN pip --no-cache-dir install -r /root/requirements.txt
24 | 
25 | RUN mkdir /root/tools
26 | 
27 | ARG FAST_ALIGN_URL
28 | ENV FAST_ALIGN_URL=${FAST_ALIGN_URL:-https://github.com/clab/fast_align}
29 | ARG FAST_ALIGN_REF
30 | ENV FAST_ALIGN_REF=${FAST_ALIGN_REF:-master}
31 | 
32 | RUN git clone --depth 1 --single-branch ${FAST_ALIGN_URL} /root/fast_align && \
33 | 	cd fast_align && git checkout ${FAST_ALIGN_REF} && \
34 |   mkdir build && cd build && cmake .. && make && \
35 | 	cp fast_align /root/tools && \
36 | 	rm -rf /root/fast_align
37 | 
38 | ADD nmtwizard /root/nmtwizard
39 | 
40 | ADD utilities/similarity/entrypoint.py /root/
41 | 
42 | ENTRYPOINT ["python", "entrypoint.py"]


--------------------------------------------------------------------------------
/utilities/similarity/README.md:
--------------------------------------------------------------------------------
 1 | # Similarity Utility
 2 | 
 3 | ## Direct run
 4 | 
 5 | 1\. Install dependencies:
 6 | 
 7 | ```bash
 8 | virtualenv utilities/similarity/env
 9 | source utilities/similarity/env/bin/activate
10 | pip install -r requirements.txt
11 | ```
12 | 
13 | 2\. Define local environment:
14 | 
15 | ```bash
16 | export MODELS_DIR=/tmp/models
17 | export WORKSPACE_DIR=/tmp/workspace
18 | export CORPUS_DIR=$PWD/test/corpus/
19 | export PYTHONPATH=$PWD:$PYTHONPATH
20 | ```
21 | 
22 | 3\. Run:
23 | 
24 | ### Local run
25 | 
26 | If you run this utility locally, you need [similarity project](https://github.com/SYSTRAN/similarity):
27 | ```bash
28 | git clone https://github.com/SYSTRAN/similarity utilities/similarity/similarity
29 | pip install -r utilities/similarity/similarity/requirements.txt
30 | ```
31 | Train
32 | 
33 | ```bash
34 | python utilities/similarity/entrypoint.py \
35 |   -g 1 \
36 |   simtrain \
37 |   -mdir ${MODELS_DIR} \
38 |   -trn_src ${TESTSRC} \
39 |   -trn_tgt ${TESTTGT} \
40 |   -build_data_mode puid \
41 |   -src_emb_size 64 \
42 |   -tgt_emb_size 64 \
43 |   -max_sents 1000000 \
44 |   -lr_method adam -lr 0.001 \
45 |   -n_epochs 1
46 | ```
47 | 
48 | Inference
49 | 
50 | ```bash
51 | python utilities/similarity/entrypoint.py \
52 |   simapply \
53 |   -mdir ${MODELS_DIR} \
54 |   -tst_src ${CORPUS_DIR}/test.en \
55 |   -tst_tgt ${CORPUS_DIR}/test.fr \
56 |   -output $PWD/output
57 | ```
58 | 
59 | ### Docker run
60 | 
61 | Train
62 | 
63 | ```bash
64 | nvidia-docker run -i --rm \
65 |   -v ${MODELS_DIR}:/modelpath \
66 |   -v ${CORPUS_DIR}:/data \
67 |   nmtwizard/similarity:latest \
68 |   -g 1 \
69 |   simtrain \
70 |   -mdir /modelpath \
71 |   -trn_src /data/train.en \
72 |   -trn_tgt /data/train.fr \
73 |   -build_data_mode puid \
74 |   -src_emb_size 64 \
75 |   -tgt_emb_size 64 \
76 |   -max_sents 1000000 \
77 |   -lr_method adam -lr 0.001 \
78 |   -n_epochs 1
79 | ```
80 | 
81 | Inference
82 | ```bash
83 | docker run -i --rm \
84 |   -v ${MODELS_DIR}:/modelpath \
85 |   -v ${CORPUS_DIR}:/data \
86 |   nmtwizard/similarity:latest \
87 |   simapply \
88 |   -mdir /modelpath \
89 |   -tst_src /data/test.en \
90 |   -tst_tgt /data/test.fr \
91 |   -output /modelpath/output
92 | ```
93 | 
94 | 


--------------------------------------------------------------------------------
/utilities/tuminer/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel
 2 | # python3.7 pip 20.0.2
 3 | ENV LANG C.UTF-8
 4 | 
 5 | #####################################################################################################
 6 | # Install LASER
 7 | #####################################################################################################
 8 | RUN apt-get update && \
 9 |     apt-get install --no-install-recommends -y unzip \
10 |       g++ wget cpio \
11 |       libgtest-dev swig3.0 \
12 |       libopenblas-dev \
13 |       git \
14 |       libssl-dev \
15 |       wget && \
16 |     git clone https://github.com/Kitware/CMake ~/cmake && \
17 |     cd ~/cmake && \
18 |     ./bootstrap && \
19 |     make -j 8 install
20 | 
21 | # python modules
22 | RUN pip install transliterate jieba
23 | 
24 | #install faiss
25 | RUN cd /opt && \
26 |     git clone https://github.com/facebookresearch/faiss.git && \
27 |     cd faiss && \
28 |     cmake -B build . && \
29 |     make -C build -j 8 && \
30 |     cd build/faiss/python && python setup.py install
31 | 
32 | #install LASER
33 | RUN cd /opt && \
34 |     git clone https://github.com/facebookresearch/LASER.git && \
35 |     cd LASER && \
36 |     LASER=/opt/LASER bash ./install_models.sh
37 | 
38 | #install LASER tools-external
39 | RUN cd /opt/LASER && \
40 |     sed -i "s#cd fastBPE#cd fastBPE; ln -s main.cc fastBPE/fastBPE.cpp#g" install_external_tools.sh && \
41 |     sed -i "s#python setup.py install##g" install_external_tools.sh && \
42 |     LASER=/opt/LASER bash ./install_external_tools.sh
43 | 
44 | #install fastBPE
45 | RUN pip install fastBPE
46 | 
47 | #install mecab
48 | RUN cd /tmp && \
49 |     wget -O mecab-0.996.tar.gz "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE" && \
50 |     tar xvzf mecab-0.996.tar.gz && \
51 |     cd mecab-0.996 && \
52 |     ./configure --prefix /opt/LASER/tools-external/mecab --with-charset=utf8 && \
53 |     make install && \
54 |     rm -rf mecab-0.996.tar.gz mecab-0.996
55 | 
56 | RUN cd /tmp && \
57 |     wget -O mecab-ipadic-2.7.0-XXXX.tar.gz "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM" && \
58 |     tar xvzf mecab-ipadic-2.7.0-XXXX.tar.gz && \
59 |     cd mecab-ipadic-2.7.0-20070801/ && \
60 |     ./configure --prefix=/opt/LASER/tools-external/mecab --with-mecab-config=/opt/LASER/tools-external/mecab/bin/mecab-config --with-charset=utf8 && \
61 |     make install && \
62 |     rm -rf mecab-ipadic-2.7.0-XXXX.tar.gz mecab-ipadic-2.7.0-20070801
63 | 
64 | RUN echo "export LASER=/opt/LASER" >> /etc/bash.bashrc && echo "export LC_ALL=C.UTF-8" >> /etc/bash.bashrc
65 | #####################################################################################################
66 | # config & cleanup
67 | #####################################################################################################
68 | RUN ldconfig && \
69 |     apt-get clean && \
70 |     apt-get autoremove && \
71 |     rm -rf /var/lib/apt/lists/* /tmp/* ~/*
72 | #####################################################################################################
73 | # nmt-wizard-docker
74 | #####################################################################################################
75 | ADD nmtwizard /nmtwizard
76 | ADD requirements.txt /
77 | RUN pip --no-cache-dir install -r /requirements.txt && rm /requirements.txt
78 | 
79 | ADD utilities/tuminer/entrypoint.py /
80 | ENTRYPOINT ["python3", "/entrypoint.py"]
81 | 


--------------------------------------------------------------------------------
/utilities/tuminer/README.md:
--------------------------------------------------------------------------------
  1 | # TU Miner Utility
  2 | 
  3 | ## Build docker
  4 | 
  5 | 
  6 | ```bash
  7 | (within nmt-wizard-docker top directory)
  8 | docker build -t nmtwizard/tuminer ${PWD} -f utilities/tuminer/Dockerfile
  9 | ```
 10 | 
 11 | 
 12 | ## Run docker
 13 | 
 14 | ### Command line options ###
 15 | 
 16 | ```bash
 17 | usage: entrypoint.py tuminer [-h] --tumode {score,mine} [--srclang SRCLANG]
 18 |                              --srcfile SRCFILE [--tgtlang TGTLANG] --tgtfile
 19 |                              TGTFILE --output OUTPUT [--encoding ENCODING]
 20 |                              [--verbose] [--threshold THRESHOLD]
 21 |                              [--bpecodes BPECODES] [--encoder ENCODER]
 22 |                              [--encoderdim ENCODERDIM]
 23 |                              [--encoderbuffersize ENCODERBUFFERSIZE]
 24 |                              [--encodermaxtokens ENCODERMAXTOKENS]
 25 | 
 26 | optional arguments:
 27 |   -h, --help            show this help message and exit
 28 |   --tumode {score,mine}
 29 |                         Tuminer mode
 30 |   --srclang SRCLANG     Source language (two-letter language code; ISO 639-1).
 31 |   --srcfile SRCFILE     Source language file.
 32 |   --tgtlang TGTLANG     Target language (two-letter language code; ISO 639-1).
 33 |   --tgtfile TGTFILE     Target language file.
 34 |   --output OUTPUT       Output file.
 35 |   --encoding ENCODING   Encoding of the input and output text files.
 36 |   --verbose             Increase output verbosity.
 37 |   --threshold THRESHOLD
 38 |                         When in `mine` mode, threshold value for mined TUs
 39 |   --bpecodes BPECODES   BPE code to be applied to both source and target
 40 |                         files. (default model provided in docker)
 41 |   --encoder ENCODER     Multi-lingual encoder to be used to encode both source
 42 |                         and target files. (default model provided in docker)
 43 |   --encoderdim ENCODERDIM
 44 |                         Encoder output dimension
 45 |   --encoderbuffersize ENCODERBUFFERSIZE
 46 |                         Encoder buffer size
 47 |   --encodermaxtokens ENCODERMAXTOKENS
 48 |                         Encoder max_token size
 49 | ```
 50 | 
 51 | ### Sample command line ###
 52 | 
 53 | ```bash
 54 | nvidia-docker run -it \
 55 |   -v $PWD/test/corpus:/corpus \
 56 |   -v /tmp/output:/output \
 57 |   nmtwizard/tuminer \
 58 |   tuminer \
 59 |   --tumode score \
 60 |   --srclang he --srcfile /corpus/train/europarl-v7.de-en.10K.tok.de \
 61 |   --tgtlang en --tgtfile /corpus/train/europarl-v7.de-en.10K.tok.en \
 62 |   --output /output/europarl-v7.de-en.10K.tok.deen.tuminer-score
 63 | ```
 64 | 
 65 | ### Output format ###
 66 | 
 67 | #### Score mode ####
 68 | 
 69 | The output file will contain same number lines as the input files where each line will contain a real number.
 70 | 
 71 | 
 72 | #### Mine mode ####
 73 | 
 74 | In `mine` mode, the output file contains 0 or more lines of text where each line is formatted as below: 
 75 | 
 76 | ```
 77 | (real number score)  \t  (source sentence)  \t  (target sentence)
 78 | ```
 79 | 
 80 | 
 81 | #### How to interpret the score #### 
 82 | For both `score` and `mine` modes, a score is associated with a given sentence pair. 
 83 | 
 84 | This value typically ranges between 0 and 1.5.
 85 | However, a really bad pair of source and target sentences may produce a value below 0, and likewise a really good pair may have higher values than 1.5.
 86 | 
 87 | Values above 1.0 may indicate a really good translation unit pair.
 88 | If mining TUs from comparable corpora or when scoring translated texts, values above 0.7 or 0.8 may indicate that the pair is useful.
 89 | 
 90 | 
 91 | 
 92 | ### Selecting the sentence encoder ###
 93 | The docker image contains two pre-trained multi-lingual encoders. 
 94 | The following encoder model and its associated BPE code is used by default:
 95 | 
 96 | ```
 97 | /opt/LASER/models/bilstm.93langs.2018-12-26.pt
 98 | /opt/LASER/models/93langs.fcodes
 99 | ```
100 | 
101 | If you wish to try the other model included in the docker image, you can add the following arguments to your command: 
102 | 
103 | ```
104 | --encoder /opt/LASER/models/bilstm.eparl21.2018-11-19.pt
105 | --bpecodes /opt/LASER/models/eparl21.fcodes
106 | ```
107 | 
108 | 
109 | 
110 | ### If there is an error ###
111 | 
112 | If your process is terminated without generating an output file (and there was no useful error message), 
113 | it may be due to the process being killed when the sentence encoder was not able to allocate enough buffer space in memory.
114 | 
115 | The default values for the arguments `--encoderbuffersize`and `--encodermaxtokens` are 10000 and 12000, respectively,
116 | and these values are known to work on a server-grade machine with 256G of RAM and NVIDIA GPUs with 12G of memory.
117 | 
118 | For a Macbook Pro laptop with 16G of RAM without any GPUs, scoring a 1000 pairs of sentences ran successfully by setting the argument `--encodermaxtokens` to 7500.
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 


--------------------------------------------------------------------------------