├── CS2S+BPE+Emb
    ├── software
    │   ├── nbest-reranker
    │   │   ├── lib
    │   │   │   ├── __init__.py
    │   │   │   ├── m2scorer
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── token_offsets.py
    │   │   │   │   ├── nuclesgmlparser.py
    │   │   │   │   ├── combiner.py
    │   │   │   │   ├── m2scorer.py
    │   │   │   │   ├── util.py
    │   │   │   │   └── Tokenizer.py
    │   │   │   └── kenlm_python
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── example.py
    │   │   │   │   └── _kenlm.pxd
    │   │   ├── .gitignore
    │   │   ├── README.md
    │   │   ├── configreader.py
    │   │   ├── augmenter.py
    │   │   ├── rerank.py
    │   │   ├── train.py
    │   │   ├── log_utils.py
    │   │   └── candidatesreader.py
    │   ├── fairseq-py
    │   │   ├── requirements.txt
    │   │   ├── fairseq.gif
    │   │   ├── fairseq
    │   │   │   ├── __init__.py
    │   │   │   ├── temporal_convolution_tbc
    │   │   │   │   └── __init__.py
    │   │   │   ├── modules
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── beamable_mm.py
    │   │   │   │   ├── linearized_convolution.py
    │   │   │   │   └── conv_tbc.py
    │   │   │   ├── criterions
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── cross_entropy.py
    │   │   │   │   ├── fairseq_criterion.py
    │   │   │   │   └── label_smoothed_cross_entropy.py
    │   │   │   ├── clib
    │   │   │   │   ├── temporal_convolution_tbc
    │   │   │   │   │   ├── temporal_convolution_tbc.h
    │   │   │   │   │   └── temporal_convolution_tbc.cpp
    │   │   │   │   └── libbleu
    │   │   │   │   │   ├── module.cpp
    │   │   │   │   │   └── libbleu.cpp
    │   │   │   ├── models
    │   │   │   │   └── __init__.py
    │   │   │   ├── multiprocessing_pdb.py
    │   │   │   ├── progress_bar.py
    │   │   │   ├── nag.py
    │   │   │   ├── meters.py
    │   │   │   ├── tokenizer.py
    │   │   │   ├── bleu.py
    │   │   │   ├── dictionary.py
    │   │   │   ├── nccl.py
    │   │   │   ├── indexed_dataset.py
    │   │   │   ├── utils.py
    │   │   │   ├── multiprocessing_event_loop.py
    │   │   │   └── options.py
    │   │   ├── scripts
    │   │   │   ├── convert_dictionary.lua
    │   │   │   ├── convert_model.lua
    │   │   │   └── build_sym_alignment.py
    │   │   ├── CONTRIBUTING.md
    │   │   ├── tests
    │   │   │   └── test_label_smoothing.py
    │   │   ├── LICENSE
    │   │   ├── .gitignore
    │   │   ├── PATENTS
    │   │   ├── setup.py
    │   │   ├── score.py
    │   │   ├── data
    │   │   │   └── prepare-iwslt14.sh
    │   │   ├── preprocess.py
    │   │   └── generate.py
    │   ├── subword-nmt
    │   │   ├── get_vocab.py
    │   │   ├── LICENSE
    │   │   ├── README.md
    │   │   ├── bpe_toy.py
    │   │   ├── segment-char-ngrams.py
    │   │   ├── chrF.py
    │   │   ├── apply_bpe.py
    │   │   └── learn_bpe.py
    │   └── download.sh
    ├── paths.sh
    ├── scripts
    │   ├── get_diff.py
    │   ├── nbest_reformat.py
    │   ├── convert_m2_to_parallel.py
    │   └── apply_bpe.py
    ├── training
    │   ├── train.sh
    │   ├── train_embed.sh
    │   └── preprocess.sh
    └── run.sh
├── scripts
    ├── remove_spac_pkunlp_segment.sh
    └── pkunlp_segment.py
└── README.md


/CS2S+BPE+Emb/software/nbest-reranker/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/lib/m2scorer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/lib/kenlm_python/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | torch
3 | tqdm
4 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YingyWang/NLPCC_2018_TASK2_GEC/HEAD/CS2S+BPE+Emb/software/fairseq-py/fairseq.gif


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/paths.sh:
--------------------------------------------------------------------------------
1 | BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"  && pwd )"
2 | DATA_DIR=$BASE_DIR/data
3 | MODEL_DIR=$BASE_DIR/models
4 | SCRIPTS_DIR=$BASE_DIR/scripts
5 | SOFTWARE_DIR=$BASE_DIR/software
6 | TRAINING_DIR=$BASE_DIR/training
7 | 
8 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/subword-nmt/get_vocab.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import sys
 4 | from collections import Counter
 5 | 
 6 | c = Counter()
 7 | 
 8 | for line in sys.stdin:
 9 |     for word in line.split():
10 |         c[word] += 1
11 | 
12 | for key,f in sorted(c.items(), key=lambda x: x[1], reverse=True):
13 |     print key, f
14 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/scripts/get_diff.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | 
 4 | prefix = sys.argv[1]
 5 | src = sys.argv[2]
 6 | trg = sys.argv[3]
 7 | 
 8 | with open(prefix + '.' + src) as f_src, open(prefix + '.' + trg) as f_trg:
 9 |     for sline, tline in zip(f_src, f_trg):
10 |         sline = sline.strip()
11 |         tline = tline.strip()
12 |         if sline != tline:
13 |             print sline+'\t'+tline
14 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | from .multiprocessing_pdb import pdb
10 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/temporal_convolution_tbc/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from torch.utils.ffi import _wrap_function
 3 | from ._temporal_convolution_tbc import lib as _lib, ffi as _ffi
 4 | 
 5 | __all__ = []
 6 | def _import_symbols(locals):
 7 |     for symbol in dir(_lib):
 8 |         fn = getattr(_lib, symbol)
 9 |         locals[symbol] = _wrap_function(fn, _ffi)
10 |         __all__.append(symbol)
11 | 
12 | _import_symbols(locals())
13 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/README.md:
--------------------------------------------------------------------------------
 1 | # N-best Reranker
 2 | 
 3 | Re-ranking N-best lists (MOSES format) using features like language models, edit operations etc. It is also easy to implement custom features.
 4 | 
 5 | Currently, tuning with BLEU and M2Scorer with MERT are supported
 6 | 
 7 | ## Running the re-ranker
 8 | 
 9 | 1. First augment the new feature using augment.py script
10 | 
11 | 2. Then train the re-ranker using train.py script
12 | 
13 | 3. Then rerank using rerank.py script
14 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/configreader.py:
--------------------------------------------------------------------------------
 1 | ''' File to parse config files
 2 | '''
 3 | 
 4 | def parse_ini(ini_path):
 5 | 	out = []
 6 | 	with open(ini_path, 'r') as ini_file:
 7 | 		section = '[nil]'
 8 | 		for line in ini_file:
 9 | 			line = line.strip()
10 | 			if line.startswith('['):
11 | 				section = line
12 | 			elif section == '[weight]' and line != '':
13 | 				if line.startswith('UnknownWordPenalty0= '):
14 | 					out.append('UnknownWordPenalty0 UNTUNEABLE')
15 | 				else:
16 | 					out.append(line)
17 | 	return out
18 | 


--------------------------------------------------------------------------------
/scripts/remove_spac_pkunlp_segment.sh:
--------------------------------------------------------------------------------
1 | NLPCC2018_DIR=/home/renhongkai/projects/mlconvgec2018/nlpcc2018/
2 | DIR=$NLPCC2018_DIR//seq2seq+bpe+embed/outputs/mlconv_embed/model1/model_best
3 | # 去除空格（分词信息）
4 | sed 's/ //g' $DIR/output.tok.txt > $DIR/output.tok.txt.remove.spac
5 | # 使用pkunlp进行分词，得到文件$DIR/output.tok.txt.remove.spac.seg
6 | python pkunlp_segment.py --corpus $DIR/output.tok.txt.remove.spac --segsuffix seg 
7 | # 使用m2score计算得分
8 | $NLPCC2018_DIR/m2scorer/m2scorer $DIR/output.tok.txt.remove.spac.seg  ~/projects/mlconvgec2018/nlpcc2018/gold.01
9 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | from .beamable_mm import *
10 | from .linearized_convolution import *
11 | from .conv_tbc import ConvTBC
12 | 
13 | __all__ = [
14 |     'BeamableMM', 'LinearizedConvolution', 'ConvTBC',
15 | ]
16 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/training/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | set -e
 5 | 
 6 | source ../paths.sh
 7 | FAIRSEQPY=$SOFTWARE_DIR/fairseq-py
 8 | 
 9 | SEED=1
10 | DATA_BIN_DIR=processed/bin
11 | 
12 | OUT_DIR=models/mlconv/model$SEED/
13 | mkdir -p $OUT_DIR
14 | 
15 | PYTHONPATH=$FAIRSEQPY:$PYTHONPATH CUDA_VISIBLE_DEVICES="0" python $FAIRSEQPY/train.py --save-dir $OUT_DIR --encoder-embed-dim 500 --decoder-embed-dim 500 --decoder-out-embed-dim 500 --dropout 0.2 --clip-norm 0.1 --lr 0.25 --min-lr 1e-4 --encoder-layers '[(1024,3)] * 7' --decoder-layers '[(1024,3)] * 7' --momentum 0.99 --max-epoch 100 --batch-size 32 --no-progress-bar --seed $SEED $DATA_BIN_DIR
16 | 
17 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/criterions/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | from .cross_entropy import CrossEntropyCriterion
10 | from .fairseq_criterion import FairseqCriterion
11 | from .label_smoothed_cross_entropy import LabelSmoothedCrossEntropyCriterion
12 | 
13 | __all__ = [
14 |     'CrossEntropyCriterion',
15 |     'LabelSmoothedCrossEntropyCriterion',
16 | ]
17 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/clib/temporal_convolution_tbc/temporal_convolution_tbc.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2017-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | void TemporalConvolutionTBC_forward(
10 |   const char* dtype,
11 |   void* input,
12 |   void* output,
13 |   void* weight,
14 |   void* bias);
15 | 
16 | void TemporalConvolutionTBC_backward(
17 |   const char* dtype,
18 |   void* _dOutput,
19 |   void* _dInput,
20 |   void* _dWeight,
21 |   void* _dBias,
22 |   void* _input,
23 |   void* _weight);
24 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | from . import fconv
10 | 
11 | 
12 | __all__ = ['fconv']
13 | 
14 | arch_model_map = {}
15 | for model in __all__:
16 |     archs = locals()[model].get_archs()
17 |     for arch in archs:
18 |         assert arch not in arch_model_map, 'Duplicate model architecture detected: {}'.format(arch)
19 |         arch_model_map[arch] = model
20 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/training/train_embed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | set -e
 5 | 
 6 | source ../paths.sh
 7 | FAIRSEQPY=$SOFTWARE_DIR/fairseq-py
 8 | 
 9 | # download embeddings if necessary
10 | EMBED_PATH=$DATA_DIR/embeddings/chinesegigawordv5.jian.jieba.seg.bpe.skipngram.500d.txt
11 | 
12 | SEED=1
13 | DATA_BIN_DIR=processed/bin
14 | OUT_DIR=models/mlconv_embed/model$SEED/
15 | mkdir -p $OUT_DIR
16 | 
17 | PYTHONPATH=$FAIRSEQPY:$PYTHONPATH CUDA_VISIBLE_DEVICES="0" python $FAIRSEQPY/train.py --save-dir $OUT_DIR --encoder-embed-dim 500 --encoder-embed-path $EMBED_PATH --decoder-embed-dim 500 --decoder-embed-path $EMBED_PATH --decoder-out-embed-dim 500 --dropout 0.2 --clip-norm 0.1 --lr 0.25 --min-lr 1e-4 --encoder-layers '[(1024,3)] * 7' --decoder-layers '[(1024,3)] * 7' --momentum 0.99 --max-epoch 100 --batch-size 32 --no-progress-bar --seed $SEED $DATA_BIN_DIR
18 | 
19 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/scripts/nbest_reformat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument('-i', '--input-file',  help='path to input file (output of fairseq)')
 7 | parser.add_argument('--debpe',  action='store_true', help='enable the flag to post-process and remove BPE segmentation.')
 8 | 
 9 | args = parser.parse_args()
10 | 
11 | 
12 | scount = -1
13 | with open(args.input_file) as f:
14 |     for line in f:
15 |         line = line.strip()
16 |         pieces = line.split('\t')
17 |         if pieces[0] == 'S':
18 |             scount += 1
19 |         if pieces[0] == 'H':
20 |             hyp = pieces[2]
21 |             if args.debpe:
22 |                 hyp = hyp.replace('@@ ','')
23 |             score = pieces[1]
24 |             print("%d ||| %s ||| F0= %s ||| %s" % (scount, hyp, score, score) )
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/clib/libbleu/module.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2017-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #include <Python.h>
10 | 
11 | 
12 | static PyMethodDef method_def[] = {
13 |   {NULL, NULL, 0, NULL}
14 | };
15 | 
16 | static struct PyModuleDef module_def = {
17 |    PyModuleDef_HEAD_INIT,
18 |    "libbleu",   /* name of module */
19 |    NULL,     /* module documentation, may be NULL */
20 |    -1,       /* size of per-interpreter state of the module,
21 |                 or -1 if the module keeps state in global variables. */
22 |    method_def
23 | };
24 | 
25 | 
26 | #if PY_MAJOR_VERSION == 2
27 | PyMODINIT_FUNC init_libbleu()
28 | #else
29 | PyMODINIT_FUNC PyInit_libbleu()
30 | #endif
31 | {
32 |   PyObject *m = PyModule_Create(&module_def);
33 |   if (!m) {
34 |     return NULL;
35 |   }
36 |   return m;
37 | }
38 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/scripts/convert_dictionary.lua:
--------------------------------------------------------------------------------
 1 | -- Copyright (c) 2017-present, Facebook, Inc.
 2 | -- All rights reserved.
 3 | --
 4 | -- This source code is licensed under the license found in the LICENSE file in
 5 | -- the root directory of this source tree. An additional grant of patent rights
 6 | -- can be found in the PATENTS file in the same directory.
 7 | --
 8 | -- Usage: convert_dictionary.lua <dict.th7>
 9 | require 'fairseq'
10 | require 'torch'
11 | require 'paths'
12 | 
13 | if #arg < 1 then
14 |    print('usage: convert_dictionary.lua <dict.th7>')
15 |    os.exit(1)
16 | end
17 | if not paths.filep(arg[1]) then
18 |    print('error: file does not exit: ' .. arg[1])
19 |    os.exit(1)
20 | end
21 | 
22 | dict = torch.load(arg[1])
23 | dst = paths.basename(arg[1]):gsub('.th7', '.txt')
24 | assert(dst:match('.txt$'))
25 | 
26 | f = io.open(dst, 'w')
27 | for idx, symbol in ipairs(dict.index_to_symbol) do
28 |   if idx > dict.cutoff then
29 |     break
30 |   end
31 |   f:write(symbol)
32 |   f:write(' ')
33 |   f:write(dict.index_to_freq[idx])
34 |   f:write('\n')
35 | end
36 | f:close()
37 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/criterions/cross_entropy.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | import math
10 | import torch.nn.functional as F
11 | 
12 | from .fairseq_criterion import FairseqCriterion
13 | 
14 | 
15 | class CrossEntropyCriterion(FairseqCriterion):
16 | 
17 |     def __init__(self, padding_idx):
18 |         super().__init__()
19 |         self.padding_idx = padding_idx
20 | 
21 |     def prepare(self, samples):
22 |         self.denom = sum(s['ntokens'] if s else 0 for s in samples)
23 | 
24 |     def forward(self, net_output, sample):
25 |         input = net_output.view(-1, net_output.size(-1))
26 |         target = sample['target'].view(-1)
27 |         loss = F.cross_entropy(input, target, size_average=False, ignore_index=self.padding_idx)
28 |         return loss / self.denom
29 | 
30 |     def aggregate(self, losses):
31 |         return sum(losses) / math.log(2)
32 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/criterions/fairseq_criterion.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | from torch.nn.modules.loss import _Loss
10 | 
11 | 
12 | class FairseqCriterion(_Loss):
13 | 
14 |     def __init__(self, *args, **kwargs):
15 |         super().__init__(*args, **kwargs)
16 | 
17 |     def prepare(self, samples):
18 |         """Prepare criterion for DataParallel training."""
19 |         raise NotImplementedError
20 | 
21 |     def forward(self, net_output, sample):
22 |         """Compute the loss for the given sample and network output."""
23 |         raise NotImplementedError
24 | 
25 |     def aggregate(self, losses):
26 |         """Aggregate losses from DataParallel training.
27 | 
28 |         Takes a list of losses as input (as returned by forward) and
29 |         aggregates them into the total loss for the mini-batch.
30 |         """
31 |         raise NotImplementedError
32 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/subword-nmt/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 University of Edinburgh
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/multiprocessing_pdb.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | import multiprocessing
10 | import os
11 | import pdb
12 | import sys
13 | 
14 | 
15 | class MultiprocessingPdb(pdb.Pdb):
16 |     """A Pdb wrapper that works in a multiprocessing environment.
17 | 
18 |     Usage: `from fairseq import pdb; pdb.set_trace()`
19 |     """
20 | 
21 |     _stdin_fd = sys.stdin.fileno()
22 |     _stdin = None
23 |     _stdin_lock = multiprocessing.Lock()
24 | 
25 |     def __init__(self):
26 |         pdb.Pdb.__init__(self, nosigint=True)
27 | 
28 |     def _cmdloop(self):
29 |         stdin_bak = sys.stdin
30 |         with self._stdin_lock:
31 |             try:
32 |                 if not self._stdin:
33 |                     self._stdin = os.fdopen(self._stdin_fd)
34 |                 sys.stdin = self._stdin
35 |                 self.cmdloop()
36 |             finally:
37 |                 sys.stdin = stdin_bak
38 | 
39 | 
40 | pdb = MultiprocessingPdb()
41 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/download.sh:
--------------------------------------------------------------------------------
 1 | echo "Downloading Fairseq from https://github.com/shamilcm/fairseq-py (rev:90c31cd92055124c427689c00624b1eb84c5688a)"
 2 | wget https://github.com/shamilcm/fairseq-py/archive/90c31cd92055124c427689c00624b1eb84c5688a.zip 
 3 | unzip 90c31cd92055124c427689c00624b1eb84c5688a.zip
 4 | rm 90c31cd92055124c427689c00624b1eb84c5688a.zip
 5 | mv fairseq-py-90c31cd92055124c427689c00624b1eb84c5688a fairseq-py
 6 | 
 7 | echo "Downloading n-best reranker from https://github.com/nusnlp/nbest-reranker (rev: 454c4adc90d0469ef7b2c71ff8cf849ea8cb67f)"
 8 | wget https://github.com/nusnlp/nbest-reranker/archive/454c4adc90d0469ef7b2c71ff8cf849ea8cb67f6.zip
 9 | unzip 454c4adc90d0469ef7b2c71ff8cf849ea8cb67f6.zip
10 | rm 454c4adc90d0469ef7b2c71ff8cf849ea8cb67f6.zip
11 | mv nbest-reranker-454c4adc90d0469ef7b2c71ff8cf849ea8cb67f6 nbest-reranker
12 | #git clone https://github.com/nusnlp/nbest-reranker/
13 | 
14 | echo "Downloading Subword NMT from https://github.com/rsennrich/subword-nmt (rev: ec5c7b009c409e72b5ef65a77c1a846546f14847)"
15 | wget https://github.com/rsennrich/subword-nmt/archive/ec5c7b009c409e72b5ef65a77c1a846546f14847.zip
16 | unzip ec5c7b009c409e72b5ef65a77c1a846546f14847.zip
17 | rm ec5c7b009c409e72b5ef65a77c1a846546f14847.zip
18 | mv subword-nmt-ec5c7b009c409e72b5ef65a77c1a846546f14847 subword-nmt
19 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to FAIR Sequence-to-Sequence Toolkit (PyTorch)
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `master`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Facebook's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | ## Coding Style
26 | We try to follow the PEP style guidelines and encourage you to as well.
27 | 
28 | ## License
29 | By contributing to FAIR Sequence-to-Sequence Toolkit, you agree that your contributions will be licensed
30 | under the LICENSE file in the root directory of this source tree.


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/subword-nmt/README.md:
--------------------------------------------------------------------------------
 1 | Subword Neural Machine Translation
 2 | ==================================
 3 | 
 4 | This repository contains preprocessing scripts to segment text into subword
 5 | units. The primary purpose is to facilitate the reproduction of our experiments
 6 | on Neural Machine Translation with subword units (see below for reference).
 7 | 
 8 | USAGE INSTRUCTIONS
 9 | ------------------
10 | 
11 | Check the individual files for usage instructions.
12 | 
13 | To apply byte pair encoding to word segmentation, invoke these commands:
14 | 
15 |     ./learn_bpe.py -s {num_operations} < {train_file} > {codes_file}
16 |     ./apply_bpe.py -c {codes_file} < {test_file}
17 | 
18 | To segment rare words into character n-grams, do the following:
19 | 
20 |     ./get_vocab.py < {train_file} > {vocab_file}
21 |     ./segment-char-ngrams.py --vocab {vocab_file} -n {order} --shortlist {size} < {test_file}
22 | 
23 | The original segmentation can be restored with a simple replacement:
24 | 
25 |     sed "s/@@ //g"
26 | 
27 | PUBLICATIONS
28 | ------------
29 | 
30 | The segmentation methods are described in:
31 | 
32 | Rico Sennrich, Barry Haddow and Alexandra Birch (2016):
33 |     Neural Machine Translation of Rare Words with Subword Units
34 |     Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/tests/test_label_smoothing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | import torch
10 | import unittest
11 | from fairseq.criterions.label_smoothed_cross_entropy import LabelSmoothedCrossEntropy
12 | from torch.autograd import Variable, gradcheck
13 | 
14 | 
15 | torch.set_default_tensor_type('torch.DoubleTensor')
16 | 
17 | 
18 | class TestLabelSmoothing(unittest.TestCase):
19 | 
20 |     def test_label_smoothing(self):
21 |         input = Variable(torch.randn(3, 5), requires_grad=True)
22 |         idx = torch.rand(3) * 4
23 |         target = Variable(idx.long())
24 |         criterion = LabelSmoothedCrossEntropy()
25 |         self.assertTrue(gradcheck(
26 |             lambda x, y: criterion.apply(x, y, 0.1, 2, None), (input, target)
27 |         ))
28 |         weights = torch.ones(5)
29 |         weights[2] = 0
30 |         self.assertTrue(gradcheck(lambda x, y: criterion.apply(x, y, 0.1, None, weights), (input, target)))
31 |         self.assertTrue(gradcheck(lambda x, y: criterion.apply(x, y, 0.1, None, None), (input, target)))
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     unittest.main()
36 | 


--------------------------------------------------------------------------------
/scripts/pkunlp_segment.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # encoding: utf-8
 3 | 
 4 | from __future__ import unicode_literals, print_function
 5 | from pkunlp import Segmentor, NERTagger, POSTagger
 6 | 
 7 | import argparse
 8 | import time
 9 | import codecs, json, re, sys, time
10 | import multiprocessing
11 | 
12 | # usage:
13 | # python pkunlp_segmenter.py --corpus data.train.src  --segsuffix seg
14 | 
15 | def parseargs():
16 |     parser = argparse.ArgumentParser(description="segment corpus")
17 | 
18 |     parser.add_argument("--corpus", required=True,
19 |                         help="input corpora")
20 |     parser.add_argument("--segsuffix", type=str, default="seg",
21 |                         help="Suffix of output files")
22 |     return parser.parse_args()
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     print("Start processing")
27 |     start_time = time.time()
28 |     parsed_args = parseargs()
29 | 
30 |     segmentor = Segmentor("feature/segment.feat", "feature/segment.dic")
31 | 
32 |     with open(parsed_args.corpus ,'r',encoding='utf-8') as corpus_f,\
33 |             open(parsed_args.corpus + "." + parsed_args.segsuffix,'w',encoding='utf-8',errors='ignore') as seg_output_f:
34 |         for line in corpus_f:
35 |             if len(line) <= 1500 and len(line) != 0:
36 |                 segments = segmentor.seg_string(line.strip())
37 |                 segments_str = " ".join(segments)
38 |                 seg_output_f.write(segments_str + "\n")
39 |     print("Done in", time.time()-start_time, "seconds")


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/lib/kenlm_python/example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | 
 4 | 
 5 | 
 6 | 
 7 | LM = os.path.join(os.path.dirname(__file__), '..', 'lm', 'test.arpa')
 8 | model = kenlm.Model(LM)
 9 | print('{0}-gram model'.format(model.order))
10 | 
11 | sentence = 'language modeling is fun .'
12 | print(sentence)
13 | print(model.score(sentence))
14 | 
15 | # Check that total full score = direct score
16 | def score(s):
17 |     return sum(prob for prob, _, _ in model.full_scores(s))
18 | 
19 | assert (abs(score(sentence) - model.score(sentence)) < 1e-3)
20 | 
21 | # Show scores and n-gram matches
22 | words = ['<s>'] + sentence.split() + ['</s>']
23 | for i, (prob, length, oov) in enumerate(model.full_scores(sentence)):
24 |     print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i+2-length:i+2])))
25 |     if oov:
26 |         print('\t"{0}" is an OOV'.format(words[i+1]))
27 | 
28 | # Find out-of-vocabulary words
29 | for w in words:
30 |     if not w in model:
31 |         print('"{0}" is an OOV'.format(w))
32 | 
33 | #Stateful query
34 | state = kenlm.State()
35 | state2 = kenlm.State()
36 | #Use <s> as context.  If you don't want <s>, use model.NullContextWrite(state).
37 | model.BeginSentenceWrite(state)
38 | accum = 0.0
39 | accum += model.BaseScore(state, "a", state2)
40 | accum += model.BaseScore(state2, "sentence", state)
41 | #score defaults to bos = True and eos = True.  Here we'll check without the end
42 | #of sentence marker.  
43 | assert (abs(accum - model.score("a sentence", eos = False)) < 1e-3)
44 | accum += model.BaseScore(state, "</s>", state2)
45 | assert (abs(accum - model.score("a sentence")) < 1e-3)
46 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD License
 2 | 
 3 | For fairseq software
 4 | 
 5 | Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without modification,
 8 | are permitted provided that the following conditions are met:
 9 | 
10 |  * Redistributions of source code must retain the above copyright notice, this
11 |     list of conditions and the following disclaimer.
12 | 
13 |  * Redistributions in binary form must reproduce the above copyright notice,
14 |     this list of conditions and the following disclaimer in the documentation
15 |        and/or other materials provided with the distribution.
16 | 
17 |  * Neither the name Facebook nor the names of its contributors may be used to
18 |     endorse or promote products derived from this software without specific
19 |        prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # Checkpoints
 29 | checkpoints
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # dotenv
 86 | .env
 87 | 
 88 | # virtualenv
 89 | .venv
 90 | venv/
 91 | ENV/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/progress_bar.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | """
10 | Progress bar wrapper around tqdm which handles non-tty outputs
11 | """
12 | 
13 | import sys
14 | 
15 | from tqdm import tqdm
16 | 
17 | 
18 | class progress_bar(tqdm):
19 |     enabled = sys.stderr.isatty()
20 |     print_interval = 1000
21 | 
22 |     def __new__(cls, *args, **kwargs):
23 |         if cls.enabled:
24 |             return tqdm(*args, **kwargs)
25 |         else:
26 |             return simple_progress_bar(cls.print_interval, *args, **kwargs)
27 | 
28 | 
29 | class simple_progress_bar(tqdm):
30 | 
31 |     def __init__(self, print_interval, *args, **kwargs):
32 |         super(simple_progress_bar, self).__init__(*args, **kwargs)
33 |         self.print_interval = print_interval
34 | 
35 |     def __iter__(self):
36 |         size = len(self.iterable)
37 |         for i, obj in enumerate(self.iterable):
38 |             yield obj
39 |             if i > 0 and i % self.print_interval == 0:
40 |                 msg = '{} {:5d} / {:d} {}\n'.format(self.desc, i, size, self.postfix)
41 |                 sys.stdout.write(msg)
42 |                 sys.stdout.flush()
43 | 
44 |     @classmethod
45 |     def write(cls, s, file=None, end="\n"):
46 |         fp = file if file is not None else sys.stdout
47 |         fp.write(s)
48 |         fp.write(end)
49 |         fp.flush()
50 | 
51 |     @staticmethod
52 |     def status_printer(file):
53 |         def print_status(s):
54 |             pass
55 |         return print_status
56 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/lib/kenlm_python/_kenlm.pxd:
--------------------------------------------------------------------------------
 1 | cdef extern from "lm/word_index.hh" namespace "lm":
 2 |     ctypedef unsigned WordIndex
 3 | 
 4 | cdef extern from "lm/return.hh" namespace "lm":
 5 |     cdef struct FullScoreReturn:
 6 |         float prob
 7 |         unsigned char ngram_length
 8 | 
 9 | cdef extern from "lm/state.hh" namespace "lm::ngram":
10 |     cdef cppclass State :
11 |         int Compare(const State &other) const
12 | 
13 |     int hash_value(const State &state) 
14 | 
15 | cdef extern from "lm/virtual_interface.hh" namespace "lm::base":
16 |     cdef cppclass Vocabulary:
17 |         WordIndex Index(char*)
18 |         WordIndex BeginSentence() 
19 |         WordIndex EndSentence()
20 |         WordIndex NotFound()
21 | 
22 |     ctypedef Vocabulary const_Vocabulary "const lm::base::Vocabulary"
23 | 
24 |     cdef cppclass Model:
25 |         void BeginSentenceWrite(void *)
26 |         void NullContextWrite(void *)
27 |         unsigned int Order()
28 |         const_Vocabulary& BaseVocabulary()
29 |         float BaseScore(void *in_state, WordIndex new_word, void *out_state)
30 |         FullScoreReturn BaseFullScore(void *in_state, WordIndex new_word, void *out_state)
31 | 
32 | cdef extern from "util/mmap.hh" namespace "util":
33 |     cdef enum LoadMethod:
34 |         LAZY
35 |         POPULATE_OR_LAZY
36 |         POPULATE_OR_READ
37 |         READ
38 |         PARALLEL_READ
39 | 
40 | cdef extern from "lm/config.hh" namespace "lm::ngram":
41 |     cdef cppclass Config:
42 |         Config()
43 |         float probing_multiplier
44 |         LoadMethod load_method
45 | 
46 | cdef extern from "lm/model.hh" namespace "lm::ngram":
47 |     cdef Model *LoadVirtual(char *, Config &config) except +
48 |     #default constructor
49 |     cdef Model *LoadVirtual(char *) except +
50 | 
51 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/scripts/convert_m2_to_parallel.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import re
 5 | 
 6 | if len(sys.argv) != 4:
 7 | 	print "[USAGE] %s nucle_m2_file output_src output_tgt" % sys.argv[0]
 8 | 	sys.exit()
 9 | 
10 | input_path = sys.argv[1]
11 | output_src_path = sys.argv[2]
12 | output_tgt_path = sys.argv[3]
13 | 
14 | words = []
15 | corrected = []
16 | sid = eid = 0
17 | prev_sid = prev_eid = -1
18 | pos = 0
19 | 
20 | 
21 | with open(input_path) as input_file, open(output_src_path, 'w') as output_src_file, open(output_tgt_path, 'w') as output_tgt_file:
22 | 	for line in input_file:
23 | 		line = line.strip()
24 | 		if line.startswith('S'):
25 | 			line = line[2:]
26 | 			words = line.split()
27 | 			corrected = ['<S>'] + words[:]
28 | 			output_src_file.write(line + '\n')
29 | 		elif line.startswith('A'):
30 | 			line = line[2:]
31 | 			info = line.split("|||")
32 | 			sid, eid = info[0].split()
33 | 			sid = int(sid) + 1; eid = int(eid) + 1;
34 | 			error_type = info[1]
35 | 			if error_type == "Um":
36 | 				continue
37 | 			for idx in range(sid, eid):
38 | 				corrected[idx] = ""
39 | 			if sid == eid:
40 | 				if sid == 0: continue	# Originally index was -1, indicating no op
41 | 				if sid != prev_sid or eid != prev_eid:
42 | 					pos = len(corrected[sid-1].split())
43 | 				cur_words = corrected[sid-1].split()
44 | 				cur_words.insert(pos, info[2])
45 | 				pos += len(info[2].split())
46 | 				corrected[sid-1] = " ".join(cur_words)
47 | 			else:
48 | 				corrected[sid] = info[2]
49 | 				pos = 0
50 | 			prev_sid = sid
51 | 			prev_eid = eid
52 | 		else:
53 | 			target_sentence = ' '.join([word for word in corrected if word != ""])
54 | 			assert target_sentence.startswith('<S>'), '(' + target_sentence + ')'
55 | 			target_sentence = target_sentence[4:]
56 | 			output_tgt_file.write(target_sentence + '\n')
57 | 			prev_sid = -1
58 | 			prev_eid = -1
59 | 			pos = 0
60 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/subword-nmt/bpe_toy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: Rico Sennrich
 4 | 
 5 | """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
 6 | Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary
 7 | of a text to a configurable number of symbols, with only a small increase in the number of tokens.
 8 | This is an (inefficient) toy implementation that shows the algorithm. For processing large datasets,
 9 | indexing and incremental updates can be used to speed up the implementation (see learn_bpe.py).
10 | 
11 | Reference:
12 | Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
13 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
14 | """
15 | 
16 | 
17 | import re
18 | import sys
19 | import collections
20 | 
21 | def get_stats(vocab):
22 |   pairs = collections.defaultdict(int)
23 |   for word, freq in vocab.items():
24 |     symbols = word.split()
25 |     for i in range(len(symbols)-1):
26 |       pairs[symbols[i],symbols[i+1]] += freq
27 |   return pairs
28 | 
29 | def merge_vocab(pair, v_in):
30 |   v_out = {}
31 |   bigram_pattern = re.escape(' '.join(pair))
32 |   p = re.compile(r'(?<!\S)' + bigram_pattern + r'(?!\S)')
33 |   for word in v_in:
34 |     w_out = p.sub(''.join(pair), word)
35 |     v_out[w_out] = v_in[word]
36 |   return v_out
37 | 
38 | vocab = {'l o w </w>' : 5, 'l o w e r </w>' : 2,
39 |          'n e w e s t </w>' : 6, 'w i d e s t </w>' : 3}
40 | num_merges = 15
41 | for i in range(num_merges):
42 |   pairs = get_stats(vocab)
43 |   best = max(pairs, key=pairs.get)
44 |   if pairs[best] < 2:
45 |      sys.stderr.write('no pair has frequency > 1. Stopping\n')
46 |      break
47 |   vocab = merge_vocab(best, vocab)
48 |   print(best)
49 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/nag.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | from torch.optim.optimizer import Optimizer, required
10 | 
11 | 
12 | class NAG(Optimizer):
13 |     def __init__(self, params, lr=required, momentum=0, weight_decay=0):
14 |         defaults = dict(lr=lr, momentum=momentum, weight_decay=weight_decay)
15 |         super(NAG, self).__init__(params, defaults)
16 | 
17 |     def step(self, closure=None):
18 |         """Performs a single optimization step.
19 | 
20 |         Arguments:
21 |             closure (callable, optional): A closure that reevaluates the model
22 |                 and returns the loss.
23 |         """
24 |         loss = None
25 |         if closure is not None:
26 |             loss = closure()
27 | 
28 |         for group in self.param_groups:
29 |             weight_decay = group['weight_decay']
30 |             momentum = group['momentum']
31 |             lr = group['lr']
32 | 
33 |             for p in group['params']:
34 |                 if p.grad is None:
35 |                     continue
36 | 
37 |                 d_p = p.grad.data
38 |                 if weight_decay != 0:
39 |                     d_p.add_(weight_decay, p.data)
40 | 
41 |                 param_state = self.state[p]
42 |                 if 'momentum_buffer' not in param_state:
43 |                     param_state['momentum_buffer'] = d_p.clone().zero_()
44 | 
45 |                 buf = param_state['momentum_buffer']
46 | 
47 |                 p.data.add_(momentum * momentum, buf)
48 |                 p.data.add_(-(1 + momentum) * lr, d_p)
49 | 
50 |                 buf.mul_(momentum).add_(-lr, d_p)
51 | 
52 |         return loss
53 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/training/preprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | 
 5 | source ../paths.sh
 6 | 
 7 | ## paths to training and development datasets
 8 | src_ext=src
 9 | trg_ext=trg
10 | train_data_prefix=$DATA_DIR/train.tok
11 | dev_data_prefix=$DATA_DIR/dev.tok
12 | #dev_data_m2=$DATA_DIR/dev.m2
13 | 
14 | # path to subword nmt
15 | SUBWORD_NMT=$SOFTWARE_DIR/subword-nmt
16 | # path to Fairseq-Py
17 | FAIRSEQPY=$SOFTWARE_DIR/fairseq-py
18 | 
19 | ######################
20 | # subword segmentation
21 | mkdir -p models/bpe_model
22 | bpe_operations=30000
23 | cat $train_data_prefix.$trg_ext | $SUBWORD_NMT/learn_bpe.py -s $bpe_operations > models/bpe_model/train.bpe.model
24 | mkdir -p processed/
25 | $SCRIPTS_DIR/apply_bpe.py -c models/bpe_model/train.bpe.model < $train_data_prefix.$src_ext > processed/train.all.src
26 | $SCRIPTS_DIR/apply_bpe.py -c models/bpe_model/train.bpe.model < $train_data_prefix.$trg_ext > processed/train.all.trg
27 | $SCRIPTS_DIR/apply_bpe.py -c models/bpe_model/train.bpe.model < $dev_data_prefix.$src_ext > processed/dev.src
28 | $SCRIPTS_DIR/apply_bpe.py -c models/bpe_model/train.bpe.model < $dev_data_prefix.$trg_ext > processed/dev.trg
29 | #cp $dev_data_m2 processed/dev.m2
30 | cp $dev_data_prefix.$src_ext processed/dev.input.txt
31 | 
32 | ##########################
33 | #  getting annotated sentence pairs only
34 | #python $SCRIPTS_DIR/get_diff.py  processed/train.all src trg > processed/train.annotated.src-trg
35 | #cut -f1  processed/train.annotated.src-trg > processed/train.src
36 | #cut -f2  processed/train.annotated.src-trg > processed/train.trg
37 | less processed/train.all.src > processed/train.src
38 | less processed/train.all.trg > processed/train.trg
39 | 
40 | #########################
41 | # preprocessing
42 | python $FAIRSEQPY/preprocess.py --source-lang src --target-lang trg --trainpref processed/train --validpref processed/dev --testpref  processed/dev --nwordssrc 37000 --nwordstgt 37000 --destdir processed/bin
43 | 
44 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/modules/beamable_mm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | 
12 | 
13 | class BeamableMM(nn.Module):
14 |     """This module provides an optimized MM for beam decoding with attention.
15 | 
16 |     It leverage the fact that the source-side of the input is replicated beam
17 |     times and the target-side of the input is of width one. This layer speeds up
18 |     inference by replacing the inputs {(bsz x 1 x nhu), (bsz x sz2 x nhu)}
19 |     with smaller inputs {(bsz/beam x beam x nhu), (bsz/beam x sz2 x nhu)}.
20 |     """
21 |     def __init__(self, beam_size):
22 |         super(BeamableMM, self).__init__()
23 |         self.beam_size = beam_size
24 | 
25 |     def forward(self, input1, input2):
26 |         if (
27 |             not self.training and   # test mode
28 |             self.beam_size > 0 and  # beam size is set
29 |             input1.dim() == 3 and   # only support batched input
30 |             input1.size(1) == 1     # single time step update
31 |         ):
32 |             bsz, beam = input1.size(0), self.beam_size
33 | 
34 |             # bsz x 1 x nhu --> bsz/beam x beam x nhu
35 |             input1 = input1[:, 0, :].unfold(0, beam, beam).transpose(2, 1)
36 | 
37 |             # bsz x sz2 x nhu --> bsz/beam x sz2 x nhu
38 |             input2 = input2.unfold(0, beam, beam)[:, :, :, 0]
39 | 
40 |             # use non batched operation if bsz = beam
41 |             if input1.size(0) == 1:
42 |                 output = torch.mm(input1[0, :, :], input2[0, :, :])
43 |             else:
44 |                 output = input1.bmm(input2)
45 |             return output.view(bsz, 1, -1)
46 |         else:
47 |             return input1.bmm(input2)
48 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/meters.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | import time
10 | 
11 | 
12 | class AverageMeter(object):
13 |     """Computes and stores the average and current value"""
14 |     def __init__(self):
15 |         self.reset()
16 | 
17 |     def reset(self):
18 |         self.val = 0
19 |         self.avg = 0
20 |         self.sum = 0
21 |         self.count = 0
22 | 
23 |     def update(self, val, n=1):
24 |         self.val = val
25 |         self.sum += val * n
26 |         self.count += n
27 |         self.avg = self.sum / self.count
28 | 
29 | 
30 | class TimeMeter(object):
31 |     """Computes the average occurence of some event per second"""
32 |     def __init__(self):
33 |         self.reset()
34 | 
35 |     def reset(self):
36 |         self.start = time.time()
37 |         self.n = 0
38 | 
39 |     def update(self, val=1):
40 |         self.n += val
41 | 
42 |     @property
43 |     def avg(self):
44 |         delta = time.time() - self.start
45 |         return self.n / delta
46 | 
47 |     @property
48 |     def elapsed_time(self):
49 |         return time.time() - self.start
50 | 
51 | 
52 | class StopwatchMeter(object):
53 |     """Computes the sum/avg duration of some event in seconds"""
54 |     def __init__(self):
55 |         self.reset()
56 | 
57 |     def start(self):
58 |         self.start_time = time.time()
59 | 
60 |     def stop(self, n=1):
61 |         if self.start_time is not None:
62 |             delta = time.time() - self.start_time
63 |             self.sum += delta
64 |             self.n += n
65 |             self.start_time = None
66 | 
67 |     def reset(self):
68 |         self.sum = 0
69 |         self.n = 0
70 |         self.start_time = None
71 | 
72 |     @property
73 |     def avg(self):
74 |         return self.sum / self.n
75 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/PATENTS:
--------------------------------------------------------------------------------
 1 | Additional Grant of Patent Rights Version 2
 2 | 
 3 | "Software" means the fairseq software distributed by Facebook, Inc.
 4 | 
 5 | Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software
 6 | ("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable
 7 | (subject to the termination provision below) license under any Necessary
 8 | Claims, to make, have made, use, sell, offer to sell, import, and otherwise
 9 | transfer the Software. For avoidance of doubt, no license is granted under
10 | Facebook’s rights in any patent claims that are infringed by (i) modifications
11 | to the Software made by you or any third party or (ii) the Software in
12 | combination with any software or other technology.
13 | 
14 | The license granted hereunder will terminate, automatically and without notice,
15 | if you (or any of your subsidiaries, corporate affiliates or agents) initiate
16 | directly or indirectly, or take a direct financial interest in, any Patent
17 | Assertion: (i) against Facebook or any of its subsidiaries or corporate
18 | affiliates, (ii) against any party if such Patent Assertion arises in whole or
19 | in part from any software, technology, product or service of Facebook or any of
20 | its subsidiaries or corporate affiliates, or (iii) against any party relating
21 | to the Software. Notwithstanding the foregoing, if Facebook or any of its
22 | subsidiaries or corporate affiliates files a lawsuit alleging patent
23 | infringement against you in the first instance, and you respond by filing a
24 | patent infringement counterclaim in that lawsuit against that party that is
25 | unrelated to the Software, the license granted hereunder will not terminate
26 | under section (i) of this paragraph due to such counterclaim.
27 | 
28 | A "Necessary Claim" is a claim of a patent owned by Facebook that is
29 | necessarily infringed by the Software standing alone.
30 | 
31 | A "Patent Assertion" is any lawsuit or other action alleging direct, indirect,
32 | or contributory infringement or inducement to infringe any patent, including a
33 | cross-claim or counterclaim.
34 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | from setuptools import setup, find_packages, Extension
10 | from setuptools.command.build_py import build_py
11 | import sys
12 | from torch.utils.ffi import create_extension
13 | 
14 | 
15 | if sys.version_info < (3,):
16 |     sys.exit('Sorry, Python3 is required for fairseq.')
17 | 
18 | with open('README.md') as f:
19 |     readme = f.read()
20 | 
21 | with open('LICENSE') as f:
22 |     license = f.read()
23 | 
24 | with open('requirements.txt') as f:
25 |     reqs = f.read()
26 | 
27 | bleu = Extension(
28 |     'fairseq.libbleu',
29 |     sources=[
30 |         'fairseq/clib/libbleu/libbleu.cpp',
31 |         'fairseq/clib/libbleu/module.cpp',
32 |     ],
33 |     extra_compile_args=['-std=c++11'],
34 | )
35 | 
36 | conv_tbc = create_extension(
37 |     'fairseq.temporal_convolution_tbc',
38 |     relative_to='fairseq',
39 |     headers=['fairseq/clib/temporal_convolution_tbc/temporal_convolution_tbc.h'],
40 |     sources=['fairseq/clib/temporal_convolution_tbc/temporal_convolution_tbc.cpp'],
41 |     define_macros=[('WITH_CUDA', None)],
42 |     with_cuda=True,
43 |     extra_compile_args=['-std=c++11'],
44 | )
45 | 
46 | 
47 | class build_py_hook(build_py):
48 |     def run(self):
49 |         conv_tbc.build()
50 |         build_py.run(self)
51 | 
52 | 
53 | setup(
54 |     name='fairseq',
55 |     version='0.1.0',
56 |     description='Facebook AI Research Sequence-to-Sequence Toolkit',
57 |     long_description=readme,
58 |     license=license,
59 |     install_requires=reqs.strip().split('\n'),
60 |     packages=find_packages(),
61 |     ext_modules=[bleu],
62 | 
63 |     # build and install PyTorch extensions
64 |     package_data={
65 |         'fairseq': ['temporal_convolution_tbc/*.so'],
66 |     },
67 |     include_package_data=True,
68 |     cmdclass={
69 |         'build_py': build_py_hook,
70 |     },
71 | )
72 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/score.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | import argparse
10 | import os
11 | import sys
12 | 
13 | from fairseq import bleu, dictionary, tokenizer
14 | 
15 | 
16 | def main():
17 |     parser = argparse.ArgumentParser(description='Command-line script for BLEU scoring.')
18 |     parser.add_argument('-s', '--sys', default='-', help='system output')
19 |     parser.add_argument('-r', '--ref', required=True, help='references')
20 |     parser.add_argument('-o', '--order', default=4, metavar='N',
21 |                         type=int, help='consider ngrams up to this order')
22 |     parser.add_argument('--ignore-case', action='store_true',
23 |                         help='case-insensitive scoring')
24 | 
25 |     args = parser.parse_args()
26 |     print(args)
27 | 
28 |     assert args.sys == '-' or os.path.exists(args.sys), \
29 |         "System output file {} does not exist".format(args.sys)
30 |     assert os.path.exists(args.ref), \
31 |         "Reference file {} does not exist".format(args.ref)
32 | 
33 |     dict = dictionary.Dictionary()
34 | 
35 |     def readlines(fd):
36 |         for line in fd.readlines():
37 |             if args.ignore_case:
38 |                 yield line.lower()
39 |             yield line
40 | 
41 |     def score(fdsys):
42 |         with open(args.ref) as fdref:
43 |             scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
44 |             for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)):
45 |                 sys_tok = tokenizer.Tokenizer.tokenize(sys_tok, dict)
46 |                 ref_tok = tokenizer.Tokenizer.tokenize(ref_tok, dict)
47 |                 scorer.add(ref_tok, sys_tok)
48 |             print(scorer.result_string(args.order))
49 | 
50 |     if args.sys == '-':
51 |         score(sys.stdin)
52 |     else:
53 |         with open(args.sys, 'r') as f:
54 |             score(f)
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     main()
59 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/lib/m2scorer/token_offsets.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # This file is part of the NUS M2 scorer.
 4 | # The NUS M2 scorer is free software: you can redistribute it and/or modify
 5 | # it under the terms of the GNU General Public License as published by
 6 | # the Free Software Foundation, either version 3 of the License, or
 7 | # (at your option) any later version.
 8 | 
 9 | # The NUS M2 scorer is distributed in the hope that it will be useful,
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | # GNU General Public License for more details.
13 | 
14 | # You should have received a copy of the GNU General Public License
15 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 | # file: token_offsets.py
18 | # convert character to token offsets, tokenize sentence 
19 | #
20 | # usage: %prog  < input > output
21 | #
22 | 
23 | 
24 | import sys
25 | import re
26 | import os
27 | from util import *
28 | from Tokenizer import PTBTokenizer
29 | 
30 | 
31 | assert len(sys.argv) == 1
32 | 
33 | 
34 | # main
35 | # loop over sentences cum annotation
36 | tokenizer = PTBTokenizer()
37 | sentence = ''
38 | for line in sys.stdin:
39 |     line = line.decode("utf8").strip()
40 |     if line.startswith("S "):
41 |         sentence = line[2:]
42 |         sentence_tok = "S " + ' '.join(tokenizer.tokenize(sentence))
43 |         print sentence_tok.encode("utf8")
44 |     elif line.startswith("A "):
45 |         fields = line[2:].split('|||')
46 |         start_end = fields[0]
47 |         char_start, char_end = [int(a) for a in start_end.split()]
48 |         # calculate token offsets
49 |         prefix = sentence[:char_start]
50 |         tok_start = len(tokenizer.tokenize(prefix))
51 |         postfix = sentence[:char_end]
52 |         tok_end = len(tokenizer.tokenize(postfix))
53 |         start_end = str(tok_start) + " " + str(tok_end)
54 |         fields[0] = start_end
55 |         # tokenize corrections, remove trailing whitespace
56 |         corrections = [(' '.join(tokenizer.tokenize(c))).strip() for c in fields[2].split('||')]
57 |         fields[2] = '||'.join(corrections)
58 |         annotation =  "A " + '|||'.join(fields)
59 |         print annotation.encode("utf8")
60 |     else:
61 |         print line.encode("utf8")
62 | 
63 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 
 2 | This is the code of our team (Zlbnlp) for the NLPCC 2018 Shared Task 2 Grammatical Error Correction.
 3 | 
 4 | ## Usage
 5 | ### Prerequisites
 6 | * python3.6
 7 | * pytorch0.2.0 (use following commands to install from source)
 8 | 
 9 | ```bash
10 | export CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" # [anaconda root directory]
11 | conda install numpy pyyaml mkl mkl-include setuptools cmake cffi typing
12 | conda install -c mingfeima mkldnn
13 | conda install -c pytorch magma-cuda80
14 | 
15 | git clone https://github.com/pytorch/pytorch.git
16 | cd pytorch
17 | git reset --hard a03e5cb40938b6b3f3e6dbddf9cff8afdff72d1b
18 | git submodule update --init
19 | pip install -r requirements.txt
20 | python setup.py install
21 | ```
22 | 
23 | * [m2score scripts](http://www.comp.nus.edu.sg/~nlp/sw/m2scorer.tar.gz)(to compute the metrics)
24 | * [libgrass-ui toolkit](http://www.icst.pku.edu.cn/lcwm/pkunlp/downloads/libgrass-ui.tar.gz)(word segmentation toolkit)
25 | * fairseq-py (dependent on torch, use following commands to install) 
26 | 
27 | ```bash
28 | cd CS2S+BPE+Emb/software/fairseq-py
29 | pip install -r requirements.txt
30 | python setup.py build 
31 | python setup.py develop  
32 | ```
33 | 
34 | 
35 | ### Data
36 | The data and embeddings can be found in the [Zlbnlp_data](https://pan.baidu.com/s/18JXm1KGmRu3Pe45jt2sYBQ).
37 | You need manually split the whole dataset into two parts. 
38 | * training dataset:contain 1,215,876 sentence pairs.Filepaths is CS2S+BPE+Emb/data/train.tok.src, CS2S+BPE+Emb/data/train.tok.trg 
39 | * development dataset:contain 5k sentence pairs.Filepaths is CS2S+BPE+Emb/data/dev.tok.src, CS2S+BPE+Emb/data/dev.tok.trg 
40 | * test data is source.txt.jieba.seg,using jieba toolkit.
41 | 
42 | ### Data processing
43 | 
44 | ```bash
45 | cd CS2S+BPE+Emb/training/
46 | chmod +x preprocess.sh
47 | ./preprocess.sh
48 | ```
49 | 
50 | ### Training
51 | 
52 | * Training command
53 | 
54 | The command below is what we used to train an model on the NLPCC-2018 Task 2 dataset.
55 | ```
56 | ./train_embed.sh
57 | ```
58 | 
59 | ### Decoding
60 | The following is the command used to generate outputs and F0.5 score:
61 | ```
62 | cd CS2S+BPE+Emb/
63 | ./run.sh ./data/source.txt.jieba.seg ./output/CS2S+BPE+Emb/ 0 ./training/models/mlconv_embed/model1
64 | cd libgrass-ui/
65 | ./remove_spac_pkunlp_segment.sh 
66 | ````
67 | 
68 | ## Contact
69 | If you have questions, suggestions and bug reports, please email renhongkai27@gmail.com.
70 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/criterions/label_smoothed_cross_entropy.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | import math
10 | import torch
11 | from torch.autograd.variable import Variable
12 | import torch.nn.functional as F
13 | 
14 | from .fairseq_criterion import FairseqCriterion
15 | 
16 | 
17 | class LabelSmoothedCrossEntropy(torch.autograd.Function):
18 | 
19 |     @staticmethod
20 |     def forward(ctx, input, target, eps, padding_idx, weights):
21 |         grad_input = input.new(input.size()).zero_()
22 |         target = target.view(target.size(0), 1)
23 |         grad_input = grad_input.scatter_(grad_input.dim() - 1, target, eps - 1)
24 | 
25 |         norm = grad_input.size(-1)
26 |         if weights is not None:
27 |             norm = weights.sum()
28 |             grad_input.mul(weights.view(1, weights.size(0)).expand_as(grad_input))
29 | 
30 |         if padding_idx is not None:
31 |             norm -= 1 if weights is None else weights[padding_idx]
32 |             grad_input.select(grad_input.dim() - 1, padding_idx).fill_(0)
33 | 
34 |         grad_input = grad_input.add(-eps / norm)
35 | 
36 |         ctx.grad_input = grad_input
37 |         return input.new([grad_input.view(-1).dot(input.view(-1))])
38 | 
39 |     @staticmethod
40 |     def backward(ctx, grad):
41 |         return Variable(ctx.grad_input, volatile=True) * grad, None, None, None, None
42 | 
43 | 
44 | class LabelSmoothedCrossEntropyCriterion(FairseqCriterion):
45 | 
46 |     def __init__(self, eps, padding_idx=None, weights=None):
47 |         super().__init__()
48 |         self.eps = eps
49 |         self.padding_idx = padding_idx
50 |         self.weights = weights
51 | 
52 |     def prepare(self, samples):
53 |         self.denom = sum(s['ntokens'] if s else 0 for s in samples)
54 | 
55 |     def forward(self, net_output, sample):
56 |         input = F.log_softmax(net_output.view(-1, net_output.size(-1)))
57 |         target = sample['target'].view(-1)
58 |         loss = LabelSmoothedCrossEntropy.apply(input, target, self.eps, self.padding_idx, self.weights)
59 |         return loss / self.denom
60 | 
61 |     def aggregate(self, losses):
62 |         return sum(losses) / math.log(2)
63 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/subword-nmt/segment-char-ngrams.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: Rico Sennrich
 4 | 
 5 | from __future__ import unicode_literals, division
 6 | 
 7 | import sys
 8 | import codecs
 9 | import argparse
10 | 
11 | # hack for python2/3 compatibility
12 | from io import open
13 | argparse.open = open
14 | 
15 | # python 2/3 compatibility
16 | if sys.version_info < (3, 0):
17 |   sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
18 |   sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
19 |   sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
20 | 
21 | def create_parser():
22 |     parser = argparse.ArgumentParser(
23 |         formatter_class=argparse.RawDescriptionHelpFormatter,
24 |         description="segment rare words into character n-grams")
25 | 
26 |     parser.add_argument(
27 |         '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
28 |         metavar='PATH',
29 |         help="Input file (default: standard input).")
30 |     parser.add_argument(
31 |         '--vocab', type=argparse.FileType('r'), metavar='PATH',
32 |         required=True,
33 |         help="Vocabulary file.")
34 |     parser.add_argument(
35 |         '--shortlist', type=int, metavar='INT', default=0,
36 |         help="do not segment INT most frequent words in vocabulary (default: '%(default)s')).")
37 |     parser.add_argument(
38 |         '-n', type=int, metavar='INT', default=2,
39 |         help="segment rare words into character n-grams of size INT (default: '%(default)s')).")
40 |     parser.add_argument(
41 |         '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
42 |         metavar='PATH',
43 |         help="Output file (default: standard output)")
44 |     parser.add_argument(
45 |         '--separator', '-s', type=str, default='@@', metavar='STR',
46 |         help="Separator between non-final subword units (default: '%(default)s'))")
47 | 
48 |     return parser
49 | 
50 | 
51 | if __name__ == '__main__':
52 | 
53 |     parser = create_parser()
54 |     args = parser.parse_args()
55 | 
56 |     vocab = [line.split()[0] for line in args.vocab if len(line.split()) == 2]
57 |     vocab = dict((y,x) for (x,y) in enumerate(vocab))
58 | 
59 |     for line in args.input:
60 |       for word in line.split():
61 |         if word not in vocab or vocab[word] > args.shortlist:
62 |           i = 0
63 |           while i*args.n < len(word):
64 |             args.output.write(word[i*args.n:i*args.n+args.n])
65 |             i += 1
66 |             if i*args.n < len(word):
67 |               args.output.write(args.separator)
68 |             args.output.write(' ')
69 |         else:
70 |           args.output.write(word + ' ')
71 |       args.output.write('\n')
72 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/augmenter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import time
 5 | import numpy as np
 6 | import codecs
 7 | import argparse
 8 | 
 9 | # Initializing the logging module
10 | import logging
11 | import log_utils as L
12 | logger = logging.getLogger(__name__)
13 | 
14 | from candidatesreader import NBestList
15 | from features import *
16 | 
17 | def augment(features, source_path, input_nbest_path, output_nbest_path):
18 |     ''' Function to augment the n-best list with a feature function
19 |      :param feature: The feature function object
20 |      :param source_path: Path to the original source sentences (maybe required for the feature function)
21 |      :param input_nbest_path: Path to the n-best file
22 |      :param output_nbest_path: Path to the output n-best file
23 |     '''
24 |     # Initialize NBestList objects
25 |     logger.info('Initializing Nbest lists')
26 |     input_nbest = NBestList(input_nbest_path, mode='r')
27 |     output_nbest = NBestList(output_nbest_path, mode='w')
28 | 
29 |     # Load the source sentences
30 |     logger.info('Loading source sentences')
31 |     src_sents = codecs.open(source_path, mode='r', encoding='UTF-8')
32 |     
33 |     # For each of the item in the n-best list, append the feature
34 |     sent_count = 0
35 |     for group, src_sent in zip(input_nbest, src_sents):
36 |         candidate_count = 0
37 |         for item in group:
38 |             for feature in features:
39 |                 item.append_feature(feature.name, feature.get_score(src_sent, item.hyp, (sent_count, candidate_count)))
40 |             output_nbest.write(item)
41 |             candidate_count += 1
42 |         sent_count += 1
43 |         if (sent_count % 100 == 0):
44 |             logger.info('Augmented ' + L.b_yellow(str(sent_count)) + ' sentences.')
45 |     output_nbest.close()
46 | 
47 | 
48 | parser = argparse.ArgumentParser()	
49 | parser.add_argument("-s", "--source-sentence-file", dest="source_path", required=True, help="Path to the file containing source sentences.")
50 | parser.add_argument("-i", "--input-nbest", dest="input_nbest_path", required=True, help="Input n-best file")
51 | parser.add_argument("-o", "--output-nbest", dest="output_nbest_path", required=True, help="Output n-best file")
52 | parser.add_argument("-f", "--feature", dest="feature_string", required=True, help="feature initializer, e.g. LM('LM0','/path/to/lm_file', normalize=True)")
53 | args = parser.parse_args()
54 | 
55 | L.set_logger(os.path.abspath(os.path.dirname(args.output_nbest_path)),'augment_log.txt')
56 | L.print_args(args)
57 | features = eval('['+args.feature_string+']')
58 | augment(features, args.source_path, args.input_nbest_path, args.output_nbest_path)
59 | logger.info(L.green('Augmenting done.'))
60 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/run.sh:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bash
 2 | ## This script is to run the complete GEC system on any given test set
 3 | 
 4 | set -e
 5 | set -x
 6 | 
 7 | source paths.sh
 8 | 
 9 | if [ $# -ge 4 ]; then
10 |     input_file=$1
11 |     output_dir=$2
12 |     device=$3
13 |     model_path=$4
14 |     if [ $# -eq 6 ]; then
15 |         reranker_weights=$5
16 |         reranker_feats=$6
17 |     fi
18 | else
19 |     echo "Please specify the paths to the input_file and output directory"
20 |     echo "Usage: `basename $0` <input_file> <output_dir> <gpu-device-num(e.g: 0)> <path to model_file/dir> [optional args: <path-to-reranker-weights> <featuers,e.g:eo,eolm]"   >&2
21 | fi
22 | 
23 | if [ -d $model_path ]; then
24 |     models=`ls $model_path/*pt | tr '\n' ' ' | sed "s| \([^$]\)| --path \1|g"`
25 |     echo $models
26 | elif [ -f $model_path ]; then
27 |     models=$model_path
28 | elif [ ! -e $model_path ]; then
29 |     echo "Model path not found: $model_path"
30 | fi
31 | 
32 | 
33 | FAIRSEQPY=$SOFTWARE_DIR/fairseq-py
34 | NBEST_RERANKER=$SOFTWARE_DIR/nbest-reranker
35 | 
36 | 
37 | beam=12
38 | nbest=$beam
39 | threads=12
40 | 
41 | mkdir -p $output_dir
42 | $SCRIPTS_DIR/apply_bpe.py -c $TRAINING_DIR/models/bpe_model/train.bpe.model < $input_file > $output_dir/input.bpe.txt
43 | 
44 | # running fairseq on the test data
45 | CUDA_VISIBLE_DEVICES=$device python $FAIRSEQPY/generate.py --no-progress-bar --path $models --beam $beam --nbest $beam --interactive --workers $threads $TRAINING_DIR/processed/bin/ < $output_dir/input.bpe.txt > $output_dir/output.bpe.nbest.txt
46 | 
47 | # getting best hypotheses
48 | cat $output_dir/output.bpe.nbest.txt | grep "^H"  | python -c "import sys; x = sys.stdin.readlines(); x = ' '.join([ x[i] for i in range(len(x)) if(i%$nbest == 0) ]); print(x)" | cut -f3 > $output_dir/output.bpe.txt
49 | 
50 | # debpe
51 | cat $output_dir/output.bpe.txt | sed 's|@@ ||g' | sed '$ d' > $output_dir/output.tok.txt
52 | 
53 | # additionally re-rank outputs
54 | if [ $# -eq 6  ];  then
55 |     if [ $reranker_feats == "eo" ]; then
56 |         featstring="EditOps(name='EditOps0')"
57 |     elif [ $reranker_feats == "eolm" ]; then
58 |         featstring="EditOps(name='EditOps0'), LM('LM0', '$MODEL_DIR/lm/94Bcclm.trie', normalize=False), WordPenalty(name='WordPenalty0')"
59 |     fi
60 |     $SCRIPTS_DIR/nbest_reformat.py -i $output_dir/output.bpe.nbest.txt --debpe > $output_dir/output.tok.nbest.reformat.txt
61 |     $NBEST_RERANKER/augmenter.py -s $input_file -i $output_dir/output.tok.nbest.reformat.txt -o $output_dir/output.tok.nbest.reformat.augmented.txt -f "$featstring"
62 |     $NBEST_RERANKER/rerank.py -i $output_dir/output.tok.nbest.reformat.augmented.txt -w $reranker_weights -o $output_dir --clean-up
63 |     mv $output_dir/output.tok.nbest.reformat.augmented.txt.reranked.1best $output_dir/output.reranked.tok.txt
64 | fi
65 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/data/prepare-iwslt14.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
 4 | 
 5 | echo 'Cloning Moses github repository (for tokenization scripts)...'
 6 | git clone https://github.com/moses-smt/mosesdecoder.git
 7 | 
 8 | SCRIPTS=mosesdecoder/scripts
 9 | TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
10 | LC=$SCRIPTS/tokenizer/lowercase.perl
11 | CLEAN=$SCRIPTS/training/clean-corpus-n.perl
12 | 
13 | URL="https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz"
14 | GZ=de-en.tgz
15 | 
16 | if [ ! -d "$SCRIPTS" ]; then
17 |     echo "Please set SCRIPTS variable correctly to point to Moses scripts."
18 |     exit
19 | fi
20 | 
21 | src=de
22 | tgt=en
23 | lang=de-en
24 | prep=iwslt14.tokenized.de-en
25 | tmp=$prep/tmp
26 | orig=orig
27 | 
28 | mkdir -p $orig $tmp $prep
29 | 
30 | echo "Downloading data from ${URL}..."
31 | cd $orig
32 | wget "$URL"
33 | 
34 | if [ -f $GZ ]; then
35 |     echo "Data successfully downloaded."
36 | else
37 |     echo "Data not successfully downloaded."
38 |     exit
39 | fi
40 | 
41 | tar zxvf $GZ
42 | cd ..
43 | 
44 | echo "pre-processing train data..."
45 | for l in $src $tgt; do
46 |     f=train.tags.$lang.$l
47 |     tok=train.tags.$lang.tok.$l
48 | 
49 |     cat $orig/$lang/$f | \
50 |     grep -v '<url>' | \
51 |     grep -v '<talkid>' | \
52 |     grep -v '<keywords>' | \
53 |     sed -e 's/<title>//g' | \
54 |     sed -e 's/<\/title>//g' | \
55 |     sed -e 's/<description>//g' | \
56 |     sed -e 's/<\/description>//g' | \
57 |     perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
58 |     echo ""
59 | done
60 | perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.clean 1 175
61 | for l in $src $tgt; do
62 |     perl $LC < $tmp/train.tags.$lang.clean.$l > $tmp/train.tags.$lang.$l
63 | done
64 | 
65 | echo "pre-processing valid/test data..."
66 | for l in $src $tgt; do
67 |     for o in `ls $orig/$lang/IWSLT14.TED*.$l.xml`; do
68 |     fname=${o##*/}
69 |     f=$tmp/${fname%.*}
70 |     echo $o $f
71 |     grep '<seg id' $o | \
72 |         sed -e 's/<seg id="[0-9]*">\s*//g' | \
73 |         sed -e 's/\s*<\/seg>\s*//g' | \
74 |         sed -e "s/\’/\'/g" | \
75 |     perl $TOKENIZER -threads 8 -l $l | \
76 |     perl $LC > $f
77 |     echo ""
78 |     done
79 | done
80 | 
81 | 
82 | echo "creating train, valid, test..."
83 | for l in $src $tgt; do
84 |     awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.de-en.$l > $prep/valid.$l
85 |     awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.de-en.$l > $prep/train.$l
86 | 
87 |     cat $tmp/IWSLT14.TED.dev2010.de-en.$l \
88 |         $tmp/IWSLT14.TEDX.dev2012.de-en.$l \
89 |         $tmp/IWSLT14.TED.tst2010.de-en.$l \
90 |         $tmp/IWSLT14.TED.tst2011.de-en.$l \
91 |         $tmp/IWSLT14.TED.tst2012.de-en.$l \
92 |         > $prep/test.$l
93 | done
94 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/tokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | import re
10 | import torch
11 | 
12 | from fairseq import dictionary
13 | 
14 | 
15 | def tokenize_line(line):
16 |     line = re.sub(r"\t", "", line)
17 |     line = re.sub(r"^\s+", "", line)
18 |     line = re.sub(r"\s+$", "", line)
19 |     line = re.sub(r"\s+", " ", line)
20 |     return line.split()
21 | 
22 | 
23 | class Tokenizer:
24 | 
25 |     @staticmethod
26 |     def build_dictionary(filename, tokenize=tokenize_line):
27 |         dict = dictionary.Dictionary()
28 |         Tokenizer.add_file_to_dictionary(filename, dict, tokenize)
29 |         dict.finalize()
30 |         return dict
31 | 
32 |     @staticmethod
33 |     def add_file_to_dictionary(filename, dict, tokenize):
34 |         with open(filename, 'r') as f:
35 |             for line in f.readlines():
36 |                 for word in tokenize(line):
37 |                     dict.add_symbol(word)
38 |                 dict.add_symbol(dict.eos_word)
39 | 
40 |     @staticmethod
41 |     def binarize(filename, dict, consumer, tokenize=tokenize_line):
42 |         nseq, ntok, nunk = 0, 0, 0
43 |         replaced = {}
44 |         with open(filename, 'r') as f:
45 |             for line in f.readlines():
46 |                 words = tokenize(line)
47 |                 nwords = len(words)
48 |                 ids = torch.IntTensor(nwords + 1)
49 |                 nseq = nseq + 1
50 |                 for i in range(0, len(words)):
51 |                     word = words[i]
52 |                     idx = dict.index(word)
53 |                     if idx == dict.unk_index and word != dict.unk_word:
54 |                         nunk = nunk + 1
55 |                         if word in replaced:
56 |                             replaced[word] = replaced[word] + 1
57 |                         else:
58 |                             replaced[word] = 1
59 |                     ids[i] = idx
60 | 
61 |                 ids[nwords] = dict.eos_index
62 |                 consumer(ids)
63 |                 ntok = ntok + len(ids)
64 |         return {'nseq': nseq, 'nunk': nunk, 'ntok': ntok, 'replaced': len(replaced)}
65 | 
66 |     @staticmethod
67 |     def tokenize(line, dict, tokenize=tokenize_line, add_if_not_exist=True):
68 |         words = tokenize(line)
69 |         nwords = len(words)
70 |         ids = torch.IntTensor(nwords + 1)
71 |         for i in range(0, len(words)):
72 |             if add_if_not_exist:
73 |                 ids[i] = dict.add_symbol(words[i])
74 |             else:
75 |                 ids[i] = dict.index(words[i])
76 |         ids[nwords] = dict.eos_index
77 |         return ids
78 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/rerank.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import os
 5 | import imp
 6 | import shutil
 7 | 
 8 | import argparse
 9 | 
10 | # Initializing the logging module
11 | import logging
12 | import log_utils as L
13 | import configreader
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | 
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument("-i", "--input-nbest", dest="input_nbest", required=True, help="Input n-best file")
20 | parser.add_argument("-w", "--weights", dest="weights", required=True, help="Input weights file")
21 | parser.add_argument("-o", "--output-dir", dest="out_dir", required=True, help="Output directory")
22 | parser.add_argument("-c", "--clean-up", dest="clean_up", action='store_true', help="Temporary files will be removed")
23 | parser.add_argument("-q", "--quiet", dest="quiet", action='store_true', help="Nothing will be printed in STDERR")
24 | args = parser.parse_args()
25 | 
26 | 
27 | from candidatesreader import NBestList
28 | import codecs
29 | import numpy as np
30 | 
31 | if not os.path.exists(args.out_dir):
32 |     os.makedirs(args.out_dir)
33 | L.set_logger(os.path.abspath(args.out_dir),'train_log.txt')
34 | L.print_args(args)
35 | 
36 | 
37 | output_nbest_path = args.out_dir + '/augmented.nbest'
38 | shutil.copy(args.input_nbest, output_nbest_path)
39 | 
40 | with open(args.weights, 'r') as input_weights:
41 |     lines = input_weights.readlines()
42 |     if len(lines) > 1:
43 |         L.warning("Weights file has more than one line. I'll read the 1st and ignore the rest.")
44 |     weights = np.asarray(lines[0].strip().split(" "), dtype=float)
45 | 
46 | prefix = os.path.basename(args.input_nbest)
47 | input_aug_nbest = NBestList(output_nbest_path, mode='r')
48 | output_nbest = NBestList(args.out_dir + '/' + prefix + '.reranked.nbest', mode='w')
49 | output_1best = codecs.open(args.out_dir + '/' + prefix + '.reranked.1best', mode='w', encoding='UTF-8')
50 | 
51 | def is_number(s):
52 |     try:
53 |         float(s)
54 |         return True
55 |     except ValueError:
56 |         return False
57 | 
58 | counter = 0
59 | for group in input_aug_nbest:
60 |     index = 0
61 |     scores = dict()
62 |     for item in group:
63 |         features = np.asarray([x for x in item.features.split() if is_number(x)], dtype=float)
64 |         try:
65 |             scores[index] = np.dot(features, weights)
66 |         except ValueError:
67 |             logger.error('Number of features in the nbest and the weights file are not the same')
68 |         index += 1
69 |     sorted_indices = sorted(scores, key=scores.get, reverse=True)
70 |     for idx in sorted_indices:
71 |         output_nbest.write(group[idx])
72 |     output_1best.write(group[sorted_indices[0]].hyp + "\n")
73 |     counter += 1
74 |     if counter % 100 == 0:
75 |         logger.info(L.b_yellow(str(counter)) + " groups processed")
76 |         logger.info("%i groups processed" % (counter))
77 | logger.info("Finished processing %i groups" % (counter))
78 | logger.info(L.green('Reranking completed.'))
79 | output_nbest.close()
80 | output_1best.close()
81 | 
82 | if args.clean_up:
83 |     os.remove(output_nbest_path)
84 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/clib/libbleu/libbleu.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2017-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | #include <map>
 10 | #include <array>
 11 | #include <cstring>
 12 | #include <cstdio>
 13 | 
 14 | typedef struct
 15 | {
 16 |     size_t reflen;
 17 |     size_t predlen;
 18 |     size_t match1;
 19 |     size_t count1;
 20 |     size_t match2;
 21 |     size_t count2;
 22 |     size_t match3;
 23 |     size_t count3;
 24 |     size_t match4;
 25 |     size_t count4;
 26 | } bleu_stat;
 27 | 
 28 | // left trim (remove pad)
 29 | void bleu_ltrim(size_t* len, int** sent, int pad) {
 30 |   size_t start = 0;
 31 |   while(start < *len) {
 32 |     if (*(*sent + start) != pad) { break; }
 33 |     start++;
 34 |   }
 35 |   *sent += start;
 36 |   *len -= start;
 37 | }
 38 | 
 39 | // right trim remove (eos)
 40 | void bleu_rtrim(size_t* len, int** sent, int pad, int eos) {
 41 |   size_t end = *len - 1;
 42 |   while (end > 0) {
 43 |     if (*(*sent + end) != eos && *(*sent + end) != pad) { break; }
 44 |     end--;
 45 |   }
 46 |   *len = end + 1;
 47 | }
 48 | 
 49 | // left and right trim
 50 | void bleu_trim(size_t* len, int** sent, int pad, int eos) {
 51 |   bleu_ltrim(len, sent, pad);
 52 |   bleu_rtrim(len, sent, pad, eos);
 53 | }
 54 | 
 55 | size_t bleu_hash(int len, int* data) {
 56 |   size_t h     = 14695981039346656037ul;
 57 |   size_t prime = 0x100000001b3;
 58 |   char* b      = (char*) data;
 59 |   size_t blen  = sizeof(int) * len;
 60 | 
 61 |   while (blen-- > 0) {
 62 |     h ^= *b++;
 63 |     h *= prime;
 64 |   }
 65 | 
 66 |   return h;
 67 | }
 68 | 
 69 | void bleu_addngram(
 70 |     size_t *ntotal, size_t *nmatch, size_t n,
 71 |     size_t reflen, int* ref, size_t predlen, int* pred) {
 72 | 
 73 |   if (predlen < n) { return; }
 74 | 
 75 |   predlen = predlen - n + 1;
 76 |   (*ntotal) += predlen;
 77 | 
 78 |   if (reflen < n) { return; }
 79 | 
 80 |   reflen = reflen - n + 1;
 81 | 
 82 |   std::map<size_t, size_t> count;
 83 |   while (predlen > 0) {
 84 |     size_t w = bleu_hash(n, pred++);
 85 |     count[w]++;
 86 |     predlen--;
 87 |   }
 88 | 
 89 |   while (reflen > 0) {
 90 |     size_t w = bleu_hash(n, ref++);
 91 |     if (count[w] > 0) {
 92 |       (*nmatch)++;
 93 |       count[w] -=1;
 94 |     }
 95 |     reflen--;
 96 |   }
 97 | }
 98 | 
 99 | extern "C" {
100 | 
101 | void bleu_zero_init(bleu_stat* stat) {
102 |   std::memset(stat, 0, sizeof(bleu_stat));
103 | }
104 | 
105 | void bleu_one_init(bleu_stat* stat) {
106 |   bleu_zero_init(stat);
107 |   stat->count1 = 1;
108 |   stat->count2 = 1;
109 |   stat->count3 = 1;
110 |   stat->count4 = 1;
111 |   stat->match1 = 1;
112 |   stat->match2 = 1;
113 |   stat->match3 = 1;
114 |   stat->match4 = 1;
115 | }
116 | 
117 | void bleu_add(
118 |     bleu_stat* stat,
119 |     size_t reflen, int* ref, size_t predlen, int* pred, int pad, int eos) {
120 | 
121 |   bleu_trim(&reflen, &ref, pad, eos);
122 |   bleu_trim(&predlen, &pred, pad, eos);
123 |   stat->reflen += reflen;
124 |   stat->predlen += predlen;
125 | 
126 |   bleu_addngram(&stat->count1, &stat->match1, 1, reflen, ref, predlen, pred);
127 |   bleu_addngram(&stat->count2, &stat->match2, 2, reflen, ref, predlen, pred);
128 |   bleu_addngram(&stat->count3, &stat->match3, 3, reflen, ref, predlen, pred);
129 |   bleu_addngram(&stat->count4, &stat->match4, 4, reflen, ref, predlen, pred);
130 | }
131 | 
132 | }
133 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/modules/linearized_convolution.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | import torch
10 | import torch.nn.functional as F
11 | from .conv_tbc import ConvTBC
12 | 
13 | 
14 | class LinearizedConvolution(ConvTBC):
15 |     """An optimized version of nn.Conv1d.
16 | 
17 |     This module replaces convolutions with linear layers as appropriate
18 |     and supports optimizations for incremental inference.
19 |     """
20 | 
21 |     def __init__(self, in_channels, out_channels, kernel_size, **kwargs):
22 |         super().__init__(in_channels, out_channels, kernel_size, **kwargs)
23 |         self.clear_buffer()
24 | 
25 |         self._linearized_weight = None
26 |         self.register_backward_hook(self._clear_linearized_weight)
27 | 
28 |     def remove_future_timesteps(self, x):
29 |         """Remove future time steps created by padding."""
30 |         if self.kernel_size[0] > 1 and self.padding[0] > 0:
31 |             x = x[:-self.padding[0], :, :]
32 |         return x
33 | 
34 |     def incremental_forward(self, input):
35 |         """Forward convolution one time step at a time.
36 | 
37 |         This function maintains an internal state to buffer signal and
38 |         accepts a single frame as input. If the input order changes
39 |         between time steps, call reorder_buffer. To apply to fresh
40 |         inputs, call clear_buffer.
41 |         """
42 |         if self.training:
43 |             raise RuntimeError('LinearizedConvolution only supports inference')
44 | 
45 |         # run forward pre hooks (e.g., weight norm)
46 |         for hook in self._forward_pre_hooks.values():
47 |             hook(self, input)
48 | 
49 |         # reshape weight
50 |         weight = self._get_linearized_weight()
51 |         kw = self.kernel_size[0]
52 | 
53 |         bsz = input.size(0)  # input: bsz x len x dim
54 |         if kw > 1:
55 |             input = input.data
56 |             if self.input_buffer is None:
57 |                 self.input_buffer = input.new(bsz, kw, input.size(2))
58 |                 self.input_buffer.zero_()
59 |             else:
60 |                 # shift buffer
61 |                 self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone()
62 |             # append next input
63 |             self.input_buffer[:, -1, :] = input[:, -1, :]
64 |             input = torch.autograd.Variable(self.input_buffer, volatile=True)
65 |         output = F.linear(input.view(bsz, -1), weight, self.bias)
66 |         return output.view(bsz, 1, -1)
67 | 
68 |     def clear_buffer(self):
69 |         self.input_buffer = None
70 | 
71 |     def reorder_buffer(self, new_order):
72 |         if self.input_buffer is not None:
73 |             self.input_buffer = self.input_buffer.index_select(0, new_order)
74 | 
75 |     def _get_linearized_weight(self):
76 |         if self._linearized_weight is None:
77 |             kw = self.kernel_size[0]
78 |             weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous()
79 |             assert weight.size() == (self.out_channels, kw, self.in_channels)
80 |             self._linearized_weight = weight.view(self.out_channels, -1)
81 |         return self._linearized_weight
82 | 
83 |     def _clear_linearized_weight(self, *args):
84 |         self._linearized_weight = None
85 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/bleu.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the LICENSE file in
  5 | # the root directory of this source tree. An additional grant of patent rights
  6 | # can be found in the PATENTS file in the same directory.
  7 | #
  8 | 
  9 | import ctypes
 10 | import math
 11 | import torch
 12 | 
 13 | try:
 14 |     from fairseq import libbleu
 15 | except ImportError as e:
 16 |     import sys
 17 |     sys.stderr.write('ERROR: missing libbleu.so. run `python setup.py install`\n')
 18 |     raise e
 19 | 
 20 | 
 21 | C = ctypes.cdll.LoadLibrary(libbleu.__file__)
 22 | 
 23 | 
 24 | class BleuStat(ctypes.Structure):
 25 |     _fields_ = [
 26 |         ('reflen', ctypes.c_size_t),
 27 |         ('predlen', ctypes.c_size_t),
 28 |         ('match1', ctypes.c_size_t),
 29 |         ('count1', ctypes.c_size_t),
 30 |         ('match2', ctypes.c_size_t),
 31 |         ('count2', ctypes.c_size_t),
 32 |         ('match3', ctypes.c_size_t),
 33 |         ('count3', ctypes.c_size_t),
 34 |         ('match4', ctypes.c_size_t),
 35 |         ('count4', ctypes.c_size_t),
 36 |     ]
 37 | 
 38 | 
 39 | class Scorer(object):
 40 |     def __init__(self, pad, eos, unk):
 41 |         self.stat = BleuStat()
 42 |         self.pad = pad
 43 |         self.eos = eos
 44 |         self.unk = unk
 45 |         self.reset()
 46 | 
 47 |     def reset(self, one_init=False):
 48 |         if one_init:
 49 |             C.bleu_one_init(ctypes.byref(self.stat))
 50 |         else:
 51 |             C.bleu_zero_init(ctypes.byref(self.stat))
 52 | 
 53 |     def add(self, ref, pred):
 54 |         if not isinstance(ref, torch.IntTensor):
 55 |             raise TypeError('ref must be a torch.IntTensor (got {})'
 56 |                             .format(type(ref)))
 57 |         if not isinstance(pred, torch.IntTensor):
 58 |             raise TypeError('pred must be a torch.IntTensor(got {})'
 59 |                             .format(type(pred)))
 60 | 
 61 |         assert self.unk > 0, 'unknown token index must be >0'
 62 |         rref = ref.clone()
 63 |         rref.apply_(lambda x: x if x != self.unk else -x)
 64 | 
 65 |         rref = rref.contiguous().view(-1)
 66 |         pred = pred.contiguous().view(-1)
 67 | 
 68 |         C.bleu_add(
 69 |             ctypes.byref(self.stat),
 70 |             ctypes.c_size_t(rref.size(0)),
 71 |             ctypes.c_void_p(rref.data_ptr()),
 72 |             ctypes.c_size_t(pred.size(0)),
 73 |             ctypes.c_void_p(pred.data_ptr()),
 74 |             ctypes.c_int(self.pad),
 75 |             ctypes.c_int(self.eos))
 76 | 
 77 |     def score(self, order=4):
 78 |         psum = sum(math.log(p) if p > 0 else float('-Inf')
 79 |                    for p in self.precision()[:order])
 80 |         return self.brevity() * math.exp(psum / order) * 100
 81 | 
 82 |     def precision(self):
 83 |         def ratio(a, b):
 84 |             return a / b if b > 0 else 0
 85 | 
 86 |         return [
 87 |             ratio(self.stat.match1, self.stat.count1),
 88 |             ratio(self.stat.match2, self.stat.count2),
 89 |             ratio(self.stat.match3, self.stat.count3),
 90 |             ratio(self.stat.match4, self.stat.count4),
 91 |         ]
 92 | 
 93 |     def brevity(self):
 94 |         r = self.stat.reflen / self.stat.predlen
 95 |         return min(1, math.exp(1 - r))
 96 | 
 97 |     def result_string(self, order=4):
 98 |         assert order <= 4, "BLEU scores for order > 4 aren't supported"
 99 |         fmt = 'BLEU{} = {:2.2f}, {:2.1f}'
100 |         for i in range(1, order):
101 |             fmt += '/{:2.1f}'
102 |         fmt += ' (BP={:.3f}, ratio={:.3f}, syslen={}, reflen={})'
103 |         bleup = [p * 100 for p in self.precision()[:order]]
104 |         return fmt.format(order, self.score(order=order), *bleup,
105 |                           self.brevity(), self.stat.reflen/self.stat.predlen,
106 |                           self.stat.predlen, self.stat.reflen)
107 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/modules/conv_tbc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the LICENSE file in
  5 | # the root directory of this source tree. An additional grant of patent rights
  6 | # can be found in the PATENTS file in the same directory.
  7 | #
  8 | 
  9 | import torch
 10 | from torch.autograd import Variable, Function
 11 | from torch.nn.modules.utils import _single
 12 | 
 13 | try:
 14 |     from fairseq import temporal_convolution_tbc
 15 | except ImportError as e:
 16 |     import sys
 17 |     sys.stderr.write('ERROR: missing temporal_convolution_tbc, run `python setup.py install`\n')
 18 |     raise e
 19 | 
 20 | 
 21 | class ConvTBC(torch.nn.Module):
 22 |     """1D convolution over an input of shape (time x batch x channel)
 23 | 
 24 |     The implementation uses gemm to perform the convolution. This implementation
 25 |     is faster than cuDNN for small kernel sizes.
 26 |     """
 27 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
 28 |                  padding=0):
 29 |         super(ConvTBC, self).__init__()
 30 |         self.in_channels = in_channels
 31 |         self.out_channels = out_channels
 32 |         self.kernel_size = _single(kernel_size)
 33 |         self.stride = _single(stride)
 34 |         self.padding = _single(padding)
 35 |         assert self.stride == (1,)
 36 | 
 37 |         self.weight = torch.nn.Parameter(torch.Tensor(
 38 |             self.kernel_size[0], in_channels, out_channels))
 39 |         self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
 40 | 
 41 |     def forward(self, input):
 42 |         return ConvTBCFunction.apply(
 43 |             input.contiguous(), self.weight, self.bias, self.padding[0])
 44 | 
 45 |     def __repr__(self):
 46 |         s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size}'
 47 |              ', padding={padding}')
 48 |         if self.bias is None:
 49 |             s += ', bias=False'
 50 |         s += ')'
 51 |         return s.format(name=self.__class__.__name__, **self.__dict__)
 52 | 
 53 | 
 54 | class ConvTBCFunction(Function):
 55 |     @staticmethod
 56 |     def forward(ctx, input, weight, bias, pad):
 57 |         input_size = input.size()
 58 |         weight_size = weight.size()
 59 |         kernel_size = weight_size[0]
 60 | 
 61 |         output = input.new(
 62 |             input_size[0] - kernel_size + 1 + pad * 2,
 63 |             input_size[1],
 64 |             weight_size[2])
 65 | 
 66 |         ctx.input_size = input_size
 67 |         ctx.weight_size = weight_size
 68 |         ctx.save_for_backward(input, weight)
 69 |         temporal_convolution_tbc.TemporalConvolutionTBC_forward(
 70 |             input.type().encode('utf-8'),
 71 |             input,
 72 |             output,
 73 |             weight,
 74 |             bias)
 75 | 
 76 |         return output
 77 | 
 78 |     @staticmethod
 79 |     def backward(ctx, grad_output):
 80 |         input, weight = ctx.saved_tensors
 81 | 
 82 |         grad_output = grad_output.data.contiguous()
 83 |         grad_input = grad_output.new(ctx.input_size).zero_()
 84 |         grad_weight = grad_output.new(ctx.weight_size).zero_()
 85 |         grad_bias = grad_output.new(ctx.weight_size[2])
 86 | 
 87 |         temporal_convolution_tbc.TemporalConvolutionTBC_backward(
 88 |             input.type().encode('utf-8'),
 89 |             grad_output,
 90 |             grad_input,
 91 |             grad_weight,
 92 |             grad_bias,
 93 |             input,
 94 |             weight)
 95 | 
 96 |         grad_input = Variable(grad_input, volatile=True)
 97 |         grad_weight = Variable(grad_weight, volatile=True)
 98 |         grad_bias = Variable(grad_bias, volatile=True)
 99 | 
100 |         return grad_input, grad_weight, grad_bias, None
101 | 
102 | 
103 | def conv_tbc(input, weight, bias=None, stride=1, padding=0):
104 |     return ConvTBCFunction.apply(
105 |         input.contiguous(), weight, bias, padding[0])
106 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/scripts/convert_model.lua:
--------------------------------------------------------------------------------
  1 | -- Copyright (c) 2017-present, Facebook, Inc.
  2 | -- All rights reserved.
  3 | --
  4 | -- This source code is licensed under the license found in the LICENSE file in
  5 | -- the root directory of this source tree. An additional grant of patent rights
  6 | -- can be found in the PATENTS file in the same directory.
  7 | --
  8 | -- Usage: convert_model.lua <model_epoch1.th7>
  9 | require 'torch'
 10 | local fairseq = require 'fairseq'
 11 | 
 12 | model = torch.load(arg[1])
 13 | 
 14 | function find_weight_norm(container, module)
 15 |   for _, wn in ipairs(container:listModules()) do
 16 |     if torch.type(wn) == 'nn.WeightNorm' and wn.modules[1] == module then
 17 |       return wn
 18 |     end
 19 |   end
 20 | end
 21 | 
 22 | function push_state(dict, key, module)
 23 |   if torch.type(module) == 'nn.Linear' then
 24 |     local wn = find_weight_norm(model.module, module)
 25 |     assert(wn)
 26 |     dict[key .. '.weight_v'] = wn.v:float()
 27 |     dict[key .. '.weight_g'] = wn.g:float()
 28 |   elseif torch.type(module) == 'nn.TemporalConvolutionTBC' then
 29 |     local wn = find_weight_norm(model.module, module)
 30 |     assert(wn)
 31 |     local v = wn.v:float():view(wn.viewOut):transpose(2, 3)
 32 |     dict[key .. '.weight_v'] = v
 33 |     dict[key .. '.weight_g'] = wn.g:float():view(module.weight:size(3), 1, 1)
 34 |   else
 35 |     dict[key .. '.weight'] = module.weight:float()
 36 |   end
 37 |   if module.bias then
 38 |     dict[key .. '.bias'] = module.bias:float()
 39 |   end
 40 | end
 41 | 
 42 | encoder_dict = {}
 43 | decoder_dict = {}
 44 | combined_dict = {}
 45 | 
 46 | function encoder_state(encoder)
 47 |   luts = encoder:findModules('nn.LookupTable')
 48 |   push_state(encoder_dict, 'embed_tokens', luts[1])
 49 |   push_state(encoder_dict, 'embed_positions', luts[2])
 50 | 
 51 |   fcs = encoder:findModules('nn.Linear')
 52 |   assert(#fcs >= 2)
 53 |   local nInputPlane = fcs[1].weight:size(1)
 54 |   push_state(encoder_dict, 'fc1', table.remove(fcs, 1))
 55 |   push_state(encoder_dict, 'fc2', table.remove(fcs, #fcs))
 56 | 
 57 |   for i, module in ipairs(encoder:findModules('nn.TemporalConvolutionTBC')) do
 58 |     push_state(encoder_dict, 'convolutions.' .. tostring(i - 1), module)
 59 |     if nInputPlane ~= module.weight:size(3) / 2 then
 60 |       push_state(encoder_dict, 'projections.' .. tostring(i - 1), table.remove(fcs, 1))
 61 |     end
 62 |     nInputPlane = module.weight:size(3) / 2
 63 |   end
 64 |   assert(#fcs == 0)
 65 | end
 66 | 
 67 | function decoder_state(decoder)
 68 |   luts = decoder:findModules('nn.LookupTable')
 69 |   push_state(decoder_dict, 'embed_tokens', luts[1])
 70 |   push_state(decoder_dict, 'embed_positions', luts[2])
 71 | 
 72 |   fcs = decoder:findModules('nn.Linear')
 73 |   local nInputPlane = fcs[1].weight:size(1)
 74 |   push_state(decoder_dict, 'fc1', table.remove(fcs, 1))
 75 |   push_state(decoder_dict, 'fc2', fcs[#fcs - 1])
 76 |   push_state(decoder_dict, 'fc3', fcs[#fcs])
 77 | 
 78 |   table.remove(fcs, #fcs)
 79 |   table.remove(fcs, #fcs)
 80 | 
 81 |   for i, module in ipairs(decoder:findModules('nn.TemporalConvolutionTBC')) do
 82 |     if nInputPlane ~= module.weight:size(3) / 2 then
 83 |       push_state(decoder_dict, 'projections.' .. tostring(i - 1), table.remove(fcs, 1))
 84 |     end
 85 |     nInputPlane = module.weight:size(3) / 2
 86 | 
 87 |     local prefix = 'attention.' .. tostring(i - 1)
 88 |     push_state(decoder_dict, prefix .. '.in_projection', table.remove(fcs, 1))
 89 |     push_state(decoder_dict, prefix .. '.out_projection', table.remove(fcs, 1))
 90 |     push_state(decoder_dict, 'convolutions.' .. tostring(i - 1), module)
 91 |   end
 92 |   assert(#fcs == 0)
 93 | end
 94 | 
 95 | 
 96 | _encoder = model.module.modules[2]
 97 | _decoder = model.module.modules[3]
 98 | 
 99 | encoder_state(_encoder)
100 | decoder_state(_decoder)
101 | 
102 | for k, v in pairs(encoder_dict) do
103 |   combined_dict['encoder.' .. k] = v
104 | end
105 | for k, v in pairs(decoder_dict) do
106 |   combined_dict['decoder.' .. k] = v
107 | end
108 | 
109 | 
110 | torch.save('state_dict.t7', combined_dict)
111 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/dictionary.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the LICENSE file in
  5 | # the root directory of this source tree. An additional grant of patent rights
  6 | # can be found in the PATENTS file in the same directory.
  7 | #
  8 | 
  9 | import math
 10 | import torch
 11 | 
 12 | 
 13 | class Dictionary(object):
 14 |     """A mapping from symbols to consecutive integers"""
 15 |     def __init__(self, pad='<pad>', eos='</s>', unk='<unk>'):
 16 |         self.unk_word, self.pad_word, self.eos_word = unk, pad, eos
 17 |         self.symbols = []
 18 |         self.count = []
 19 |         self.indices = {}
 20 |         self.add_symbol('<Lua heritage>')
 21 |         self.pad_index = self.add_symbol(pad)
 22 |         self.eos_index = self.add_symbol(eos)
 23 |         self.unk_index = self.add_symbol(unk)
 24 |         self.nspecial = len(self.symbols)
 25 | 
 26 |     def __getitem__(self, idx):
 27 |         if idx < len(self.symbols):
 28 |             return self.symbols[idx]
 29 |         return self.unk_word
 30 | 
 31 |     def __len__(self):
 32 |         """Returns the number of symbols in the dictionary"""
 33 |         return len(self.symbols)
 34 | 
 35 |     def index(self, sym):
 36 |         """Returns the index of the specified symbol"""
 37 |         if sym in self.indices:
 38 |             return self.indices[sym]
 39 |         return self.unk_index
 40 | 
 41 |     def string(self, tensor):
 42 |         if torch.is_tensor(tensor) and tensor.dim() == 2:
 43 |             sentences = [self.string(line) for line in tensor]
 44 |             return '\n'.join(sentences)
 45 | 
 46 |         eos = self.eos()
 47 |         return ' '.join([self[i] for i in tensor if i != eos])
 48 | 
 49 |     def add_symbol(self, word, n=1):
 50 |         """Adds a word to the dictionary"""
 51 |         if word in self.indices:
 52 |             idx = self.indices[word]
 53 |             self.count[idx] = self.count[idx] + n
 54 |             return idx
 55 |         else:
 56 |             idx = len(self.symbols)
 57 |             self.indices[word] = idx
 58 |             self.symbols.append(word)
 59 |             self.count.append(n)
 60 |             return idx
 61 | 
 62 |     def finalize(self):
 63 |         """Sort symbols by frequency in descending order, ignoring special ones."""
 64 |         self.count, self.symbols = zip(
 65 |             *sorted(zip(self.count, self.symbols),
 66 |                     key=(lambda x: math.inf if self.indices[x[1]] < self.nspecial else x[0]),
 67 |                     reverse=True)
 68 |         )
 69 | 
 70 |     def pad(self):
 71 |         """Helper to get index of pad symbol"""
 72 |         return self.pad_index
 73 | 
 74 |     def eos(self):
 75 |         """Helper to get index of end-of-sentence symbol"""
 76 |         return self.eos_index
 77 | 
 78 |     def unk(self):
 79 |         """Helper to get index of unk symbol"""
 80 |         return self.unk_index
 81 | 
 82 |     @staticmethod
 83 |     def load(f):
 84 |         """Loads the dictionary from a text file with the format:
 85 | 
 86 |         ```
 87 |         <symbol0> <count0>
 88 |         <symbol1> <count1>
 89 |         ...
 90 |         ```
 91 |         """
 92 | 
 93 |         if isinstance(f, str):
 94 |             with open(f, 'r') as fd:
 95 |                 return Dictionary.load(fd)
 96 | 
 97 |         d = Dictionary()
 98 |         for line in f.readlines():
 99 |             idx = line.rfind(' ')
100 |             word = line[:idx]
101 |             count = int(line[idx+1:])
102 |             d.indices[word] = len(d.symbols)
103 |             d.symbols.append(word)
104 |             d.count.append(count)
105 |         return d
106 | 
107 |     def save(self, f, threshold=3, nwords=-1):
108 |         """Stores dictionary into a text file"""
109 |         if isinstance(f, str):
110 |             with open(f, 'w') as fd:
111 |                 return self.save(fd, threshold, nwords)
112 |         cnt = 0
113 |         for i, t in enumerate(zip(self.symbols, self.count)):
114 |             if i >= self.nspecial and t[1] >= threshold \
115 |                     and (nwords < 0 or cnt < nwords):
116 |                 print('{} {}'.format(t[0], t[1]), file=f)
117 |                 cnt += 1
118 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/scripts/build_sym_alignment.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the LICENSE file in
  5 | # the root directory of this source tree. An additional grant of patent rights
  6 | # can be found in the PATENTS file in the same directory.
  7 | #
  8 | 
  9 | """
 10 | Use this script in order to build symmetric alignments for your translation
 11 | dataset.
 12 | This script depends on fast_align and mosesdecoder tools. You will need to
 13 | build those before running the script.
 14 | fast_align:
 15 |     github: http://github.com/clab/fast_align
 16 |     instructions: follow the instructions in README.md
 17 | mosesdecoder:
 18 |     github: http://github.com/moses-smt/mosesdecoder
 19 |     instructions: http://www.statmt.org/moses/?n=Development.GetStarted
 20 | The script produces the following files under --output_dir:
 21 |     text.joined - concatenation of lines from the source_file and the
 22 |     target_file.
 23 |     align.forward - forward pass of fast_align.
 24 |     align.backward - backward pass of fast_align.
 25 |     aligned.sym_heuristic - symmetrized alignment.
 26 | """
 27 | 
 28 | import argparse
 29 | import os
 30 | from itertools import zip_longest
 31 | 
 32 | 
 33 | def main():
 34 |     parser = argparse.ArgumentParser(description='symmetric alignment builer')
 35 |     parser.add_argument('--fast_align_dir',
 36 |                         help='path to fast_align build directory')
 37 |     parser.add_argument('--mosesdecoder_dir',
 38 |                         help='path to mosesdecoder root directory')
 39 |     parser.add_argument('--sym_heuristic',
 40 |                         help='heuristic to use for symmetrization',
 41 |                         default='grow-diag-final-and')
 42 |     parser.add_argument('--source_file',
 43 |                         help='path to a file with sentences '
 44 |                              'in the source language')
 45 |     parser.add_argument('--target_file',
 46 |                         help='path to a file with sentences '
 47 |                              'in the target language')
 48 |     parser.add_argument('--output_dir',
 49 |                         help='output directory')
 50 |     args = parser.parse_args()
 51 | 
 52 |     fast_align_bin = os.path.join(args.fast_align_dir, 'fast_align')
 53 |     symal_bin = os.path.join(args.mosesdecoder_dir, 'bin', 'symal')
 54 |     sym_fast_align_bin = os.path.join(
 55 |         args.mosesdecoder_dir, 'scripts', 'ems',
 56 |         'support', 'symmetrize-fast-align.perl')
 57 | 
 58 |     # create joined file
 59 |     joined_file = os.path.join(args.output_dir, 'text.joined')
 60 |     with open(args.source_file, 'r') as src, open(args.target_file, 'r') as tgt:
 61 |         with open(joined_file, 'w') as joined:
 62 |             for s, t in zip_longest(src, tgt):
 63 |                 print('{} ||| {}'.format(s.strip(), t.strip()), file=joined)
 64 | 
 65 |     bwd_align_file = os.path.join(args.output_dir, 'align.backward')
 66 | 
 67 |     # run forward alignment
 68 |     fwd_align_file = os.path.join(args.output_dir, 'align.forward')
 69 |     fwd_fast_align_cmd = '{FASTALIGN} -i {JOINED} -d -o -v > {FWD}'.format(
 70 |         FASTALIGN=fast_align_bin,
 71 |         JOINED=joined_file,
 72 |         FWD=fwd_align_file)
 73 |     assert os.system(fwd_fast_align_cmd) == 0
 74 | 
 75 |     # run backward alignment
 76 |     bwd_align_file = os.path.join(args.output_dir, 'align.backward')
 77 |     bwd_fast_align_cmd = '{FASTALIGN} -i {JOINED} -d -o -v -r > {BWD}'.format(
 78 |         FASTALIGN=fast_align_bin,
 79 |         JOINED=joined_file,
 80 |         BWD=bwd_align_file)
 81 |     assert os.system(bwd_fast_align_cmd) == 0
 82 | 
 83 |     # run symmetrization
 84 |     sym_out_file = os.path.join(args.output_dir, 'aligned')
 85 |     sym_cmd = '{SYMFASTALIGN} {FWD} {BWD} {SRC} {TGT} {OUT} {HEURISTIC} {SYMAL}'.format(
 86 |         SYMFASTALIGN=sym_fast_align_bin,
 87 |         FWD=fwd_align_file,
 88 |         BWD=bwd_align_file,
 89 |         SRC=args.source_file,
 90 |         TGT=args.target_file,
 91 |         OUT=sym_out_file,
 92 |         HEURISTIC=args.sym_heuristic,
 93 |         SYMAL=symal_bin
 94 |     )
 95 |     assert os.system(sym_cmd) == 0
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     main()
100 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/clib/temporal_convolution_tbc/temporal_convolution_tbc.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2017-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | #include <stdio.h>
 10 | #include <string.h>
 11 | #include <stdexcept>
 12 | #include <ATen/ATen.h>
 13 | 
 14 | 
 15 | using at::Tensor;
 16 | extern THCState* state;
 17 | 
 18 | at::Type& getDataType(const char* dtype) {
 19 |   if (strcmp(dtype, "torch.cuda.FloatTensor") == 0) {
 20 |     return at::getType(at::kCUDA, at::kFloat);
 21 |   } else if (strcmp(dtype, "torch.FloatTensor") == 0) {
 22 |     return at::getType(at::kCPU, at::kFloat);
 23 |   } else {
 24 |     throw std::runtime_error(std::string("Unsupported data type: ") + dtype);
 25 |   }
 26 | }
 27 | 
 28 | inline at::Tensor t(at::Type& type, void* i) {
 29 |   return type.unsafeTensorFromTH(i, true);
 30 | }
 31 | 
 32 | extern "C" void TemporalConvolutionTBC_forward(
 33 |   const char* dtype,
 34 |   void* _input,
 35 |   void* _output,
 36 |   void* _weight,
 37 |   void* _bias)
 38 | {
 39 |   auto& type = getDataType(dtype);
 40 |   Tensor input = t(type, _input);
 41 |   Tensor output = t(type, _output);
 42 |   Tensor weight = t(type, _weight);
 43 |   Tensor bias = t(type, _bias);
 44 | 
 45 |   auto input_size = input.sizes();
 46 |   auto output_size = output.sizes();
 47 | 
 48 |   auto ilen = input_size[0];
 49 |   auto batchSize = input_size[1];
 50 |   auto inputPlanes = input_size[2];
 51 |   auto outputPlanes = output_size[2];
 52 |   auto olen = output_size[0];
 53 |   auto kw = weight.sizes()[0];
 54 |   int pad = (olen - ilen + kw - 1) / 2;
 55 | 
 56 |   // input * weights + bias -> output_features
 57 |   output.copy_(bias.expand(output.sizes()));
 58 |   for (int k = 0; k < kw; k++) {
 59 |     int iShift = std::max(0, k - pad);
 60 |     int oShift = std::max(0, pad - k);
 61 |     int t = std::min(ilen + pad - k, olen) - oShift;
 62 |     // Note: gemm assumes column-major matrices
 63 |     // input    is l*m (row-major)
 64 |     // weight   is m*r (row-major)
 65 |     // output   is l*r (row-major)
 66 |     if (t > 0) {
 67 |       auto W = weight[k];
 68 |       auto I = input.narrow(0, iShift, t).view({t * batchSize, inputPlanes});
 69 |       auto O = output.narrow(0, oShift, t).view({t * batchSize, outputPlanes});
 70 |       at::addmm_out(1, O, 1, I, W, O);
 71 |     }
 72 |   }
 73 | }
 74 | 
 75 | extern "C" void TemporalConvolutionTBC_backward(
 76 |   const char* dtype,
 77 |   void* _dOutput,
 78 |   void* _dInput,
 79 |   void* _dWeight,
 80 |   void* _dBias,
 81 |   void* _input,
 82 |   void* _weight)
 83 | {
 84 |   auto& type = getDataType(dtype);
 85 |   Tensor dOutput = t(type, _dOutput);
 86 |   Tensor dInput = t(type, _dInput);
 87 |   Tensor dWeight = t(type, _dWeight);
 88 |   Tensor dBias = t(type, _dBias);
 89 |   Tensor input = t(type, _input);
 90 |   Tensor weight = t(type, _weight);
 91 | 
 92 |   auto input_size = input.sizes();
 93 |   auto output_size = dOutput.sizes();
 94 | 
 95 |   auto ilen = input_size[0];
 96 |   auto batchSize = input_size[1];
 97 |   auto inputPlanes = input_size[2];
 98 |   auto outputPlanes = output_size[2];
 99 |   auto olen = output_size[0];
100 |   auto kw = weight.sizes()[0];
101 |   int pad = (olen - ilen + kw - 1) / 2;
102 | 
103 |   for (int k = 0; k < kw; k++) {
104 |     int iShift = std::max(0, k - pad);
105 |     int oShift = std::max(0, pad - k);
106 |     int t = std::min(ilen + pad - k, olen) - oShift;
107 |     // dOutput * T(weight) -> dInput
108 |     if (t > 0) {
109 |       auto dO = dOutput.narrow(0, oShift, t).view({t * batchSize, outputPlanes});
110 |       auto dI = dInput.narrow(0, iShift, t).view({t * batchSize, inputPlanes});
111 |       at::addmm_out(1, dI, 1, dO, weight[k].t(), dI);
112 |     }
113 |   }
114 | 
115 |   for (int k = 0; k < kw; k++) {
116 |     int iShift = std::max(0, k - pad);
117 |     int oShift = std::max(0, pad - k);
118 |     int t = std::min(ilen + pad - k, olen) - oShift;
119 |     // T(input) * dOutput -> dWeight
120 |     if (t > 0) {
121 |       auto dW = dWeight[k];
122 |       auto dO = dOutput.narrow(0, oShift, t).view({t * batchSize, outputPlanes});
123 |       auto I = input.narrow(0, iShift, t).view({t * batchSize, inputPlanes}).t();
124 |       at::addmm_out(1, dW, 1, I, dO, dW);
125 |     }
126 |   }
127 | 
128 |   auto tmp = dOutput.sum(0, false);
129 |   at::sum_out(tmp, 0, dBias);
130 | }
131 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/lib/m2scorer/nuclesgmlparser.py:
--------------------------------------------------------------------------------
  1 | # nuclesgmlparser.py
  2 | #
  3 | # Author:	Yuanbin Wu
  4 | #           National University of Singapore (NUS)
  5 | # Date:		12 Mar 2013
  6 | # Version:      1.0
  7 | # 
  8 | # Contact:  wuyb@comp.nus.edu.sg
  9 | #
 10 | # This script is distributed to support the CoNLL-2013 Shared Task.
 11 | # It is free for research and educational purposes.
 12 | 
 13 | from sgmllib import SGMLParser
 14 | from nucle_doc import nucle_doc
 15 | 
 16 | 
 17 | class nuclesgmlparser(SGMLParser):
 18 |     def __init__(self):
 19 |         SGMLParser.__init__(self)
 20 |         self.docs = []
 21 | 
 22 |     def reset(self):
 23 |         self.docs = []
 24 |         self.data = []
 25 |         SGMLParser.reset(self)
 26 | 
 27 |     def unknow_starttag(self, tag, attrs):
 28 |         pass
 29 | 
 30 |     def unknow_endtag(self):
 31 |         pass
 32 | 
 33 |     def start_doc(self, attrs):
 34 |         self.docs.append(nucle_doc())
 35 |         self.docs[-1].docattrs = attrs
 36 | 
 37 |     def end_doc(self):
 38 |         pass
 39 | 
 40 |     def start_matric(self, attrs):
 41 |         pass
 42 | 
 43 |     def end_matric(self):
 44 |         self.docs[-1].matric = ''.join(self.data)
 45 |         self.data = []
 46 |         pass
 47 | 
 48 |     def start_email(self, attrs):
 49 |         pass
 50 | 
 51 |     def end_email(self):
 52 |         self.docs[-1].email = ''.join(self.data)
 53 |         self.data = []
 54 |         pass
 55 | 
 56 |     def start_nationality(self, attrs):
 57 |         pass
 58 | 
 59 |     def end_nationality(self):
 60 |         self.docs[-1].nationality = ''.join(self.data)
 61 |         self.data = []
 62 |         pass
 63 | 
 64 |     def start_first_language(self, attrs):
 65 |         pass
 66 | 
 67 |     def end_first_language(self):
 68 |         self.docs[-1].firstLanguage = ''.join(self.data)
 69 |         self.data = []
 70 |         pass
 71 | 
 72 |     def start_school_language(self, attrs):
 73 |         pass
 74 | 
 75 |     def end_school_language(self):
 76 |         self.docs[-1].schoolLanguage = ''.join(self.data)
 77 |         self.data = []
 78 |         pass
 79 | 
 80 |     def start_english_tests(self, attrs):
 81 |         pass
 82 | 
 83 |     def end_english_tests(self):
 84 |         self.docs[-1].englishTests = ''.join(self.data)
 85 |         self.data = []
 86 |         pass
 87 | 
 88 | 
 89 |     def start_text(self, attrs):
 90 |         pass
 91 |     
 92 |     def end_text(self):
 93 |         pass
 94 | 
 95 |     def start_title(self, attrs):
 96 |         pass
 97 | 
 98 |     def end_title(self):
 99 |         self.docs[-1].paragraphs.append(''.join(self.data))
100 |         self.data = []
101 |         pass
102 | 
103 | 
104 |     def start_p(self, attrs):
105 |         pass
106 | 
107 |     def end_p(self):
108 |         self.docs[-1].paragraphs.append(''.join(self.data))
109 |         self.data = []
110 |         pass
111 | 
112 | 
113 |     def start_annotation(self, attrs):
114 |         self.docs[-1].annotation.append(attrs)
115 | 
116 |     def end_annotation(self):
117 |         pass
118 | 
119 |     def start_mistake(self, attrs):
120 |         d = {}
121 |         for t in attrs:
122 |             d[t[0]] = int(t[1])
123 |         self.docs[-1].mistakes.append(d)
124 |         pass 
125 | 
126 |     def end_mistake(self):
127 |         pass 
128 | 
129 |     def start_type(self, attrs):
130 |         pass
131 | 
132 |     def end_type(self):
133 |         self.docs[-1].mistakes[-1]['type'] = ''.join(self.data)
134 |         self.data = []
135 | 
136 |     def start_correction(self, attrs):
137 |         pass
138 | 
139 |     def end_correction(self):
140 |         self.docs[-1].mistakes[-1]['correction'] = ''.join(self.data)
141 |         self.data = []
142 | 
143 |     def start_comment(self, attrs):
144 |         pass
145 | 
146 |     def end_comment(self):
147 |         self.docs[-1].mistakes[-1]['comment'] = ''.join( self.data)
148 |         self.data = []
149 | 
150 | 
151 |     def handle_charref(self, ref):
152 |         self.data.append('&' + ref)
153 | 
154 |     def handle_entityref(self, ref):
155 |         self.data.append('&' + ref)
156 | 
157 |     def handle_data(self, text):
158 |         if  text.strip() == '':
159 |             self.data.append('')
160 |             return
161 |         else:
162 |             if text.startswith('\n'):
163 |                 text = text[1:]
164 |             if text.endswith('\n'):
165 |                 text = text[:-1]
166 |             self.data.append(text)
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import os
 5 | import shutil
 6 | import imp
 7 | 
 8 | import argparse
 9 | 
10 | # Initializing the logging module
11 | import logging
12 | import log_utils as L
13 | import configreader
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument("-i", "--input-nbest", dest="input_nbest", required=True, help="Input n-best file")
19 | parser.add_argument("-r", "--reference-files", dest="ref_paths", required=True, help="A comma-seperated list of reference files")
20 | parser.add_argument("-c", "--config", dest="input_config", required=True, help="Input config (ini) file, e.g similar to moses with [weight] section")
21 | parser.add_argument("-o", "--output-dir", dest="out_dir", required=True, help="Output directory")
22 | parser.add_argument("-t", "--threads", dest="threads", default = 14, type=int, help="Number of MERT threads")
23 | parser.add_argument("--no-add-weight", dest="no_add_weight", action="store_true", help="Flag to be true if config file already contains initial weights for augmented feature(s). Useful for adding multiple features.")
24 | parser.add_argument("-iv", "--init-value", dest="init_value", default = '0.05', help="The initial value of the feature")
25 | parser.add_argument("-a", "--tuning-algorithm", dest="alg", default = 'mert', help="Tuning Algorithm (mert|pro|wpro)")
26 | parser.add_argument("-m", "--tuning-metric", dest="metric", default = 'bleu', help="Tuning Algorithm (bleu|m2)")
27 | parser.add_argument("-s", "--predictable-seed", dest="pred_seed", action='store_true', help="Tune with predictable seed to avoid randomness")
28 | parser.add_argument("--moses-dir", dest="moses_dir", required=True, help="Path to Moses. Required for tuning scripts")
29 | args = parser.parse_args()
30 | 
31 | fscore_arg = ""
32 | if args.metric == 'm2':
33 |     fscore_arg = " --sctype M2SCORER --scconfig ignore_whitespace_casing:true "
34 |     logger.info("Using M2 Tuning")
35 |     logger.info(L.b_yellow('Arguments: ') + fscore_arg)
36 | 
37 | 
38 | if not os.path.exists(args.out_dir):
39 |     os.makedirs(args.out_dir)
40 | 
41 | L.set_logger(os.path.abspath(args.out_dir),'train_log.txt')
42 | L.print_args(args)
43 | 
44 | logger.info("Reading weights from config file")
45 | features = configreader.parse_ini(args.input_config)
46 | logger.info("Feature weights: " + str(features))
47 | 
48 | output_nbest_path = args.out_dir + '/augmented.nbest'
49 | shutil.copy(args.input_nbest, output_nbest_path)
50 | 
51 | logger.info('Extracting stats and features')
52 | logger.warning('The optional arguments of extractor are not used yet')
53 | cmd = args.moses_dir + '/bin/extractor -r ' + args.ref_paths + ' -n ' + output_nbest_path + ' --scfile ' + args.out_dir + '/statscore.data --ffile ' + args.out_dir + '/features.data'
54 | if args.metric == 'm2':
55 |     cmd = args.moses_dir + '/bin/extractor --sctype M2SCORER --scconfig ignore_whitespace_casing:true -r ' + args.ref_paths + ' -n ' + output_nbest_path + ' --scfile ' + args.out_dir + '/statscore.data --ffile ' + args.out_dir + '/features.data'
56 | logger.info('Executing command: ' + cmd )
57 | os.system(cmd)
58 | 
59 | 
60 | #create the list of features
61 | 
62 | with open(args.out_dir + '/init.opt', 'w') as init_opt:
63 |     init_list = []
64 |     for line in features:
65 |         tokens = line.split(" ")
66 |         try:
67 |             float(tokens[1])
68 |             init_list += tokens[1:]
69 |         except ValueError:
70 |             pass
71 |     if args.no_add_weight == False:
72 |         init_list.append(args.init_value)
73 |     dim = len(init_list)
74 |     init_opt.write(' '.join(init_list) + '\n')
75 |     init_opt.write(' '.join(['0' for i in range(dim)]) + '\n')
76 |     init_opt.write(' '.join(['1' for i in range(dim)]) + '\n')
77 | 
78 | seed_arg = ''
79 | if args.pred_seed:
80 |     seed_arg = ' -r 1 '
81 |     #seed_arg = ' -r 1500 '
82 | 
83 | 
84 | if (args.alg == 'mert'):
85 |     logger.info('Running MERT')
86 |     cmd = args.moses_dir + '/bin/mert -d ' + str(dim) + ' -S ' + args.out_dir + '/statscore.data -F ' + args.out_dir + '/features.data --ifile ' + args.out_dir + '/init.opt --threads ' + str(args.threads) + seed_arg + fscore_arg# + "-m 50 -n 20"
87 |     logger.info("Command: " +  cmd)
88 |     os.system(cmd)
89 | else:
90 |     logger.error('Invalid tuning algorithm: ' + args.alg)
91 | 
92 | logger.info(L.green("Optimization complete."))
93 | assert os.path.isfile('weights.txt')
94 | shutil.move('weights.txt', args.out_dir + '/weights.txt')
95 | 
96 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/nccl.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the LICENSE file in
  5 | # the root directory of this source tree. An additional grant of patent rights
  6 | # can be found in the PATENTS file in the same directory.
  7 | #
  8 | 
  9 | """
 10 | A modified version of torch.cuda.nccl.all_reduce for launching kernels on each
 11 | GPU separately.
 12 | """
 13 | 
 14 | import ctypes
 15 | import warnings
 16 | 
 17 | lib = None
 18 | _uid = None
 19 | _rank = None
 20 | _num_devices = None
 21 | _comm = None
 22 | 
 23 | __all__ = ['all_reduce', 'initialize', 'get_unique_id']
 24 | 
 25 | 
 26 | def _libnccl():
 27 |     global lib
 28 |     if lib is None:
 29 |         lib = ctypes.cdll.LoadLibrary(None)
 30 |         if hasattr(lib, 'ncclCommDestroy'):
 31 |             lib.ncclCommDestroy.restype = None
 32 |             lib.ncclGetErrorString.restype = ctypes.c_char_p
 33 |         else:
 34 |             lib = None
 35 |     return lib
 36 | 
 37 | 
 38 | def is_available(tensors):
 39 |     devices = set()
 40 |     for tensor in tensors:
 41 |         if not tensor.is_contiguous():
 42 |             return False
 43 |         if not tensor.is_cuda:
 44 |             return False
 45 |         device = tensor.get_device()
 46 |         if device in devices:
 47 |             return False
 48 |         devices.add(device)
 49 | 
 50 |     if _libnccl() is None:
 51 |         warnings.warn('NCCL library not found. Check your LD_LIBRARY_PATH')
 52 |         return False
 53 | 
 54 |     return True
 55 | 
 56 | 
 57 | _communicators = {}
 58 | 
 59 | # ncclDataType_t
 60 | ncclChar = 0
 61 | ncclInt = 1
 62 | ncclHalf = 2
 63 | ncclFloat = 3
 64 | ncclDouble = 4
 65 | ncclInt64 = 5
 66 | ncclUint64 = 6
 67 | 
 68 | # ncclRedOp_t
 69 | SUM = 0
 70 | PROD = 1
 71 | MAX = 2
 72 | MIN = 3
 73 | 
 74 | nccl_types = {
 75 |     'torch.cuda.ByteTensor': ncclChar,
 76 |     'torch.cuda.CharTensor': ncclChar,
 77 |     'torch.cuda.IntTensor': ncclInt,
 78 |     'torch.cuda.HalfTensor': ncclHalf,
 79 |     'torch.cuda.FloatTensor': ncclFloat,
 80 |     'torch.cuda.DoubleTensor': ncclDouble,
 81 |     'torch.cuda.LongTensor': ncclInt64,
 82 | }
 83 | 
 84 | 
 85 | class NcclError(RuntimeError):
 86 |     def __init__(self, status):
 87 |         self.status = status
 88 |         msg = '{0} ({1})'.format(lib.ncclGetErrorString(status), status)
 89 |         super(NcclError, self).__init__(msg)
 90 | 
 91 | 
 92 | class NcclComm(ctypes.c_void_p):
 93 |     def __del__(self):
 94 |         lib.ncclCommDestroy(self)
 95 | 
 96 | 
 97 | class NcclUniqueId(ctypes.Structure):
 98 |     _fields_ = [
 99 |         ('internal', ctypes.c_uint8 * 128)
100 |     ]
101 | 
102 | 
103 | def check_error(status):
104 |     if status != 0:
105 |         raise NcclError(status)
106 | 
107 | 
108 | _uids = []
109 | 
110 | 
111 | def get_unique_id():
112 |     if _libnccl() is None:
113 |         raise RuntimeError('Unable to load NCCL library')
114 | 
115 |     uid = NcclUniqueId()
116 |     check_error(lib.ncclGetUniqueId(ctypes.byref(uid)))
117 |     _uids.append(uid)  # Don't allow UIDs to be collected
118 |     return uid
119 | 
120 | 
121 | def initialize(num_devices, uid, rank):
122 |     global _num_devices, _uid, _rank
123 | 
124 |     if _libnccl() is None:
125 |         raise RuntimeError('Unable to load NCCL library')
126 | 
127 |     _num_devices = num_devices
128 |     if rank != 0:
129 |         _uid = NcclUniqueId.from_buffer_copy(uid)
130 |     else:
131 |         _uid = uid
132 |     _rank = rank
133 | 
134 | 
135 | def communicator():
136 |     global _comm
137 |     if _uid is None:
138 |         raise RuntimeError('NCCL not initialized')
139 |     if _comm is None:
140 |         comm = ctypes.c_void_p()
141 |         check_error(lib.ncclCommInitRank(
142 |             ctypes.byref(comm),
143 |             ctypes.c_int(_num_devices),
144 |             _uid,
145 |             ctypes.c_int(_rank)))
146 |         _comm = comm
147 |     return _comm
148 | 
149 | 
150 | def all_reduce(input, output=None, op=SUM, stream=None):
151 |     comm = communicator()
152 |     if output is None:
153 |         output = input
154 |     if stream is not None:
155 |         stream = stream.cuda_stream
156 |     data_type = nccl_types[input.type()]
157 |     check_error(lib.ncclAllReduce(
158 |         ctypes.c_void_p(input.data_ptr()),
159 |         ctypes.c_void_p(output.data_ptr()),
160 |         ctypes.c_size_t(input.numel()),
161 |         data_type,
162 |         op,
163 |         comm,
164 |         ctypes.c_void_p(stream)))
165 |     return output
166 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/subword-nmt/chrF.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | """Compute chrF3 for machine translation evaluation
  6 | 
  7 | Reference:
  8 | Maja Popović (2015). chrF: character n-gram F-score for automatic MT evaluation. In Proceedings of the Tenth Workshop on Statistical Machine Translationn, pages 392–395, Lisbon, Portugal.
  9 | """
 10 | 
 11 | from __future__ import print_function, unicode_literals, division
 12 | import sys
 13 | import codecs
 14 | import io
 15 | import argparse
 16 | from collections import defaultdict
 17 | from math import log, exp
 18 | 
 19 | # hack for python2/3 compatibility
 20 | from io import open
 21 | argparse.open = open
 22 | 
 23 | # python 2/3 compatibility
 24 | if sys.version_info < (3, 0):
 25 |   sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
 26 |   sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
 27 |   sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
 28 | 
 29 | 
 30 | def create_parser():
 31 |     parser = argparse.ArgumentParser(
 32 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 33 |         description="learn BPE-based word segmentation")
 34 | 
 35 |     parser.add_argument(
 36 |         '--ref', '-r', type=argparse.FileType('r'), required=True,
 37 |         metavar='PATH',
 38 |         help="Reference file")
 39 |     parser.add_argument(
 40 |         '--hyp', type=argparse.FileType('r'), metavar='PATH',
 41 |         default=sys.stdin,
 42 |         help="Hypothesis file (default: stdin).")
 43 |     parser.add_argument(
 44 |         '--beta', '-b', type=float, default=3,
 45 |         metavar='FLOAT',
 46 |         help="beta parameter (default: '%(default)s')")
 47 |     parser.add_argument(
 48 |         '--ngram', '-n', type=int, default=6,
 49 |         metavar='INT',
 50 |         help="ngram order (default: '%(default)s')")
 51 |     parser.add_argument(
 52 |         '--space', '-s', action='store_true',
 53 |         help="take spaces into account (default: '%(default)s')")
 54 |     parser.add_argument(
 55 |         '--precision', action='store_true',
 56 |         help="report precision (default: '%(default)s')")
 57 |     parser.add_argument(
 58 |         '--recall', action='store_true',
 59 |         help="report recall (default: '%(default)s')")
 60 | 
 61 |     return parser
 62 | 
 63 | def extract_ngrams(words, max_length=4, spaces=False):
 64 | 
 65 |     if not spaces:
 66 |         words = ''.join(words.split())
 67 |     else:
 68 |         words = words.strip()
 69 | 
 70 |     results = defaultdict(lambda: defaultdict(int))
 71 |     for length in range(max_length):
 72 |         for start_pos in range(len(words)):
 73 |             end_pos = start_pos + length + 1
 74 |             if end_pos <= len(words):
 75 |                 results[length][tuple(words[start_pos: end_pos])] += 1
 76 |     return results
 77 | 
 78 | 
 79 | def get_correct(ngrams_ref, ngrams_test, correct, total):
 80 | 
 81 |     for rank in ngrams_test:
 82 |         for chain in ngrams_test[rank]:
 83 |             total[rank] += ngrams_test[rank][chain]
 84 |             if chain in ngrams_ref[rank]:
 85 |                 correct[rank] += min(ngrams_test[rank][chain], ngrams_ref[rank][chain])
 86 | 
 87 |     return correct, total
 88 | 
 89 | 
 90 | def f1(correct, total_hyp, total_ref, max_length, beta=3, smooth=0):
 91 | 
 92 |     precision = 0
 93 |     recall = 0
 94 | 
 95 |     for i in range(max_length):
 96 |       if total_hyp[i] + smooth and total_ref[i] + smooth:
 97 |         precision += (correct[i] + smooth) / (total_hyp[i] + smooth)
 98 |         recall += (correct[i] + smooth) / (total_ref[i] + smooth)
 99 | 
100 |     precision /= max_length
101 |     recall /= max_length
102 | 
103 |     return (1 + beta**2) * (precision*recall) / ((beta**2 * precision) + recall), precision, recall
104 | 
105 | def main(args):
106 | 
107 |     correct = [0]*args.ngram
108 |     total = [0]*args.ngram
109 |     total_ref = [0]*args.ngram
110 |     for line in args.ref:
111 |       line2 = args.hyp.readline()
112 | 
113 |       ngrams_ref = extract_ngrams(line, max_length=args.ngram, spaces=args.space)
114 |       ngrams_test = extract_ngrams(line2, max_length=args.ngram, spaces=args.space)
115 | 
116 |       get_correct(ngrams_ref, ngrams_test, correct, total)
117 | 
118 |       for rank in ngrams_ref:
119 |           for chain in ngrams_ref[rank]:
120 |               total_ref[rank] += ngrams_ref[rank][chain]
121 | 
122 |     chrf, precision, recall = f1(correct, total, total_ref, args.ngram, args.beta)
123 | 
124 |     print('chrF3: {0:.4f}'.format(chrf))
125 |     if args.precision:
126 |         print('chrPrec: {0:.4f}'.format(precision))
127 |     if args.recall:
128 |         print('chrRec: {0:.4f}'.format(recall))
129 | 
130 | if __name__ == '__main__':
131 | 
132 |     parser = create_parser()
133 |     args = parser.parse_args()
134 | 
135 |     main(args)
136 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/indexed_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the LICENSE file in
  5 | # the root directory of this source tree. An additional grant of patent rights
  6 | # can be found in the PATENTS file in the same directory.
  7 | #
  8 | 
  9 | import numpy as np
 10 | import os
 11 | import struct
 12 | import torch
 13 | 
 14 | 
 15 | def read_longs(f, n):
 16 |     a = np.empty(n, dtype=np.int64)
 17 |     f.readinto(a)
 18 |     return a
 19 | 
 20 | 
 21 | def write_longs(f, a):
 22 |     f.write(np.array(a, dtype=np.int64))
 23 | 
 24 | 
 25 | dtypes = {
 26 |     1: np.uint8,
 27 |     2: np.int8,
 28 |     3: np.int16,
 29 |     4: np.int32,
 30 |     5: np.int64,
 31 |     6: np.float,
 32 |     7: np.double,
 33 | }
 34 | 
 35 | 
 36 | def code(dtype):
 37 |     for k in dtypes.keys():
 38 |         if dtypes[k] == dtype:
 39 |             return k
 40 | 
 41 | 
 42 | class IndexedDataset(object):
 43 |     """Loader for TorchNet IndexedDataset"""
 44 | 
 45 |     def __init__(self, path):
 46 |         with open(path + '.idx', 'rb') as f:
 47 |             magic = f.read(8)
 48 |             assert magic == b'TNTIDX\x00\x00'
 49 |             version = f.read(8)
 50 |             assert struct.unpack('<Q', version) == (1,)
 51 |             code, self.element_size = struct.unpack('<QQ', f.read(16))
 52 |             self.dtype = dtypes[code]
 53 |             self.size, self.s = struct.unpack('<QQ', f.read(16))
 54 |             self.dim_offsets = read_longs(f, self.size + 1)
 55 |             self.data_offsets = read_longs(f, self.size + 1)
 56 |             self.sizes = read_longs(f, self.s)
 57 |         self.read_data(path)
 58 | 
 59 |     def read_data(self, path):
 60 |         self.data_file = open(path + '.bin', 'rb', buffering=0)
 61 | 
 62 |     def __del__(self):
 63 |         self.data_file.close()
 64 | 
 65 |     def __getitem__(self, i):
 66 |         if i < 0 or i >= self.size:
 67 |             raise IndexError('index out of range')
 68 |         tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
 69 |         a = np.empty(tensor_size, dtype=self.dtype)
 70 |         self.data_file.seek(self.data_offsets[i] * self.element_size)
 71 |         self.data_file.readinto(a)
 72 |         return torch.from_numpy(a)
 73 | 
 74 |     def __len__(self):
 75 |         return self.size
 76 | 
 77 |     @staticmethod
 78 |     def exists(path):
 79 |         return os.path.exists(path + '.idx')
 80 | 
 81 | 
 82 | class IndexedInMemoryDataset(IndexedDataset):
 83 |     """Loader for TorchNet IndexedDataset, keeps all the data in memory"""
 84 | 
 85 |     def read_data(self, path):
 86 |         self.data_file = open(path + '.bin', 'rb')
 87 |         self.buffer = np.empty(self.data_offsets[-1], dtype=self.dtype)
 88 |         self.data_file.readinto(self.buffer)
 89 |         self.data_file.close()
 90 | 
 91 |     def __del__(self):
 92 |         pass
 93 | 
 94 |     def __getitem__(self, i):
 95 |         if i < 0 or i >= self.size:
 96 |             raise IndexError('index out of range')
 97 |         tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
 98 |         a = np.empty(tensor_size, dtype=self.dtype)
 99 |         np.copyto(a, self.buffer[self.data_offsets[i]:self.data_offsets[i + 1]])
100 |         return torch.from_numpy(a)
101 | 
102 | 
103 | class IndexedDatasetBuilder(object):
104 | 
105 |     element_sizes = {
106 |         np.uint8: 1,
107 |         np.int8:  1,
108 |         np.int16: 2,
109 |         np.int32: 4,
110 |         np.int64: 8,
111 |         np.float: 4,
112 |         np.double: 8
113 |     }
114 | 
115 |     def __init__(self, out_file, dtype=np.int32):
116 |         self.out_file = open(out_file, 'wb')
117 |         self.dtype = dtype
118 |         self.data_offsets = [0]
119 |         self.dim_offsets = [0]
120 |         self.sizes = []
121 |         self.element_size = self.element_sizes[self.dtype]
122 | 
123 |     def add_item(self, tensor):
124 |         # +1 for Lua compatibility
125 |         bytes = self.out_file.write(np.array(tensor.numpy() + 1, dtype=self.dtype))
126 |         self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size)
127 |         for s in tensor.size():
128 |             self.sizes.append(s)
129 |         self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size()))
130 | 
131 |     def finalize(self, index_file):
132 |         self.out_file.close()
133 |         index = open(index_file, 'wb')
134 |         index.write(b'TNTIDX\x00\x00')
135 |         index.write(struct.pack('<Q', 1))
136 |         index.write(struct.pack('<QQ', code(self.dtype),
137 |                                 self.element_size))
138 |         index.write(struct.pack('<QQ', len(self.data_offsets) - 1,
139 |                                 len(self.sizes)))
140 |         write_longs(index, self.dim_offsets)
141 |         write_longs(index, self.data_offsets)
142 |         write_longs(index, self.sizes)
143 |         index.close()
144 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/log_utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import sys
  3 | 
  4 | #-----------------------------------------------------------------------------------------------------------#
  5 | import re
  6 | 
  7 | class BColors:
  8 |     HEADER = '\033[95m'
  9 |     OKBLUE = '\033[94m'
 10 |     OKGREEN = '\033[92m'
 11 |     WARNING = '\033[93m'
 12 |     FAIL = '\033[91m'
 13 |     ENDC = '\033[0m'
 14 |     BOLD = '\033[1m'
 15 |     UNDERLINE = '\033[4m'
 16 |     WHITE = '\033[37m'
 17 |     YELLOW = '\033[33m'
 18 |     GREEN = '\033[32m'
 19 |     BLUE = '\033[34m'
 20 |     CYAN = '\033[36m'
 21 |     RED = '\033[31m'
 22 |     MAGENTA = '\033[35m'
 23 |     BLACK = '\033[30m'
 24 |     BHEADER = BOLD + '\033[95m'
 25 |     BOKBLUE = BOLD + '\033[94m'
 26 |     BOKGREEN = BOLD + '\033[92m'
 27 |     BWARNING = BOLD + '\033[93m'
 28 |     BFAIL = BOLD + '\033[91m'
 29 |     BUNDERLINE = BOLD + '\033[4m'
 30 |     BWHITE = BOLD + '\033[37m'
 31 |     BYELLOW = BOLD + '\033[33m'
 32 |     BGREEN = BOLD + '\033[32m'
 33 |     BBLUE = BOLD + '\033[34m'
 34 |     BCYAN = BOLD + '\033[36m'
 35 |     BRED = BOLD + '\033[31m'
 36 |     BMAGENTA = BOLD + '\033[35m'
 37 |     BBLACK = BOLD + '\033[30m'
 38 |     
 39 |     @staticmethod
 40 |     def cleared(s):
 41 |         return re.sub("\033\[[0-9][0-9]?m", "", s)
 42 | 
 43 | def red(message):
 44 |     return BColors.RED + str(message) + BColors.ENDC
 45 | 
 46 | def b_red(message):
 47 |     return BColors.BRED + str(message) + BColors.ENDC
 48 | 
 49 | def blue(message):
 50 |     return BColors.BLUE + str(message) + BColors.ENDC
 51 | 
 52 | def yellow(message):
 53 |     return BColors.YELLOW + str(message) + BColors.ENDC
 54 | 
 55 | def b_yellow(message):
 56 |     return BColors.BYELLOW + str(message) + BColors.ENDC
 57 | 
 58 | def white(message):
 59 |     return BColors.WHITE + str(message) + BColors.ENDC
 60 | 
 61 | def green(message):
 62 |     return BColors.GREEN + str(message) + BColors.ENDC
 63 | 
 64 | def b_green(message):
 65 |     return BColors.BGREEN + str(message) + BColors.ENDC
 66 | 
 67 | def b_okblue(message):
 68 |     return BColors.OKBLUE + str(message) + BColors.ENDC
 69 | 
 70 | def b_fail(message):
 71 |     return BColors.BFAIL + str(message) + BColors.ENDC
 72 | 
 73 | def b_warning(message):
 74 |     return BColors.WARNING + str(message) + BColors.ENDC
 75 | 
 76 | def print_args(args, path=None):
 77 |     if path:
 78 |         output_file = open(path, 'w')
 79 |     logger = logging.getLogger(__name__)
 80 |     logger.info("Arguments:")
 81 |     args.command = ' '.join(sys.argv)
 82 |     items = vars(args)
 83 |     for key in sorted(items.keys(), key=lambda s: s.lower()):
 84 |         value = items[key]
 85 |         if not value:
 86 |             value = "None"
 87 |         logger.info("  " + key + ": " + str(items[key]))
 88 |         if path is not None:
 89 |             output_file.write("  " + key + ": " + str(items[key]) + "\n")
 90 |     if path:
 91 |         output_file.close()
 92 |     del args.command
 93 | 
 94 | #-----------------------------------------------------------------------------------------------------------#
 95 | 
 96 | #-----------------------------------------------------------------------------------------------------------#
 97 | 
 98 | def set_logger(out_dir=None, log_file="log.txt"):
 99 |     #console_format = BColors.OKBLUE + '[%(levelname)s]' + BColors.ENDC + ' (%(name)s) %(message)s'
100 |     #console_format = b_okblue('[%(levelname)s]') + b_okblue(' [%(asctime)s] ') + ' %(message)s '
101 |     datefmt='%d-%m-%Y %H:%M:%S'
102 |     logger = logging.getLogger()
103 |     logger.setLevel(logging.DEBUG)
104 |     console = logging.StreamHandler()
105 |     console.setLevel(logging.DEBUG)
106 |     console.setFormatter(ColoredFormatter(datefmt=datefmt))
107 |     logger.addHandler(console)
108 |     if out_dir:
109 |         #file_format = '[%(levelname)s] (%(name)s) %(message)s'
110 |         file_format = '[%(levelname)s] [%(asctime)s] %(message)s'
111 |         log_file = logging.FileHandler(out_dir + '/' + log_file, mode='w')
112 |         log_file.setLevel(logging.DEBUG)
113 |         log_file.setFormatter(logging.Formatter(file_format, datefmt=datefmt))
114 |         logger.addHandler(log_file)
115 | 
116 | #-----------------------------------------------------------------------------------------------------------#
117 | 
118 | class ColoredFormatter(logging.Formatter):
119 |     FORMATS = {logging.DEBUG :"DBG: %(module)s: %(lineno)d: %(message)s",
120 |                logging.ERROR : b_fail('[%(levelname)s]') + b_fail(' [%(asctime)s] ') + ' %(message)s ',
121 |                logging.INFO : b_okblue('[%(levelname)s]') + b_okblue(' [%(asctime)s] ') + ' %(message)s ',
122 |                logging.WARNING : b_warning('[%(levelname)s]') + ' %(message)s',
123 |                'DEFAULT' : b_okblue('[%(levelname)s]') + b_okblue(' [%(asctime)s] ') + ' %(message)s '}
124 | 
125 |     def format(self, record):
126 |         self._fmt = self.FORMATS.get(record.levelno, self.FORMATS['DEFAULT'])
127 |         return logging.Formatter.format(self, record)
128 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/lib/m2scorer/combiner.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # This file is part of the NUS M2 scorer.
  4 | # The NUS M2 scorer is free software: you can redistribute it and/or modify
  5 | # it under the terms of the GNU General Public License as published by
  6 | # the Free Software Foundation, either version 3 of the License, or
  7 | # (at your option) any later version.
  8 | 
  9 | # The NUS M2 scorer is distributed in the hope that it will be useful,
 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | # GNU General Public License for more details.
 13 | 
 14 | # You should have received a copy of the GNU General Public License
 15 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 | # file: m2scorer.py
 18 | # 
 19 | # score a system's output against a gold reference 
 20 | #
 21 | # Usage: m2scorer.py [OPTIONS] proposed_sentences source_gold
 22 | # where
 23 | #  proposed_sentences   -   system output, sentence per line
 24 | #  source_gold          -   source sentences with gold token edits
 25 | # OPTIONS
 26 | #   -v    --verbose             -  print verbose output
 27 | #   --very_verbose              -  print lots of verbose output
 28 | #   --max_unchanged_words N     -  Maximum unchanged words when extracting edits. Default 2."
 29 | #   --ignore_whitespace_casing  -  Ignore edits that only affect whitespace and caseing. Default no."
 30 | #
 31 | 
 32 | import sys
 33 | import levenshtein
 34 | from getopt import getopt
 35 | from util import paragraphs
 36 | from util import smart_open
 37 | 
 38 | 
 39 | 
 40 | def load_annotation(gold_file):
 41 |     source_sentences = []
 42 |     gold_edits = []
 43 |     fgold = smart_open(gold_file, 'r')
 44 |     puffer = fgold.read()
 45 |     fgold.close()
 46 |     puffer = puffer.decode('utf8')
 47 |     for item in paragraphs(puffer.splitlines(True)):
 48 |         item = item.splitlines(False)
 49 |         sentence = [line[2:].strip() for line in item if line.startswith('S ')]
 50 |         assert sentence != []
 51 |         annotations = {}
 52 |         for line in item[1:]:
 53 |             if line.startswith('I ') or line.startswith('S '):
 54 |                 continue
 55 |             assert line.startswith('A ')
 56 |             line = line[2:]
 57 |             fields = line.split('|||')
 58 |             start_offset = int(fields[0].split()[0])
 59 |             end_offset = int(fields[0].split()[1])
 60 |             etype = fields[1]
 61 |             if etype == 'noop':
 62 |                 start_offset = -1
 63 |                 end_offset = -1
 64 |             corrections =  [c.strip() if c != '-NONE-' else '' for c in fields[2].split('||')]
 65 |             # NOTE: start and end are *token* offsets
 66 |             original = ' '.join(' '.join(sentence).split()[start_offset:end_offset])
 67 |             annotator = int(fields[5])
 68 |             if annotator not in annotations.keys():
 69 |                 annotations[annotator] = []
 70 |             annotations[annotator].append((start_offset, end_offset, original, corrections))
 71 |         tok_offset = 0
 72 |         for this_sentence in sentence:
 73 |             tok_offset += len(this_sentence.split())
 74 |             source_sentences.append(this_sentence)
 75 |             this_edits = {}
 76 |             for annotator, annotation in annotations.iteritems():
 77 |                 this_edits[annotator] = [edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0]
 78 |             if len(this_edits) == 0:
 79 |                 this_edits[0] = []
 80 |             gold_edits.append(this_edits)
 81 |     return (source_sentences, gold_edits)
 82 | 
 83 | 
 84 | def print_usage():
 85 |     print >> sys.stderr, "Usage: m2scorer.py [OPTIONS] proposed_sentences gold_source"
 86 |     print >> sys.stderr, "where"
 87 |     print >> sys.stderr, "  proposed_sentences   -   system output, sentence per line"
 88 |     print >> sys.stderr, "  source_gold          -   source sentences with gold token edits"
 89 |     print >> sys.stderr, "OPTIONS"
 90 |     print >> sys.stderr, "  -v    --verbose                   -  print verbose output"
 91 |     print >> sys.stderr, "        --very_verbose              -  print lots of verbose output"
 92 |     print >> sys.stderr, "        --max_unchanged_words N     -  Maximum unchanged words when extraction edit. Default 2."
 93 |     print >> sys.stderr, "        --ignore_whitespace_casing  -  Ignore edits that only affect whitespace and caseing. Default no."
 94 | 
 95 | 
 96 | 
 97 | max_unchanged_words=2
 98 | ignore_whitespace_casing= False
 99 | verbose = False
100 | very_verbose = False
101 | opts, args = getopt(sys.argv[1:], "v", ["max_unchanged_words=", "verbose", "ignore_whitespace_casing", "very_verbose"])
102 | for o, v in opts:
103 |     if o in ('-v', '--verbose'):
104 |         verbose = True
105 |     elif o == '--very_verbose':
106 |         very_verbose = True
107 |     elif o == '--max_unchanged_words':
108 |         max_unchanged_words = int(v)
109 |     elif o == '--ignore_whitespace_casing':
110 |         ignore_whitespace_casing = True
111 |     else:
112 |         print >> sys.stderr, "Unknown option :", o
113 |         print_usage()
114 |         sys.exit(-1)
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/scripts/apply_bpe.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | """Use operations learned with learn_bpe.py to encode a new text.
  6 | The text will not be smaller, but use only a fixed vocabulary, with rare words
  7 | encoded as variable-length sequences of subword units.
  8 | 
  9 | Reference:
 10 | Rico Sennrich, Barry Haddow and Alexandra Birch (2015). Neural Machine Translation of Rare Words with Subword Units.
 11 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
 12 | """
 13 | 
 14 | from __future__ import unicode_literals, division
 15 | 
 16 | import sys
 17 | import codecs
 18 | import argparse
 19 | from collections import defaultdict
 20 | 
 21 | # hack for python2/3 compatibility
 22 | from io import open
 23 | argparse.open = open
 24 | 
 25 | # python 2/3 compatibility
 26 | if sys.version_info < (3, 0):
 27 |   sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
 28 |   sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
 29 |   sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
 30 | 
 31 | import codecs
 32 | 
 33 | class BPE(object):
 34 | 
 35 |     def __init__(self, codes, separator='@@'):            
 36 |         
 37 |         with codecs.open(codes.name, encoding='utf-8') as codes:
 38 |             self.bpe_codes = [tuple(item.split()) for item in codes]
 39 |          
 40 |         # some hacking to deal with duplicates (only consider first instance)
 41 |         self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])
 42 | 
 43 |         self.separator = separator
 44 | 
 45 |     def segment(self, sentence):
 46 |         """segment single sentence (whitespace-tokenized string) with BPE encoding"""
 47 | 
 48 |         output = []
 49 |         for word in sentence.split():
 50 |             new_word = encode(word, self.bpe_codes)
 51 | 
 52 |             for item in new_word[:-1]:
 53 |                 output.append(item + self.separator)
 54 |             output.append(new_word[-1])
 55 | 
 56 |         return ' '.join(output)
 57 | 
 58 | def create_parser():
 59 |     parser = argparse.ArgumentParser(
 60 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 61 |         description="learn BPE-based word segmentation")
 62 | 
 63 |     parser.add_argument(
 64 |         '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
 65 |         metavar='PATH',
 66 |         help="Input file (default: standard input).")
 67 |     parser.add_argument(
 68 |         '--codes', '-c', type=argparse.FileType('r'), metavar='PATH',
 69 |         required=True,
 70 |         help="File with BPE codes (created by learn_bpe.py).")
 71 |     parser.add_argument(
 72 |         '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
 73 |         metavar='PATH',
 74 |         help="Output file (default: standard output)")
 75 |     parser.add_argument(
 76 |         '--separator', '-s', type=str, default='@@', metavar='STR',
 77 |         help="Separator between non-final subword units (default: '%(default)s'))")
 78 | 
 79 |     return parser
 80 | 
 81 | def get_pairs(word):
 82 |     """Return set of symbol pairs in a word.
 83 | 
 84 |     word is represented as tuple of symbols (symbols being variable-length strings)
 85 |     """
 86 |     pairs = set()
 87 |     prev_char = word[0]
 88 |     for char in word[1:]:
 89 |         pairs.add((prev_char, char))
 90 |         prev_char = char
 91 |     return pairs
 92 | 
 93 | def encode(orig, bpe_codes, cache={}):
 94 |     """Encode word based on list of BPE merge operations, which are applied consecutively
 95 |     """
 96 | 
 97 |     if orig in cache:
 98 |         return cache[orig]
 99 | 
100 |     word = tuple(orig) + ('</w>',)
101 |     pairs = get_pairs(word)
102 | 
103 |     while True:
104 |         bigram = min(pairs, key = lambda pair: bpe_codes.get(pair, float('inf')))
105 |         if bigram not in bpe_codes:
106 |             break
107 |         first, second = bigram
108 |         new_word = []
109 |         i = 0
110 |         while i < len(word):
111 |             try:
112 |                 j = word.index(first, i)
113 |                 new_word.extend(word[i:j])
114 |                 i = j
115 |             except:
116 |                 new_word.extend(word[i:])
117 |                 break
118 | 
119 |             if word[i] == first and i < len(word)-1 and word[i+1] == second:
120 |                 new_word.append(first+second)
121 |                 i += 2
122 |             else:
123 |                 new_word.append(word[i])
124 |                 i += 1
125 |         new_word = tuple(new_word)
126 |         word = new_word
127 |         if len(word) == 1:
128 |             break
129 |         else:
130 |             pairs = get_pairs(word)
131 | 
132 |     # don't print end-of-word symbols
133 |     if word[-1] == '</w>':
134 |         word = word[:-1]
135 |     elif word[-1].endswith('</w>'):
136 |         word = word[:-1] + (word[-1].replace('</w>',''),)
137 | 
138 |     cache[orig] = word
139 |     return word
140 | 
141 | 
142 | if __name__ == '__main__':
143 |     parser = create_parser()
144 |     args = parser.parse_args()
145 | 
146 |     bpe = BPE(args.codes, args.separator)
147 | 
148 |     for line in args.input:
149 |         args.output.write(bpe.segment(line).strip())
150 |         args.output.write('\n')
151 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/subword-nmt/apply_bpe.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | """Use operations learned with learn_bpe.py to encode a new text.
  6 | The text will not be smaller, but use only a fixed vocabulary, with rare words
  7 | encoded as variable-length sequences of subword units.
  8 | 
  9 | Reference:
 10 | Rico Sennrich, Barry Haddow and Alexandra Birch (2015). Neural Machine Translation of Rare Words with Subword Units.
 11 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
 12 | """
 13 | 
 14 | from __future__ import unicode_literals, division
 15 | 
 16 | import sys
 17 | import codecs
 18 | import argparse
 19 | from collections import defaultdict
 20 | 
 21 | # hack for python2/3 compatibility
 22 | from io import open
 23 | argparse.open = open
 24 | 
 25 | # python 2/3 compatibility
 26 | if sys.version_info < (3, 0):
 27 |   sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
 28 |   sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
 29 |   sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
 30 | 
 31 | import codecs
 32 | 
 33 | class BPE(object):
 34 | 
 35 |     def __init__(self, codes, separator='@@'):            
 36 |         
 37 |         with codecs.open(codes.name, encoding='utf-8') as codes:
 38 |             self.bpe_codes = [tuple(item.split()) for item in codes]
 39 |          
 40 |         # some hacking to deal with duplicates (only consider first instance)
 41 |         self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])
 42 | 
 43 |         self.separator = separator
 44 | 
 45 |     def segment(self, sentence):
 46 |         """segment single sentence (whitespace-tokenized string) with BPE encoding"""
 47 | 
 48 |         output = []
 49 |         for word in sentence.split():
 50 |             new_word = encode(word, self.bpe_codes)
 51 | 
 52 |             for item in new_word[:-1]:
 53 |                 output.append(item + self.separator)
 54 |             output.append(new_word[-1])
 55 | 
 56 |         return ' '.join(output)
 57 | 
 58 | def create_parser():
 59 |     parser = argparse.ArgumentParser(
 60 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 61 |         description="learn BPE-based word segmentation")
 62 | 
 63 |     parser.add_argument(
 64 |         '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
 65 |         metavar='PATH',
 66 |         help="Input file (default: standard input).")
 67 |     parser.add_argument(
 68 |         '--codes', '-c', type=argparse.FileType('r'), metavar='PATH',
 69 |         required=True,
 70 |         help="File with BPE codes (created by learn_bpe.py).")
 71 |     parser.add_argument(
 72 |         '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
 73 |         metavar='PATH',
 74 |         help="Output file (default: standard output)")
 75 |     parser.add_argument(
 76 |         '--separator', '-s', type=str, default='@@', metavar='STR',
 77 |         help="Separator between non-final subword units (default: '%(default)s'))")
 78 | 
 79 |     return parser
 80 | 
 81 | def get_pairs(word):
 82 |     """Return set of symbol pairs in a word.
 83 | 
 84 |     word is represented as tuple of symbols (symbols being variable-length strings)
 85 |     """
 86 |     pairs = set()
 87 |     prev_char = word[0]
 88 |     for char in word[1:]:
 89 |         pairs.add((prev_char, char))
 90 |         prev_char = char
 91 |     return pairs
 92 | 
 93 | def encode(orig, bpe_codes, cache={}):
 94 |     """Encode word based on list of BPE merge operations, which are applied consecutively
 95 |     """
 96 | 
 97 |     if orig in cache:
 98 |         return cache[orig]
 99 | 
100 |     word = tuple(orig) + ('</w>',)
101 |     pairs = get_pairs(word)
102 | 
103 |     while True:
104 |         bigram = min(pairs, key = lambda pair: bpe_codes.get(pair, float('inf')))
105 |         if bigram not in bpe_codes:
106 |             break
107 |         first, second = bigram
108 |         new_word = []
109 |         i = 0
110 |         while i < len(word):
111 |             try:
112 |                 j = word.index(first, i)
113 |                 new_word.extend(word[i:j])
114 |                 i = j
115 |             except:
116 |                 new_word.extend(word[i:])
117 |                 break
118 | 
119 |             if word[i] == first and i < len(word)-1 and word[i+1] == second:
120 |                 new_word.append(first+second)
121 |                 i += 2
122 |             else:
123 |                 new_word.append(word[i])
124 |                 i += 1
125 |         new_word = tuple(new_word)
126 |         word = new_word
127 |         if len(word) == 1:
128 |             break
129 |         else:
130 |             pairs = get_pairs(word)
131 | 
132 |     # don't print end-of-word symbols
133 |     if word[-1] == '</w>':
134 |         word = word[:-1]
135 |     elif word[-1].endswith('</w>'):
136 |         word = word[:-1] + (word[-1].replace('</w>',''),)
137 | 
138 |     cache[orig] = word
139 |     return word
140 | 
141 | 
142 | if __name__ == '__main__':
143 |     parser = create_parser()
144 |     args = parser.parse_args()
145 | 
146 |     bpe = BPE(args.codes, args.separator)
147 | 
148 |     for line in args.input:
149 |         args.output.write(bpe.segment(line).strip())
150 |         args.output.write('\n')
151 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the LICENSE file in
  5 | # the root directory of this source tree. An additional grant of patent rights
  6 | # can be found in the PATENTS file in the same directory.
  7 | #
  8 | 
  9 | import logging
 10 | import os
 11 | import torch
 12 | import traceback
 13 | import subprocess
 14 | 
 15 | from torch.autograd import Variable
 16 | from torch.serialization import default_restore_location
 17 | 
 18 | from fairseq import criterions, data, models
 19 | 
 20 | validation_proc = None
 21 | 
 22 | def parse_args_and_arch(parser):
 23 |     args = parser.parse_args()
 24 |     args.model = models.arch_model_map[args.arch]
 25 |     args = getattr(models, args.model).parse_arch(args)
 26 |     return args
 27 | 
 28 | 
 29 | def build_model(args, dataset):
 30 |     assert hasattr(models, args.model), 'Missing model type'
 31 |     return getattr(models, args.model).build_model(args, dataset)
 32 | 
 33 | 
 34 | def build_criterion(args, dataset):
 35 |     padding_idx = dataset.dst_dict.pad()
 36 |     if args.label_smoothing > 0:
 37 |         return criterions.LabelSmoothedCrossEntropyCriterion(args.label_smoothing, padding_idx)
 38 |     else:
 39 |         return criterions.CrossEntropyCriterion(padding_idx)
 40 | 
 41 | 
 42 | def torch_persistent_save(*args, **kwargs):
 43 |     for i in range(3):
 44 |         try:
 45 |             return torch.save(*args, **kwargs)
 46 |         except:
 47 |             if i == 2:
 48 |                 logging.error(traceback.format_exc())
 49 | 
 50 | 
 51 | def save_checkpoint(args, epoch, batch_offset, model, optimizer, lr_scheduler, val_loss=None, validation_script=None):
 52 | 
 53 |     global validation_proc
 54 |     state_dict = {
 55 |         'args': args,
 56 |         'epoch': epoch,
 57 |         'batch_offset': batch_offset,
 58 |         'model': model.state_dict(),
 59 |         'optimizer': optimizer.state_dict(),
 60 |         'best_loss': lr_scheduler.best,
 61 |         'val_loss': val_loss,
 62 |     }
 63 | 
 64 |     if batch_offset == 0:
 65 |         if not args.no_epoch_checkpoints:
 66 |             epoch_filename = os.path.join(args.save_dir, 'checkpoint{}.pt'.format(epoch))
 67 |             print('| epoch {:03d} | saving checkpoint '.format(epoch, epoch_filename))
 68 |             torch_persistent_save(state_dict, epoch_filename)
 69 |             if validation_script:
 70 |                 if validation_proc and validation_proc.poll() is None:
 71 |                     print('| epoch {:03d} | waiting for previous validation process to finish.'.format(epoch))
 72 |                     validation_proc.wait()
 73 |                 validation_proc = subprocess.Popen(validation_script + [epoch_filename])
 74 | 
 75 |         assert val_loss is not None
 76 |         if not hasattr(save_checkpoint, 'best') or val_loss < save_checkpoint.best:
 77 |             save_checkpoint.best = val_loss
 78 |             best_filename = os.path.join(args.save_dir, 'checkpoint_best.pt')
 79 |             print('| epoch {:03d} | saving best checkpoint'.format(epoch, best_filename))
 80 |             torch_persistent_save(state_dict, best_filename)
 81 | 
 82 |     last_filename = os.path.join(args.save_dir, 'checkpoint_last.pt')
 83 |     print('| epoch {:03d} | saving last checkpoint'.format(epoch, last_filename))
 84 |     torch_persistent_save(state_dict, last_filename)
 85 | 
 86 | 
 87 | 
 88 | def load_checkpoint(filename, model, optimizer, lr_scheduler, cuda_device=None):
 89 |     if not os.path.exists(filename):
 90 |         return 1, 0
 91 |     if cuda_device is None:
 92 |         state = torch.load(filename)
 93 |     else:
 94 |         state = torch.load(
 95 |             filename,
 96 |             map_location=lambda s, l: default_restore_location(s, 'cuda:{}'.format(cuda_device))
 97 |         )
 98 | 
 99 |     model.load_state_dict(state['model'])
100 |     optimizer.load_state_dict(state['optimizer'])
101 |     lr_scheduler.best = state['best_loss']
102 |     epoch = state['epoch'] + 1
103 |     batch_offset = state['batch_offset']
104 | 
105 |     gpu_str = ' on GPU #{}'.format(cuda_device) if cuda_device is not None else ''
106 |     print('| loaded checkpoint {} (epoch {}){}'.format(filename, epoch, gpu_str))
107 |     return epoch, batch_offset
108 | 
109 | 
110 | def load_ensemble_for_inference(filenames, data_path):
111 |     # load model architectures and weights
112 |     states = []
113 |     for filename in filenames:
114 |         if not os.path.exists(filename):
115 |             raise IOError('Model file not found: {}'.format(filename))
116 |         states.append(
117 |             torch.load(filename, map_location=lambda s, l: default_restore_location(s, 'cpu'))
118 |         )
119 | 
120 |     # load dataset
121 |     args = states[0]['args']
122 |     dataset = data.load(data_path, args.source_lang, args.target_lang)
123 | 
124 |     # build models
125 |     ensemble = []
126 |     for state in states:
127 |         model = build_model(args, dataset)
128 |         model.load_state_dict(state['model'])
129 |         ensemble.append(model)
130 | 
131 |     return ensemble, dataset
132 | 
133 | 
134 | def prepare_sample(sample, volatile=False, cuda_device=None):
135 |     """Wrap input tensors in Variable class."""
136 | 
137 |     def make_variable(tensor):
138 |         if cuda_device is not None and torch.cuda.is_available():
139 |             tensor = tensor.cuda(async=True, device=cuda_device)
140 |         return Variable(tensor, volatile=volatile)
141 | 
142 |     return {
143 |         'id': sample['id'],
144 |         'ntokens': sample['ntokens'],
145 |         'target': make_variable(sample['target']),
146 |         'net_input': {
147 |             key: make_variable(sample[key])
148 |             for key in ['src_tokens', 'src_positions', 'input_tokens', 'input_positions']
149 |         },
150 |     }
151 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/lib/m2scorer/m2scorer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # This file is part of the NUS M2 scorer.
  4 | # The NUS M2 scorer is free software: you can redistribute it and/or modify
  5 | # it under the terms of the GNU General Public License as published by
  6 | # the Free Software Foundation, either version 3 of the License, or
  7 | # (at your option) any later version.
  8 | 
  9 | # The NUS M2 scorer is distributed in the hope that it will be useful,
 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | # GNU General Public License for more details.
 13 | 
 14 | # You should have received a copy of the GNU General Public License
 15 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 | # file: m2scorer.py
 18 | # 
 19 | # score a system's output against a gold reference 
 20 | #
 21 | # Usage: m2scorer.py [OPTIONS] proposed_sentences source_gold
 22 | # where
 23 | #  proposed_sentences   -   system output, sentence per line
 24 | #  source_gold          -   source sentences with gold token edits
 25 | # OPTIONS
 26 | #   -v    --verbose             -  print verbose output
 27 | #   --very_verbose              -  print lots of verbose output
 28 | #   --max_unchanged_words N     -  Maximum unchanged words when extracting edits. Default 2."
 29 | #   --beta B                    -  Beta value for F-measure. Default 0.5."
 30 | #   --ignore_whitespace_casing  -  Ignore edits that only affect whitespace and caseing. Default no."
 31 | #
 32 | 
 33 | import sys
 34 | import levenshtein
 35 | from getopt import getopt
 36 | from util import paragraphs
 37 | from util import smart_open
 38 | 
 39 | 
 40 | 
 41 | def load_annotation(gold_file):
 42 |     source_sentences = []
 43 |     gold_edits = []
 44 |     fgold = smart_open(gold_file, 'r')
 45 |     puffer = fgold.read()
 46 |     fgold.close()
 47 |     puffer = puffer.decode('utf8')
 48 |     for item in paragraphs(puffer.splitlines(True)):
 49 |         item = item.splitlines(False)
 50 |         sentence = [line[2:].strip() for line in item if line.startswith('S ')]
 51 |         assert sentence != []
 52 |         annotations = {}
 53 |         for line in item[1:]:
 54 |             if line.startswith('I ') or line.startswith('S '):
 55 |                 continue
 56 |             assert line.startswith('A ')
 57 |             line = line[2:]
 58 |             fields = line.split('|||')
 59 |             start_offset = int(fields[0].split()[0])
 60 |             end_offset = int(fields[0].split()[1])
 61 |             etype = fields[1]
 62 |             if etype == 'noop':
 63 |                 start_offset = -1
 64 |                 end_offset = -1
 65 |             corrections =  [c.strip() if c != '-NONE-' else '' for c in fields[2].split('||')]
 66 |             # NOTE: start and end are *token* offsets
 67 |             original = ' '.join(' '.join(sentence).split()[start_offset:end_offset])
 68 |             annotator = int(fields[5])
 69 |             if annotator not in annotations.keys():
 70 |                 annotations[annotator] = []
 71 |             annotations[annotator].append((start_offset, end_offset, original, corrections))
 72 |         tok_offset = 0
 73 |         for this_sentence in sentence:
 74 |             tok_offset += len(this_sentence.split())
 75 |             source_sentences.append(this_sentence)
 76 |             this_edits = {}
 77 |             for annotator, annotation in annotations.iteritems():
 78 |                 this_edits[annotator] = [edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0]
 79 |             if len(this_edits) == 0:
 80 |                 this_edits[0] = []
 81 |             gold_edits.append(this_edits)
 82 |     return (source_sentences, gold_edits)
 83 | 
 84 | 
 85 | def print_usage():
 86 |     print >> sys.stderr, "Usage: m2scorer.py [OPTIONS] proposed_sentences gold_source"
 87 |     print >> sys.stderr, "where"
 88 |     print >> sys.stderr, "  proposed_sentences   -   system output, sentence per line"
 89 |     print >> sys.stderr, "  source_gold          -   source sentences with gold token edits"
 90 |     print >> sys.stderr, "OPTIONS"
 91 |     print >> sys.stderr, "  -v    --verbose                   -  print verbose output"
 92 |     print >> sys.stderr, "        --very_verbose              -  print lots of verbose output"
 93 |     print >> sys.stderr, "        --max_unchanged_words N     -  Maximum unchanged words when extraction edit. Default 2."
 94 |     print >> sys.stderr, "        --beta B                    -  Beta value for F-measure. Default 0.5."
 95 |     print >> sys.stderr, "        --ignore_whitespace_casing  -  Ignore edits that only affect whitespace and caseing. Default no."
 96 | 
 97 | 
 98 | 
 99 | max_unchanged_words=2
100 | beta = 0.5
101 | ignore_whitespace_casing= False
102 | verbose = False
103 | very_verbose = False
104 | opts, args = getopt(sys.argv[1:], "v", ["max_unchanged_words=", "beta=", "verbose", "ignore_whitespace_casing", "very_verbose"])
105 | for o, v in opts:
106 |     if o in ('-v', '--verbose'):
107 |         verbose = True
108 |     elif o == '--very_verbose':
109 |         very_verbose = True
110 |     elif o == '--max_unchanged_words':
111 |         max_unchanged_words = int(v)
112 |     elif o == '--beta':
113 |         beta = float(v)
114 |     elif o == '--ignore_whitespace_casing':
115 |         ignore_whitespace_casing = True
116 |     else:
117 |         print >> sys.stderr, "Unknown option :", o
118 |         print_usage()
119 |         sys.exit(-1)
120 | 
121 | # starting point
122 | if len(args) != 2:
123 |     print_usage()
124 |     sys.exit(-1)
125 | 
126 | system_file = args[0]
127 | gold_file = args[1]
128 | 
129 | # load source sentences and gold edits
130 | source_sentences, gold_edits = load_annotation(gold_file)
131 | 
132 | # load system hypotheses
133 | fin = smart_open(system_file, 'r')
134 | system_sentences = [line.decode("utf8").strip() for line in fin.readlines()]
135 | fin.close()
136 | 
137 | p, r, f1 = levenshtein.batch_multi_pre_rec_f1(system_sentences, source_sentences, gold_edits, max_unchanged_words, beta, ignore_whitespace_casing, verbose, very_verbose)
138 | 
139 | print "Precision   : %.4f" % p
140 | print "Recall      : %.4f" % r
141 | print "F_%.1f       : %.4f" % (beta, f1)
142 | 
143 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/multiprocessing_event_loop.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the LICENSE file in
  5 | # the root directory of this source tree. An additional grant of patent rights
  6 | # can be found in the PATENTS file in the same directory.
  7 | #
  8 | 
  9 | import os
 10 | import signal
 11 | import threading
 12 | from torch import multiprocessing
 13 | 
 14 | 
 15 | class MultiprocessingEventLoop(object):
 16 |     """Start a multiprocessing event loop."""
 17 | 
 18 |     def __init__(self, device_ids=None, multiprocessing_method='spawn'):
 19 |         super().__init__()
 20 |         self.device_ids = tuple(device_ids)
 21 |         self.num_replicas = len(device_ids)
 22 |         self.rank = None
 23 | 
 24 |         self._mp = multiprocessing.get_context(multiprocessing_method)
 25 | 
 26 |         self._start_error_handler()
 27 |         self._start_multiprocessing()
 28 | 
 29 |     def call_async(self, rank, action, **kwargs):
 30 |         """Asynchronously call a function in each child process.
 31 | 
 32 |         Call a function named `action` on the rank'th process and return
 33 |         a Future with the result.
 34 |         """
 35 | 
 36 |         def result_generator():
 37 |             yield self.return_pipes[rank].recv()
 38 | 
 39 |         assert not self.return_pipes[rank].poll(), \
 40 |             'return pipe must be consumed before calling another function'
 41 |         self.input_pipes[rank].send((action, kwargs))
 42 | 
 43 |         return Future(result_generator())
 44 | 
 45 |     def stop(self, interrupt_children=False):
 46 |         """Stop multiprocessing."""
 47 |         for rank in range(self.num_replicas):
 48 |             self.input_pipes[rank].close()
 49 |             self.return_pipes[rank].close()
 50 |             if interrupt_children:
 51 |                 # send KeyboardInterrupt to children
 52 |                 os.kill(self.procs[rank].pid, signal.SIGINT)
 53 |             else:
 54 |                 self.procs[rank].join()
 55 |         self.error_queue.put((None, None))  # poison pill
 56 | 
 57 |     def _start_error_handler(self):
 58 |         """Error handler to catch exceptions in child processes."""
 59 |         # create a thread to listen for errors in the child processes
 60 |         self.error_queue = self._mp.SimpleQueue()
 61 |         error_thread = threading.Thread(target=self._error_listener,
 62 |                                         daemon=True)
 63 |         error_thread.start()
 64 | 
 65 |         # create signal handler that executes in the main process/thread and
 66 |         # handles errors from child processes
 67 |         signal.signal(signal.SIGUSR1, self._signal_handler)
 68 | 
 69 |     def _error_listener(self):
 70 |         """A thread that listens for errors in the child processes.
 71 | 
 72 |         Errors are handled in a signal handler in the main thread.
 73 |         """
 74 |         (rank, original_trace) = self.error_queue.get()
 75 |         if rank is None:  # poison pill, return
 76 |             return
 77 | 
 78 |         # requeue error and switch to main thread for handling the error
 79 |         self.error_queue.put((rank, original_trace))
 80 |         os.kill(os.getpid(), signal.SIGUSR1)
 81 | 
 82 |     def _signal_handler(self, signal, frame):
 83 |         """Signal handler that handles errors from child processes.
 84 | 
 85 |         This signal handler executes in the main/process thread.
 86 |         """
 87 |         self.stop(interrupt_children=True)
 88 |         (rank, original_trace) = self.error_queue.get()
 89 |         msg = "\n\n-- Tracebacks above this line can probably be ignored --\n\n"
 90 |         msg += original_trace
 91 |         raise Exception(msg)
 92 | 
 93 |     def _start_multiprocessing(self):
 94 |         """Create child processes to run async event loop.
 95 | 
 96 |         Each process reads input from a Pipe, performs some computation,
 97 |         and returns its output to another Pipe.
 98 |         """
 99 |         # create child processes
100 |         input_pipes = []
101 |         return_pipes = []
102 |         procs = []
103 |         for rank, id in enumerate(self.device_ids):
104 |             recv_input_pipe, send_input_pipe = self._mp.Pipe(duplex=False)
105 |             recv_return_pipe, send_return_pipe = self._mp.Pipe(duplex=False)
106 |             proc = self._mp.Process(
107 |                 target=self._process_event_loop,
108 |                 args=(rank, id, recv_input_pipe, send_return_pipe),
109 |                 daemon=True)
110 |             proc.start()
111 |             input_pipes.append(send_input_pipe)
112 |             return_pipes.append(recv_return_pipe)
113 |             procs.append(proc)
114 |         self.input_pipes = input_pipes
115 |         self.return_pipes = return_pipes
116 |         self.procs = procs
117 | 
118 |     def _process_event_loop(self, rank, device_id, input_pipe, return_pipe):
119 |         """Event loop that runs in each child process.
120 | 
121 |         Event loop:
122 |         - take an action from the input pipe
123 |         - call the corresponding function in this process
124 |         - put the return value in the return pipe
125 | 
126 |         Any exceptions are put in the error queue.
127 |         """
128 |         self.rank = rank
129 |         try:
130 |             # event loop
131 |             while True:
132 |                 action, kwargs = input_pipe.recv()
133 |                 action_fn = getattr(self, action)
134 |                 return_pipe.send(action_fn(rank, device_id, **kwargs))
135 |         except EOFError:
136 |             # input pipe was closed, do nothing
137 |             pass
138 |         except KeyboardInterrupt:
139 |             # killed by parent, do nothing
140 |             pass
141 |         except Exception:
142 |             # propagate exception from child to parent process, keeping
143 |             # original traceback
144 |             import traceback
145 |             self.error_queue.put((rank, traceback.format_exc()))
146 |         finally:
147 |             # cleanup pipes
148 |             input_pipe.close()
149 |             return_pipe.close()
150 | 
151 | 
152 | class Future(object):
153 |     """A wrapper around a Python generator, with syntactic sugar."""
154 |     def __init__(self, generator):
155 |         self.generator = generator
156 | 
157 |     def gen(self):
158 |         return next(self.generator)
159 | 
160 |     @staticmethod
161 |     def gen_list(gens):
162 |         return [g.gen() for g in gens]
163 | 
164 |     @staticmethod
165 |     def gen_tuple_list(gens):
166 |         list = [g.gen() for g in gens]
167 |         return zip(*list)
168 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/preprocess.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the LICENSE file in
  5 | # the root directory of this source tree. An additional grant of patent rights
  6 | # can be found in the PATENTS file in the same directory.
  7 | #
  8 | 
  9 | import argparse
 10 | import os
 11 | from itertools import zip_longest
 12 | 
 13 | from fairseq import dictionary, indexed_dataset
 14 | from fairseq.tokenizer import Tokenizer
 15 | 
 16 | 
 17 | def main():
 18 |     parser = argparse.ArgumentParser(
 19 |         description='Data pre-processing: Create dictionary and store data in binary format')
 20 |     parser.add_argument('-s', '--source-lang', default=None, metavar='SRC', help='source language')
 21 |     parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET', help='target language')
 22 |     parser.add_argument('--trainpref', metavar='FP', default='train', help='target language')
 23 |     parser.add_argument('--validpref', metavar='FP', default='valid', help='comma separated, valid language prefixes')
 24 |     parser.add_argument('--testpref', metavar='FP', default='test', help='comma separated, test language prefixes')
 25 |     parser.add_argument('--destdir', metavar='DIR', default='data-bin', help='destination dir')
 26 |     parser.add_argument('--thresholdtgt', metavar='N', default=0, type=int,
 27 |                         help='map words appearing less than threshold times to unknown')
 28 |     parser.add_argument('--thresholdsrc', metavar='N', default=0, type=int,
 29 |                         help='map words appearing less than threshold times to unknown')
 30 |     parser.add_argument('--nwordstgt', metavar='N', default=-1, type=int, help='number of target words to retain')
 31 |     parser.add_argument('--nwordssrc', metavar='N', default=-1, type=int, help='number of source words to retain')
 32 |     parser.add_argument('--alignfile', metavar='ALIGN', default=None, help='an alignment file (optional)')
 33 | 
 34 |     args = parser.parse_args()
 35 |     print(args)
 36 | 
 37 |     os.makedirs(args.destdir, exist_ok=True)
 38 | 
 39 |     src_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.source_lang))
 40 |     src_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)),
 41 |                   threshold=args.thresholdsrc, nwords=args.nwordssrc)
 42 |     tgt_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.target_lang))
 43 |     tgt_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)),
 44 |                   threshold=args.thresholdtgt, nwords=args.nwordstgt)
 45 | 
 46 |     def make_dataset(input_prefix, output_prefix, lang):
 47 |         dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(lang)))
 48 |         print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))
 49 | 
 50 |         ds = indexed_dataset.IndexedDatasetBuilder(
 51 |             '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix, args.source_lang,
 52 |                                         args.target_lang, lang)
 53 |         )
 54 | 
 55 |         def consumer(tensor):
 56 |             ds.add_item(tensor)
 57 | 
 58 |         input_file = '{}.{}'.format(input_prefix, lang)
 59 |         res = Tokenizer.binarize(input_file, dict, consumer)
 60 |         print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
 61 |             lang, input_file, res['nseq'], res['ntok'],
 62 |             100 * res['nunk'] / res['ntok'], dict.unk_word))
 63 |         ds.finalize('{}/{}.{}-{}.{}.idx'.format(
 64 |             args.destdir, output_prefix,
 65 |             args.source_lang, args.target_lang, lang))
 66 | 
 67 |     make_dataset(args.trainpref, 'train', args.source_lang)
 68 |     make_dataset(args.trainpref, 'train', args.target_lang)
 69 |     for k, validpref in enumerate(args.validpref.split(',')):
 70 |         outprefix = 'valid{}'.format(k) if k > 0 else 'valid'
 71 |         make_dataset(validpref, outprefix, args.source_lang)
 72 |         make_dataset(validpref, outprefix, args.target_lang)
 73 |     for k, testpref in enumerate(args.testpref.split(',')):
 74 |         outprefix = 'test{}'.format(k) if k > 0 else 'test'
 75 |         make_dataset(testpref, outprefix, args.source_lang)
 76 |         make_dataset(testpref, outprefix, args.target_lang)
 77 |     print('| Wrote preprocessed data to {}'.format(args.destdir))
 78 | 
 79 |     if args.alignfile:
 80 |         src_file_name = '{}.{}'.format(args.trainpref, args.source_lang)
 81 |         tgt_file_name = '{}.{}'.format(args.trainpref, args.target_lang)
 82 |         src_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)))
 83 |         tgt_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)))
 84 |         freq_map = {}
 85 |         with open(args.alignfile, 'r') as align_file:
 86 |             with open(src_file_name, 'r') as src_file:
 87 |                 with open(tgt_file_name, 'r') as tgt_file:
 88 |                     for a, s, t in zip_longest(align_file, src_file, tgt_file):
 89 |                         si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False)
 90 |                         ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False)
 91 |                         ai = list(map(lambda x: tuple(x.split('-')), a.split()))
 92 |                         for sai, tai in ai:
 93 |                             srcidx = si[int(sai)]
 94 |                             tgtidx = ti[int(tai)]
 95 |                             if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk():
 96 |                                 assert srcidx != src_dict.pad()
 97 |                                 assert srcidx != src_dict.eos()
 98 |                                 assert tgtidx != tgt_dict.pad()
 99 |                                 assert tgtidx != tgt_dict.eos()
100 | 
101 |                                 if srcidx not in freq_map:
102 |                                     freq_map[srcidx] = {}
103 |                                 if tgtidx not in freq_map[srcidx]:
104 |                                     freq_map[srcidx][tgtidx] = 1
105 |                                 else:
106 |                                     freq_map[srcidx][tgtidx] += 1
107 | 
108 |         align_dict = {}
109 |         for srcidx in freq_map.keys():
110 |             align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get)
111 | 
112 |         with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format(
113 |                 args.source_lang, args.target_lang)), 'w') as f:
114 |             for k, v in align_dict.items():
115 |                 print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     main()
120 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/lib/m2scorer/util.py:
--------------------------------------------------------------------------------
  1 | # This file is part of the NUS M2 scorer.
  2 | # The NUS M2 scorer is free software: you can redistribute it and/or modify
  3 | # it under the terms of the GNU General Public License as published by
  4 | # the Free Software Foundation, either version 3 of the License, or
  5 | # (at your option) any later version.
  6 | 
  7 | # The NUS M2 scorer is distributed in the hope that it will be useful,
  8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
  9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 10 | # GNU General Public License for more details.
 11 | 
 12 | # You should have received a copy of the GNU General Public License
 13 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 14 | 
 15 | # file: util.py
 16 | #
 17 | 
 18 | import operator
 19 | import random
 20 | import math
 21 | import re
 22 | 
 23 | def smart_open(fname, mode = 'r'):
 24 |     if fname.endswith('.gz'):
 25 |         import gzip
 26 |         # Using max compression (9) by default seems to be slow.                                
 27 |         # Let's try using the fastest.                                                          
 28 |         return gzip.open(fname, mode, 1)
 29 |     else:
 30 |         return open(fname, mode)
 31 | 
 32 | 
 33 | def randint(b, a=0):
 34 |     return random.randint(a,b)
 35 | 
 36 | def uniq(seq, idfun=None):
 37 |     # order preserving                                                                          
 38 |     if idfun is None:
 39 |         def idfun(x): return x
 40 |     seen = {}
 41 |     result = []
 42 |     for item in seq:
 43 |         marker = idfun(item)
 44 |         # in old Python versions:                                                               
 45 |         # if seen.has_key(marker)                                                               
 46 |         # but in new ones:                                                                      
 47 |         if marker in seen: continue
 48 |         seen[marker] = 1
 49 |         result.append(item)
 50 |     return result
 51 | 
 52 | 
 53 | def sort_dict(myDict, byValue=False, reverse=False):
 54 |     if byValue:
 55 |         items = myDict.items()
 56 |         items.sort(key = operator.itemgetter(1), reverse=reverse)
 57 |     else:
 58 |         items = sorted(myDict.items())
 59 |     return items
 60 | 
 61 | def max_dict(myDict, byValue=False):
 62 |     if byValue:
 63 |         skey=lambda x:x[1]
 64 |     else:
 65 |         skey=lambda x:x[0]
 66 |     return max(myDict.items(), key=skey)
 67 | 
 68 | 
 69 | def min_dict(myDict, byValue=False):
 70 |     if byValue:
 71 |         skey=lambda x:x[1]
 72 |     else:
 73 |         skey=lambda x:x[0]
 74 |     return min(myDict.items(), key=skey)
 75 | 
 76 | def paragraphs(lines, is_separator=lambda x : x == '\n', joiner=''.join):
 77 |     paragraph = []
 78 |     for line in lines:
 79 |         if is_separator(line):
 80 |             if paragraph:
 81 |                 yield joiner(paragraph)
 82 |                 paragraph = []
 83 |         else:
 84 |             paragraph.append(line)
 85 |     if paragraph:
 86 |         yield joiner(paragraph)
 87 | 
 88 | 
 89 | def isASCII(word):
 90 |     try:
 91 |         word = word.decode("ascii")
 92 |         return True
 93 |     except UnicodeEncodeError :
 94 |         return False
 95 |     except UnicodeDecodeError:
 96 |         return False
 97 | 
 98 | 
 99 | def intersect(x, y):
100 |     return [z for z in x if z in y]
101 | 
102 | 
103 | 
104 | # Mapping Windows CP1252 Gremlins to Unicode
105 | # from http://effbot.org/zone/unicode-gremlins.htm
106 | cp1252 = {
107 |     # from http://www.microsoft.com/typography/unicode/1252.htm
108 |     u"\x80": u"\u20AC", # EURO SIGN
109 |     u"\x82": u"\u201A", # SINGLE LOW-9 QUOTATION MARK
110 |     u"\x83": u"\u0192", # LATIN SMALL LETTER F WITH HOOK
111 |     u"\x84": u"\u201E", # DOUBLE LOW-9 QUOTATION MARK
112 |     u"\x85": u"\u2026", # HORIZONTAL ELLIPSIS
113 |     u"\x86": u"\u2020", # DAGGER
114 |     u"\x87": u"\u2021", # DOUBLE DAGGER
115 |     u"\x88": u"\u02C6", # MODIFIER LETTER CIRCUMFLEX ACCENT
116 |     u"\x89": u"\u2030", # PER MILLE SIGN
117 |     u"\x8A": u"\u0160", # LATIN CAPITAL LETTER S WITH CARON
118 |     u"\x8B": u"\u2039", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
119 |     u"\x8C": u"\u0152", # LATIN CAPITAL LIGATURE OE
120 |     u"\x8E": u"\u017D", # LATIN CAPITAL LETTER Z WITH CARON
121 |     u"\x91": u"\u2018", # LEFT SINGLE QUOTATION MARK
122 |     u"\x92": u"\u2019", # RIGHT SINGLE QUOTATION MARK
123 |     u"\x93": u"\u201C", # LEFT DOUBLE QUOTATION MARK
124 |     u"\x94": u"\u201D", # RIGHT DOUBLE QUOTATION MARK
125 |     u"\x95": u"\u2022", # BULLET
126 |     u"\x96": u"\u2013", # EN DASH
127 |     u"\x97": u"\u2014", # EM DASH
128 |     u"\x98": u"\u02DC", # SMALL TILDE
129 |     u"\x99": u"\u2122", # TRADE MARK SIGN
130 |     u"\x9A": u"\u0161", # LATIN SMALL LETTER S WITH CARON
131 |     u"\x9B": u"\u203A", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
132 |     u"\x9C": u"\u0153", # LATIN SMALL LIGATURE OE
133 |     u"\x9E": u"\u017E", # LATIN SMALL LETTER Z WITH CARON
134 |     u"\x9F": u"\u0178", # LATIN CAPITAL LETTER Y WITH DIAERESIS
135 | }
136 | 
137 | def fix_cp1252codes(text):
138 |     # map cp1252 gremlins to real unicode characters
139 |     if re.search(u"[\x80-\x9f]", text):
140 |         def fixup(m):
141 |             s = m.group(0)
142 |             return cp1252.get(s, s)
143 |         if isinstance(text, type("")):
144 |             # make sure we have a unicode string
145 |             text = unicode(text, "iso-8859-1")
146 |         text = re.sub(u"[\x80-\x9f]", fixup, text)
147 |     return text
148 | 
149 | def clean_utf8(text):
150 |     return filter(lambda x : x > '\x1f' and x < '\x7f', text)
151 | 
152 | def pairs(iterable, overlapping=False):
153 |     iterator = iterable.__iter__()
154 |     token = iterator.next()
155 |     i = 0
156 |     for lookahead in iterator:
157 |         if overlapping or i % 2 == 0: 
158 |             yield (token, lookahead)
159 |         token = lookahead
160 |         i += 1
161 |     if i % 2 == 0:
162 |         yield (token, None)
163 | 
164 | def frange(start, end=None, inc=None):
165 |     "A range function, that does accept float increments..."
166 | 
167 |     if end == None:
168 |         end = start + 0.0
169 |         start = 0.0
170 | 
171 |     if inc == None:
172 |         inc = 1.0
173 | 
174 |     L = []
175 |     while 1:
176 |         next = start + len(L) * inc
177 |         if inc > 0 and next >= end:
178 |             break
179 |         elif inc < 0 and next <= end:
180 |             break
181 |         L.append(next)
182 |         
183 |     return L
184 | 
185 | def softmax(values):
186 |     a = max(values)
187 |     Z = 0.0
188 |     for v in values:
189 |         Z += math.exp(v - a)
190 |     sm = [math.exp(v-a) / Z for v in values]
191 |     return sm
192 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/candidatesreader.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | #import dlm.utils as U
  3 | #import dlm.io.logging as L
  4 | import codecs
  5 | 
  6 | class NBestList():
  7 |     def __init__(self, nbest_path, mode='r', reference_list=None):
  8 |         assert mode == 'r' or mode == 'w', "Invalid mode: " + mode
  9 |         self.mode = mode
 10 |         self.nbest_file = codecs.open(nbest_path, mode=mode, encoding='UTF-8')
 11 |         self.prev_index = -1
 12 |         self.curr_item = None
 13 |         self.curr_index = 0
 14 |         self.eof_flag = False
 15 |         self.ref_manager = None
 16 |         if reference_list:
 17 |             assert mode == 'r', "Cannot accept a reference_list in 'w' mode"
 18 |             self.ref_manager = RefernceManager(reference_list)
 19 | 
 20 | 
 21 |     def __iter__(self):
 22 |         assert self.mode == 'r', "Iteration can only be done in 'r' mode"
 23 |         return self
 24 | 
 25 |     def next_item(self):
 26 |         assert self.mode == 'r', "next() method can only be used in 'r' mode"
 27 |         try:
 28 |             segments = self.nbest_file.next().split("|||")
 29 |         except StopIteration:
 30 |             self.close()
 31 |             raise StopIteration
 32 |         try:
 33 |             index = int(segments[0])
 34 |         except ValueError:
 35 |                     print >> sys.stderr, "The first segment in an n-best list must be an integer"
 36 |             #L.error("The first segment in an n-best list must be an integer")
 37 |         hyp = segments[1].strip()
 38 |         features = segments[2].strip()
 39 |         score = None
 40 |         phrase_alignments = None
 41 |         word_alignments = None
 42 |         phrase_alignments = None
 43 |         if len(segments) > 3:
 44 |             score = segments[3].strip()
 45 |         if len(segments) > 4:
 46 |             phrase_alignments = segments[4].strip()
 47 |         if len(segments) > 5:
 48 |             word_alignments = segments[5].strip()
 49 |         return NBestItem(index, hyp, features, score, phrase_alignments, word_alignments)
 50 | 
 51 |     def next(self): # Returns a group of NBestItems with the same index
 52 |         if self.eof_flag == True:
 53 |             raise StopIteration
 54 |         assert self.mode == 'r', "next_group() method can only be used in 'r' mode"
 55 |         group = NBestGroup(self.ref_manager)
 56 |         group.add(self.curr_item) # add the item that was read in the last next() call
 57 |         try:
 58 |             self.curr_item = self.next_item()
 59 |         except StopIteration:
 60 |             self.eof_flag = True
 61 |             return group
 62 |         if self.curr_index != self.curr_item.index:
 63 |             self.curr_index = self.curr_item.index
 64 |             return group
 65 |         while self.curr_index == self.curr_item.index:
 66 |             group.add(self.curr_item)
 67 |             try:
 68 |                 self.curr_item = self.next_item()
 69 |             except StopIteration:
 70 |                 self.eof_flag = True
 71 |                 return group
 72 |         self.curr_index = self.curr_item.index
 73 |         return group
 74 | 
 75 |     def write(self, item):
 76 |         assert self.mode == 'w', "write() method can only be used in 'w' mode"
 77 |         self.nbest_file.write(unicode(item) + "\n")
 78 |     
 79 |     def close(self):
 80 |         self.nbest_file.close()
 81 | 
 82 | 
 83 | 
 84 | class NBestItem:
 85 |     def __init__(self, index, hyp, features, score, phrase_alignments, word_alignments):
 86 |         self.index = index
 87 |         self.hyp = hyp
 88 |         self.features = features
 89 |         self.score = score
 90 |         self.phrase_alignments = phrase_alignments
 91 |         self.word_alignments = word_alignments
 92 | 
 93 |     def __unicode__(self):
 94 |         output = ' ||| '.join([unicode(self.index), self.hyp, self.features])
 95 |         if self.score:
 96 |             output = output + ' ||| ' + self.score
 97 |         if self.phrase_alignments:
 98 |             output = output + ' ||| ' + self.phrase_alignments
 99 |         if self.word_alignments:
100 |             output = output + ' ||| ' + self.word_alignments
101 |         return output
102 | 
103 |     def append_feature(self, feature_name, feature_value):
104 |         self.features += ' ' + str(feature_name) + '= ' + str(feature_value) + ' '
105 | 
106 | 
107 | class NBestGroup:
108 |     def __init__(self, refrence_manager=None):
109 |         self.group_index = -1
110 |         self.group = []
111 |         self.ref_manager = refrence_manager
112 | 
113 |     def __unicode__(self):
114 |         return '\n'.join([unicode(item) for item in self.group])
115 | 
116 |     def __iter__(self):
117 |         self.item_index = 0
118 |         return self
119 | 
120 |     def __getitem__(self, index):
121 |         return self.group[index]
122 | 
123 |     def add(self, item):
124 |         if item is None:
125 |             return
126 |         if self.group_index == -1:
127 |             self.group_index = item.index
128 |             if self.ref_manager:
129 |                 self.refs = self.ref_manager.get_all_refs(self.group_index)
130 |         else:
131 |             assert item.index == self.group_index, "Cannot add an nbest item with an incompatible index"
132 |         self.group.append(item)
133 | 
134 |     def next(self):
135 |         #if self.item_index < len(self.group):
136 |         try:
137 |             item = self.group[self.item_index]
138 |             self.item_index += 1
139 |             return item
140 |         #else:
141 |         except IndexError:
142 |             raise StopIteration
143 | 
144 |     def size(self):
145 |         return len(self.group)
146 | 
147 |     def append_features(self, features_list):
148 |         assert len(features_list) == len(self.group), 'Number of features and number of items in this group do not match'
149 |         for i in range(len(self.group)):
150 |             self.group[i].append_feature(features_list[i])
151 | 
152 | 
153 | 
154 | class RefernceManager:
155 |     def __init__(self, paths_list):
156 |         assert type(paths_list) is list, "The input to a RefernceManager class must be a list"
157 |         self.ref_list = []
158 |         self.num_lines = -1
159 |         self.num_refs = 0
160 |         for path in paths_list:
161 |             with codecs.open(path, mode='r', encoding='UTF-8') as f:
162 |                 self.num_refs += 1
163 |                 sentences = f.readlines()
164 |                 if self.num_lines == -1:
165 |                     self.num_lines = len(sentences)
166 |                 else:
167 |                     assert self.num_lines == len(sentences), "Reference files must have the same number of lines"
168 |                 self.ref_list.append(sentences)
169 | 
170 |     def get_all_refs(self, index):
171 |         assert index < self.num_lines, "Index out of bound"
172 |         return [self.ref_list[k][index] for k in range(self.num_refs)]
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/nbest-reranker/lib/m2scorer/Tokenizer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: iso-8859-15 -*-
  3 | 
  4 | # This file is part of the NUS M2 scorer.
  5 | # The NUS M2 scorer is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | 
 10 | # The NUS M2 scorer is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | 
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 | 
 18 | # file: Tokenizer.py
 19 | #
 20 | # A Penn Treebank tokenizer reimplemented based on the MOSES implementation.
 21 | #
 22 | # usage : %prog < input > output
 23 | 
 24 | 
 25 | import re
 26 | import sys
 27 | 
 28 | 
 29 | class DummyTokenizer(object):
 30 | 
 31 |     def tokenize(self, text):
 32 |         return text.split()
 33 | 
 34 | 
 35 | 
 36 | class PTBTokenizer(object):
 37 | 
 38 |     def __init__(self, language="en"):
 39 |         self.language = language
 40 |         self.nonbreaking_prefixes = {}
 41 |         self.nonbreaking_prefixes_numeric = {}
 42 |         self.nonbreaking_prefixes["en"] = ''' A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 
 43 |             Adj Adm Adv Asst Bart Bldg Brig Bros Capt Cmdr Col Comdr Con Corp Cpl DR Dr Drs Ens 
 44 |             Gen Gov Hon Hr Hosp Insp Lt MM MR MRS MS Maj Messrs Mlle Mme Mr Mrs Ms Msgr Op Ord
 45 |             Pfc Ph Prof Pvt Rep Reps Res Rev Rt Sen Sens Sfc Sgt Sr St Supt Surg
 46 |             v vs i.e rev e.g Nos Nr'''.split()        
 47 |         self.nonbreaking_prefixes_numeric["en"] = '''No Art pp'''.split()
 48 |         self.special_chars = re.compile(r"([^\w\s\.\'\`\,\-\"\|\/])", flags=re.UNICODE)
 49 |                 
 50 |     def tokenize(self, text, ptb=False):
 51 |         text = text.strip()
 52 |         text = " " + text + " "
 53 |         
 54 |         # Separate all "other" punctuation 
 55 | 
 56 |         text = re.sub(self.special_chars, r' \1 ', text)    
 57 |         text = re.sub(r";", r' ; ', text)    
 58 |         text = re.sub(r":", r' : ', text)
 59 |                             
 60 |         # replace the pipe character
 61 |         text = re.sub(r"\|", r' -PIPE- ', text)
 62 | 
 63 |         # split internal slash, keep others 
 64 |         text = re.sub(r"(\S)/(\S)", r'\1 / \2', text) 
 65 | 
 66 |         # PTB tokenization
 67 |         if ptb:
 68 |             text = re.sub(r"\(", r' -LRB- ', text)    
 69 |             text = re.sub(r"\)", r' -RRB- ', text)
 70 |             text = re.sub(r"\[", r' -LSB- ', text)    
 71 |             text = re.sub(r"\]", r' -RSB- ', text)
 72 |             text = re.sub(r"\{", r' -LCB- ', text)    
 73 |             text = re.sub(r"\}", r' -RCB- ', text)
 74 |         
 75 |             text = re.sub(r"\"\s*$", r" '' ", text)
 76 |             text = re.sub(r"^\s*\"", r' `` ', text)
 77 |             text = re.sub(r"(\S)\"\s", r"\1 '' ", text)
 78 |             text = re.sub(r"\s\"(\S)", r" `` \1", text)
 79 |             text = re.sub(r"(\S)\"", r"\1 '' ", text)
 80 |             text = re.sub(r"\"(\S)", r" `` \1", text)
 81 |             text = re.sub(r"'\s*$", r" ' ", text)
 82 |             text = re.sub(r"^\s*'", r" ` ", text)
 83 |             text = re.sub(r"(\S)'\s", r"\1 ' ", text)
 84 |             text = re.sub(r"\s'(\S)", r" ` \1", text) 
 85 |         
 86 |             text = re.sub(r"'ll", r" -CONTRACT-ll", text) 
 87 |             text = re.sub(r"'re", r" -CONTRACT-re", text) 
 88 |             text = re.sub(r"'ve", r" -CONTRACT-ve", text)
 89 |             text = re.sub(r"n't", r" n-CONTRACT-t", text)
 90 |             text = re.sub(r"'LL", r" -CONTRACT-LL", text) 
 91 |             text = re.sub(r"'RE", r" -CONTRACT-RE", text) 
 92 |             text = re.sub(r"'VE", r" -CONTRACT-VE", text)
 93 |             text = re.sub(r"N'T", r" N-CONTRACT-T", text)
 94 |             text = re.sub(r"cannot", r"can not", text)
 95 |             text = re.sub(r"Cannot", r"Can not", text)
 96 |         
 97 |         # multidots stay together
 98 |         text = re.sub(r"\.([\.]+)", r" DOTMULTI\1", text)
 99 |         while re.search("DOTMULTI\.", text):
100 |             text = re.sub(r"DOTMULTI\.([^\.])", r"DOTDOTMULTI \1", text)
101 |             text = re.sub(r"DOTMULTI\.", r"DOTDOTMULTI", text)
102 |         
103 |         # multidashes stay together
104 |         text = re.sub(r"\-([\-]+)", r" DASHMULTI\1", text)
105 |         while re.search("DASHMULTI\-", text):
106 |             text = re.sub(r"DASHMULTI\-([^\-])", r"DASHDASHMULTI \1", text)
107 |             text = re.sub(r"DASHMULTI\-", r"DASHDASHMULTI", text)
108 | 
109 |         # Separate ',' except if within number. 
110 |         text = re.sub(r"(\D),(\D)", r'\1 , \2', text) 
111 |         # Separate ',' pre and post number. 
112 |         text = re.sub(r"(\d),(\D)", r'\1 , \2', text) 
113 |         text = re.sub(r"(\D),(\d)", r'\1 , \2', text) 
114 |             
115 |         if self.language == "en":
116 |             text = re.sub(r"([^a-zA-Z])'([^a-zA-Z])", r"\1 ' \2", text) 
117 |             text = re.sub(r"(\W)'([a-zA-Z])", r"\1 ' \2", text)
118 |             text = re.sub(r"([a-zA-Z])'([^a-zA-Z])", r"\1 ' \2", text)
119 |             text = re.sub(r"([a-zA-Z])'([a-zA-Z])", r"\1 '\2", text)
120 |             text = re.sub(r"(\d)'(s)", r"\1 '\2", text)
121 |             text = re.sub(r" '\s+s ", r" 's ", text)
122 |             text = re.sub(r" '\s+s ", r" 's ", text)
123 |         elif self.language == "fr":
124 |             text = re.sub(r"([^a-zA-Z])'([^a-zA-Z])", r"\1 ' \2", text) 
125 |             text = re.sub(r"([^a-zA-Z])'([a-zA-Z])", r"\1 ' \2", text)
126 |             text = re.sub(r"([a-zA-Z])'([^a-zA-Z])", r"\1 ' \2", text)
127 |             text = re.sub(r"([a-zA-Z])'([a-zA-Z])", r"\1' \2", text)
128 |         else:
129 |             text = re.sub(r"'", r" ' ")
130 |             
131 |         # re-combine single quotes    
132 |         text = re.sub(r"' '", r"''", text)    
133 | 
134 |         words = text.split()
135 |         text = ''
136 |         for i, word in enumerate(words):
137 |             m = re.match("^(\S+)\.$", word)
138 |             if m:
139 |                 pre = m.group(1) 
140 |                 if ((re.search("\.", pre) and re.search("[a-zA-Z]", pre)) or \
141 |                     (pre in self.nonbreaking_prefixes[self.language]) or \
142 |                     ((i < len(words)-1) and re.match("^\d+", words[i+1]))):
143 |                     pass  # do nothing
144 |                 elif ((pre in self.nonbreaking_prefixes_numeric[self.language] ) and \
145 |                       (i < len(words)-1) and re.match("\d+", words[i+1])):
146 |                     pass  # do nothing
147 |                 else:
148 |                     word = pre + " ."
149 |                     
150 |             text += word + " "
151 |         text = re.sub(r"'\s+'", r"''", text)            
152 |        
153 |         # restore multidots
154 |         while re.search("DOTDOTMULTI", text):
155 |             text = re.sub(r"DOTDOTMULTI", r"DOTMULTI.", text)
156 |         text = re.sub(r"DOTMULTI", r".", text)
157 | 
158 |         # restore multidashes
159 |         while re.search("DASHDASHMULTI", text):
160 |             text = re.sub(r"DASHDASHMULTI", r"DASHMULTI-", text)
161 |         text = re.sub(r"DASHMULTI", r"-", text)    
162 |         text = re.sub(r"-CONTRACT-", r"'", text)
163 |    
164 |         return text.split() 
165 | 
166 |     
167 |     def tokenize_all(self,sentences, ptb=False):
168 |         return [self.tokenize(t, ptb) for t in sentences]
169 |             
170 | # starting point
171 | if __name__ == "__main__":
172 |     tokenizer = PTBTokenizer()
173 |     for line in sys.stdin:
174 |         line = line.decode("utf8")
175 |         tokens = tokenizer.tokenize(line.strip())
176 |         out = ' '.join(tokens)
177 |         print out.encode("utf8")
178 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/generate.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the LICENSE file in
  5 | # the root directory of this source tree. An additional grant of patent rights
  6 | # can be found in the PATENTS file in the same directory.
  7 | #
  8 | 
  9 | import sys
 10 | import torch
 11 | from torch.autograd import Variable
 12 | 
 13 | from fairseq import bleu, options, utils, tokenizer
 14 | from fairseq.meters import StopwatchMeter, TimeMeter
 15 | from fairseq.progress_bar import progress_bar
 16 | from fairseq.sequence_generator import SequenceGenerator
 17 | 
 18 | 
 19 | def main():
 20 |     parser = options.get_parser('Generation')
 21 |     parser.add_argument('--path', metavar='FILE', required=True, action='append',
 22 |                         help='path(s) to model file(s)')
 23 |     dataset_args = options.add_dataset_args(parser)
 24 |     dataset_args.add_argument('-i', '--interactive', action='store_true',
 25 |                               help='generate translations in interactive mode')
 26 |     dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N',
 27 |                               help='batch size')
 28 |     dataset_args.add_argument('--gen-subset', default='test', metavar='SPLIT',
 29 |                               help='data subset to generate (train, valid, test)')
 30 |     options.add_generation_args(parser)
 31 | 
 32 |     args = parser.parse_args()
 33 |     print(args)
 34 | 
 35 |     if args.no_progress_bar:
 36 |         progress_bar.enabled = False
 37 |     use_cuda = torch.cuda.is_available() and not args.cpu
 38 | 
 39 |     # Load model and dataset
 40 |     print('| loading model(s) from {}'.format(', '.join(args.path)))
 41 |     models, dataset = utils.load_ensemble_for_inference(args.path, args.data)
 42 | 
 43 |     print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict)))
 44 |     print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict)))
 45 |     if not args.interactive:
 46 |         print('| {} {} {} examples'.format(args.data, args.gen_subset, len(dataset.splits[args.gen_subset])))
 47 | 
 48 |     # Optimize model for generation
 49 |     for model in models:
 50 |         model.make_generation_fast_(args.beam, not args.no_beamable_mm)
 51 | 
 52 |     # Initialize generator
 53 |     translator = SequenceGenerator(models, dataset.dst_dict, beam_size=args.beam,
 54 |                                    stop_early=(not args.no_early_stop),
 55 |                                    normalize_scores=(not args.unnormalized),
 56 |                                    len_penalty=args.lenpen)
 57 |     align_dict = {}
 58 |     if args.unk_replace_dict != '':
 59 |         assert args.interactive, "Unkown words replacing requires access to original source and is only" \
 60 |                                  "supported in interactive mode"
 61 |         with open(args.unk_replace_dict, 'r') as f:
 62 |             for line in f:
 63 |                 l = line.split()
 64 |                 align_dict[l[0]] = l[1]
 65 | 
 66 |     def replace_unk(hypo_str, align_str, src, unk):
 67 |         hypo_tokens = hypo_str.split()
 68 |         src_tokens = tokenizer.tokenize_line(src)
 69 |         align_idx = [int(i) for i in align_str.split()]
 70 |         for i, ht in enumerate(hypo_tokens):
 71 |             if ht == unk:
 72 |                 src_token = src_tokens[align_idx[i]]
 73 |                 if src_token in align_dict:
 74 |                     hypo_tokens[i] = align_dict[src_token]
 75 |                 else:
 76 |                     hypo_tokens[i] = src_token
 77 |         return ' '.join(hypo_tokens)
 78 | 
 79 |     if use_cuda:
 80 |         translator.cuda()
 81 | 
 82 |     bpe_symbol = '@@ ' if args.remove_bpe else None
 83 |     def display_hypotheses(id, src, orig, ref, hypos):
 84 |         id_str = '' if id is None else '-{}'.format(id)
 85 |         src_str = to_sentence(dataset.src_dict, src, bpe_symbol)
 86 |         print('S{}\t{}'.format(id_str, src_str))
 87 |         if orig is not None:
 88 |             print('O{}\t{}'.format(id_str, orig.strip()))
 89 |         if ref is not None:
 90 |             print('T{}\t{}'.format(id_str, to_sentence(dataset.dst_dict, ref, bpe_symbol, ref_unk=True)))
 91 |         for hypo in hypos:
 92 |             hypo_str = to_sentence(dataset.dst_dict, hypo['tokens'], bpe_symbol)
 93 |             align_str = ' '.join(map(str, hypo['alignment']))
 94 |             if args.unk_replace_dict != '':
 95 |                 hypo_str = replace_unk(hypo_str, align_str, orig, unk_symbol(dataset.dst_dict))
 96 |             print('H{}\t{}\t{}'.format(
 97 |                 id_str, hypo['score'], hypo_str))
 98 |             print('A{}\t{}'.format(id_str, align_str))
 99 | 
100 |     if args.interactive:
101 |         for line in sys.stdin:
102 |             tokens = tokenizer.Tokenizer.tokenize(line, dataset.src_dict, add_if_not_exist=False).long()
103 |             start = dataset.src_dict.pad() + 1
104 |             positions = torch.arange(start, start + len(tokens)).type_as(tokens)
105 |             if use_cuda:
106 |                 positions = positions.cuda()
107 |                 tokens = tokens.cuda()
108 |             translations = translator.generate(Variable(tokens.view(1, -1)), Variable(positions.view(1, -1)))
109 |             hypos = translations[0]
110 |             display_hypotheses(None, tokens, line, None, hypos[:min(len(hypos), args.nbest)])
111 | 
112 |     else:
113 |         def maybe_remove_bpe(tokens):
114 |             """Helper for removing BPE symbols from a hypothesis."""
115 |             if not args.remove_bpe:
116 |                 return tokens
117 |             assert (tokens == dataset.dst_dict.pad()).sum() == 0
118 |             hypo_minus_bpe = to_sentence(dataset.dst_dict, tokens, bpe_symbol)
119 |             return tokenizer.Tokenizer.tokenize(hypo_minus_bpe, dataset.dst_dict, add_if_not_exist=True)
120 | 
121 |         # Generate and compute BLEU score
122 |         scorer = bleu.Scorer(dataset.dst_dict.pad(), dataset.dst_dict.eos(), dataset.dst_dict.unk())
123 |         itr = dataset.dataloader(args.gen_subset, batch_size=args.batch_size, max_positions=args.max_positions)
124 |         num_sentences = 0
125 |         with progress_bar(itr, smoothing=0, leave=False) as t:
126 |             wps_meter = TimeMeter()
127 |             gen_timer = StopwatchMeter()
128 |             translations = translator.generate_batched_itr(
129 |                 t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b,
130 |                 cuda_device=0 if use_cuda else None, timer=gen_timer)
131 |             for id, src, ref, hypos in translations:
132 |                 ref = ref.int().cpu()
133 |                 top_hypo = hypos[0]['tokens'].int().cpu()
134 |                 scorer.add(maybe_remove_bpe(ref), maybe_remove_bpe(top_hypo))
135 |                 display_hypotheses(id, src, None, ref, hypos[:min(len(hypos), args.nbest)])
136 | 
137 |                 wps_meter.update(src.size(0))
138 |                 t.set_postfix(wps='{:5d}'.format(round(wps_meter.avg)))
139 |                 num_sentences += 1
140 | 
141 |         print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} tokens/s)'.format(
142 |             num_sentences, gen_timer.n, gen_timer.sum, 1. / gen_timer.avg))
143 |         print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))
144 | 
145 | 
146 | def to_token(dict, i, runk):
147 |     return runk if i == dict.unk() else dict[i]
148 | 
149 | 
150 | def unk_symbol(dict, ref_unk=False):
151 |     return '<{}>'.format(dict.unk_word) if ref_unk else dict.unk_word
152 | 
153 | 
154 | def to_sentence(dict, tokens, bpe_symbol=None, ref_unk=False):
155 |     if torch.is_tensor(tokens) and tokens.dim() == 2:
156 |         sentences = [to_sentence(dict, token) for token in tokens]
157 |         return '\n'.join(sentences)
158 |     eos = dict.eos()
159 |     runk = unk_symbol(dict, ref_unk=ref_unk)
160 |     sent = ' '.join([to_token(dict, i, runk) for i in tokens if i != eos])
161 |     if bpe_symbol is not None:
162 |         sent = sent.replace(bpe_symbol, '')
163 |     return sent
164 | 
165 | 
166 | if __name__ == '__main__':
167 |     main()
168 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/subword-nmt/learn_bpe.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
  6 | Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary
  7 | of a text to a configurable number of symbols, with only a small increase in the number of tokens.
  8 | 
  9 | Reference:
 10 | Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
 11 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
 12 | """
 13 | 
 14 | from __future__ import unicode_literals
 15 | 
 16 | import sys
 17 | import codecs
 18 | import re
 19 | import copy
 20 | import argparse
 21 | from collections import defaultdict, Counter
 22 | 
 23 | # hack for python2/3 compatibility
 24 | from io import open
 25 | argparse.open = open
 26 | 
 27 | # python 2/3 compatibility
 28 | if sys.version_info < (3, 0):
 29 |   sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
 30 |   sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
 31 |   sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
 32 | 
 33 | def create_parser():
 34 |     parser = argparse.ArgumentParser(
 35 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 36 |         description="learn BPE-based word segmentation")
 37 | 
 38 |     parser.add_argument(
 39 |         '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
 40 |         metavar='PATH',
 41 |         help="Input text (default: standard input).")
 42 |     parser.add_argument(
 43 |         '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
 44 |         metavar='PATH',
 45 |         help="Output file for BPE codes (default: standard output)")
 46 |     parser.add_argument(
 47 |         '--symbols', '-s', type=int, default=10000,
 48 |         help="Create this many new symbols (each representing a character n-gram) (default: %(default)s))")
 49 |     parser.add_argument(
 50 |         '--verbose', '-v', action="store_true",
 51 |         help="verbose mode.")
 52 | 
 53 |     return parser
 54 | 
 55 | def get_vocabulary(fobj):
 56 |     """Read text and return dictionary that encodes vocabulary
 57 |     """
 58 |     vocab = Counter()
 59 |     for line in fobj:
 60 |         for word in line.split():
 61 |             vocab[word] += 1
 62 |     return vocab
 63 | 
 64 | def update_pair_statistics(pair, changed, stats, indices):
 65 |     """Minimally update the indices and frequency of symbol pairs
 66 | 
 67 |     if we merge a pair of symbols, only pairs that overlap with occurrences
 68 |     of this pair are affected, and need to be updated.
 69 |     """
 70 |     stats[pair] = 0
 71 |     indices[pair] = defaultdict(int)
 72 |     first, second = pair
 73 |     new_pair = first+second
 74 |     for j, word, old_word, freq in changed:
 75 | 
 76 |         # find all instances of pair, and update frequency/indices around it
 77 |         i = 0
 78 |         while True:
 79 |             try:
 80 |                 i = old_word.index(first, i)
 81 |             except ValueError:
 82 |                 break
 83 |             if i < len(old_word)-1 and old_word[i+1] == second:
 84 |                 if i:
 85 |                     prev = old_word[i-1:i+1]
 86 |                     stats[prev] -= freq
 87 |                     indices[prev][j] -= 1
 88 |                 if i < len(old_word)-2:
 89 |                     # don't double-count consecutive pairs
 90 |                     if old_word[i+2] != first or i >= len(old_word)-3 or old_word[i+3] != second:
 91 |                         nex = old_word[i+1:i+3]
 92 |                         stats[nex] -= freq
 93 |                         indices[nex][j] -= 1
 94 |                 i += 2
 95 |             else:
 96 |                 i += 1
 97 | 
 98 |         i = 0
 99 |         while True:
100 |             try:
101 |                 i = word.index(new_pair, i)
102 |             except ValueError:
103 |                 break
104 |             if i:
105 |                 prev = word[i-1:i+1]
106 |                 stats[prev] += freq
107 |                 indices[prev][j] += 1
108 |             # don't double-count consecutive pairs
109 |             if i < len(word)-1 and word[i+1] != new_pair:
110 |                 nex = word[i:i+2]
111 |                 stats[nex] += freq
112 |                 indices[nex][j] += 1
113 |             i += 1
114 | 
115 | 
116 | def get_pair_statistics(vocab):
117 |     """Count frequency of all symbol pairs, and create index"""
118 | 
119 |     # data structure of pair frequencies
120 |     stats = defaultdict(int)
121 | 
122 |     #index from pairs to words
123 |     indices = defaultdict(lambda: defaultdict(int))
124 | 
125 |     for i, (word, freq) in enumerate(vocab):
126 |         prev_char = word[0]
127 |         for char in word[1:]:
128 |             stats[prev_char, char] += freq
129 |             indices[prev_char, char][i] += 1
130 |             prev_char = char
131 | 
132 |     return stats, indices
133 | 
134 | 
135 | def replace_pair(pair, vocab, indices):
136 |     """Replace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'"""
137 |     first, second = pair
138 |     pair_str = ''.join(pair)
139 |     pair_str = pair_str.replace('\\','\\\\')
140 |     changes = []
141 |     pattern = re.compile(r'(?<!\S)' + re.escape(first + ' ' + second) + r'(?!\S)')
142 |     if sys.version_info < (3, 0):
143 |         iterator = indices[pair].iteritems()
144 |     else:
145 |         iterator = indices[pair].items()
146 |     for j, freq in iterator:
147 |         if freq < 1:
148 |             continue
149 |         word, freq = vocab[j]
150 |         new_word = ' '.join(word)
151 |         new_word = pattern.sub(pair_str, new_word)
152 |         new_word = tuple(new_word.split())
153 | 
154 |         vocab[j] = (new_word, freq)
155 |         changes.append((j, new_word, word, freq))
156 | 
157 |     return changes
158 | 
159 | def prune_stats(stats, big_stats, threshold):
160 |     """Prune statistics dict for efficiency of max()
161 | 
162 |     The frequency of a symbol pair never increases, so pruning is generally safe
163 |     (until we the most frequent pair is less frequent than a pair we previously pruned)
164 |     big_stats keeps full statistics for when we need to access pruned items
165 |     """
166 |     for item,freq in list(stats.items()):
167 |         if freq < threshold:
168 |             del stats[item]
169 |             if freq < 0:
170 |                 big_stats[item] += freq
171 |             else:
172 |                 big_stats[item] = freq
173 | 
174 | if __name__ == '__main__':
175 | 
176 |     parser = create_parser()
177 |     args = parser.parse_args()
178 | 
179 |     vocab = get_vocabulary(args.input)
180 |     vocab = dict([(tuple(x)+('</w>',) ,y) for (x,y) in vocab.items()])
181 |     sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
182 | 
183 |     stats, indices = get_pair_statistics(sorted_vocab)
184 |     big_stats = copy.deepcopy(stats)
185 |     # threshold is inspired by Zipfian assumption, but should only affect speed
186 |     threshold = max(stats.values()) / 10
187 |     for i in range(args.symbols):
188 |         if stats:
189 |             most_frequent = max(stats, key=stats.get)
190 | 
191 |         # we probably missed the best pair because of pruning; go back to full statistics
192 |         if not stats or (i and stats[most_frequent] < threshold):
193 |             prune_stats(stats, big_stats, threshold)
194 |             stats = copy.deepcopy(big_stats)
195 |             most_frequent = max(stats, key=stats.get)
196 |             # threshold is inspired by Zipfian assumption, but should only affect speed
197 |             threshold = stats[most_frequent] * i/(i+10000.0)
198 |             prune_stats(stats, big_stats, threshold)
199 | 
200 |         if stats[most_frequent] < 2:
201 |             sys.stderr.write('no pair has frequency > 1. Stopping\n')
202 |             break
203 | 
204 |         if args.verbose:
205 |             sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent]))
206 |         args.output.write('{0} {1}\n'.format(*most_frequent))
207 |         changes = replace_pair(most_frequent, sorted_vocab, indices)
208 |         update_pair_statistics(most_frequent, changes, stats, indices)
209 |         stats[most_frequent] = 0
210 |         if not i % 100:
211 |             prune_stats(stats, big_stats, threshold)
212 | 


--------------------------------------------------------------------------------
/CS2S+BPE+Emb/software/fairseq-py/fairseq/options.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the LICENSE file in
  5 | # the root directory of this source tree. An additional grant of patent rights
  6 | # can be found in the PATENTS file in the same directory.
  7 | #
  8 | 
  9 | import argparse
 10 | 
 11 | from fairseq import models
 12 | 
 13 | 
 14 | def get_parser(desc):
 15 |     parser = argparse.ArgumentParser(
 16 |         description='Facebook AI Research Sequence-to-Sequence Toolkit -- ' + desc)
 17 |     parser.add_argument('--no-progress-bar', action='store_true', help='disable progress bar')
 18 |     parser.add_argument('--log-interval', type=int, default=1000, metavar='N',
 19 |                         help='log progress every N updates (when progress bar is disabled)')
 20 |     parser.add_argument('--seed', default=1, type=int, metavar='N',
 21 |                         help='pseudo random number generator seed')
 22 |     return parser
 23 | 
 24 | 
 25 | def add_dataset_args(parser):
 26 |     group = parser.add_argument_group('Dataset and data loading')
 27 |     group.add_argument('data', metavar='DIR',
 28 |                        help='path to data directory')
 29 |     group.add_argument('-s', '--source-lang', default=None, metavar='SRC',
 30 |                        help='source language')
 31 |     group.add_argument('-t', '--target-lang', default=None, metavar='TARGET',
 32 |                        help='target language')
 33 |     group.add_argument('-j', '--workers', default=1, type=int, metavar='N',
 34 |                        help='number of data loading workers (default: 1)')
 35 |     group.add_argument('--max-positions', default=1024, type=int, metavar='N',
 36 |                        help='max number of tokens in the sequence')
 37 |     return group
 38 | 
 39 | 
 40 | def add_optimization_args(parser):
 41 |     group = parser.add_argument_group('Optimization')
 42 |     group.add_argument('--lr', '--learning-rate', default=0.25, type=float, metavar='LR',
 43 |                        help='initial learning rate')
 44 |     group.add_argument('--min-lr', metavar='LR', default=1e-5, type=float,
 45 |                        help='minimum learning rate')
 46 |     group.add_argument('--force-anneal', '--fa', default=0, type=int, metavar='N',
 47 |                        help='force annealing at specified epoch')
 48 |     group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N',
 49 |                        help='force stop training at specified epoch')
 50 |     group.add_argument('--lrshrink', default=0.1, type=float, metavar='LS',
 51 |                        help='learning rate shrink factor for annealing, lr_new = (lr * lrshrink)')
 52 |     group.add_argument('--momentum', default=0.99, type=float, metavar='M',
 53 |                        help='momentum factor')
 54 |     group.add_argument('--clip-norm', default=25, type=float, metavar='NORM',
 55 |                        help='clip threshold of gradients')
 56 |     group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
 57 |                        help='weight decay')
 58 |     group.add_argument('--sample-without-replacement', default=0, type=int, metavar='N',
 59 |                        help='If bigger than 0, use that number of mini-batches for each epoch,'
 60 |                             ' where each sample is drawn randomly with replacement from the'
 61 |                             ' dataset')
 62 |     return group
 63 | 
 64 | 
 65 | def add_checkpoint_args(parser):
 66 |     group = parser.add_argument_group('Checkpointing')
 67 |     group.add_argument('--save-dir', metavar='DIR', default='checkpoints',
 68 |                        help='path to save checkpoints')
 69 |     group.add_argument('--restore-file', default='checkpoint_last.pt',
 70 |                        help='filename in save-dir from which to load checkpoint')
 71 |     group.add_argument('--save-interval', type=int, default=-1,
 72 |                        help='checkpoint every this many batches')
 73 |     group.add_argument('--no-save', action='store_true',
 74 |                        help='don\'t save models and checkpoints')
 75 |     group.add_argument('--no-epoch-checkpoints', action='store_true',
 76 |                        help='only store last and best checkpoints')
 77 |     return group
 78 | 
 79 | 
 80 | def add_generation_args(parser):
 81 |     group = parser.add_argument_group('Generation')
 82 |     group.add_argument('--beam', default=5, type=int, metavar='N',
 83 |                        help='beam size')
 84 |     group.add_argument('--nbest', default=1, type=int, metavar='N',
 85 |                        help='number of hypotheses to output')
 86 |     group.add_argument('--max-len-a', default=0, type=int, metavar='N',
 87 |                        help=('generate sequence of maximum length ax + b, '
 88 |                              'where x is the source length'))
 89 |     group.add_argument('--max-len-b', default=200, type=int, metavar='N',
 90 |                        help=('generate sequence of maximum length ax + b, '
 91 |                              'where x is the source length'))
 92 |     group.add_argument('--remove-bpe', action='store_true',
 93 |                        help='remove BPE tokens before scoring')
 94 |     group.add_argument('--no-early-stop', action='store_true',
 95 |                        help=('continue searching even after finalizing k=beam '
 96 |                              'hypotheses; this is more correct, but increases '
 97 |                              'generation time by 50%%'))
 98 |     group.add_argument('--unnormalized', action='store_true',
 99 |                        help='compare unnormalized hypothesis scores')
100 |     group.add_argument('--cpu', action='store_true', help='generate on CPU')
101 |     group.add_argument('--no-beamable-mm', action='store_true',
102 |                        help='don\'t use BeamableMM in attention layers')
103 |     group.add_argument('--lenpen', default=1, type=float,
104 |                        help='length penalty: <1.0 favors shorter, >1.0 favors longer sentences')
105 |     group.add_argument('--unk-replace-dict', default='', type=str,
106 |                        help='performs unk word replacement')
107 | 
108 |     return group
109 | 
110 | 
111 | def add_model_args(parser):
112 |     group = parser.add_argument_group(
113 |         'Model configuration',
114 |         # Only include attributes which are explicitly given as command-line
115 |         # arguments or which have model-independent default values.
116 |         argument_default=argparse.SUPPRESS,
117 |     )
118 | 
119 |     # The model architecture can be specified in several ways.
120 |     # In increasing order of priority:
121 |     # 1) model defaults (lowest priority)
122 |     # 2) --arch argument
123 |     # 3) --encoder/decoder-* arguments (highest priority)
124 |     # Note: --arch cannot be combined with --encoder/decoder-* arguments.
125 |     group.add_argument('--arch', '-a', default='fconv', metavar='ARCH', choices=models.arch_model_map.keys(),
126 |                        help='model architecture ({})'.format(', '.join(models.arch_model_map.keys())))
127 |     group.add_argument('--encoder-embed-dim', type=int, metavar='N',
128 |                        help='encoder embedding dimension')
129 |     group.add_argument('--encoder-layers', type=str, metavar='EXPR',
130 |                        help='encoder layers [(dim, kernel_size), ...]')
131 |     group.add_argument('--decoder-embed-dim', type=int, metavar='N',
132 |                        help='decoder embedding dimension')
133 |     group.add_argument('--decoder-layers', type=str, metavar='EXPR',
134 |                        help='decoder layers [(dim, kernel_size), ...]')
135 |     group.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
136 |                        help='decoder output embedding dimension')
137 |     group.add_argument('--decoder-attention', type=str, metavar='EXPR',
138 |                        help='decoder attention [True, ...]')
139 |     group.add_argument('--encoder-embed-path', default=None, type=str, metavar='STR',
140 |                        help='path to pre-trained encoder embeddings')
141 |     group.add_argument('--decoder-embed-path', default=None, type=str, metavar='STR',
142 |                        help='path to pre-trained decoder embeddings')
143 |     # These arguments have default values independent of the model:
144 |     group.add_argument('--dropout', default=0.1, type=float, metavar='D',
145 |                        help='dropout probability')
146 |     group.add_argument('--label-smoothing', default=0, type=float, metavar='D',
147 |                        help='epsilon for label smoothing, 0 means no label smoothing')
148 |     return group
149 | 


--------------------------------------------------------------------------------