├── CS2S+BPE+Emb ├── software │ ├── nbest-reranker │ │ ├── lib │ │ │ ├── __init__.py │ │ │ ├── m2scorer │ │ │ │ ├── __init__.py │ │ │ │ ├── token_offsets.py │ │ │ │ ├── nuclesgmlparser.py │ │ │ │ ├── combiner.py │ │ │ │ ├── m2scorer.py │ │ │ │ ├── util.py │ │ │ │ └── Tokenizer.py │ │ │ └── kenlm_python │ │ │ │ ├── __init__.py │ │ │ │ ├── example.py │ │ │ │ └── _kenlm.pxd │ │ ├── .gitignore │ │ ├── README.md │ │ ├── configreader.py │ │ ├── augmenter.py │ │ ├── rerank.py │ │ ├── train.py │ │ ├── log_utils.py │ │ └── candidatesreader.py │ ├── fairseq-py │ │ ├── requirements.txt │ │ ├── fairseq.gif │ │ ├── fairseq │ │ │ ├── __init__.py │ │ │ ├── temporal_convolution_tbc │ │ │ │ └── __init__.py │ │ │ ├── modules │ │ │ │ ├── __init__.py │ │ │ │ ├── beamable_mm.py │ │ │ │ ├── linearized_convolution.py │ │ │ │ └── conv_tbc.py │ │ │ ├── criterions │ │ │ │ ├── __init__.py │ │ │ │ ├── cross_entropy.py │ │ │ │ ├── fairseq_criterion.py │ │ │ │ └── label_smoothed_cross_entropy.py │ │ │ ├── clib │ │ │ │ ├── temporal_convolution_tbc │ │ │ │ │ ├── temporal_convolution_tbc.h │ │ │ │ │ └── temporal_convolution_tbc.cpp │ │ │ │ └── libbleu │ │ │ │ │ ├── module.cpp │ │ │ │ │ └── libbleu.cpp │ │ │ ├── models │ │ │ │ └── __init__.py │ │ │ ├── multiprocessing_pdb.py │ │ │ ├── progress_bar.py │ │ │ ├── nag.py │ │ │ ├── meters.py │ │ │ ├── tokenizer.py │ │ │ ├── bleu.py │ │ │ ├── dictionary.py │ │ │ ├── nccl.py │ │ │ ├── indexed_dataset.py │ │ │ ├── utils.py │ │ │ ├── multiprocessing_event_loop.py │ │ │ └── options.py │ │ ├── scripts │ │ │ ├── convert_dictionary.lua │ │ │ ├── convert_model.lua │ │ │ └── build_sym_alignment.py │ │ ├── CONTRIBUTING.md │ │ ├── tests │ │ │ └── test_label_smoothing.py │ │ ├── LICENSE │ │ ├── .gitignore │ │ ├── PATENTS │ │ ├── setup.py │ │ ├── score.py │ │ ├── data │ │ │ └── prepare-iwslt14.sh │ │ ├── preprocess.py │ │ └── generate.py │ ├── subword-nmt │ │ ├── get_vocab.py │ │ ├── LICENSE │ │ ├── README.md │ │ ├── bpe_toy.py │ │ ├── segment-char-ngrams.py │ │ ├── chrF.py │ │ ├── apply_bpe.py │ │ └── learn_bpe.py │ └── download.sh ├── paths.sh ├── scripts │ ├── get_diff.py │ ├── nbest_reformat.py │ ├── convert_m2_to_parallel.py │ └── apply_bpe.py ├── training │ ├── train.sh │ ├── train_embed.sh │ └── preprocess.sh └── run.sh ├── scripts ├── remove_spac_pkunlp_segment.sh └── pkunlp_segment.py └── README.md /CS2S+BPE+Emb/software/nbest-reranker/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/lib/m2scorer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/lib/kenlm_python/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | torch 3 | tqdm 4 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YingyWang/NLPCC_2018_TASK2_GEC/HEAD/CS2S+BPE+Emb/software/fairseq-py/fairseq.gif -------------------------------------------------------------------------------- /CS2S+BPE+Emb/paths.sh: -------------------------------------------------------------------------------- 1 | BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 2 | DATA_DIR=$BASE_DIR/data 3 | MODEL_DIR=$BASE_DIR/models 4 | SCRIPTS_DIR=$BASE_DIR/scripts 5 | SOFTWARE_DIR=$BASE_DIR/software 6 | TRAINING_DIR=$BASE_DIR/training 7 | 8 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/subword-nmt/get_vocab.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import sys 4 | from collections import Counter 5 | 6 | c = Counter() 7 | 8 | for line in sys.stdin: 9 | for word in line.split(): 10 | c[word] += 1 11 | 12 | for key,f in sorted(c.items(), key=lambda x: x[1], reverse=True): 13 | print key, f 14 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/scripts/get_diff.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | prefix = sys.argv[1] 5 | src = sys.argv[2] 6 | trg = sys.argv[3] 7 | 8 | with open(prefix + '.' + src) as f_src, open(prefix + '.' + trg) as f_trg: 9 | for sline, tline in zip(f_src, f_trg): 10 | sline = sline.strip() 11 | tline = tline.strip() 12 | if sline != tline: 13 | print sline+'\t'+tline 14 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | from .multiprocessing_pdb import pdb 10 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/temporal_convolution_tbc/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._temporal_convolution_tbc import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | locals[symbol] = _wrap_function(fn, _ffi) 10 | __all__.append(symbol) 11 | 12 | _import_symbols(locals()) 13 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/README.md: -------------------------------------------------------------------------------- 1 | # N-best Reranker 2 | 3 | Re-ranking N-best lists (MOSES format) using features like language models, edit operations etc. It is also easy to implement custom features. 4 | 5 | Currently, tuning with BLEU and M2Scorer with MERT are supported 6 | 7 | ## Running the re-ranker 8 | 9 | 1. First augment the new feature using augment.py script 10 | 11 | 2. Then train the re-ranker using train.py script 12 | 13 | 3. Then rerank using rerank.py script 14 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/configreader.py: -------------------------------------------------------------------------------- 1 | ''' File to parse config files 2 | ''' 3 | 4 | def parse_ini(ini_path): 5 | out = [] 6 | with open(ini_path, 'r') as ini_file: 7 | section = '[nil]' 8 | for line in ini_file: 9 | line = line.strip() 10 | if line.startswith('['): 11 | section = line 12 | elif section == '[weight]' and line != '': 13 | if line.startswith('UnknownWordPenalty0= '): 14 | out.append('UnknownWordPenalty0 UNTUNEABLE') 15 | else: 16 | out.append(line) 17 | return out 18 | -------------------------------------------------------------------------------- /scripts/remove_spac_pkunlp_segment.sh: -------------------------------------------------------------------------------- 1 | NLPCC2018_DIR=/home/renhongkai/projects/mlconvgec2018/nlpcc2018/ 2 | DIR=$NLPCC2018_DIR//seq2seq+bpe+embed/outputs/mlconv_embed/model1/model_best 3 | # 去除空格(分词信息) 4 | sed 's/ //g' $DIR/output.tok.txt > $DIR/output.tok.txt.remove.spac 5 | # 使用pkunlp进行分词,得到文件$DIR/output.tok.txt.remove.spac.seg 6 | python pkunlp_segment.py --corpus $DIR/output.tok.txt.remove.spac --segsuffix seg 7 | # 使用m2score计算得分 8 | $NLPCC2018_DIR/m2scorer/m2scorer $DIR/output.tok.txt.remove.spac.seg ~/projects/mlconvgec2018/nlpcc2018/gold.01 9 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | from .beamable_mm import * 10 | from .linearized_convolution import * 11 | from .conv_tbc import ConvTBC 12 | 13 | __all__ = [ 14 | 'BeamableMM', 'LinearizedConvolution', 'ConvTBC', 15 | ] 16 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/training/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | source ../paths.sh 7 | FAIRSEQPY=$SOFTWARE_DIR/fairseq-py 8 | 9 | SEED=1 10 | DATA_BIN_DIR=processed/bin 11 | 12 | OUT_DIR=models/mlconv/model$SEED/ 13 | mkdir -p $OUT_DIR 14 | 15 | PYTHONPATH=$FAIRSEQPY:$PYTHONPATH CUDA_VISIBLE_DEVICES="0" python $FAIRSEQPY/train.py --save-dir $OUT_DIR --encoder-embed-dim 500 --decoder-embed-dim 500 --decoder-out-embed-dim 500 --dropout 0.2 --clip-norm 0.1 --lr 0.25 --min-lr 1e-4 --encoder-layers '[(1024,3)] * 7' --decoder-layers '[(1024,3)] * 7' --momentum 0.99 --max-epoch 100 --batch-size 32 --no-progress-bar --seed $SEED $DATA_BIN_DIR 16 | 17 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/criterions/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | from .cross_entropy import CrossEntropyCriterion 10 | from .fairseq_criterion import FairseqCriterion 11 | from .label_smoothed_cross_entropy import LabelSmoothedCrossEntropyCriterion 12 | 13 | __all__ = [ 14 | 'CrossEntropyCriterion', 15 | 'LabelSmoothedCrossEntropyCriterion', 16 | ] 17 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/clib/temporal_convolution_tbc/temporal_convolution_tbc.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | void TemporalConvolutionTBC_forward( 10 | const char* dtype, 11 | void* input, 12 | void* output, 13 | void* weight, 14 | void* bias); 15 | 16 | void TemporalConvolutionTBC_backward( 17 | const char* dtype, 18 | void* _dOutput, 19 | void* _dInput, 20 | void* _dWeight, 21 | void* _dBias, 22 | void* _input, 23 | void* _weight); 24 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | from . import fconv 10 | 11 | 12 | __all__ = ['fconv'] 13 | 14 | arch_model_map = {} 15 | for model in __all__: 16 | archs = locals()[model].get_archs() 17 | for arch in archs: 18 | assert arch not in arch_model_map, 'Duplicate model architecture detected: {}'.format(arch) 19 | arch_model_map[arch] = model 20 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/training/train_embed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | source ../paths.sh 7 | FAIRSEQPY=$SOFTWARE_DIR/fairseq-py 8 | 9 | # download embeddings if necessary 10 | EMBED_PATH=$DATA_DIR/embeddings/chinesegigawordv5.jian.jieba.seg.bpe.skipngram.500d.txt 11 | 12 | SEED=1 13 | DATA_BIN_DIR=processed/bin 14 | OUT_DIR=models/mlconv_embed/model$SEED/ 15 | mkdir -p $OUT_DIR 16 | 17 | PYTHONPATH=$FAIRSEQPY:$PYTHONPATH CUDA_VISIBLE_DEVICES="0" python $FAIRSEQPY/train.py --save-dir $OUT_DIR --encoder-embed-dim 500 --encoder-embed-path $EMBED_PATH --decoder-embed-dim 500 --decoder-embed-path $EMBED_PATH --decoder-out-embed-dim 500 --dropout 0.2 --clip-norm 0.1 --lr 0.25 --min-lr 1e-4 --encoder-layers '[(1024,3)] * 7' --decoder-layers '[(1024,3)] * 7' --momentum 0.99 --max-epoch 100 --batch-size 32 --no-progress-bar --seed $SEED $DATA_BIN_DIR 18 | 19 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/scripts/nbest_reformat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('-i', '--input-file', help='path to input file (output of fairseq)') 7 | parser.add_argument('--debpe', action='store_true', help='enable the flag to post-process and remove BPE segmentation.') 8 | 9 | args = parser.parse_args() 10 | 11 | 12 | scount = -1 13 | with open(args.input_file) as f: 14 | for line in f: 15 | line = line.strip() 16 | pieces = line.split('\t') 17 | if pieces[0] == 'S': 18 | scount += 1 19 | if pieces[0] == 'H': 20 | hyp = pieces[2] 21 | if args.debpe: 22 | hyp = hyp.replace('@@ ','') 23 | score = pieces[1] 24 | print("%d ||| %s ||| F0= %s ||| %s" % (scount, hyp, score, score) ) 25 | 26 | 27 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/clib/libbleu/module.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include 10 | 11 | 12 | static PyMethodDef method_def[] = { 13 | {NULL, NULL, 0, NULL} 14 | }; 15 | 16 | static struct PyModuleDef module_def = { 17 | PyModuleDef_HEAD_INIT, 18 | "libbleu", /* name of module */ 19 | NULL, /* module documentation, may be NULL */ 20 | -1, /* size of per-interpreter state of the module, 21 | or -1 if the module keeps state in global variables. */ 22 | method_def 23 | }; 24 | 25 | 26 | #if PY_MAJOR_VERSION == 2 27 | PyMODINIT_FUNC init_libbleu() 28 | #else 29 | PyMODINIT_FUNC PyInit_libbleu() 30 | #endif 31 | { 32 | PyObject *m = PyModule_Create(&module_def); 33 | if (!m) { 34 | return NULL; 35 | } 36 | return m; 37 | } 38 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/scripts/convert_dictionary.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (c) 2017-present, Facebook, Inc. 2 | -- All rights reserved. 3 | -- 4 | -- This source code is licensed under the license found in the LICENSE file in 5 | -- the root directory of this source tree. An additional grant of patent rights 6 | -- can be found in the PATENTS file in the same directory. 7 | -- 8 | -- Usage: convert_dictionary.lua 9 | require 'fairseq' 10 | require 'torch' 11 | require 'paths' 12 | 13 | if #arg < 1 then 14 | print('usage: convert_dictionary.lua ') 15 | os.exit(1) 16 | end 17 | if not paths.filep(arg[1]) then 18 | print('error: file does not exit: ' .. arg[1]) 19 | os.exit(1) 20 | end 21 | 22 | dict = torch.load(arg[1]) 23 | dst = paths.basename(arg[1]):gsub('.th7', '.txt') 24 | assert(dst:match('.txt$')) 25 | 26 | f = io.open(dst, 'w') 27 | for idx, symbol in ipairs(dict.index_to_symbol) do 28 | if idx > dict.cutoff then 29 | break 30 | end 31 | f:write(symbol) 32 | f:write(' ') 33 | f:write(dict.index_to_freq[idx]) 34 | f:write('\n') 35 | end 36 | f:close() 37 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/criterions/cross_entropy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import math 10 | import torch.nn.functional as F 11 | 12 | from .fairseq_criterion import FairseqCriterion 13 | 14 | 15 | class CrossEntropyCriterion(FairseqCriterion): 16 | 17 | def __init__(self, padding_idx): 18 | super().__init__() 19 | self.padding_idx = padding_idx 20 | 21 | def prepare(self, samples): 22 | self.denom = sum(s['ntokens'] if s else 0 for s in samples) 23 | 24 | def forward(self, net_output, sample): 25 | input = net_output.view(-1, net_output.size(-1)) 26 | target = sample['target'].view(-1) 27 | loss = F.cross_entropy(input, target, size_average=False, ignore_index=self.padding_idx) 28 | return loss / self.denom 29 | 30 | def aggregate(self, losses): 31 | return sum(losses) / math.log(2) 32 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/criterions/fairseq_criterion.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | from torch.nn.modules.loss import _Loss 10 | 11 | 12 | class FairseqCriterion(_Loss): 13 | 14 | def __init__(self, *args, **kwargs): 15 | super().__init__(*args, **kwargs) 16 | 17 | def prepare(self, samples): 18 | """Prepare criterion for DataParallel training.""" 19 | raise NotImplementedError 20 | 21 | def forward(self, net_output, sample): 22 | """Compute the loss for the given sample and network output.""" 23 | raise NotImplementedError 24 | 25 | def aggregate(self, losses): 26 | """Aggregate losses from DataParallel training. 27 | 28 | Takes a list of losses as input (as returned by forward) and 29 | aggregates them into the total loss for the mini-batch. 30 | """ 31 | raise NotImplementedError 32 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/subword-nmt/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 University of Edinburgh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/multiprocessing_pdb.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import multiprocessing 10 | import os 11 | import pdb 12 | import sys 13 | 14 | 15 | class MultiprocessingPdb(pdb.Pdb): 16 | """A Pdb wrapper that works in a multiprocessing environment. 17 | 18 | Usage: `from fairseq import pdb; pdb.set_trace()` 19 | """ 20 | 21 | _stdin_fd = sys.stdin.fileno() 22 | _stdin = None 23 | _stdin_lock = multiprocessing.Lock() 24 | 25 | def __init__(self): 26 | pdb.Pdb.__init__(self, nosigint=True) 27 | 28 | def _cmdloop(self): 29 | stdin_bak = sys.stdin 30 | with self._stdin_lock: 31 | try: 32 | if not self._stdin: 33 | self._stdin = os.fdopen(self._stdin_fd) 34 | sys.stdin = self._stdin 35 | self.cmdloop() 36 | finally: 37 | sys.stdin = stdin_bak 38 | 39 | 40 | pdb = MultiprocessingPdb() 41 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/download.sh: -------------------------------------------------------------------------------- 1 | echo "Downloading Fairseq from https://github.com/shamilcm/fairseq-py (rev:90c31cd92055124c427689c00624b1eb84c5688a)" 2 | wget https://github.com/shamilcm/fairseq-py/archive/90c31cd92055124c427689c00624b1eb84c5688a.zip 3 | unzip 90c31cd92055124c427689c00624b1eb84c5688a.zip 4 | rm 90c31cd92055124c427689c00624b1eb84c5688a.zip 5 | mv fairseq-py-90c31cd92055124c427689c00624b1eb84c5688a fairseq-py 6 | 7 | echo "Downloading n-best reranker from https://github.com/nusnlp/nbest-reranker (rev: 454c4adc90d0469ef7b2c71ff8cf849ea8cb67f)" 8 | wget https://github.com/nusnlp/nbest-reranker/archive/454c4adc90d0469ef7b2c71ff8cf849ea8cb67f6.zip 9 | unzip 454c4adc90d0469ef7b2c71ff8cf849ea8cb67f6.zip 10 | rm 454c4adc90d0469ef7b2c71ff8cf849ea8cb67f6.zip 11 | mv nbest-reranker-454c4adc90d0469ef7b2c71ff8cf849ea8cb67f6 nbest-reranker 12 | #git clone https://github.com/nusnlp/nbest-reranker/ 13 | 14 | echo "Downloading Subword NMT from https://github.com/rsennrich/subword-nmt (rev: ec5c7b009c409e72b5ef65a77c1a846546f14847)" 15 | wget https://github.com/rsennrich/subword-nmt/archive/ec5c7b009c409e72b5ef65a77c1a846546f14847.zip 16 | unzip ec5c7b009c409e72b5ef65a77c1a846546f14847.zip 17 | rm ec5c7b009c409e72b5ef65a77c1a846546f14847.zip 18 | mv subword-nmt-ec5c7b009c409e72b5ef65a77c1a846546f14847 subword-nmt 19 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to FAIR Sequence-to-Sequence Toolkit (PyTorch) 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `master`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Facebook's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | ## Coding Style 26 | We try to follow the PEP style guidelines and encourage you to as well. 27 | 28 | ## License 29 | By contributing to FAIR Sequence-to-Sequence Toolkit, you agree that your contributions will be licensed 30 | under the LICENSE file in the root directory of this source tree. -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/subword-nmt/README.md: -------------------------------------------------------------------------------- 1 | Subword Neural Machine Translation 2 | ================================== 3 | 4 | This repository contains preprocessing scripts to segment text into subword 5 | units. The primary purpose is to facilitate the reproduction of our experiments 6 | on Neural Machine Translation with subword units (see below for reference). 7 | 8 | USAGE INSTRUCTIONS 9 | ------------------ 10 | 11 | Check the individual files for usage instructions. 12 | 13 | To apply byte pair encoding to word segmentation, invoke these commands: 14 | 15 | ./learn_bpe.py -s {num_operations} < {train_file} > {codes_file} 16 | ./apply_bpe.py -c {codes_file} < {test_file} 17 | 18 | To segment rare words into character n-grams, do the following: 19 | 20 | ./get_vocab.py < {train_file} > {vocab_file} 21 | ./segment-char-ngrams.py --vocab {vocab_file} -n {order} --shortlist {size} < {test_file} 22 | 23 | The original segmentation can be restored with a simple replacement: 24 | 25 | sed "s/@@ //g" 26 | 27 | PUBLICATIONS 28 | ------------ 29 | 30 | The segmentation methods are described in: 31 | 32 | Rico Sennrich, Barry Haddow and Alexandra Birch (2016): 33 | Neural Machine Translation of Rare Words with Subword Units 34 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/tests/test_label_smoothing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import torch 10 | import unittest 11 | from fairseq.criterions.label_smoothed_cross_entropy import LabelSmoothedCrossEntropy 12 | from torch.autograd import Variable, gradcheck 13 | 14 | 15 | torch.set_default_tensor_type('torch.DoubleTensor') 16 | 17 | 18 | class TestLabelSmoothing(unittest.TestCase): 19 | 20 | def test_label_smoothing(self): 21 | input = Variable(torch.randn(3, 5), requires_grad=True) 22 | idx = torch.rand(3) * 4 23 | target = Variable(idx.long()) 24 | criterion = LabelSmoothedCrossEntropy() 25 | self.assertTrue(gradcheck( 26 | lambda x, y: criterion.apply(x, y, 0.1, 2, None), (input, target) 27 | )) 28 | weights = torch.ones(5) 29 | weights[2] = 0 30 | self.assertTrue(gradcheck(lambda x, y: criterion.apply(x, y, 0.1, None, weights), (input, target))) 31 | self.assertTrue(gradcheck(lambda x, y: criterion.apply(x, y, 0.1, None, None), (input, target))) 32 | 33 | 34 | if __name__ == '__main__': 35 | unittest.main() 36 | -------------------------------------------------------------------------------- /scripts/pkunlp_segment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # encoding: utf-8 3 | 4 | from __future__ import unicode_literals, print_function 5 | from pkunlp import Segmentor, NERTagger, POSTagger 6 | 7 | import argparse 8 | import time 9 | import codecs, json, re, sys, time 10 | import multiprocessing 11 | 12 | # usage: 13 | # python pkunlp_segmenter.py --corpus data.train.src --segsuffix seg 14 | 15 | def parseargs(): 16 | parser = argparse.ArgumentParser(description="segment corpus") 17 | 18 | parser.add_argument("--corpus", required=True, 19 | help="input corpora") 20 | parser.add_argument("--segsuffix", type=str, default="seg", 21 | help="Suffix of output files") 22 | return parser.parse_args() 23 | 24 | 25 | if __name__ == "__main__": 26 | print("Start processing") 27 | start_time = time.time() 28 | parsed_args = parseargs() 29 | 30 | segmentor = Segmentor("feature/segment.feat", "feature/segment.dic") 31 | 32 | with open(parsed_args.corpus ,'r',encoding='utf-8') as corpus_f,\ 33 | open(parsed_args.corpus + "." + parsed_args.segsuffix,'w',encoding='utf-8',errors='ignore') as seg_output_f: 34 | for line in corpus_f: 35 | if len(line) <= 1500 and len(line) != 0: 36 | segments = segmentor.seg_string(line.strip()) 37 | segments_str = " ".join(segments) 38 | seg_output_f.write(segments_str + "\n") 39 | print("Done in", time.time()-start_time, "seconds") -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/lib/kenlm_python/example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | 4 | 5 | 6 | 7 | LM = os.path.join(os.path.dirname(__file__), '..', 'lm', 'test.arpa') 8 | model = kenlm.Model(LM) 9 | print('{0}-gram model'.format(model.order)) 10 | 11 | sentence = 'language modeling is fun .' 12 | print(sentence) 13 | print(model.score(sentence)) 14 | 15 | # Check that total full score = direct score 16 | def score(s): 17 | return sum(prob for prob, _, _ in model.full_scores(s)) 18 | 19 | assert (abs(score(sentence) - model.score(sentence)) < 1e-3) 20 | 21 | # Show scores and n-gram matches 22 | words = [''] + sentence.split() + [''] 23 | for i, (prob, length, oov) in enumerate(model.full_scores(sentence)): 24 | print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i+2-length:i+2]))) 25 | if oov: 26 | print('\t"{0}" is an OOV'.format(words[i+1])) 27 | 28 | # Find out-of-vocabulary words 29 | for w in words: 30 | if not w in model: 31 | print('"{0}" is an OOV'.format(w)) 32 | 33 | #Stateful query 34 | state = kenlm.State() 35 | state2 = kenlm.State() 36 | #Use as context. If you don't want , use model.NullContextWrite(state). 37 | model.BeginSentenceWrite(state) 38 | accum = 0.0 39 | accum += model.BaseScore(state, "a", state2) 40 | accum += model.BaseScore(state2, "sentence", state) 41 | #score defaults to bos = True and eos = True. Here we'll check without the end 42 | #of sentence marker. 43 | assert (abs(accum - model.score("a sentence", eos = False)) < 1e-3) 44 | accum += model.BaseScore(state, "", state2) 45 | assert (abs(accum - model.score("a sentence")) < 1e-3) 46 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/LICENSE: -------------------------------------------------------------------------------- 1 | BSD License 2 | 3 | For fairseq software 4 | 5 | Copyright (c) 2017-present, Facebook, Inc. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name Facebook nor the names of its contributors may be used to 18 | endorse or promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # Checkpoints 29 | checkpoints 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # dotenv 86 | .env 87 | 88 | # virtualenv 89 | .venv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/progress_bar.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | """ 10 | Progress bar wrapper around tqdm which handles non-tty outputs 11 | """ 12 | 13 | import sys 14 | 15 | from tqdm import tqdm 16 | 17 | 18 | class progress_bar(tqdm): 19 | enabled = sys.stderr.isatty() 20 | print_interval = 1000 21 | 22 | def __new__(cls, *args, **kwargs): 23 | if cls.enabled: 24 | return tqdm(*args, **kwargs) 25 | else: 26 | return simple_progress_bar(cls.print_interval, *args, **kwargs) 27 | 28 | 29 | class simple_progress_bar(tqdm): 30 | 31 | def __init__(self, print_interval, *args, **kwargs): 32 | super(simple_progress_bar, self).__init__(*args, **kwargs) 33 | self.print_interval = print_interval 34 | 35 | def __iter__(self): 36 | size = len(self.iterable) 37 | for i, obj in enumerate(self.iterable): 38 | yield obj 39 | if i > 0 and i % self.print_interval == 0: 40 | msg = '{} {:5d} / {:d} {}\n'.format(self.desc, i, size, self.postfix) 41 | sys.stdout.write(msg) 42 | sys.stdout.flush() 43 | 44 | @classmethod 45 | def write(cls, s, file=None, end="\n"): 46 | fp = file if file is not None else sys.stdout 47 | fp.write(s) 48 | fp.write(end) 49 | fp.flush() 50 | 51 | @staticmethod 52 | def status_printer(file): 53 | def print_status(s): 54 | pass 55 | return print_status 56 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/lib/kenlm_python/_kenlm.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from "lm/word_index.hh" namespace "lm": 2 | ctypedef unsigned WordIndex 3 | 4 | cdef extern from "lm/return.hh" namespace "lm": 5 | cdef struct FullScoreReturn: 6 | float prob 7 | unsigned char ngram_length 8 | 9 | cdef extern from "lm/state.hh" namespace "lm::ngram": 10 | cdef cppclass State : 11 | int Compare(const State &other) const 12 | 13 | int hash_value(const State &state) 14 | 15 | cdef extern from "lm/virtual_interface.hh" namespace "lm::base": 16 | cdef cppclass Vocabulary: 17 | WordIndex Index(char*) 18 | WordIndex BeginSentence() 19 | WordIndex EndSentence() 20 | WordIndex NotFound() 21 | 22 | ctypedef Vocabulary const_Vocabulary "const lm::base::Vocabulary" 23 | 24 | cdef cppclass Model: 25 | void BeginSentenceWrite(void *) 26 | void NullContextWrite(void *) 27 | unsigned int Order() 28 | const_Vocabulary& BaseVocabulary() 29 | float BaseScore(void *in_state, WordIndex new_word, void *out_state) 30 | FullScoreReturn BaseFullScore(void *in_state, WordIndex new_word, void *out_state) 31 | 32 | cdef extern from "util/mmap.hh" namespace "util": 33 | cdef enum LoadMethod: 34 | LAZY 35 | POPULATE_OR_LAZY 36 | POPULATE_OR_READ 37 | READ 38 | PARALLEL_READ 39 | 40 | cdef extern from "lm/config.hh" namespace "lm::ngram": 41 | cdef cppclass Config: 42 | Config() 43 | float probing_multiplier 44 | LoadMethod load_method 45 | 46 | cdef extern from "lm/model.hh" namespace "lm::ngram": 47 | cdef Model *LoadVirtual(char *, Config &config) except + 48 | #default constructor 49 | cdef Model *LoadVirtual(char *) except + 50 | 51 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/scripts/convert_m2_to_parallel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import re 5 | 6 | if len(sys.argv) != 4: 7 | print "[USAGE] %s nucle_m2_file output_src output_tgt" % sys.argv[0] 8 | sys.exit() 9 | 10 | input_path = sys.argv[1] 11 | output_src_path = sys.argv[2] 12 | output_tgt_path = sys.argv[3] 13 | 14 | words = [] 15 | corrected = [] 16 | sid = eid = 0 17 | prev_sid = prev_eid = -1 18 | pos = 0 19 | 20 | 21 | with open(input_path) as input_file, open(output_src_path, 'w') as output_src_file, open(output_tgt_path, 'w') as output_tgt_file: 22 | for line in input_file: 23 | line = line.strip() 24 | if line.startswith('S'): 25 | line = line[2:] 26 | words = line.split() 27 | corrected = [''] + words[:] 28 | output_src_file.write(line + '\n') 29 | elif line.startswith('A'): 30 | line = line[2:] 31 | info = line.split("|||") 32 | sid, eid = info[0].split() 33 | sid = int(sid) + 1; eid = int(eid) + 1; 34 | error_type = info[1] 35 | if error_type == "Um": 36 | continue 37 | for idx in range(sid, eid): 38 | corrected[idx] = "" 39 | if sid == eid: 40 | if sid == 0: continue # Originally index was -1, indicating no op 41 | if sid != prev_sid or eid != prev_eid: 42 | pos = len(corrected[sid-1].split()) 43 | cur_words = corrected[sid-1].split() 44 | cur_words.insert(pos, info[2]) 45 | pos += len(info[2].split()) 46 | corrected[sid-1] = " ".join(cur_words) 47 | else: 48 | corrected[sid] = info[2] 49 | pos = 0 50 | prev_sid = sid 51 | prev_eid = eid 52 | else: 53 | target_sentence = ' '.join([word for word in corrected if word != ""]) 54 | assert target_sentence.startswith(''), '(' + target_sentence + ')' 55 | target_sentence = target_sentence[4:] 56 | output_tgt_file.write(target_sentence + '\n') 57 | prev_sid = -1 58 | prev_eid = -1 59 | pos = 0 60 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/subword-nmt/bpe_toy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text. 6 | Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary 7 | of a text to a configurable number of symbols, with only a small increase in the number of tokens. 8 | This is an (inefficient) toy implementation that shows the algorithm. For processing large datasets, 9 | indexing and incremental updates can be used to speed up the implementation (see learn_bpe.py). 10 | 11 | Reference: 12 | Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units. 13 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. 14 | """ 15 | 16 | 17 | import re 18 | import sys 19 | import collections 20 | 21 | def get_stats(vocab): 22 | pairs = collections.defaultdict(int) 23 | for word, freq in vocab.items(): 24 | symbols = word.split() 25 | for i in range(len(symbols)-1): 26 | pairs[symbols[i],symbols[i+1]] += freq 27 | return pairs 28 | 29 | def merge_vocab(pair, v_in): 30 | v_out = {} 31 | bigram_pattern = re.escape(' '.join(pair)) 32 | p = re.compile(r'(?' : 5, 'l o w e r ' : 2, 39 | 'n e w e s t ' : 6, 'w i d e s t ' : 3} 40 | num_merges = 15 41 | for i in range(num_merges): 42 | pairs = get_stats(vocab) 43 | best = max(pairs, key=pairs.get) 44 | if pairs[best] < 2: 45 | sys.stderr.write('no pair has frequency > 1. Stopping\n') 46 | break 47 | vocab = merge_vocab(best, vocab) 48 | print(best) 49 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/nag.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | from torch.optim.optimizer import Optimizer, required 10 | 11 | 12 | class NAG(Optimizer): 13 | def __init__(self, params, lr=required, momentum=0, weight_decay=0): 14 | defaults = dict(lr=lr, momentum=momentum, weight_decay=weight_decay) 15 | super(NAG, self).__init__(params, defaults) 16 | 17 | def step(self, closure=None): 18 | """Performs a single optimization step. 19 | 20 | Arguments: 21 | closure (callable, optional): A closure that reevaluates the model 22 | and returns the loss. 23 | """ 24 | loss = None 25 | if closure is not None: 26 | loss = closure() 27 | 28 | for group in self.param_groups: 29 | weight_decay = group['weight_decay'] 30 | momentum = group['momentum'] 31 | lr = group['lr'] 32 | 33 | for p in group['params']: 34 | if p.grad is None: 35 | continue 36 | 37 | d_p = p.grad.data 38 | if weight_decay != 0: 39 | d_p.add_(weight_decay, p.data) 40 | 41 | param_state = self.state[p] 42 | if 'momentum_buffer' not in param_state: 43 | param_state['momentum_buffer'] = d_p.clone().zero_() 44 | 45 | buf = param_state['momentum_buffer'] 46 | 47 | p.data.add_(momentum * momentum, buf) 48 | p.data.add_(-(1 + momentum) * lr, d_p) 49 | 50 | buf.mul_(momentum).add_(-lr, d_p) 51 | 52 | return loss 53 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/training/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | 5 | source ../paths.sh 6 | 7 | ## paths to training and development datasets 8 | src_ext=src 9 | trg_ext=trg 10 | train_data_prefix=$DATA_DIR/train.tok 11 | dev_data_prefix=$DATA_DIR/dev.tok 12 | #dev_data_m2=$DATA_DIR/dev.m2 13 | 14 | # path to subword nmt 15 | SUBWORD_NMT=$SOFTWARE_DIR/subword-nmt 16 | # path to Fairseq-Py 17 | FAIRSEQPY=$SOFTWARE_DIR/fairseq-py 18 | 19 | ###################### 20 | # subword segmentation 21 | mkdir -p models/bpe_model 22 | bpe_operations=30000 23 | cat $train_data_prefix.$trg_ext | $SUBWORD_NMT/learn_bpe.py -s $bpe_operations > models/bpe_model/train.bpe.model 24 | mkdir -p processed/ 25 | $SCRIPTS_DIR/apply_bpe.py -c models/bpe_model/train.bpe.model < $train_data_prefix.$src_ext > processed/train.all.src 26 | $SCRIPTS_DIR/apply_bpe.py -c models/bpe_model/train.bpe.model < $train_data_prefix.$trg_ext > processed/train.all.trg 27 | $SCRIPTS_DIR/apply_bpe.py -c models/bpe_model/train.bpe.model < $dev_data_prefix.$src_ext > processed/dev.src 28 | $SCRIPTS_DIR/apply_bpe.py -c models/bpe_model/train.bpe.model < $dev_data_prefix.$trg_ext > processed/dev.trg 29 | #cp $dev_data_m2 processed/dev.m2 30 | cp $dev_data_prefix.$src_ext processed/dev.input.txt 31 | 32 | ########################## 33 | # getting annotated sentence pairs only 34 | #python $SCRIPTS_DIR/get_diff.py processed/train.all src trg > processed/train.annotated.src-trg 35 | #cut -f1 processed/train.annotated.src-trg > processed/train.src 36 | #cut -f2 processed/train.annotated.src-trg > processed/train.trg 37 | less processed/train.all.src > processed/train.src 38 | less processed/train.all.trg > processed/train.trg 39 | 40 | ######################### 41 | # preprocessing 42 | python $FAIRSEQPY/preprocess.py --source-lang src --target-lang trg --trainpref processed/train --validpref processed/dev --testpref processed/dev --nwordssrc 37000 --nwordstgt 37000 --destdir processed/bin 43 | 44 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/modules/beamable_mm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import torch 10 | import torch.nn as nn 11 | 12 | 13 | class BeamableMM(nn.Module): 14 | """This module provides an optimized MM for beam decoding with attention. 15 | 16 | It leverage the fact that the source-side of the input is replicated beam 17 | times and the target-side of the input is of width one. This layer speeds up 18 | inference by replacing the inputs {(bsz x 1 x nhu), (bsz x sz2 x nhu)} 19 | with smaller inputs {(bsz/beam x beam x nhu), (bsz/beam x sz2 x nhu)}. 20 | """ 21 | def __init__(self, beam_size): 22 | super(BeamableMM, self).__init__() 23 | self.beam_size = beam_size 24 | 25 | def forward(self, input1, input2): 26 | if ( 27 | not self.training and # test mode 28 | self.beam_size > 0 and # beam size is set 29 | input1.dim() == 3 and # only support batched input 30 | input1.size(1) == 1 # single time step update 31 | ): 32 | bsz, beam = input1.size(0), self.beam_size 33 | 34 | # bsz x 1 x nhu --> bsz/beam x beam x nhu 35 | input1 = input1[:, 0, :].unfold(0, beam, beam).transpose(2, 1) 36 | 37 | # bsz x sz2 x nhu --> bsz/beam x sz2 x nhu 38 | input2 = input2.unfold(0, beam, beam)[:, :, :, 0] 39 | 40 | # use non batched operation if bsz = beam 41 | if input1.size(0) == 1: 42 | output = torch.mm(input1[0, :, :], input2[0, :, :]) 43 | else: 44 | output = input1.bmm(input2) 45 | return output.view(bsz, 1, -1) 46 | else: 47 | return input1.bmm(input2) 48 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/meters.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import time 10 | 11 | 12 | class AverageMeter(object): 13 | """Computes and stores the average and current value""" 14 | def __init__(self): 15 | self.reset() 16 | 17 | def reset(self): 18 | self.val = 0 19 | self.avg = 0 20 | self.sum = 0 21 | self.count = 0 22 | 23 | def update(self, val, n=1): 24 | self.val = val 25 | self.sum += val * n 26 | self.count += n 27 | self.avg = self.sum / self.count 28 | 29 | 30 | class TimeMeter(object): 31 | """Computes the average occurence of some event per second""" 32 | def __init__(self): 33 | self.reset() 34 | 35 | def reset(self): 36 | self.start = time.time() 37 | self.n = 0 38 | 39 | def update(self, val=1): 40 | self.n += val 41 | 42 | @property 43 | def avg(self): 44 | delta = time.time() - self.start 45 | return self.n / delta 46 | 47 | @property 48 | def elapsed_time(self): 49 | return time.time() - self.start 50 | 51 | 52 | class StopwatchMeter(object): 53 | """Computes the sum/avg duration of some event in seconds""" 54 | def __init__(self): 55 | self.reset() 56 | 57 | def start(self): 58 | self.start_time = time.time() 59 | 60 | def stop(self, n=1): 61 | if self.start_time is not None: 62 | delta = time.time() - self.start_time 63 | self.sum += delta 64 | self.n += n 65 | self.start_time = None 66 | 67 | def reset(self): 68 | self.sum = 0 69 | self.n = 0 70 | self.start_time = None 71 | 72 | @property 73 | def avg(self): 74 | return self.sum / self.n 75 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/PATENTS: -------------------------------------------------------------------------------- 1 | Additional Grant of Patent Rights Version 2 2 | 3 | "Software" means the fairseq software distributed by Facebook, Inc. 4 | 5 | Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software 6 | ("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable 7 | (subject to the termination provision below) license under any Necessary 8 | Claims, to make, have made, use, sell, offer to sell, import, and otherwise 9 | transfer the Software. For avoidance of doubt, no license is granted under 10 | Facebook’s rights in any patent claims that are infringed by (i) modifications 11 | to the Software made by you or any third party or (ii) the Software in 12 | combination with any software or other technology. 13 | 14 | The license granted hereunder will terminate, automatically and without notice, 15 | if you (or any of your subsidiaries, corporate affiliates or agents) initiate 16 | directly or indirectly, or take a direct financial interest in, any Patent 17 | Assertion: (i) against Facebook or any of its subsidiaries or corporate 18 | affiliates, (ii) against any party if such Patent Assertion arises in whole or 19 | in part from any software, technology, product or service of Facebook or any of 20 | its subsidiaries or corporate affiliates, or (iii) against any party relating 21 | to the Software. Notwithstanding the foregoing, if Facebook or any of its 22 | subsidiaries or corporate affiliates files a lawsuit alleging patent 23 | infringement against you in the first instance, and you respond by filing a 24 | patent infringement counterclaim in that lawsuit against that party that is 25 | unrelated to the Software, the license granted hereunder will not terminate 26 | under section (i) of this paragraph due to such counterclaim. 27 | 28 | A "Necessary Claim" is a claim of a patent owned by Facebook that is 29 | necessarily infringed by the Software standing alone. 30 | 31 | A "Patent Assertion" is any lawsuit or other action alleging direct, indirect, 32 | or contributory infringement or inducement to infringe any patent, including a 33 | cross-claim or counterclaim. 34 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | from setuptools import setup, find_packages, Extension 10 | from setuptools.command.build_py import build_py 11 | import sys 12 | from torch.utils.ffi import create_extension 13 | 14 | 15 | if sys.version_info < (3,): 16 | sys.exit('Sorry, Python3 is required for fairseq.') 17 | 18 | with open('README.md') as f: 19 | readme = f.read() 20 | 21 | with open('LICENSE') as f: 22 | license = f.read() 23 | 24 | with open('requirements.txt') as f: 25 | reqs = f.read() 26 | 27 | bleu = Extension( 28 | 'fairseq.libbleu', 29 | sources=[ 30 | 'fairseq/clib/libbleu/libbleu.cpp', 31 | 'fairseq/clib/libbleu/module.cpp', 32 | ], 33 | extra_compile_args=['-std=c++11'], 34 | ) 35 | 36 | conv_tbc = create_extension( 37 | 'fairseq.temporal_convolution_tbc', 38 | relative_to='fairseq', 39 | headers=['fairseq/clib/temporal_convolution_tbc/temporal_convolution_tbc.h'], 40 | sources=['fairseq/clib/temporal_convolution_tbc/temporal_convolution_tbc.cpp'], 41 | define_macros=[('WITH_CUDA', None)], 42 | with_cuda=True, 43 | extra_compile_args=['-std=c++11'], 44 | ) 45 | 46 | 47 | class build_py_hook(build_py): 48 | def run(self): 49 | conv_tbc.build() 50 | build_py.run(self) 51 | 52 | 53 | setup( 54 | name='fairseq', 55 | version='0.1.0', 56 | description='Facebook AI Research Sequence-to-Sequence Toolkit', 57 | long_description=readme, 58 | license=license, 59 | install_requires=reqs.strip().split('\n'), 60 | packages=find_packages(), 61 | ext_modules=[bleu], 62 | 63 | # build and install PyTorch extensions 64 | package_data={ 65 | 'fairseq': ['temporal_convolution_tbc/*.so'], 66 | }, 67 | include_package_data=True, 68 | cmdclass={ 69 | 'build_py': build_py_hook, 70 | }, 71 | ) 72 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/score.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import argparse 10 | import os 11 | import sys 12 | 13 | from fairseq import bleu, dictionary, tokenizer 14 | 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser(description='Command-line script for BLEU scoring.') 18 | parser.add_argument('-s', '--sys', default='-', help='system output') 19 | parser.add_argument('-r', '--ref', required=True, help='references') 20 | parser.add_argument('-o', '--order', default=4, metavar='N', 21 | type=int, help='consider ngrams up to this order') 22 | parser.add_argument('--ignore-case', action='store_true', 23 | help='case-insensitive scoring') 24 | 25 | args = parser.parse_args() 26 | print(args) 27 | 28 | assert args.sys == '-' or os.path.exists(args.sys), \ 29 | "System output file {} does not exist".format(args.sys) 30 | assert os.path.exists(args.ref), \ 31 | "Reference file {} does not exist".format(args.ref) 32 | 33 | dict = dictionary.Dictionary() 34 | 35 | def readlines(fd): 36 | for line in fd.readlines(): 37 | if args.ignore_case: 38 | yield line.lower() 39 | yield line 40 | 41 | def score(fdsys): 42 | with open(args.ref) as fdref: 43 | scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) 44 | for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)): 45 | sys_tok = tokenizer.Tokenizer.tokenize(sys_tok, dict) 46 | ref_tok = tokenizer.Tokenizer.tokenize(ref_tok, dict) 47 | scorer.add(ref_tok, sys_tok) 48 | print(scorer.result_string(args.order)) 49 | 50 | if args.sys == '-': 51 | score(sys.stdin) 52 | else: 53 | with open(args.sys, 'r') as f: 54 | score(f) 55 | 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/lib/m2scorer/token_offsets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # This file is part of the NUS M2 scorer. 4 | # The NUS M2 scorer is free software: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation, either version 3 of the License, or 7 | # (at your option) any later version. 8 | 9 | # The NUS M2 scorer is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | # file: token_offsets.py 18 | # convert character to token offsets, tokenize sentence 19 | # 20 | # usage: %prog < input > output 21 | # 22 | 23 | 24 | import sys 25 | import re 26 | import os 27 | from util import * 28 | from Tokenizer import PTBTokenizer 29 | 30 | 31 | assert len(sys.argv) == 1 32 | 33 | 34 | # main 35 | # loop over sentences cum annotation 36 | tokenizer = PTBTokenizer() 37 | sentence = '' 38 | for line in sys.stdin: 39 | line = line.decode("utf8").strip() 40 | if line.startswith("S "): 41 | sentence = line[2:] 42 | sentence_tok = "S " + ' '.join(tokenizer.tokenize(sentence)) 43 | print sentence_tok.encode("utf8") 44 | elif line.startswith("A "): 45 | fields = line[2:].split('|||') 46 | start_end = fields[0] 47 | char_start, char_end = [int(a) for a in start_end.split()] 48 | # calculate token offsets 49 | prefix = sentence[:char_start] 50 | tok_start = len(tokenizer.tokenize(prefix)) 51 | postfix = sentence[:char_end] 52 | tok_end = len(tokenizer.tokenize(postfix)) 53 | start_end = str(tok_start) + " " + str(tok_end) 54 | fields[0] = start_end 55 | # tokenize corrections, remove trailing whitespace 56 | corrections = [(' '.join(tokenizer.tokenize(c))).strip() for c in fields[2].split('||')] 57 | fields[2] = '||'.join(corrections) 58 | annotation = "A " + '|||'.join(fields) 59 | print annotation.encode("utf8") 60 | else: 61 | print line.encode("utf8") 62 | 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 2 | This is the code of our team (Zlbnlp) for the NLPCC 2018 Shared Task 2 Grammatical Error Correction. 3 | 4 | ## Usage 5 | ### Prerequisites 6 | * python3.6 7 | * pytorch0.2.0 (use following commands to install from source) 8 | 9 | ```bash 10 | export CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" # [anaconda root directory] 11 | conda install numpy pyyaml mkl mkl-include setuptools cmake cffi typing 12 | conda install -c mingfeima mkldnn 13 | conda install -c pytorch magma-cuda80 14 | 15 | git clone https://github.com/pytorch/pytorch.git 16 | cd pytorch 17 | git reset --hard a03e5cb40938b6b3f3e6dbddf9cff8afdff72d1b 18 | git submodule update --init 19 | pip install -r requirements.txt 20 | python setup.py install 21 | ``` 22 | 23 | * [m2score scripts](http://www.comp.nus.edu.sg/~nlp/sw/m2scorer.tar.gz)(to compute the metrics) 24 | * [libgrass-ui toolkit](http://www.icst.pku.edu.cn/lcwm/pkunlp/downloads/libgrass-ui.tar.gz)(word segmentation toolkit) 25 | * fairseq-py (dependent on torch, use following commands to install) 26 | 27 | ```bash 28 | cd CS2S+BPE+Emb/software/fairseq-py 29 | pip install -r requirements.txt 30 | python setup.py build 31 | python setup.py develop 32 | ``` 33 | 34 | 35 | ### Data 36 | The data and embeddings can be found in the [Zlbnlp_data](https://pan.baidu.com/s/18JXm1KGmRu3Pe45jt2sYBQ). 37 | You need manually split the whole dataset into two parts. 38 | * training dataset:contain 1,215,876 sentence pairs.Filepaths is CS2S+BPE+Emb/data/train.tok.src, CS2S+BPE+Emb/data/train.tok.trg 39 | * development dataset:contain 5k sentence pairs.Filepaths is CS2S+BPE+Emb/data/dev.tok.src, CS2S+BPE+Emb/data/dev.tok.trg 40 | * test data is source.txt.jieba.seg,using jieba toolkit. 41 | 42 | ### Data processing 43 | 44 | ```bash 45 | cd CS2S+BPE+Emb/training/ 46 | chmod +x preprocess.sh 47 | ./preprocess.sh 48 | ``` 49 | 50 | ### Training 51 | 52 | * Training command 53 | 54 | The command below is what we used to train an model on the NLPCC-2018 Task 2 dataset. 55 | ``` 56 | ./train_embed.sh 57 | ``` 58 | 59 | ### Decoding 60 | The following is the command used to generate outputs and F0.5 score: 61 | ``` 62 | cd CS2S+BPE+Emb/ 63 | ./run.sh ./data/source.txt.jieba.seg ./output/CS2S+BPE+Emb/ 0 ./training/models/mlconv_embed/model1 64 | cd libgrass-ui/ 65 | ./remove_spac_pkunlp_segment.sh 66 | ```` 67 | 68 | ## Contact 69 | If you have questions, suggestions and bug reports, please email renhongkai27@gmail.com. 70 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/criterions/label_smoothed_cross_entropy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import math 10 | import torch 11 | from torch.autograd.variable import Variable 12 | import torch.nn.functional as F 13 | 14 | from .fairseq_criterion import FairseqCriterion 15 | 16 | 17 | class LabelSmoothedCrossEntropy(torch.autograd.Function): 18 | 19 | @staticmethod 20 | def forward(ctx, input, target, eps, padding_idx, weights): 21 | grad_input = input.new(input.size()).zero_() 22 | target = target.view(target.size(0), 1) 23 | grad_input = grad_input.scatter_(grad_input.dim() - 1, target, eps - 1) 24 | 25 | norm = grad_input.size(-1) 26 | if weights is not None: 27 | norm = weights.sum() 28 | grad_input.mul(weights.view(1, weights.size(0)).expand_as(grad_input)) 29 | 30 | if padding_idx is not None: 31 | norm -= 1 if weights is None else weights[padding_idx] 32 | grad_input.select(grad_input.dim() - 1, padding_idx).fill_(0) 33 | 34 | grad_input = grad_input.add(-eps / norm) 35 | 36 | ctx.grad_input = grad_input 37 | return input.new([grad_input.view(-1).dot(input.view(-1))]) 38 | 39 | @staticmethod 40 | def backward(ctx, grad): 41 | return Variable(ctx.grad_input, volatile=True) * grad, None, None, None, None 42 | 43 | 44 | class LabelSmoothedCrossEntropyCriterion(FairseqCriterion): 45 | 46 | def __init__(self, eps, padding_idx=None, weights=None): 47 | super().__init__() 48 | self.eps = eps 49 | self.padding_idx = padding_idx 50 | self.weights = weights 51 | 52 | def prepare(self, samples): 53 | self.denom = sum(s['ntokens'] if s else 0 for s in samples) 54 | 55 | def forward(self, net_output, sample): 56 | input = F.log_softmax(net_output.view(-1, net_output.size(-1))) 57 | target = sample['target'].view(-1) 58 | loss = LabelSmoothedCrossEntropy.apply(input, target, self.eps, self.padding_idx, self.weights) 59 | return loss / self.denom 60 | 61 | def aggregate(self, losses): 62 | return sum(losses) / math.log(2) 63 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/subword-nmt/segment-char-ngrams.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | from __future__ import unicode_literals, division 6 | 7 | import sys 8 | import codecs 9 | import argparse 10 | 11 | # hack for python2/3 compatibility 12 | from io import open 13 | argparse.open = open 14 | 15 | # python 2/3 compatibility 16 | if sys.version_info < (3, 0): 17 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) 18 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) 19 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin) 20 | 21 | def create_parser(): 22 | parser = argparse.ArgumentParser( 23 | formatter_class=argparse.RawDescriptionHelpFormatter, 24 | description="segment rare words into character n-grams") 25 | 26 | parser.add_argument( 27 | '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, 28 | metavar='PATH', 29 | help="Input file (default: standard input).") 30 | parser.add_argument( 31 | '--vocab', type=argparse.FileType('r'), metavar='PATH', 32 | required=True, 33 | help="Vocabulary file.") 34 | parser.add_argument( 35 | '--shortlist', type=int, metavar='INT', default=0, 36 | help="do not segment INT most frequent words in vocabulary (default: '%(default)s')).") 37 | parser.add_argument( 38 | '-n', type=int, metavar='INT', default=2, 39 | help="segment rare words into character n-grams of size INT (default: '%(default)s')).") 40 | parser.add_argument( 41 | '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, 42 | metavar='PATH', 43 | help="Output file (default: standard output)") 44 | parser.add_argument( 45 | '--separator', '-s', type=str, default='@@', metavar='STR', 46 | help="Separator between non-final subword units (default: '%(default)s'))") 47 | 48 | return parser 49 | 50 | 51 | if __name__ == '__main__': 52 | 53 | parser = create_parser() 54 | args = parser.parse_args() 55 | 56 | vocab = [line.split()[0] for line in args.vocab if len(line.split()) == 2] 57 | vocab = dict((y,x) for (x,y) in enumerate(vocab)) 58 | 59 | for line in args.input: 60 | for word in line.split(): 61 | if word not in vocab or vocab[word] > args.shortlist: 62 | i = 0 63 | while i*args.n < len(word): 64 | args.output.write(word[i*args.n:i*args.n+args.n]) 65 | i += 1 66 | if i*args.n < len(word): 67 | args.output.write(args.separator) 68 | args.output.write(' ') 69 | else: 70 | args.output.write(word + ' ') 71 | args.output.write('\n') 72 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/augmenter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import time 5 | import numpy as np 6 | import codecs 7 | import argparse 8 | 9 | # Initializing the logging module 10 | import logging 11 | import log_utils as L 12 | logger = logging.getLogger(__name__) 13 | 14 | from candidatesreader import NBestList 15 | from features import * 16 | 17 | def augment(features, source_path, input_nbest_path, output_nbest_path): 18 | ''' Function to augment the n-best list with a feature function 19 | :param feature: The feature function object 20 | :param source_path: Path to the original source sentences (maybe required for the feature function) 21 | :param input_nbest_path: Path to the n-best file 22 | :param output_nbest_path: Path to the output n-best file 23 | ''' 24 | # Initialize NBestList objects 25 | logger.info('Initializing Nbest lists') 26 | input_nbest = NBestList(input_nbest_path, mode='r') 27 | output_nbest = NBestList(output_nbest_path, mode='w') 28 | 29 | # Load the source sentences 30 | logger.info('Loading source sentences') 31 | src_sents = codecs.open(source_path, mode='r', encoding='UTF-8') 32 | 33 | # For each of the item in the n-best list, append the feature 34 | sent_count = 0 35 | for group, src_sent in zip(input_nbest, src_sents): 36 | candidate_count = 0 37 | for item in group: 38 | for feature in features: 39 | item.append_feature(feature.name, feature.get_score(src_sent, item.hyp, (sent_count, candidate_count))) 40 | output_nbest.write(item) 41 | candidate_count += 1 42 | sent_count += 1 43 | if (sent_count % 100 == 0): 44 | logger.info('Augmented ' + L.b_yellow(str(sent_count)) + ' sentences.') 45 | output_nbest.close() 46 | 47 | 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument("-s", "--source-sentence-file", dest="source_path", required=True, help="Path to the file containing source sentences.") 50 | parser.add_argument("-i", "--input-nbest", dest="input_nbest_path", required=True, help="Input n-best file") 51 | parser.add_argument("-o", "--output-nbest", dest="output_nbest_path", required=True, help="Output n-best file") 52 | parser.add_argument("-f", "--feature", dest="feature_string", required=True, help="feature initializer, e.g. LM('LM0','/path/to/lm_file', normalize=True)") 53 | args = parser.parse_args() 54 | 55 | L.set_logger(os.path.abspath(os.path.dirname(args.output_nbest_path)),'augment_log.txt') 56 | L.print_args(args) 57 | features = eval('['+args.feature_string+']') 58 | augment(features, args.source_path, args.input_nbest_path, args.output_nbest_path) 59 | logger.info(L.green('Augmenting done.')) 60 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/run.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | ## This script is to run the complete GEC system on any given test set 3 | 4 | set -e 5 | set -x 6 | 7 | source paths.sh 8 | 9 | if [ $# -ge 4 ]; then 10 | input_file=$1 11 | output_dir=$2 12 | device=$3 13 | model_path=$4 14 | if [ $# -eq 6 ]; then 15 | reranker_weights=$5 16 | reranker_feats=$6 17 | fi 18 | else 19 | echo "Please specify the paths to the input_file and output directory" 20 | echo "Usage: `basename $0` [optional args: &2 21 | fi 22 | 23 | if [ -d $model_path ]; then 24 | models=`ls $model_path/*pt | tr '\n' ' ' | sed "s| \([^$]\)| --path \1|g"` 25 | echo $models 26 | elif [ -f $model_path ]; then 27 | models=$model_path 28 | elif [ ! -e $model_path ]; then 29 | echo "Model path not found: $model_path" 30 | fi 31 | 32 | 33 | FAIRSEQPY=$SOFTWARE_DIR/fairseq-py 34 | NBEST_RERANKER=$SOFTWARE_DIR/nbest-reranker 35 | 36 | 37 | beam=12 38 | nbest=$beam 39 | threads=12 40 | 41 | mkdir -p $output_dir 42 | $SCRIPTS_DIR/apply_bpe.py -c $TRAINING_DIR/models/bpe_model/train.bpe.model < $input_file > $output_dir/input.bpe.txt 43 | 44 | # running fairseq on the test data 45 | CUDA_VISIBLE_DEVICES=$device python $FAIRSEQPY/generate.py --no-progress-bar --path $models --beam $beam --nbest $beam --interactive --workers $threads $TRAINING_DIR/processed/bin/ < $output_dir/input.bpe.txt > $output_dir/output.bpe.nbest.txt 46 | 47 | # getting best hypotheses 48 | cat $output_dir/output.bpe.nbest.txt | grep "^H" | python -c "import sys; x = sys.stdin.readlines(); x = ' '.join([ x[i] for i in range(len(x)) if(i%$nbest == 0) ]); print(x)" | cut -f3 > $output_dir/output.bpe.txt 49 | 50 | # debpe 51 | cat $output_dir/output.bpe.txt | sed 's|@@ ||g' | sed '$ d' > $output_dir/output.tok.txt 52 | 53 | # additionally re-rank outputs 54 | if [ $# -eq 6 ]; then 55 | if [ $reranker_feats == "eo" ]; then 56 | featstring="EditOps(name='EditOps0')" 57 | elif [ $reranker_feats == "eolm" ]; then 58 | featstring="EditOps(name='EditOps0'), LM('LM0', '$MODEL_DIR/lm/94Bcclm.trie', normalize=False), WordPenalty(name='WordPenalty0')" 59 | fi 60 | $SCRIPTS_DIR/nbest_reformat.py -i $output_dir/output.bpe.nbest.txt --debpe > $output_dir/output.tok.nbest.reformat.txt 61 | $NBEST_RERANKER/augmenter.py -s $input_file -i $output_dir/output.tok.nbest.reformat.txt -o $output_dir/output.tok.nbest.reformat.augmented.txt -f "$featstring" 62 | $NBEST_RERANKER/rerank.py -i $output_dir/output.tok.nbest.reformat.augmented.txt -w $reranker_weights -o $output_dir --clean-up 63 | mv $output_dir/output.tok.nbest.reformat.augmented.txt.reranked.1best $output_dir/output.reranked.tok.txt 64 | fi 65 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/data/prepare-iwslt14.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh 4 | 5 | echo 'Cloning Moses github repository (for tokenization scripts)...' 6 | git clone https://github.com/moses-smt/mosesdecoder.git 7 | 8 | SCRIPTS=mosesdecoder/scripts 9 | TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl 10 | LC=$SCRIPTS/tokenizer/lowercase.perl 11 | CLEAN=$SCRIPTS/training/clean-corpus-n.perl 12 | 13 | URL="https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz" 14 | GZ=de-en.tgz 15 | 16 | if [ ! -d "$SCRIPTS" ]; then 17 | echo "Please set SCRIPTS variable correctly to point to Moses scripts." 18 | exit 19 | fi 20 | 21 | src=de 22 | tgt=en 23 | lang=de-en 24 | prep=iwslt14.tokenized.de-en 25 | tmp=$prep/tmp 26 | orig=orig 27 | 28 | mkdir -p $orig $tmp $prep 29 | 30 | echo "Downloading data from ${URL}..." 31 | cd $orig 32 | wget "$URL" 33 | 34 | if [ -f $GZ ]; then 35 | echo "Data successfully downloaded." 36 | else 37 | echo "Data not successfully downloaded." 38 | exit 39 | fi 40 | 41 | tar zxvf $GZ 42 | cd .. 43 | 44 | echo "pre-processing train data..." 45 | for l in $src $tgt; do 46 | f=train.tags.$lang.$l 47 | tok=train.tags.$lang.tok.$l 48 | 49 | cat $orig/$lang/$f | \ 50 | grep -v '' | \ 51 | grep -v '' | \ 52 | grep -v '' | \ 53 | sed -e 's///g' | \ 54 | sed -e 's/<\/title>//g' | \ 55 | sed -e 's/<description>//g' | \ 56 | sed -e 's/<\/description>//g' | \ 57 | perl $TOKENIZER -threads 8 -l $l > $tmp/$tok 58 | echo "" 59 | done 60 | perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.clean 1 175 61 | for l in $src $tgt; do 62 | perl $LC < $tmp/train.tags.$lang.clean.$l > $tmp/train.tags.$lang.$l 63 | done 64 | 65 | echo "pre-processing valid/test data..." 66 | for l in $src $tgt; do 67 | for o in `ls $orig/$lang/IWSLT14.TED*.$l.xml`; do 68 | fname=${o##*/} 69 | f=$tmp/${fname%.*} 70 | echo $o $f 71 | grep '<seg id' $o | \ 72 | sed -e 's/<seg id="[0-9]*">\s*//g' | \ 73 | sed -e 's/\s*<\/seg>\s*//g' | \ 74 | sed -e "s/\’/\'/g" | \ 75 | perl $TOKENIZER -threads 8 -l $l | \ 76 | perl $LC > $f 77 | echo "" 78 | done 79 | done 80 | 81 | 82 | echo "creating train, valid, test..." 83 | for l in $src $tgt; do 84 | awk '{if (NR%23 == 0) print $0; }' $tmp/train.tags.de-en.$l > $prep/valid.$l 85 | awk '{if (NR%23 != 0) print $0; }' $tmp/train.tags.de-en.$l > $prep/train.$l 86 | 87 | cat $tmp/IWSLT14.TED.dev2010.de-en.$l \ 88 | $tmp/IWSLT14.TEDX.dev2012.de-en.$l \ 89 | $tmp/IWSLT14.TED.tst2010.de-en.$l \ 90 | $tmp/IWSLT14.TED.tst2011.de-en.$l \ 91 | $tmp/IWSLT14.TED.tst2012.de-en.$l \ 92 | > $prep/test.$l 93 | done 94 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import re 10 | import torch 11 | 12 | from fairseq import dictionary 13 | 14 | 15 | def tokenize_line(line): 16 | line = re.sub(r"\t", "", line) 17 | line = re.sub(r"^\s+", "", line) 18 | line = re.sub(r"\s+$", "", line) 19 | line = re.sub(r"\s+", " ", line) 20 | return line.split() 21 | 22 | 23 | class Tokenizer: 24 | 25 | @staticmethod 26 | def build_dictionary(filename, tokenize=tokenize_line): 27 | dict = dictionary.Dictionary() 28 | Tokenizer.add_file_to_dictionary(filename, dict, tokenize) 29 | dict.finalize() 30 | return dict 31 | 32 | @staticmethod 33 | def add_file_to_dictionary(filename, dict, tokenize): 34 | with open(filename, 'r') as f: 35 | for line in f.readlines(): 36 | for word in tokenize(line): 37 | dict.add_symbol(word) 38 | dict.add_symbol(dict.eos_word) 39 | 40 | @staticmethod 41 | def binarize(filename, dict, consumer, tokenize=tokenize_line): 42 | nseq, ntok, nunk = 0, 0, 0 43 | replaced = {} 44 | with open(filename, 'r') as f: 45 | for line in f.readlines(): 46 | words = tokenize(line) 47 | nwords = len(words) 48 | ids = torch.IntTensor(nwords + 1) 49 | nseq = nseq + 1 50 | for i in range(0, len(words)): 51 | word = words[i] 52 | idx = dict.index(word) 53 | if idx == dict.unk_index and word != dict.unk_word: 54 | nunk = nunk + 1 55 | if word in replaced: 56 | replaced[word] = replaced[word] + 1 57 | else: 58 | replaced[word] = 1 59 | ids[i] = idx 60 | 61 | ids[nwords] = dict.eos_index 62 | consumer(ids) 63 | ntok = ntok + len(ids) 64 | return {'nseq': nseq, 'nunk': nunk, 'ntok': ntok, 'replaced': len(replaced)} 65 | 66 | @staticmethod 67 | def tokenize(line, dict, tokenize=tokenize_line, add_if_not_exist=True): 68 | words = tokenize(line) 69 | nwords = len(words) 70 | ids = torch.IntTensor(nwords + 1) 71 | for i in range(0, len(words)): 72 | if add_if_not_exist: 73 | ids[i] = dict.add_symbol(words[i]) 74 | else: 75 | ids[i] = dict.index(words[i]) 76 | ids[nwords] = dict.eos_index 77 | return ids 78 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/rerank.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import imp 6 | import shutil 7 | 8 | import argparse 9 | 10 | # Initializing the logging module 11 | import logging 12 | import log_utils as L 13 | import configreader 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("-i", "--input-nbest", dest="input_nbest", required=True, help="Input n-best file") 20 | parser.add_argument("-w", "--weights", dest="weights", required=True, help="Input weights file") 21 | parser.add_argument("-o", "--output-dir", dest="out_dir", required=True, help="Output directory") 22 | parser.add_argument("-c", "--clean-up", dest="clean_up", action='store_true', help="Temporary files will be removed") 23 | parser.add_argument("-q", "--quiet", dest="quiet", action='store_true', help="Nothing will be printed in STDERR") 24 | args = parser.parse_args() 25 | 26 | 27 | from candidatesreader import NBestList 28 | import codecs 29 | import numpy as np 30 | 31 | if not os.path.exists(args.out_dir): 32 | os.makedirs(args.out_dir) 33 | L.set_logger(os.path.abspath(args.out_dir),'train_log.txt') 34 | L.print_args(args) 35 | 36 | 37 | output_nbest_path = args.out_dir + '/augmented.nbest' 38 | shutil.copy(args.input_nbest, output_nbest_path) 39 | 40 | with open(args.weights, 'r') as input_weights: 41 | lines = input_weights.readlines() 42 | if len(lines) > 1: 43 | L.warning("Weights file has more than one line. I'll read the 1st and ignore the rest.") 44 | weights = np.asarray(lines[0].strip().split(" "), dtype=float) 45 | 46 | prefix = os.path.basename(args.input_nbest) 47 | input_aug_nbest = NBestList(output_nbest_path, mode='r') 48 | output_nbest = NBestList(args.out_dir + '/' + prefix + '.reranked.nbest', mode='w') 49 | output_1best = codecs.open(args.out_dir + '/' + prefix + '.reranked.1best', mode='w', encoding='UTF-8') 50 | 51 | def is_number(s): 52 | try: 53 | float(s) 54 | return True 55 | except ValueError: 56 | return False 57 | 58 | counter = 0 59 | for group in input_aug_nbest: 60 | index = 0 61 | scores = dict() 62 | for item in group: 63 | features = np.asarray([x for x in item.features.split() if is_number(x)], dtype=float) 64 | try: 65 | scores[index] = np.dot(features, weights) 66 | except ValueError: 67 | logger.error('Number of features in the nbest and the weights file are not the same') 68 | index += 1 69 | sorted_indices = sorted(scores, key=scores.get, reverse=True) 70 | for idx in sorted_indices: 71 | output_nbest.write(group[idx]) 72 | output_1best.write(group[sorted_indices[0]].hyp + "\n") 73 | counter += 1 74 | if counter % 100 == 0: 75 | logger.info(L.b_yellow(str(counter)) + " groups processed") 76 | logger.info("%i groups processed" % (counter)) 77 | logger.info("Finished processing %i groups" % (counter)) 78 | logger.info(L.green('Reranking completed.')) 79 | output_nbest.close() 80 | output_1best.close() 81 | 82 | if args.clean_up: 83 | os.remove(output_nbest_path) 84 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/clib/libbleu/libbleu.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include <map> 10 | #include <array> 11 | #include <cstring> 12 | #include <cstdio> 13 | 14 | typedef struct 15 | { 16 | size_t reflen; 17 | size_t predlen; 18 | size_t match1; 19 | size_t count1; 20 | size_t match2; 21 | size_t count2; 22 | size_t match3; 23 | size_t count3; 24 | size_t match4; 25 | size_t count4; 26 | } bleu_stat; 27 | 28 | // left trim (remove pad) 29 | void bleu_ltrim(size_t* len, int** sent, int pad) { 30 | size_t start = 0; 31 | while(start < *len) { 32 | if (*(*sent + start) != pad) { break; } 33 | start++; 34 | } 35 | *sent += start; 36 | *len -= start; 37 | } 38 | 39 | // right trim remove (eos) 40 | void bleu_rtrim(size_t* len, int** sent, int pad, int eos) { 41 | size_t end = *len - 1; 42 | while (end > 0) { 43 | if (*(*sent + end) != eos && *(*sent + end) != pad) { break; } 44 | end--; 45 | } 46 | *len = end + 1; 47 | } 48 | 49 | // left and right trim 50 | void bleu_trim(size_t* len, int** sent, int pad, int eos) { 51 | bleu_ltrim(len, sent, pad); 52 | bleu_rtrim(len, sent, pad, eos); 53 | } 54 | 55 | size_t bleu_hash(int len, int* data) { 56 | size_t h = 14695981039346656037ul; 57 | size_t prime = 0x100000001b3; 58 | char* b = (char*) data; 59 | size_t blen = sizeof(int) * len; 60 | 61 | while (blen-- > 0) { 62 | h ^= *b++; 63 | h *= prime; 64 | } 65 | 66 | return h; 67 | } 68 | 69 | void bleu_addngram( 70 | size_t *ntotal, size_t *nmatch, size_t n, 71 | size_t reflen, int* ref, size_t predlen, int* pred) { 72 | 73 | if (predlen < n) { return; } 74 | 75 | predlen = predlen - n + 1; 76 | (*ntotal) += predlen; 77 | 78 | if (reflen < n) { return; } 79 | 80 | reflen = reflen - n + 1; 81 | 82 | std::map<size_t, size_t> count; 83 | while (predlen > 0) { 84 | size_t w = bleu_hash(n, pred++); 85 | count[w]++; 86 | predlen--; 87 | } 88 | 89 | while (reflen > 0) { 90 | size_t w = bleu_hash(n, ref++); 91 | if (count[w] > 0) { 92 | (*nmatch)++; 93 | count[w] -=1; 94 | } 95 | reflen--; 96 | } 97 | } 98 | 99 | extern "C" { 100 | 101 | void bleu_zero_init(bleu_stat* stat) { 102 | std::memset(stat, 0, sizeof(bleu_stat)); 103 | } 104 | 105 | void bleu_one_init(bleu_stat* stat) { 106 | bleu_zero_init(stat); 107 | stat->count1 = 1; 108 | stat->count2 = 1; 109 | stat->count3 = 1; 110 | stat->count4 = 1; 111 | stat->match1 = 1; 112 | stat->match2 = 1; 113 | stat->match3 = 1; 114 | stat->match4 = 1; 115 | } 116 | 117 | void bleu_add( 118 | bleu_stat* stat, 119 | size_t reflen, int* ref, size_t predlen, int* pred, int pad, int eos) { 120 | 121 | bleu_trim(&reflen, &ref, pad, eos); 122 | bleu_trim(&predlen, &pred, pad, eos); 123 | stat->reflen += reflen; 124 | stat->predlen += predlen; 125 | 126 | bleu_addngram(&stat->count1, &stat->match1, 1, reflen, ref, predlen, pred); 127 | bleu_addngram(&stat->count2, &stat->match2, 2, reflen, ref, predlen, pred); 128 | bleu_addngram(&stat->count3, &stat->match3, 3, reflen, ref, predlen, pred); 129 | bleu_addngram(&stat->count4, &stat->match4, 4, reflen, ref, predlen, pred); 130 | } 131 | 132 | } 133 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/modules/linearized_convolution.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import torch 10 | import torch.nn.functional as F 11 | from .conv_tbc import ConvTBC 12 | 13 | 14 | class LinearizedConvolution(ConvTBC): 15 | """An optimized version of nn.Conv1d. 16 | 17 | This module replaces convolutions with linear layers as appropriate 18 | and supports optimizations for incremental inference. 19 | """ 20 | 21 | def __init__(self, in_channels, out_channels, kernel_size, **kwargs): 22 | super().__init__(in_channels, out_channels, kernel_size, **kwargs) 23 | self.clear_buffer() 24 | 25 | self._linearized_weight = None 26 | self.register_backward_hook(self._clear_linearized_weight) 27 | 28 | def remove_future_timesteps(self, x): 29 | """Remove future time steps created by padding.""" 30 | if self.kernel_size[0] > 1 and self.padding[0] > 0: 31 | x = x[:-self.padding[0], :, :] 32 | return x 33 | 34 | def incremental_forward(self, input): 35 | """Forward convolution one time step at a time. 36 | 37 | This function maintains an internal state to buffer signal and 38 | accepts a single frame as input. If the input order changes 39 | between time steps, call reorder_buffer. To apply to fresh 40 | inputs, call clear_buffer. 41 | """ 42 | if self.training: 43 | raise RuntimeError('LinearizedConvolution only supports inference') 44 | 45 | # run forward pre hooks (e.g., weight norm) 46 | for hook in self._forward_pre_hooks.values(): 47 | hook(self, input) 48 | 49 | # reshape weight 50 | weight = self._get_linearized_weight() 51 | kw = self.kernel_size[0] 52 | 53 | bsz = input.size(0) # input: bsz x len x dim 54 | if kw > 1: 55 | input = input.data 56 | if self.input_buffer is None: 57 | self.input_buffer = input.new(bsz, kw, input.size(2)) 58 | self.input_buffer.zero_() 59 | else: 60 | # shift buffer 61 | self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone() 62 | # append next input 63 | self.input_buffer[:, -1, :] = input[:, -1, :] 64 | input = torch.autograd.Variable(self.input_buffer, volatile=True) 65 | output = F.linear(input.view(bsz, -1), weight, self.bias) 66 | return output.view(bsz, 1, -1) 67 | 68 | def clear_buffer(self): 69 | self.input_buffer = None 70 | 71 | def reorder_buffer(self, new_order): 72 | if self.input_buffer is not None: 73 | self.input_buffer = self.input_buffer.index_select(0, new_order) 74 | 75 | def _get_linearized_weight(self): 76 | if self._linearized_weight is None: 77 | kw = self.kernel_size[0] 78 | weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous() 79 | assert weight.size() == (self.out_channels, kw, self.in_channels) 80 | self._linearized_weight = weight.view(self.out_channels, -1) 81 | return self._linearized_weight 82 | 83 | def _clear_linearized_weight(self, *args): 84 | self._linearized_weight = None 85 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/bleu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import ctypes 10 | import math 11 | import torch 12 | 13 | try: 14 | from fairseq import libbleu 15 | except ImportError as e: 16 | import sys 17 | sys.stderr.write('ERROR: missing libbleu.so. run `python setup.py install`\n') 18 | raise e 19 | 20 | 21 | C = ctypes.cdll.LoadLibrary(libbleu.__file__) 22 | 23 | 24 | class BleuStat(ctypes.Structure): 25 | _fields_ = [ 26 | ('reflen', ctypes.c_size_t), 27 | ('predlen', ctypes.c_size_t), 28 | ('match1', ctypes.c_size_t), 29 | ('count1', ctypes.c_size_t), 30 | ('match2', ctypes.c_size_t), 31 | ('count2', ctypes.c_size_t), 32 | ('match3', ctypes.c_size_t), 33 | ('count3', ctypes.c_size_t), 34 | ('match4', ctypes.c_size_t), 35 | ('count4', ctypes.c_size_t), 36 | ] 37 | 38 | 39 | class Scorer(object): 40 | def __init__(self, pad, eos, unk): 41 | self.stat = BleuStat() 42 | self.pad = pad 43 | self.eos = eos 44 | self.unk = unk 45 | self.reset() 46 | 47 | def reset(self, one_init=False): 48 | if one_init: 49 | C.bleu_one_init(ctypes.byref(self.stat)) 50 | else: 51 | C.bleu_zero_init(ctypes.byref(self.stat)) 52 | 53 | def add(self, ref, pred): 54 | if not isinstance(ref, torch.IntTensor): 55 | raise TypeError('ref must be a torch.IntTensor (got {})' 56 | .format(type(ref))) 57 | if not isinstance(pred, torch.IntTensor): 58 | raise TypeError('pred must be a torch.IntTensor(got {})' 59 | .format(type(pred))) 60 | 61 | assert self.unk > 0, 'unknown token index must be >0' 62 | rref = ref.clone() 63 | rref.apply_(lambda x: x if x != self.unk else -x) 64 | 65 | rref = rref.contiguous().view(-1) 66 | pred = pred.contiguous().view(-1) 67 | 68 | C.bleu_add( 69 | ctypes.byref(self.stat), 70 | ctypes.c_size_t(rref.size(0)), 71 | ctypes.c_void_p(rref.data_ptr()), 72 | ctypes.c_size_t(pred.size(0)), 73 | ctypes.c_void_p(pred.data_ptr()), 74 | ctypes.c_int(self.pad), 75 | ctypes.c_int(self.eos)) 76 | 77 | def score(self, order=4): 78 | psum = sum(math.log(p) if p > 0 else float('-Inf') 79 | for p in self.precision()[:order]) 80 | return self.brevity() * math.exp(psum / order) * 100 81 | 82 | def precision(self): 83 | def ratio(a, b): 84 | return a / b if b > 0 else 0 85 | 86 | return [ 87 | ratio(self.stat.match1, self.stat.count1), 88 | ratio(self.stat.match2, self.stat.count2), 89 | ratio(self.stat.match3, self.stat.count3), 90 | ratio(self.stat.match4, self.stat.count4), 91 | ] 92 | 93 | def brevity(self): 94 | r = self.stat.reflen / self.stat.predlen 95 | return min(1, math.exp(1 - r)) 96 | 97 | def result_string(self, order=4): 98 | assert order <= 4, "BLEU scores for order > 4 aren't supported" 99 | fmt = 'BLEU{} = {:2.2f}, {:2.1f}' 100 | for i in range(1, order): 101 | fmt += '/{:2.1f}' 102 | fmt += ' (BP={:.3f}, ratio={:.3f}, syslen={}, reflen={})' 103 | bleup = [p * 100 for p in self.precision()[:order]] 104 | return fmt.format(order, self.score(order=order), *bleup, 105 | self.brevity(), self.stat.reflen/self.stat.predlen, 106 | self.stat.predlen, self.stat.reflen) 107 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/modules/conv_tbc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import torch 10 | from torch.autograd import Variable, Function 11 | from torch.nn.modules.utils import _single 12 | 13 | try: 14 | from fairseq import temporal_convolution_tbc 15 | except ImportError as e: 16 | import sys 17 | sys.stderr.write('ERROR: missing temporal_convolution_tbc, run `python setup.py install`\n') 18 | raise e 19 | 20 | 21 | class ConvTBC(torch.nn.Module): 22 | """1D convolution over an input of shape (time x batch x channel) 23 | 24 | The implementation uses gemm to perform the convolution. This implementation 25 | is faster than cuDNN for small kernel sizes. 26 | """ 27 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, 28 | padding=0): 29 | super(ConvTBC, self).__init__() 30 | self.in_channels = in_channels 31 | self.out_channels = out_channels 32 | self.kernel_size = _single(kernel_size) 33 | self.stride = _single(stride) 34 | self.padding = _single(padding) 35 | assert self.stride == (1,) 36 | 37 | self.weight = torch.nn.Parameter(torch.Tensor( 38 | self.kernel_size[0], in_channels, out_channels)) 39 | self.bias = torch.nn.Parameter(torch.Tensor(out_channels)) 40 | 41 | def forward(self, input): 42 | return ConvTBCFunction.apply( 43 | input.contiguous(), self.weight, self.bias, self.padding[0]) 44 | 45 | def __repr__(self): 46 | s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size}' 47 | ', padding={padding}') 48 | if self.bias is None: 49 | s += ', bias=False' 50 | s += ')' 51 | return s.format(name=self.__class__.__name__, **self.__dict__) 52 | 53 | 54 | class ConvTBCFunction(Function): 55 | @staticmethod 56 | def forward(ctx, input, weight, bias, pad): 57 | input_size = input.size() 58 | weight_size = weight.size() 59 | kernel_size = weight_size[0] 60 | 61 | output = input.new( 62 | input_size[0] - kernel_size + 1 + pad * 2, 63 | input_size[1], 64 | weight_size[2]) 65 | 66 | ctx.input_size = input_size 67 | ctx.weight_size = weight_size 68 | ctx.save_for_backward(input, weight) 69 | temporal_convolution_tbc.TemporalConvolutionTBC_forward( 70 | input.type().encode('utf-8'), 71 | input, 72 | output, 73 | weight, 74 | bias) 75 | 76 | return output 77 | 78 | @staticmethod 79 | def backward(ctx, grad_output): 80 | input, weight = ctx.saved_tensors 81 | 82 | grad_output = grad_output.data.contiguous() 83 | grad_input = grad_output.new(ctx.input_size).zero_() 84 | grad_weight = grad_output.new(ctx.weight_size).zero_() 85 | grad_bias = grad_output.new(ctx.weight_size[2]) 86 | 87 | temporal_convolution_tbc.TemporalConvolutionTBC_backward( 88 | input.type().encode('utf-8'), 89 | grad_output, 90 | grad_input, 91 | grad_weight, 92 | grad_bias, 93 | input, 94 | weight) 95 | 96 | grad_input = Variable(grad_input, volatile=True) 97 | grad_weight = Variable(grad_weight, volatile=True) 98 | grad_bias = Variable(grad_bias, volatile=True) 99 | 100 | return grad_input, grad_weight, grad_bias, None 101 | 102 | 103 | def conv_tbc(input, weight, bias=None, stride=1, padding=0): 104 | return ConvTBCFunction.apply( 105 | input.contiguous(), weight, bias, padding[0]) 106 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/scripts/convert_model.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (c) 2017-present, Facebook, Inc. 2 | -- All rights reserved. 3 | -- 4 | -- This source code is licensed under the license found in the LICENSE file in 5 | -- the root directory of this source tree. An additional grant of patent rights 6 | -- can be found in the PATENTS file in the same directory. 7 | -- 8 | -- Usage: convert_model.lua <model_epoch1.th7> 9 | require 'torch' 10 | local fairseq = require 'fairseq' 11 | 12 | model = torch.load(arg[1]) 13 | 14 | function find_weight_norm(container, module) 15 | for _, wn in ipairs(container:listModules()) do 16 | if torch.type(wn) == 'nn.WeightNorm' and wn.modules[1] == module then 17 | return wn 18 | end 19 | end 20 | end 21 | 22 | function push_state(dict, key, module) 23 | if torch.type(module) == 'nn.Linear' then 24 | local wn = find_weight_norm(model.module, module) 25 | assert(wn) 26 | dict[key .. '.weight_v'] = wn.v:float() 27 | dict[key .. '.weight_g'] = wn.g:float() 28 | elseif torch.type(module) == 'nn.TemporalConvolutionTBC' then 29 | local wn = find_weight_norm(model.module, module) 30 | assert(wn) 31 | local v = wn.v:float():view(wn.viewOut):transpose(2, 3) 32 | dict[key .. '.weight_v'] = v 33 | dict[key .. '.weight_g'] = wn.g:float():view(module.weight:size(3), 1, 1) 34 | else 35 | dict[key .. '.weight'] = module.weight:float() 36 | end 37 | if module.bias then 38 | dict[key .. '.bias'] = module.bias:float() 39 | end 40 | end 41 | 42 | encoder_dict = {} 43 | decoder_dict = {} 44 | combined_dict = {} 45 | 46 | function encoder_state(encoder) 47 | luts = encoder:findModules('nn.LookupTable') 48 | push_state(encoder_dict, 'embed_tokens', luts[1]) 49 | push_state(encoder_dict, 'embed_positions', luts[2]) 50 | 51 | fcs = encoder:findModules('nn.Linear') 52 | assert(#fcs >= 2) 53 | local nInputPlane = fcs[1].weight:size(1) 54 | push_state(encoder_dict, 'fc1', table.remove(fcs, 1)) 55 | push_state(encoder_dict, 'fc2', table.remove(fcs, #fcs)) 56 | 57 | for i, module in ipairs(encoder:findModules('nn.TemporalConvolutionTBC')) do 58 | push_state(encoder_dict, 'convolutions.' .. tostring(i - 1), module) 59 | if nInputPlane ~= module.weight:size(3) / 2 then 60 | push_state(encoder_dict, 'projections.' .. tostring(i - 1), table.remove(fcs, 1)) 61 | end 62 | nInputPlane = module.weight:size(3) / 2 63 | end 64 | assert(#fcs == 0) 65 | end 66 | 67 | function decoder_state(decoder) 68 | luts = decoder:findModules('nn.LookupTable') 69 | push_state(decoder_dict, 'embed_tokens', luts[1]) 70 | push_state(decoder_dict, 'embed_positions', luts[2]) 71 | 72 | fcs = decoder:findModules('nn.Linear') 73 | local nInputPlane = fcs[1].weight:size(1) 74 | push_state(decoder_dict, 'fc1', table.remove(fcs, 1)) 75 | push_state(decoder_dict, 'fc2', fcs[#fcs - 1]) 76 | push_state(decoder_dict, 'fc3', fcs[#fcs]) 77 | 78 | table.remove(fcs, #fcs) 79 | table.remove(fcs, #fcs) 80 | 81 | for i, module in ipairs(decoder:findModules('nn.TemporalConvolutionTBC')) do 82 | if nInputPlane ~= module.weight:size(3) / 2 then 83 | push_state(decoder_dict, 'projections.' .. tostring(i - 1), table.remove(fcs, 1)) 84 | end 85 | nInputPlane = module.weight:size(3) / 2 86 | 87 | local prefix = 'attention.' .. tostring(i - 1) 88 | push_state(decoder_dict, prefix .. '.in_projection', table.remove(fcs, 1)) 89 | push_state(decoder_dict, prefix .. '.out_projection', table.remove(fcs, 1)) 90 | push_state(decoder_dict, 'convolutions.' .. tostring(i - 1), module) 91 | end 92 | assert(#fcs == 0) 93 | end 94 | 95 | 96 | _encoder = model.module.modules[2] 97 | _decoder = model.module.modules[3] 98 | 99 | encoder_state(_encoder) 100 | decoder_state(_decoder) 101 | 102 | for k, v in pairs(encoder_dict) do 103 | combined_dict['encoder.' .. k] = v 104 | end 105 | for k, v in pairs(decoder_dict) do 106 | combined_dict['decoder.' .. k] = v 107 | end 108 | 109 | 110 | torch.save('state_dict.t7', combined_dict) 111 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/dictionary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import math 10 | import torch 11 | 12 | 13 | class Dictionary(object): 14 | """A mapping from symbols to consecutive integers""" 15 | def __init__(self, pad='<pad>', eos='</s>', unk='<unk>'): 16 | self.unk_word, self.pad_word, self.eos_word = unk, pad, eos 17 | self.symbols = [] 18 | self.count = [] 19 | self.indices = {} 20 | self.add_symbol('<Lua heritage>') 21 | self.pad_index = self.add_symbol(pad) 22 | self.eos_index = self.add_symbol(eos) 23 | self.unk_index = self.add_symbol(unk) 24 | self.nspecial = len(self.symbols) 25 | 26 | def __getitem__(self, idx): 27 | if idx < len(self.symbols): 28 | return self.symbols[idx] 29 | return self.unk_word 30 | 31 | def __len__(self): 32 | """Returns the number of symbols in the dictionary""" 33 | return len(self.symbols) 34 | 35 | def index(self, sym): 36 | """Returns the index of the specified symbol""" 37 | if sym in self.indices: 38 | return self.indices[sym] 39 | return self.unk_index 40 | 41 | def string(self, tensor): 42 | if torch.is_tensor(tensor) and tensor.dim() == 2: 43 | sentences = [self.string(line) for line in tensor] 44 | return '\n'.join(sentences) 45 | 46 | eos = self.eos() 47 | return ' '.join([self[i] for i in tensor if i != eos]) 48 | 49 | def add_symbol(self, word, n=1): 50 | """Adds a word to the dictionary""" 51 | if word in self.indices: 52 | idx = self.indices[word] 53 | self.count[idx] = self.count[idx] + n 54 | return idx 55 | else: 56 | idx = len(self.symbols) 57 | self.indices[word] = idx 58 | self.symbols.append(word) 59 | self.count.append(n) 60 | return idx 61 | 62 | def finalize(self): 63 | """Sort symbols by frequency in descending order, ignoring special ones.""" 64 | self.count, self.symbols = zip( 65 | *sorted(zip(self.count, self.symbols), 66 | key=(lambda x: math.inf if self.indices[x[1]] < self.nspecial else x[0]), 67 | reverse=True) 68 | ) 69 | 70 | def pad(self): 71 | """Helper to get index of pad symbol""" 72 | return self.pad_index 73 | 74 | def eos(self): 75 | """Helper to get index of end-of-sentence symbol""" 76 | return self.eos_index 77 | 78 | def unk(self): 79 | """Helper to get index of unk symbol""" 80 | return self.unk_index 81 | 82 | @staticmethod 83 | def load(f): 84 | """Loads the dictionary from a text file with the format: 85 | 86 | ``` 87 | <symbol0> <count0> 88 | <symbol1> <count1> 89 | ... 90 | ``` 91 | """ 92 | 93 | if isinstance(f, str): 94 | with open(f, 'r') as fd: 95 | return Dictionary.load(fd) 96 | 97 | d = Dictionary() 98 | for line in f.readlines(): 99 | idx = line.rfind(' ') 100 | word = line[:idx] 101 | count = int(line[idx+1:]) 102 | d.indices[word] = len(d.symbols) 103 | d.symbols.append(word) 104 | d.count.append(count) 105 | return d 106 | 107 | def save(self, f, threshold=3, nwords=-1): 108 | """Stores dictionary into a text file""" 109 | if isinstance(f, str): 110 | with open(f, 'w') as fd: 111 | return self.save(fd, threshold, nwords) 112 | cnt = 0 113 | for i, t in enumerate(zip(self.symbols, self.count)): 114 | if i >= self.nspecial and t[1] >= threshold \ 115 | and (nwords < 0 or cnt < nwords): 116 | print('{} {}'.format(t[0], t[1]), file=f) 117 | cnt += 1 118 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/scripts/build_sym_alignment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | """ 10 | Use this script in order to build symmetric alignments for your translation 11 | dataset. 12 | This script depends on fast_align and mosesdecoder tools. You will need to 13 | build those before running the script. 14 | fast_align: 15 | github: http://github.com/clab/fast_align 16 | instructions: follow the instructions in README.md 17 | mosesdecoder: 18 | github: http://github.com/moses-smt/mosesdecoder 19 | instructions: http://www.statmt.org/moses/?n=Development.GetStarted 20 | The script produces the following files under --output_dir: 21 | text.joined - concatenation of lines from the source_file and the 22 | target_file. 23 | align.forward - forward pass of fast_align. 24 | align.backward - backward pass of fast_align. 25 | aligned.sym_heuristic - symmetrized alignment. 26 | """ 27 | 28 | import argparse 29 | import os 30 | from itertools import zip_longest 31 | 32 | 33 | def main(): 34 | parser = argparse.ArgumentParser(description='symmetric alignment builer') 35 | parser.add_argument('--fast_align_dir', 36 | help='path to fast_align build directory') 37 | parser.add_argument('--mosesdecoder_dir', 38 | help='path to mosesdecoder root directory') 39 | parser.add_argument('--sym_heuristic', 40 | help='heuristic to use for symmetrization', 41 | default='grow-diag-final-and') 42 | parser.add_argument('--source_file', 43 | help='path to a file with sentences ' 44 | 'in the source language') 45 | parser.add_argument('--target_file', 46 | help='path to a file with sentences ' 47 | 'in the target language') 48 | parser.add_argument('--output_dir', 49 | help='output directory') 50 | args = parser.parse_args() 51 | 52 | fast_align_bin = os.path.join(args.fast_align_dir, 'fast_align') 53 | symal_bin = os.path.join(args.mosesdecoder_dir, 'bin', 'symal') 54 | sym_fast_align_bin = os.path.join( 55 | args.mosesdecoder_dir, 'scripts', 'ems', 56 | 'support', 'symmetrize-fast-align.perl') 57 | 58 | # create joined file 59 | joined_file = os.path.join(args.output_dir, 'text.joined') 60 | with open(args.source_file, 'r') as src, open(args.target_file, 'r') as tgt: 61 | with open(joined_file, 'w') as joined: 62 | for s, t in zip_longest(src, tgt): 63 | print('{} ||| {}'.format(s.strip(), t.strip()), file=joined) 64 | 65 | bwd_align_file = os.path.join(args.output_dir, 'align.backward') 66 | 67 | # run forward alignment 68 | fwd_align_file = os.path.join(args.output_dir, 'align.forward') 69 | fwd_fast_align_cmd = '{FASTALIGN} -i {JOINED} -d -o -v > {FWD}'.format( 70 | FASTALIGN=fast_align_bin, 71 | JOINED=joined_file, 72 | FWD=fwd_align_file) 73 | assert os.system(fwd_fast_align_cmd) == 0 74 | 75 | # run backward alignment 76 | bwd_align_file = os.path.join(args.output_dir, 'align.backward') 77 | bwd_fast_align_cmd = '{FASTALIGN} -i {JOINED} -d -o -v -r > {BWD}'.format( 78 | FASTALIGN=fast_align_bin, 79 | JOINED=joined_file, 80 | BWD=bwd_align_file) 81 | assert os.system(bwd_fast_align_cmd) == 0 82 | 83 | # run symmetrization 84 | sym_out_file = os.path.join(args.output_dir, 'aligned') 85 | sym_cmd = '{SYMFASTALIGN} {FWD} {BWD} {SRC} {TGT} {OUT} {HEURISTIC} {SYMAL}'.format( 86 | SYMFASTALIGN=sym_fast_align_bin, 87 | FWD=fwd_align_file, 88 | BWD=bwd_align_file, 89 | SRC=args.source_file, 90 | TGT=args.target_file, 91 | OUT=sym_out_file, 92 | HEURISTIC=args.sym_heuristic, 93 | SYMAL=symal_bin 94 | ) 95 | assert os.system(sym_cmd) == 0 96 | 97 | 98 | if __name__ == '__main__': 99 | main() 100 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/clib/temporal_convolution_tbc/temporal_convolution_tbc.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include <stdio.h> 10 | #include <string.h> 11 | #include <stdexcept> 12 | #include <ATen/ATen.h> 13 | 14 | 15 | using at::Tensor; 16 | extern THCState* state; 17 | 18 | at::Type& getDataType(const char* dtype) { 19 | if (strcmp(dtype, "torch.cuda.FloatTensor") == 0) { 20 | return at::getType(at::kCUDA, at::kFloat); 21 | } else if (strcmp(dtype, "torch.FloatTensor") == 0) { 22 | return at::getType(at::kCPU, at::kFloat); 23 | } else { 24 | throw std::runtime_error(std::string("Unsupported data type: ") + dtype); 25 | } 26 | } 27 | 28 | inline at::Tensor t(at::Type& type, void* i) { 29 | return type.unsafeTensorFromTH(i, true); 30 | } 31 | 32 | extern "C" void TemporalConvolutionTBC_forward( 33 | const char* dtype, 34 | void* _input, 35 | void* _output, 36 | void* _weight, 37 | void* _bias) 38 | { 39 | auto& type = getDataType(dtype); 40 | Tensor input = t(type, _input); 41 | Tensor output = t(type, _output); 42 | Tensor weight = t(type, _weight); 43 | Tensor bias = t(type, _bias); 44 | 45 | auto input_size = input.sizes(); 46 | auto output_size = output.sizes(); 47 | 48 | auto ilen = input_size[0]; 49 | auto batchSize = input_size[1]; 50 | auto inputPlanes = input_size[2]; 51 | auto outputPlanes = output_size[2]; 52 | auto olen = output_size[0]; 53 | auto kw = weight.sizes()[0]; 54 | int pad = (olen - ilen + kw - 1) / 2; 55 | 56 | // input * weights + bias -> output_features 57 | output.copy_(bias.expand(output.sizes())); 58 | for (int k = 0; k < kw; k++) { 59 | int iShift = std::max(0, k - pad); 60 | int oShift = std::max(0, pad - k); 61 | int t = std::min(ilen + pad - k, olen) - oShift; 62 | // Note: gemm assumes column-major matrices 63 | // input is l*m (row-major) 64 | // weight is m*r (row-major) 65 | // output is l*r (row-major) 66 | if (t > 0) { 67 | auto W = weight[k]; 68 | auto I = input.narrow(0, iShift, t).view({t * batchSize, inputPlanes}); 69 | auto O = output.narrow(0, oShift, t).view({t * batchSize, outputPlanes}); 70 | at::addmm_out(1, O, 1, I, W, O); 71 | } 72 | } 73 | } 74 | 75 | extern "C" void TemporalConvolutionTBC_backward( 76 | const char* dtype, 77 | void* _dOutput, 78 | void* _dInput, 79 | void* _dWeight, 80 | void* _dBias, 81 | void* _input, 82 | void* _weight) 83 | { 84 | auto& type = getDataType(dtype); 85 | Tensor dOutput = t(type, _dOutput); 86 | Tensor dInput = t(type, _dInput); 87 | Tensor dWeight = t(type, _dWeight); 88 | Tensor dBias = t(type, _dBias); 89 | Tensor input = t(type, _input); 90 | Tensor weight = t(type, _weight); 91 | 92 | auto input_size = input.sizes(); 93 | auto output_size = dOutput.sizes(); 94 | 95 | auto ilen = input_size[0]; 96 | auto batchSize = input_size[1]; 97 | auto inputPlanes = input_size[2]; 98 | auto outputPlanes = output_size[2]; 99 | auto olen = output_size[0]; 100 | auto kw = weight.sizes()[0]; 101 | int pad = (olen - ilen + kw - 1) / 2; 102 | 103 | for (int k = 0; k < kw; k++) { 104 | int iShift = std::max(0, k - pad); 105 | int oShift = std::max(0, pad - k); 106 | int t = std::min(ilen + pad - k, olen) - oShift; 107 | // dOutput * T(weight) -> dInput 108 | if (t > 0) { 109 | auto dO = dOutput.narrow(0, oShift, t).view({t * batchSize, outputPlanes}); 110 | auto dI = dInput.narrow(0, iShift, t).view({t * batchSize, inputPlanes}); 111 | at::addmm_out(1, dI, 1, dO, weight[k].t(), dI); 112 | } 113 | } 114 | 115 | for (int k = 0; k < kw; k++) { 116 | int iShift = std::max(0, k - pad); 117 | int oShift = std::max(0, pad - k); 118 | int t = std::min(ilen + pad - k, olen) - oShift; 119 | // T(input) * dOutput -> dWeight 120 | if (t > 0) { 121 | auto dW = dWeight[k]; 122 | auto dO = dOutput.narrow(0, oShift, t).view({t * batchSize, outputPlanes}); 123 | auto I = input.narrow(0, iShift, t).view({t * batchSize, inputPlanes}).t(); 124 | at::addmm_out(1, dW, 1, I, dO, dW); 125 | } 126 | } 127 | 128 | auto tmp = dOutput.sum(0, false); 129 | at::sum_out(tmp, 0, dBias); 130 | } 131 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/lib/m2scorer/nuclesgmlparser.py: -------------------------------------------------------------------------------- 1 | # nuclesgmlparser.py 2 | # 3 | # Author: Yuanbin Wu 4 | # National University of Singapore (NUS) 5 | # Date: 12 Mar 2013 6 | # Version: 1.0 7 | # 8 | # Contact: wuyb@comp.nus.edu.sg 9 | # 10 | # This script is distributed to support the CoNLL-2013 Shared Task. 11 | # It is free for research and educational purposes. 12 | 13 | from sgmllib import SGMLParser 14 | from nucle_doc import nucle_doc 15 | 16 | 17 | class nuclesgmlparser(SGMLParser): 18 | def __init__(self): 19 | SGMLParser.__init__(self) 20 | self.docs = [] 21 | 22 | def reset(self): 23 | self.docs = [] 24 | self.data = [] 25 | SGMLParser.reset(self) 26 | 27 | def unknow_starttag(self, tag, attrs): 28 | pass 29 | 30 | def unknow_endtag(self): 31 | pass 32 | 33 | def start_doc(self, attrs): 34 | self.docs.append(nucle_doc()) 35 | self.docs[-1].docattrs = attrs 36 | 37 | def end_doc(self): 38 | pass 39 | 40 | def start_matric(self, attrs): 41 | pass 42 | 43 | def end_matric(self): 44 | self.docs[-1].matric = ''.join(self.data) 45 | self.data = [] 46 | pass 47 | 48 | def start_email(self, attrs): 49 | pass 50 | 51 | def end_email(self): 52 | self.docs[-1].email = ''.join(self.data) 53 | self.data = [] 54 | pass 55 | 56 | def start_nationality(self, attrs): 57 | pass 58 | 59 | def end_nationality(self): 60 | self.docs[-1].nationality = ''.join(self.data) 61 | self.data = [] 62 | pass 63 | 64 | def start_first_language(self, attrs): 65 | pass 66 | 67 | def end_first_language(self): 68 | self.docs[-1].firstLanguage = ''.join(self.data) 69 | self.data = [] 70 | pass 71 | 72 | def start_school_language(self, attrs): 73 | pass 74 | 75 | def end_school_language(self): 76 | self.docs[-1].schoolLanguage = ''.join(self.data) 77 | self.data = [] 78 | pass 79 | 80 | def start_english_tests(self, attrs): 81 | pass 82 | 83 | def end_english_tests(self): 84 | self.docs[-1].englishTests = ''.join(self.data) 85 | self.data = [] 86 | pass 87 | 88 | 89 | def start_text(self, attrs): 90 | pass 91 | 92 | def end_text(self): 93 | pass 94 | 95 | def start_title(self, attrs): 96 | pass 97 | 98 | def end_title(self): 99 | self.docs[-1].paragraphs.append(''.join(self.data)) 100 | self.data = [] 101 | pass 102 | 103 | 104 | def start_p(self, attrs): 105 | pass 106 | 107 | def end_p(self): 108 | self.docs[-1].paragraphs.append(''.join(self.data)) 109 | self.data = [] 110 | pass 111 | 112 | 113 | def start_annotation(self, attrs): 114 | self.docs[-1].annotation.append(attrs) 115 | 116 | def end_annotation(self): 117 | pass 118 | 119 | def start_mistake(self, attrs): 120 | d = {} 121 | for t in attrs: 122 | d[t[0]] = int(t[1]) 123 | self.docs[-1].mistakes.append(d) 124 | pass 125 | 126 | def end_mistake(self): 127 | pass 128 | 129 | def start_type(self, attrs): 130 | pass 131 | 132 | def end_type(self): 133 | self.docs[-1].mistakes[-1]['type'] = ''.join(self.data) 134 | self.data = [] 135 | 136 | def start_correction(self, attrs): 137 | pass 138 | 139 | def end_correction(self): 140 | self.docs[-1].mistakes[-1]['correction'] = ''.join(self.data) 141 | self.data = [] 142 | 143 | def start_comment(self, attrs): 144 | pass 145 | 146 | def end_comment(self): 147 | self.docs[-1].mistakes[-1]['comment'] = ''.join( self.data) 148 | self.data = [] 149 | 150 | 151 | def handle_charref(self, ref): 152 | self.data.append('&' + ref) 153 | 154 | def handle_entityref(self, ref): 155 | self.data.append('&' + ref) 156 | 157 | def handle_data(self, text): 158 | if text.strip() == '': 159 | self.data.append('') 160 | return 161 | else: 162 | if text.startswith('\n'): 163 | text = text[1:] 164 | if text.endswith('\n'): 165 | text = text[:-1] 166 | self.data.append(text) 167 | 168 | 169 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import shutil 6 | import imp 7 | 8 | import argparse 9 | 10 | # Initializing the logging module 11 | import logging 12 | import log_utils as L 13 | import configreader 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("-i", "--input-nbest", dest="input_nbest", required=True, help="Input n-best file") 19 | parser.add_argument("-r", "--reference-files", dest="ref_paths", required=True, help="A comma-seperated list of reference files") 20 | parser.add_argument("-c", "--config", dest="input_config", required=True, help="Input config (ini) file, e.g similar to moses with [weight] section") 21 | parser.add_argument("-o", "--output-dir", dest="out_dir", required=True, help="Output directory") 22 | parser.add_argument("-t", "--threads", dest="threads", default = 14, type=int, help="Number of MERT threads") 23 | parser.add_argument("--no-add-weight", dest="no_add_weight", action="store_true", help="Flag to be true if config file already contains initial weights for augmented feature(s). Useful for adding multiple features.") 24 | parser.add_argument("-iv", "--init-value", dest="init_value", default = '0.05', help="The initial value of the feature") 25 | parser.add_argument("-a", "--tuning-algorithm", dest="alg", default = 'mert', help="Tuning Algorithm (mert|pro|wpro)") 26 | parser.add_argument("-m", "--tuning-metric", dest="metric", default = 'bleu', help="Tuning Algorithm (bleu|m2)") 27 | parser.add_argument("-s", "--predictable-seed", dest="pred_seed", action='store_true', help="Tune with predictable seed to avoid randomness") 28 | parser.add_argument("--moses-dir", dest="moses_dir", required=True, help="Path to Moses. Required for tuning scripts") 29 | args = parser.parse_args() 30 | 31 | fscore_arg = "" 32 | if args.metric == 'm2': 33 | fscore_arg = " --sctype M2SCORER --scconfig ignore_whitespace_casing:true " 34 | logger.info("Using M2 Tuning") 35 | logger.info(L.b_yellow('Arguments: ') + fscore_arg) 36 | 37 | 38 | if not os.path.exists(args.out_dir): 39 | os.makedirs(args.out_dir) 40 | 41 | L.set_logger(os.path.abspath(args.out_dir),'train_log.txt') 42 | L.print_args(args) 43 | 44 | logger.info("Reading weights from config file") 45 | features = configreader.parse_ini(args.input_config) 46 | logger.info("Feature weights: " + str(features)) 47 | 48 | output_nbest_path = args.out_dir + '/augmented.nbest' 49 | shutil.copy(args.input_nbest, output_nbest_path) 50 | 51 | logger.info('Extracting stats and features') 52 | logger.warning('The optional arguments of extractor are not used yet') 53 | cmd = args.moses_dir + '/bin/extractor -r ' + args.ref_paths + ' -n ' + output_nbest_path + ' --scfile ' + args.out_dir + '/statscore.data --ffile ' + args.out_dir + '/features.data' 54 | if args.metric == 'm2': 55 | cmd = args.moses_dir + '/bin/extractor --sctype M2SCORER --scconfig ignore_whitespace_casing:true -r ' + args.ref_paths + ' -n ' + output_nbest_path + ' --scfile ' + args.out_dir + '/statscore.data --ffile ' + args.out_dir + '/features.data' 56 | logger.info('Executing command: ' + cmd ) 57 | os.system(cmd) 58 | 59 | 60 | #create the list of features 61 | 62 | with open(args.out_dir + '/init.opt', 'w') as init_opt: 63 | init_list = [] 64 | for line in features: 65 | tokens = line.split(" ") 66 | try: 67 | float(tokens[1]) 68 | init_list += tokens[1:] 69 | except ValueError: 70 | pass 71 | if args.no_add_weight == False: 72 | init_list.append(args.init_value) 73 | dim = len(init_list) 74 | init_opt.write(' '.join(init_list) + '\n') 75 | init_opt.write(' '.join(['0' for i in range(dim)]) + '\n') 76 | init_opt.write(' '.join(['1' for i in range(dim)]) + '\n') 77 | 78 | seed_arg = '' 79 | if args.pred_seed: 80 | seed_arg = ' -r 1 ' 81 | #seed_arg = ' -r 1500 ' 82 | 83 | 84 | if (args.alg == 'mert'): 85 | logger.info('Running MERT') 86 | cmd = args.moses_dir + '/bin/mert -d ' + str(dim) + ' -S ' + args.out_dir + '/statscore.data -F ' + args.out_dir + '/features.data --ifile ' + args.out_dir + '/init.opt --threads ' + str(args.threads) + seed_arg + fscore_arg# + "-m 50 -n 20" 87 | logger.info("Command: " + cmd) 88 | os.system(cmd) 89 | else: 90 | logger.error('Invalid tuning algorithm: ' + args.alg) 91 | 92 | logger.info(L.green("Optimization complete.")) 93 | assert os.path.isfile('weights.txt') 94 | shutil.move('weights.txt', args.out_dir + '/weights.txt') 95 | 96 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/nccl.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | """ 10 | A modified version of torch.cuda.nccl.all_reduce for launching kernels on each 11 | GPU separately. 12 | """ 13 | 14 | import ctypes 15 | import warnings 16 | 17 | lib = None 18 | _uid = None 19 | _rank = None 20 | _num_devices = None 21 | _comm = None 22 | 23 | __all__ = ['all_reduce', 'initialize', 'get_unique_id'] 24 | 25 | 26 | def _libnccl(): 27 | global lib 28 | if lib is None: 29 | lib = ctypes.cdll.LoadLibrary(None) 30 | if hasattr(lib, 'ncclCommDestroy'): 31 | lib.ncclCommDestroy.restype = None 32 | lib.ncclGetErrorString.restype = ctypes.c_char_p 33 | else: 34 | lib = None 35 | return lib 36 | 37 | 38 | def is_available(tensors): 39 | devices = set() 40 | for tensor in tensors: 41 | if not tensor.is_contiguous(): 42 | return False 43 | if not tensor.is_cuda: 44 | return False 45 | device = tensor.get_device() 46 | if device in devices: 47 | return False 48 | devices.add(device) 49 | 50 | if _libnccl() is None: 51 | warnings.warn('NCCL library not found. Check your LD_LIBRARY_PATH') 52 | return False 53 | 54 | return True 55 | 56 | 57 | _communicators = {} 58 | 59 | # ncclDataType_t 60 | ncclChar = 0 61 | ncclInt = 1 62 | ncclHalf = 2 63 | ncclFloat = 3 64 | ncclDouble = 4 65 | ncclInt64 = 5 66 | ncclUint64 = 6 67 | 68 | # ncclRedOp_t 69 | SUM = 0 70 | PROD = 1 71 | MAX = 2 72 | MIN = 3 73 | 74 | nccl_types = { 75 | 'torch.cuda.ByteTensor': ncclChar, 76 | 'torch.cuda.CharTensor': ncclChar, 77 | 'torch.cuda.IntTensor': ncclInt, 78 | 'torch.cuda.HalfTensor': ncclHalf, 79 | 'torch.cuda.FloatTensor': ncclFloat, 80 | 'torch.cuda.DoubleTensor': ncclDouble, 81 | 'torch.cuda.LongTensor': ncclInt64, 82 | } 83 | 84 | 85 | class NcclError(RuntimeError): 86 | def __init__(self, status): 87 | self.status = status 88 | msg = '{0} ({1})'.format(lib.ncclGetErrorString(status), status) 89 | super(NcclError, self).__init__(msg) 90 | 91 | 92 | class NcclComm(ctypes.c_void_p): 93 | def __del__(self): 94 | lib.ncclCommDestroy(self) 95 | 96 | 97 | class NcclUniqueId(ctypes.Structure): 98 | _fields_ = [ 99 | ('internal', ctypes.c_uint8 * 128) 100 | ] 101 | 102 | 103 | def check_error(status): 104 | if status != 0: 105 | raise NcclError(status) 106 | 107 | 108 | _uids = [] 109 | 110 | 111 | def get_unique_id(): 112 | if _libnccl() is None: 113 | raise RuntimeError('Unable to load NCCL library') 114 | 115 | uid = NcclUniqueId() 116 | check_error(lib.ncclGetUniqueId(ctypes.byref(uid))) 117 | _uids.append(uid) # Don't allow UIDs to be collected 118 | return uid 119 | 120 | 121 | def initialize(num_devices, uid, rank): 122 | global _num_devices, _uid, _rank 123 | 124 | if _libnccl() is None: 125 | raise RuntimeError('Unable to load NCCL library') 126 | 127 | _num_devices = num_devices 128 | if rank != 0: 129 | _uid = NcclUniqueId.from_buffer_copy(uid) 130 | else: 131 | _uid = uid 132 | _rank = rank 133 | 134 | 135 | def communicator(): 136 | global _comm 137 | if _uid is None: 138 | raise RuntimeError('NCCL not initialized') 139 | if _comm is None: 140 | comm = ctypes.c_void_p() 141 | check_error(lib.ncclCommInitRank( 142 | ctypes.byref(comm), 143 | ctypes.c_int(_num_devices), 144 | _uid, 145 | ctypes.c_int(_rank))) 146 | _comm = comm 147 | return _comm 148 | 149 | 150 | def all_reduce(input, output=None, op=SUM, stream=None): 151 | comm = communicator() 152 | if output is None: 153 | output = input 154 | if stream is not None: 155 | stream = stream.cuda_stream 156 | data_type = nccl_types[input.type()] 157 | check_error(lib.ncclAllReduce( 158 | ctypes.c_void_p(input.data_ptr()), 159 | ctypes.c_void_p(output.data_ptr()), 160 | ctypes.c_size_t(input.numel()), 161 | data_type, 162 | op, 163 | comm, 164 | ctypes.c_void_p(stream))) 165 | return output 166 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/subword-nmt/chrF.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | """Compute chrF3 for machine translation evaluation 6 | 7 | Reference: 8 | Maja Popović (2015). chrF: character n-gram F-score for automatic MT evaluation. In Proceedings of the Tenth Workshop on Statistical Machine Translationn, pages 392–395, Lisbon, Portugal. 9 | """ 10 | 11 | from __future__ import print_function, unicode_literals, division 12 | import sys 13 | import codecs 14 | import io 15 | import argparse 16 | from collections import defaultdict 17 | from math import log, exp 18 | 19 | # hack for python2/3 compatibility 20 | from io import open 21 | argparse.open = open 22 | 23 | # python 2/3 compatibility 24 | if sys.version_info < (3, 0): 25 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) 26 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) 27 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin) 28 | 29 | 30 | def create_parser(): 31 | parser = argparse.ArgumentParser( 32 | formatter_class=argparse.RawDescriptionHelpFormatter, 33 | description="learn BPE-based word segmentation") 34 | 35 | parser.add_argument( 36 | '--ref', '-r', type=argparse.FileType('r'), required=True, 37 | metavar='PATH', 38 | help="Reference file") 39 | parser.add_argument( 40 | '--hyp', type=argparse.FileType('r'), metavar='PATH', 41 | default=sys.stdin, 42 | help="Hypothesis file (default: stdin).") 43 | parser.add_argument( 44 | '--beta', '-b', type=float, default=3, 45 | metavar='FLOAT', 46 | help="beta parameter (default: '%(default)s')") 47 | parser.add_argument( 48 | '--ngram', '-n', type=int, default=6, 49 | metavar='INT', 50 | help="ngram order (default: '%(default)s')") 51 | parser.add_argument( 52 | '--space', '-s', action='store_true', 53 | help="take spaces into account (default: '%(default)s')") 54 | parser.add_argument( 55 | '--precision', action='store_true', 56 | help="report precision (default: '%(default)s')") 57 | parser.add_argument( 58 | '--recall', action='store_true', 59 | help="report recall (default: '%(default)s')") 60 | 61 | return parser 62 | 63 | def extract_ngrams(words, max_length=4, spaces=False): 64 | 65 | if not spaces: 66 | words = ''.join(words.split()) 67 | else: 68 | words = words.strip() 69 | 70 | results = defaultdict(lambda: defaultdict(int)) 71 | for length in range(max_length): 72 | for start_pos in range(len(words)): 73 | end_pos = start_pos + length + 1 74 | if end_pos <= len(words): 75 | results[length][tuple(words[start_pos: end_pos])] += 1 76 | return results 77 | 78 | 79 | def get_correct(ngrams_ref, ngrams_test, correct, total): 80 | 81 | for rank in ngrams_test: 82 | for chain in ngrams_test[rank]: 83 | total[rank] += ngrams_test[rank][chain] 84 | if chain in ngrams_ref[rank]: 85 | correct[rank] += min(ngrams_test[rank][chain], ngrams_ref[rank][chain]) 86 | 87 | return correct, total 88 | 89 | 90 | def f1(correct, total_hyp, total_ref, max_length, beta=3, smooth=0): 91 | 92 | precision = 0 93 | recall = 0 94 | 95 | for i in range(max_length): 96 | if total_hyp[i] + smooth and total_ref[i] + smooth: 97 | precision += (correct[i] + smooth) / (total_hyp[i] + smooth) 98 | recall += (correct[i] + smooth) / (total_ref[i] + smooth) 99 | 100 | precision /= max_length 101 | recall /= max_length 102 | 103 | return (1 + beta**2) * (precision*recall) / ((beta**2 * precision) + recall), precision, recall 104 | 105 | def main(args): 106 | 107 | correct = [0]*args.ngram 108 | total = [0]*args.ngram 109 | total_ref = [0]*args.ngram 110 | for line in args.ref: 111 | line2 = args.hyp.readline() 112 | 113 | ngrams_ref = extract_ngrams(line, max_length=args.ngram, spaces=args.space) 114 | ngrams_test = extract_ngrams(line2, max_length=args.ngram, spaces=args.space) 115 | 116 | get_correct(ngrams_ref, ngrams_test, correct, total) 117 | 118 | for rank in ngrams_ref: 119 | for chain in ngrams_ref[rank]: 120 | total_ref[rank] += ngrams_ref[rank][chain] 121 | 122 | chrf, precision, recall = f1(correct, total, total_ref, args.ngram, args.beta) 123 | 124 | print('chrF3: {0:.4f}'.format(chrf)) 125 | if args.precision: 126 | print('chrPrec: {0:.4f}'.format(precision)) 127 | if args.recall: 128 | print('chrRec: {0:.4f}'.format(recall)) 129 | 130 | if __name__ == '__main__': 131 | 132 | parser = create_parser() 133 | args = parser.parse_args() 134 | 135 | main(args) 136 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/indexed_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import numpy as np 10 | import os 11 | import struct 12 | import torch 13 | 14 | 15 | def read_longs(f, n): 16 | a = np.empty(n, dtype=np.int64) 17 | f.readinto(a) 18 | return a 19 | 20 | 21 | def write_longs(f, a): 22 | f.write(np.array(a, dtype=np.int64)) 23 | 24 | 25 | dtypes = { 26 | 1: np.uint8, 27 | 2: np.int8, 28 | 3: np.int16, 29 | 4: np.int32, 30 | 5: np.int64, 31 | 6: np.float, 32 | 7: np.double, 33 | } 34 | 35 | 36 | def code(dtype): 37 | for k in dtypes.keys(): 38 | if dtypes[k] == dtype: 39 | return k 40 | 41 | 42 | class IndexedDataset(object): 43 | """Loader for TorchNet IndexedDataset""" 44 | 45 | def __init__(self, path): 46 | with open(path + '.idx', 'rb') as f: 47 | magic = f.read(8) 48 | assert magic == b'TNTIDX\x00\x00' 49 | version = f.read(8) 50 | assert struct.unpack('<Q', version) == (1,) 51 | code, self.element_size = struct.unpack('<QQ', f.read(16)) 52 | self.dtype = dtypes[code] 53 | self.size, self.s = struct.unpack('<QQ', f.read(16)) 54 | self.dim_offsets = read_longs(f, self.size + 1) 55 | self.data_offsets = read_longs(f, self.size + 1) 56 | self.sizes = read_longs(f, self.s) 57 | self.read_data(path) 58 | 59 | def read_data(self, path): 60 | self.data_file = open(path + '.bin', 'rb', buffering=0) 61 | 62 | def __del__(self): 63 | self.data_file.close() 64 | 65 | def __getitem__(self, i): 66 | if i < 0 or i >= self.size: 67 | raise IndexError('index out of range') 68 | tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]] 69 | a = np.empty(tensor_size, dtype=self.dtype) 70 | self.data_file.seek(self.data_offsets[i] * self.element_size) 71 | self.data_file.readinto(a) 72 | return torch.from_numpy(a) 73 | 74 | def __len__(self): 75 | return self.size 76 | 77 | @staticmethod 78 | def exists(path): 79 | return os.path.exists(path + '.idx') 80 | 81 | 82 | class IndexedInMemoryDataset(IndexedDataset): 83 | """Loader for TorchNet IndexedDataset, keeps all the data in memory""" 84 | 85 | def read_data(self, path): 86 | self.data_file = open(path + '.bin', 'rb') 87 | self.buffer = np.empty(self.data_offsets[-1], dtype=self.dtype) 88 | self.data_file.readinto(self.buffer) 89 | self.data_file.close() 90 | 91 | def __del__(self): 92 | pass 93 | 94 | def __getitem__(self, i): 95 | if i < 0 or i >= self.size: 96 | raise IndexError('index out of range') 97 | tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]] 98 | a = np.empty(tensor_size, dtype=self.dtype) 99 | np.copyto(a, self.buffer[self.data_offsets[i]:self.data_offsets[i + 1]]) 100 | return torch.from_numpy(a) 101 | 102 | 103 | class IndexedDatasetBuilder(object): 104 | 105 | element_sizes = { 106 | np.uint8: 1, 107 | np.int8: 1, 108 | np.int16: 2, 109 | np.int32: 4, 110 | np.int64: 8, 111 | np.float: 4, 112 | np.double: 8 113 | } 114 | 115 | def __init__(self, out_file, dtype=np.int32): 116 | self.out_file = open(out_file, 'wb') 117 | self.dtype = dtype 118 | self.data_offsets = [0] 119 | self.dim_offsets = [0] 120 | self.sizes = [] 121 | self.element_size = self.element_sizes[self.dtype] 122 | 123 | def add_item(self, tensor): 124 | # +1 for Lua compatibility 125 | bytes = self.out_file.write(np.array(tensor.numpy() + 1, dtype=self.dtype)) 126 | self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size) 127 | for s in tensor.size(): 128 | self.sizes.append(s) 129 | self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size())) 130 | 131 | def finalize(self, index_file): 132 | self.out_file.close() 133 | index = open(index_file, 'wb') 134 | index.write(b'TNTIDX\x00\x00') 135 | index.write(struct.pack('<Q', 1)) 136 | index.write(struct.pack('<QQ', code(self.dtype), 137 | self.element_size)) 138 | index.write(struct.pack('<QQ', len(self.data_offsets) - 1, 139 | len(self.sizes))) 140 | write_longs(index, self.dim_offsets) 141 | write_longs(index, self.data_offsets) 142 | write_longs(index, self.sizes) 143 | index.close() 144 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/log_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | #-----------------------------------------------------------------------------------------------------------# 5 | import re 6 | 7 | class BColors: 8 | HEADER = '\033[95m' 9 | OKBLUE = '\033[94m' 10 | OKGREEN = '\033[92m' 11 | WARNING = '\033[93m' 12 | FAIL = '\033[91m' 13 | ENDC = '\033[0m' 14 | BOLD = '\033[1m' 15 | UNDERLINE = '\033[4m' 16 | WHITE = '\033[37m' 17 | YELLOW = '\033[33m' 18 | GREEN = '\033[32m' 19 | BLUE = '\033[34m' 20 | CYAN = '\033[36m' 21 | RED = '\033[31m' 22 | MAGENTA = '\033[35m' 23 | BLACK = '\033[30m' 24 | BHEADER = BOLD + '\033[95m' 25 | BOKBLUE = BOLD + '\033[94m' 26 | BOKGREEN = BOLD + '\033[92m' 27 | BWARNING = BOLD + '\033[93m' 28 | BFAIL = BOLD + '\033[91m' 29 | BUNDERLINE = BOLD + '\033[4m' 30 | BWHITE = BOLD + '\033[37m' 31 | BYELLOW = BOLD + '\033[33m' 32 | BGREEN = BOLD + '\033[32m' 33 | BBLUE = BOLD + '\033[34m' 34 | BCYAN = BOLD + '\033[36m' 35 | BRED = BOLD + '\033[31m' 36 | BMAGENTA = BOLD + '\033[35m' 37 | BBLACK = BOLD + '\033[30m' 38 | 39 | @staticmethod 40 | def cleared(s): 41 | return re.sub("\033\[[0-9][0-9]?m", "", s) 42 | 43 | def red(message): 44 | return BColors.RED + str(message) + BColors.ENDC 45 | 46 | def b_red(message): 47 | return BColors.BRED + str(message) + BColors.ENDC 48 | 49 | def blue(message): 50 | return BColors.BLUE + str(message) + BColors.ENDC 51 | 52 | def yellow(message): 53 | return BColors.YELLOW + str(message) + BColors.ENDC 54 | 55 | def b_yellow(message): 56 | return BColors.BYELLOW + str(message) + BColors.ENDC 57 | 58 | def white(message): 59 | return BColors.WHITE + str(message) + BColors.ENDC 60 | 61 | def green(message): 62 | return BColors.GREEN + str(message) + BColors.ENDC 63 | 64 | def b_green(message): 65 | return BColors.BGREEN + str(message) + BColors.ENDC 66 | 67 | def b_okblue(message): 68 | return BColors.OKBLUE + str(message) + BColors.ENDC 69 | 70 | def b_fail(message): 71 | return BColors.BFAIL + str(message) + BColors.ENDC 72 | 73 | def b_warning(message): 74 | return BColors.WARNING + str(message) + BColors.ENDC 75 | 76 | def print_args(args, path=None): 77 | if path: 78 | output_file = open(path, 'w') 79 | logger = logging.getLogger(__name__) 80 | logger.info("Arguments:") 81 | args.command = ' '.join(sys.argv) 82 | items = vars(args) 83 | for key in sorted(items.keys(), key=lambda s: s.lower()): 84 | value = items[key] 85 | if not value: 86 | value = "None" 87 | logger.info(" " + key + ": " + str(items[key])) 88 | if path is not None: 89 | output_file.write(" " + key + ": " + str(items[key]) + "\n") 90 | if path: 91 | output_file.close() 92 | del args.command 93 | 94 | #-----------------------------------------------------------------------------------------------------------# 95 | 96 | #-----------------------------------------------------------------------------------------------------------# 97 | 98 | def set_logger(out_dir=None, log_file="log.txt"): 99 | #console_format = BColors.OKBLUE + '[%(levelname)s]' + BColors.ENDC + ' (%(name)s) %(message)s' 100 | #console_format = b_okblue('[%(levelname)s]') + b_okblue(' [%(asctime)s] ') + ' %(message)s ' 101 | datefmt='%d-%m-%Y %H:%M:%S' 102 | logger = logging.getLogger() 103 | logger.setLevel(logging.DEBUG) 104 | console = logging.StreamHandler() 105 | console.setLevel(logging.DEBUG) 106 | console.setFormatter(ColoredFormatter(datefmt=datefmt)) 107 | logger.addHandler(console) 108 | if out_dir: 109 | #file_format = '[%(levelname)s] (%(name)s) %(message)s' 110 | file_format = '[%(levelname)s] [%(asctime)s] %(message)s' 111 | log_file = logging.FileHandler(out_dir + '/' + log_file, mode='w') 112 | log_file.setLevel(logging.DEBUG) 113 | log_file.setFormatter(logging.Formatter(file_format, datefmt=datefmt)) 114 | logger.addHandler(log_file) 115 | 116 | #-----------------------------------------------------------------------------------------------------------# 117 | 118 | class ColoredFormatter(logging.Formatter): 119 | FORMATS = {logging.DEBUG :"DBG: %(module)s: %(lineno)d: %(message)s", 120 | logging.ERROR : b_fail('[%(levelname)s]') + b_fail(' [%(asctime)s] ') + ' %(message)s ', 121 | logging.INFO : b_okblue('[%(levelname)s]') + b_okblue(' [%(asctime)s] ') + ' %(message)s ', 122 | logging.WARNING : b_warning('[%(levelname)s]') + ' %(message)s', 123 | 'DEFAULT' : b_okblue('[%(levelname)s]') + b_okblue(' [%(asctime)s] ') + ' %(message)s '} 124 | 125 | def format(self, record): 126 | self._fmt = self.FORMATS.get(record.levelno, self.FORMATS['DEFAULT']) 127 | return logging.Formatter.format(self, record) 128 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/lib/m2scorer/combiner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # This file is part of the NUS M2 scorer. 4 | # The NUS M2 scorer is free software: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation, either version 3 of the License, or 7 | # (at your option) any later version. 8 | 9 | # The NUS M2 scorer is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see <http://www.gnu.org/licenses/>. 16 | 17 | # file: m2scorer.py 18 | # 19 | # score a system's output against a gold reference 20 | # 21 | # Usage: m2scorer.py [OPTIONS] proposed_sentences source_gold 22 | # where 23 | # proposed_sentences - system output, sentence per line 24 | # source_gold - source sentences with gold token edits 25 | # OPTIONS 26 | # -v --verbose - print verbose output 27 | # --very_verbose - print lots of verbose output 28 | # --max_unchanged_words N - Maximum unchanged words when extracting edits. Default 2." 29 | # --ignore_whitespace_casing - Ignore edits that only affect whitespace and caseing. Default no." 30 | # 31 | 32 | import sys 33 | import levenshtein 34 | from getopt import getopt 35 | from util import paragraphs 36 | from util import smart_open 37 | 38 | 39 | 40 | def load_annotation(gold_file): 41 | source_sentences = [] 42 | gold_edits = [] 43 | fgold = smart_open(gold_file, 'r') 44 | puffer = fgold.read() 45 | fgold.close() 46 | puffer = puffer.decode('utf8') 47 | for item in paragraphs(puffer.splitlines(True)): 48 | item = item.splitlines(False) 49 | sentence = [line[2:].strip() for line in item if line.startswith('S ')] 50 | assert sentence != [] 51 | annotations = {} 52 | for line in item[1:]: 53 | if line.startswith('I ') or line.startswith('S '): 54 | continue 55 | assert line.startswith('A ') 56 | line = line[2:] 57 | fields = line.split('|||') 58 | start_offset = int(fields[0].split()[0]) 59 | end_offset = int(fields[0].split()[1]) 60 | etype = fields[1] 61 | if etype == 'noop': 62 | start_offset = -1 63 | end_offset = -1 64 | corrections = [c.strip() if c != '-NONE-' else '' for c in fields[2].split('||')] 65 | # NOTE: start and end are *token* offsets 66 | original = ' '.join(' '.join(sentence).split()[start_offset:end_offset]) 67 | annotator = int(fields[5]) 68 | if annotator not in annotations.keys(): 69 | annotations[annotator] = [] 70 | annotations[annotator].append((start_offset, end_offset, original, corrections)) 71 | tok_offset = 0 72 | for this_sentence in sentence: 73 | tok_offset += len(this_sentence.split()) 74 | source_sentences.append(this_sentence) 75 | this_edits = {} 76 | for annotator, annotation in annotations.iteritems(): 77 | this_edits[annotator] = [edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0] 78 | if len(this_edits) == 0: 79 | this_edits[0] = [] 80 | gold_edits.append(this_edits) 81 | return (source_sentences, gold_edits) 82 | 83 | 84 | def print_usage(): 85 | print >> sys.stderr, "Usage: m2scorer.py [OPTIONS] proposed_sentences gold_source" 86 | print >> sys.stderr, "where" 87 | print >> sys.stderr, " proposed_sentences - system output, sentence per line" 88 | print >> sys.stderr, " source_gold - source sentences with gold token edits" 89 | print >> sys.stderr, "OPTIONS" 90 | print >> sys.stderr, " -v --verbose - print verbose output" 91 | print >> sys.stderr, " --very_verbose - print lots of verbose output" 92 | print >> sys.stderr, " --max_unchanged_words N - Maximum unchanged words when extraction edit. Default 2." 93 | print >> sys.stderr, " --ignore_whitespace_casing - Ignore edits that only affect whitespace and caseing. Default no." 94 | 95 | 96 | 97 | max_unchanged_words=2 98 | ignore_whitespace_casing= False 99 | verbose = False 100 | very_verbose = False 101 | opts, args = getopt(sys.argv[1:], "v", ["max_unchanged_words=", "verbose", "ignore_whitespace_casing", "very_verbose"]) 102 | for o, v in opts: 103 | if o in ('-v', '--verbose'): 104 | verbose = True 105 | elif o == '--very_verbose': 106 | very_verbose = True 107 | elif o == '--max_unchanged_words': 108 | max_unchanged_words = int(v) 109 | elif o == '--ignore_whitespace_casing': 110 | ignore_whitespace_casing = True 111 | else: 112 | print >> sys.stderr, "Unknown option :", o 113 | print_usage() 114 | sys.exit(-1) 115 | 116 | 117 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/scripts/apply_bpe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | """Use operations learned with learn_bpe.py to encode a new text. 6 | The text will not be smaller, but use only a fixed vocabulary, with rare words 7 | encoded as variable-length sequences of subword units. 8 | 9 | Reference: 10 | Rico Sennrich, Barry Haddow and Alexandra Birch (2015). Neural Machine Translation of Rare Words with Subword Units. 11 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. 12 | """ 13 | 14 | from __future__ import unicode_literals, division 15 | 16 | import sys 17 | import codecs 18 | import argparse 19 | from collections import defaultdict 20 | 21 | # hack for python2/3 compatibility 22 | from io import open 23 | argparse.open = open 24 | 25 | # python 2/3 compatibility 26 | if sys.version_info < (3, 0): 27 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) 28 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) 29 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin) 30 | 31 | import codecs 32 | 33 | class BPE(object): 34 | 35 | def __init__(self, codes, separator='@@'): 36 | 37 | with codecs.open(codes.name, encoding='utf-8') as codes: 38 | self.bpe_codes = [tuple(item.split()) for item in codes] 39 | 40 | # some hacking to deal with duplicates (only consider first instance) 41 | self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))]) 42 | 43 | self.separator = separator 44 | 45 | def segment(self, sentence): 46 | """segment single sentence (whitespace-tokenized string) with BPE encoding""" 47 | 48 | output = [] 49 | for word in sentence.split(): 50 | new_word = encode(word, self.bpe_codes) 51 | 52 | for item in new_word[:-1]: 53 | output.append(item + self.separator) 54 | output.append(new_word[-1]) 55 | 56 | return ' '.join(output) 57 | 58 | def create_parser(): 59 | parser = argparse.ArgumentParser( 60 | formatter_class=argparse.RawDescriptionHelpFormatter, 61 | description="learn BPE-based word segmentation") 62 | 63 | parser.add_argument( 64 | '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, 65 | metavar='PATH', 66 | help="Input file (default: standard input).") 67 | parser.add_argument( 68 | '--codes', '-c', type=argparse.FileType('r'), metavar='PATH', 69 | required=True, 70 | help="File with BPE codes (created by learn_bpe.py).") 71 | parser.add_argument( 72 | '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, 73 | metavar='PATH', 74 | help="Output file (default: standard output)") 75 | parser.add_argument( 76 | '--separator', '-s', type=str, default='@@', metavar='STR', 77 | help="Separator between non-final subword units (default: '%(default)s'))") 78 | 79 | return parser 80 | 81 | def get_pairs(word): 82 | """Return set of symbol pairs in a word. 83 | 84 | word is represented as tuple of symbols (symbols being variable-length strings) 85 | """ 86 | pairs = set() 87 | prev_char = word[0] 88 | for char in word[1:]: 89 | pairs.add((prev_char, char)) 90 | prev_char = char 91 | return pairs 92 | 93 | def encode(orig, bpe_codes, cache={}): 94 | """Encode word based on list of BPE merge operations, which are applied consecutively 95 | """ 96 | 97 | if orig in cache: 98 | return cache[orig] 99 | 100 | word = tuple(orig) + ('</w>',) 101 | pairs = get_pairs(word) 102 | 103 | while True: 104 | bigram = min(pairs, key = lambda pair: bpe_codes.get(pair, float('inf'))) 105 | if bigram not in bpe_codes: 106 | break 107 | first, second = bigram 108 | new_word = [] 109 | i = 0 110 | while i < len(word): 111 | try: 112 | j = word.index(first, i) 113 | new_word.extend(word[i:j]) 114 | i = j 115 | except: 116 | new_word.extend(word[i:]) 117 | break 118 | 119 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 120 | new_word.append(first+second) 121 | i += 2 122 | else: 123 | new_word.append(word[i]) 124 | i += 1 125 | new_word = tuple(new_word) 126 | word = new_word 127 | if len(word) == 1: 128 | break 129 | else: 130 | pairs = get_pairs(word) 131 | 132 | # don't print end-of-word symbols 133 | if word[-1] == '</w>': 134 | word = word[:-1] 135 | elif word[-1].endswith('</w>'): 136 | word = word[:-1] + (word[-1].replace('</w>',''),) 137 | 138 | cache[orig] = word 139 | return word 140 | 141 | 142 | if __name__ == '__main__': 143 | parser = create_parser() 144 | args = parser.parse_args() 145 | 146 | bpe = BPE(args.codes, args.separator) 147 | 148 | for line in args.input: 149 | args.output.write(bpe.segment(line).strip()) 150 | args.output.write('\n') 151 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/subword-nmt/apply_bpe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | """Use operations learned with learn_bpe.py to encode a new text. 6 | The text will not be smaller, but use only a fixed vocabulary, with rare words 7 | encoded as variable-length sequences of subword units. 8 | 9 | Reference: 10 | Rico Sennrich, Barry Haddow and Alexandra Birch (2015). Neural Machine Translation of Rare Words with Subword Units. 11 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. 12 | """ 13 | 14 | from __future__ import unicode_literals, division 15 | 16 | import sys 17 | import codecs 18 | import argparse 19 | from collections import defaultdict 20 | 21 | # hack for python2/3 compatibility 22 | from io import open 23 | argparse.open = open 24 | 25 | # python 2/3 compatibility 26 | if sys.version_info < (3, 0): 27 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) 28 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) 29 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin) 30 | 31 | import codecs 32 | 33 | class BPE(object): 34 | 35 | def __init__(self, codes, separator='@@'): 36 | 37 | with codecs.open(codes.name, encoding='utf-8') as codes: 38 | self.bpe_codes = [tuple(item.split()) for item in codes] 39 | 40 | # some hacking to deal with duplicates (only consider first instance) 41 | self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))]) 42 | 43 | self.separator = separator 44 | 45 | def segment(self, sentence): 46 | """segment single sentence (whitespace-tokenized string) with BPE encoding""" 47 | 48 | output = [] 49 | for word in sentence.split(): 50 | new_word = encode(word, self.bpe_codes) 51 | 52 | for item in new_word[:-1]: 53 | output.append(item + self.separator) 54 | output.append(new_word[-1]) 55 | 56 | return ' '.join(output) 57 | 58 | def create_parser(): 59 | parser = argparse.ArgumentParser( 60 | formatter_class=argparse.RawDescriptionHelpFormatter, 61 | description="learn BPE-based word segmentation") 62 | 63 | parser.add_argument( 64 | '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, 65 | metavar='PATH', 66 | help="Input file (default: standard input).") 67 | parser.add_argument( 68 | '--codes', '-c', type=argparse.FileType('r'), metavar='PATH', 69 | required=True, 70 | help="File with BPE codes (created by learn_bpe.py).") 71 | parser.add_argument( 72 | '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, 73 | metavar='PATH', 74 | help="Output file (default: standard output)") 75 | parser.add_argument( 76 | '--separator', '-s', type=str, default='@@', metavar='STR', 77 | help="Separator between non-final subword units (default: '%(default)s'))") 78 | 79 | return parser 80 | 81 | def get_pairs(word): 82 | """Return set of symbol pairs in a word. 83 | 84 | word is represented as tuple of symbols (symbols being variable-length strings) 85 | """ 86 | pairs = set() 87 | prev_char = word[0] 88 | for char in word[1:]: 89 | pairs.add((prev_char, char)) 90 | prev_char = char 91 | return pairs 92 | 93 | def encode(orig, bpe_codes, cache={}): 94 | """Encode word based on list of BPE merge operations, which are applied consecutively 95 | """ 96 | 97 | if orig in cache: 98 | return cache[orig] 99 | 100 | word = tuple(orig) + ('</w>',) 101 | pairs = get_pairs(word) 102 | 103 | while True: 104 | bigram = min(pairs, key = lambda pair: bpe_codes.get(pair, float('inf'))) 105 | if bigram not in bpe_codes: 106 | break 107 | first, second = bigram 108 | new_word = [] 109 | i = 0 110 | while i < len(word): 111 | try: 112 | j = word.index(first, i) 113 | new_word.extend(word[i:j]) 114 | i = j 115 | except: 116 | new_word.extend(word[i:]) 117 | break 118 | 119 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 120 | new_word.append(first+second) 121 | i += 2 122 | else: 123 | new_word.append(word[i]) 124 | i += 1 125 | new_word = tuple(new_word) 126 | word = new_word 127 | if len(word) == 1: 128 | break 129 | else: 130 | pairs = get_pairs(word) 131 | 132 | # don't print end-of-word symbols 133 | if word[-1] == '</w>': 134 | word = word[:-1] 135 | elif word[-1].endswith('</w>'): 136 | word = word[:-1] + (word[-1].replace('</w>',''),) 137 | 138 | cache[orig] = word 139 | return word 140 | 141 | 142 | if __name__ == '__main__': 143 | parser = create_parser() 144 | args = parser.parse_args() 145 | 146 | bpe = BPE(args.codes, args.separator) 147 | 148 | for line in args.input: 149 | args.output.write(bpe.segment(line).strip()) 150 | args.output.write('\n') 151 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import logging 10 | import os 11 | import torch 12 | import traceback 13 | import subprocess 14 | 15 | from torch.autograd import Variable 16 | from torch.serialization import default_restore_location 17 | 18 | from fairseq import criterions, data, models 19 | 20 | validation_proc = None 21 | 22 | def parse_args_and_arch(parser): 23 | args = parser.parse_args() 24 | args.model = models.arch_model_map[args.arch] 25 | args = getattr(models, args.model).parse_arch(args) 26 | return args 27 | 28 | 29 | def build_model(args, dataset): 30 | assert hasattr(models, args.model), 'Missing model type' 31 | return getattr(models, args.model).build_model(args, dataset) 32 | 33 | 34 | def build_criterion(args, dataset): 35 | padding_idx = dataset.dst_dict.pad() 36 | if args.label_smoothing > 0: 37 | return criterions.LabelSmoothedCrossEntropyCriterion(args.label_smoothing, padding_idx) 38 | else: 39 | return criterions.CrossEntropyCriterion(padding_idx) 40 | 41 | 42 | def torch_persistent_save(*args, **kwargs): 43 | for i in range(3): 44 | try: 45 | return torch.save(*args, **kwargs) 46 | except: 47 | if i == 2: 48 | logging.error(traceback.format_exc()) 49 | 50 | 51 | def save_checkpoint(args, epoch, batch_offset, model, optimizer, lr_scheduler, val_loss=None, validation_script=None): 52 | 53 | global validation_proc 54 | state_dict = { 55 | 'args': args, 56 | 'epoch': epoch, 57 | 'batch_offset': batch_offset, 58 | 'model': model.state_dict(), 59 | 'optimizer': optimizer.state_dict(), 60 | 'best_loss': lr_scheduler.best, 61 | 'val_loss': val_loss, 62 | } 63 | 64 | if batch_offset == 0: 65 | if not args.no_epoch_checkpoints: 66 | epoch_filename = os.path.join(args.save_dir, 'checkpoint{}.pt'.format(epoch)) 67 | print('| epoch {:03d} | saving checkpoint '.format(epoch, epoch_filename)) 68 | torch_persistent_save(state_dict, epoch_filename) 69 | if validation_script: 70 | if validation_proc and validation_proc.poll() is None: 71 | print('| epoch {:03d} | waiting for previous validation process to finish.'.format(epoch)) 72 | validation_proc.wait() 73 | validation_proc = subprocess.Popen(validation_script + [epoch_filename]) 74 | 75 | assert val_loss is not None 76 | if not hasattr(save_checkpoint, 'best') or val_loss < save_checkpoint.best: 77 | save_checkpoint.best = val_loss 78 | best_filename = os.path.join(args.save_dir, 'checkpoint_best.pt') 79 | print('| epoch {:03d} | saving best checkpoint'.format(epoch, best_filename)) 80 | torch_persistent_save(state_dict, best_filename) 81 | 82 | last_filename = os.path.join(args.save_dir, 'checkpoint_last.pt') 83 | print('| epoch {:03d} | saving last checkpoint'.format(epoch, last_filename)) 84 | torch_persistent_save(state_dict, last_filename) 85 | 86 | 87 | 88 | def load_checkpoint(filename, model, optimizer, lr_scheduler, cuda_device=None): 89 | if not os.path.exists(filename): 90 | return 1, 0 91 | if cuda_device is None: 92 | state = torch.load(filename) 93 | else: 94 | state = torch.load( 95 | filename, 96 | map_location=lambda s, l: default_restore_location(s, 'cuda:{}'.format(cuda_device)) 97 | ) 98 | 99 | model.load_state_dict(state['model']) 100 | optimizer.load_state_dict(state['optimizer']) 101 | lr_scheduler.best = state['best_loss'] 102 | epoch = state['epoch'] + 1 103 | batch_offset = state['batch_offset'] 104 | 105 | gpu_str = ' on GPU #{}'.format(cuda_device) if cuda_device is not None else '' 106 | print('| loaded checkpoint {} (epoch {}){}'.format(filename, epoch, gpu_str)) 107 | return epoch, batch_offset 108 | 109 | 110 | def load_ensemble_for_inference(filenames, data_path): 111 | # load model architectures and weights 112 | states = [] 113 | for filename in filenames: 114 | if not os.path.exists(filename): 115 | raise IOError('Model file not found: {}'.format(filename)) 116 | states.append( 117 | torch.load(filename, map_location=lambda s, l: default_restore_location(s, 'cpu')) 118 | ) 119 | 120 | # load dataset 121 | args = states[0]['args'] 122 | dataset = data.load(data_path, args.source_lang, args.target_lang) 123 | 124 | # build models 125 | ensemble = [] 126 | for state in states: 127 | model = build_model(args, dataset) 128 | model.load_state_dict(state['model']) 129 | ensemble.append(model) 130 | 131 | return ensemble, dataset 132 | 133 | 134 | def prepare_sample(sample, volatile=False, cuda_device=None): 135 | """Wrap input tensors in Variable class.""" 136 | 137 | def make_variable(tensor): 138 | if cuda_device is not None and torch.cuda.is_available(): 139 | tensor = tensor.cuda(async=True, device=cuda_device) 140 | return Variable(tensor, volatile=volatile) 141 | 142 | return { 143 | 'id': sample['id'], 144 | 'ntokens': sample['ntokens'], 145 | 'target': make_variable(sample['target']), 146 | 'net_input': { 147 | key: make_variable(sample[key]) 148 | for key in ['src_tokens', 'src_positions', 'input_tokens', 'input_positions'] 149 | }, 150 | } 151 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/lib/m2scorer/m2scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # This file is part of the NUS M2 scorer. 4 | # The NUS M2 scorer is free software: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation, either version 3 of the License, or 7 | # (at your option) any later version. 8 | 9 | # The NUS M2 scorer is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see <http://www.gnu.org/licenses/>. 16 | 17 | # file: m2scorer.py 18 | # 19 | # score a system's output against a gold reference 20 | # 21 | # Usage: m2scorer.py [OPTIONS] proposed_sentences source_gold 22 | # where 23 | # proposed_sentences - system output, sentence per line 24 | # source_gold - source sentences with gold token edits 25 | # OPTIONS 26 | # -v --verbose - print verbose output 27 | # --very_verbose - print lots of verbose output 28 | # --max_unchanged_words N - Maximum unchanged words when extracting edits. Default 2." 29 | # --beta B - Beta value for F-measure. Default 0.5." 30 | # --ignore_whitespace_casing - Ignore edits that only affect whitespace and caseing. Default no." 31 | # 32 | 33 | import sys 34 | import levenshtein 35 | from getopt import getopt 36 | from util import paragraphs 37 | from util import smart_open 38 | 39 | 40 | 41 | def load_annotation(gold_file): 42 | source_sentences = [] 43 | gold_edits = [] 44 | fgold = smart_open(gold_file, 'r') 45 | puffer = fgold.read() 46 | fgold.close() 47 | puffer = puffer.decode('utf8') 48 | for item in paragraphs(puffer.splitlines(True)): 49 | item = item.splitlines(False) 50 | sentence = [line[2:].strip() for line in item if line.startswith('S ')] 51 | assert sentence != [] 52 | annotations = {} 53 | for line in item[1:]: 54 | if line.startswith('I ') or line.startswith('S '): 55 | continue 56 | assert line.startswith('A ') 57 | line = line[2:] 58 | fields = line.split('|||') 59 | start_offset = int(fields[0].split()[0]) 60 | end_offset = int(fields[0].split()[1]) 61 | etype = fields[1] 62 | if etype == 'noop': 63 | start_offset = -1 64 | end_offset = -1 65 | corrections = [c.strip() if c != '-NONE-' else '' for c in fields[2].split('||')] 66 | # NOTE: start and end are *token* offsets 67 | original = ' '.join(' '.join(sentence).split()[start_offset:end_offset]) 68 | annotator = int(fields[5]) 69 | if annotator not in annotations.keys(): 70 | annotations[annotator] = [] 71 | annotations[annotator].append((start_offset, end_offset, original, corrections)) 72 | tok_offset = 0 73 | for this_sentence in sentence: 74 | tok_offset += len(this_sentence.split()) 75 | source_sentences.append(this_sentence) 76 | this_edits = {} 77 | for annotator, annotation in annotations.iteritems(): 78 | this_edits[annotator] = [edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0] 79 | if len(this_edits) == 0: 80 | this_edits[0] = [] 81 | gold_edits.append(this_edits) 82 | return (source_sentences, gold_edits) 83 | 84 | 85 | def print_usage(): 86 | print >> sys.stderr, "Usage: m2scorer.py [OPTIONS] proposed_sentences gold_source" 87 | print >> sys.stderr, "where" 88 | print >> sys.stderr, " proposed_sentences - system output, sentence per line" 89 | print >> sys.stderr, " source_gold - source sentences with gold token edits" 90 | print >> sys.stderr, "OPTIONS" 91 | print >> sys.stderr, " -v --verbose - print verbose output" 92 | print >> sys.stderr, " --very_verbose - print lots of verbose output" 93 | print >> sys.stderr, " --max_unchanged_words N - Maximum unchanged words when extraction edit. Default 2." 94 | print >> sys.stderr, " --beta B - Beta value for F-measure. Default 0.5." 95 | print >> sys.stderr, " --ignore_whitespace_casing - Ignore edits that only affect whitespace and caseing. Default no." 96 | 97 | 98 | 99 | max_unchanged_words=2 100 | beta = 0.5 101 | ignore_whitespace_casing= False 102 | verbose = False 103 | very_verbose = False 104 | opts, args = getopt(sys.argv[1:], "v", ["max_unchanged_words=", "beta=", "verbose", "ignore_whitespace_casing", "very_verbose"]) 105 | for o, v in opts: 106 | if o in ('-v', '--verbose'): 107 | verbose = True 108 | elif o == '--very_verbose': 109 | very_verbose = True 110 | elif o == '--max_unchanged_words': 111 | max_unchanged_words = int(v) 112 | elif o == '--beta': 113 | beta = float(v) 114 | elif o == '--ignore_whitespace_casing': 115 | ignore_whitespace_casing = True 116 | else: 117 | print >> sys.stderr, "Unknown option :", o 118 | print_usage() 119 | sys.exit(-1) 120 | 121 | # starting point 122 | if len(args) != 2: 123 | print_usage() 124 | sys.exit(-1) 125 | 126 | system_file = args[0] 127 | gold_file = args[1] 128 | 129 | # load source sentences and gold edits 130 | source_sentences, gold_edits = load_annotation(gold_file) 131 | 132 | # load system hypotheses 133 | fin = smart_open(system_file, 'r') 134 | system_sentences = [line.decode("utf8").strip() for line in fin.readlines()] 135 | fin.close() 136 | 137 | p, r, f1 = levenshtein.batch_multi_pre_rec_f1(system_sentences, source_sentences, gold_edits, max_unchanged_words, beta, ignore_whitespace_casing, verbose, very_verbose) 138 | 139 | print "Precision : %.4f" % p 140 | print "Recall : %.4f" % r 141 | print "F_%.1f : %.4f" % (beta, f1) 142 | 143 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/multiprocessing_event_loop.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import os 10 | import signal 11 | import threading 12 | from torch import multiprocessing 13 | 14 | 15 | class MultiprocessingEventLoop(object): 16 | """Start a multiprocessing event loop.""" 17 | 18 | def __init__(self, device_ids=None, multiprocessing_method='spawn'): 19 | super().__init__() 20 | self.device_ids = tuple(device_ids) 21 | self.num_replicas = len(device_ids) 22 | self.rank = None 23 | 24 | self._mp = multiprocessing.get_context(multiprocessing_method) 25 | 26 | self._start_error_handler() 27 | self._start_multiprocessing() 28 | 29 | def call_async(self, rank, action, **kwargs): 30 | """Asynchronously call a function in each child process. 31 | 32 | Call a function named `action` on the rank'th process and return 33 | a Future with the result. 34 | """ 35 | 36 | def result_generator(): 37 | yield self.return_pipes[rank].recv() 38 | 39 | assert not self.return_pipes[rank].poll(), \ 40 | 'return pipe must be consumed before calling another function' 41 | self.input_pipes[rank].send((action, kwargs)) 42 | 43 | return Future(result_generator()) 44 | 45 | def stop(self, interrupt_children=False): 46 | """Stop multiprocessing.""" 47 | for rank in range(self.num_replicas): 48 | self.input_pipes[rank].close() 49 | self.return_pipes[rank].close() 50 | if interrupt_children: 51 | # send KeyboardInterrupt to children 52 | os.kill(self.procs[rank].pid, signal.SIGINT) 53 | else: 54 | self.procs[rank].join() 55 | self.error_queue.put((None, None)) # poison pill 56 | 57 | def _start_error_handler(self): 58 | """Error handler to catch exceptions in child processes.""" 59 | # create a thread to listen for errors in the child processes 60 | self.error_queue = self._mp.SimpleQueue() 61 | error_thread = threading.Thread(target=self._error_listener, 62 | daemon=True) 63 | error_thread.start() 64 | 65 | # create signal handler that executes in the main process/thread and 66 | # handles errors from child processes 67 | signal.signal(signal.SIGUSR1, self._signal_handler) 68 | 69 | def _error_listener(self): 70 | """A thread that listens for errors in the child processes. 71 | 72 | Errors are handled in a signal handler in the main thread. 73 | """ 74 | (rank, original_trace) = self.error_queue.get() 75 | if rank is None: # poison pill, return 76 | return 77 | 78 | # requeue error and switch to main thread for handling the error 79 | self.error_queue.put((rank, original_trace)) 80 | os.kill(os.getpid(), signal.SIGUSR1) 81 | 82 | def _signal_handler(self, signal, frame): 83 | """Signal handler that handles errors from child processes. 84 | 85 | This signal handler executes in the main/process thread. 86 | """ 87 | self.stop(interrupt_children=True) 88 | (rank, original_trace) = self.error_queue.get() 89 | msg = "\n\n-- Tracebacks above this line can probably be ignored --\n\n" 90 | msg += original_trace 91 | raise Exception(msg) 92 | 93 | def _start_multiprocessing(self): 94 | """Create child processes to run async event loop. 95 | 96 | Each process reads input from a Pipe, performs some computation, 97 | and returns its output to another Pipe. 98 | """ 99 | # create child processes 100 | input_pipes = [] 101 | return_pipes = [] 102 | procs = [] 103 | for rank, id in enumerate(self.device_ids): 104 | recv_input_pipe, send_input_pipe = self._mp.Pipe(duplex=False) 105 | recv_return_pipe, send_return_pipe = self._mp.Pipe(duplex=False) 106 | proc = self._mp.Process( 107 | target=self._process_event_loop, 108 | args=(rank, id, recv_input_pipe, send_return_pipe), 109 | daemon=True) 110 | proc.start() 111 | input_pipes.append(send_input_pipe) 112 | return_pipes.append(recv_return_pipe) 113 | procs.append(proc) 114 | self.input_pipes = input_pipes 115 | self.return_pipes = return_pipes 116 | self.procs = procs 117 | 118 | def _process_event_loop(self, rank, device_id, input_pipe, return_pipe): 119 | """Event loop that runs in each child process. 120 | 121 | Event loop: 122 | - take an action from the input pipe 123 | - call the corresponding function in this process 124 | - put the return value in the return pipe 125 | 126 | Any exceptions are put in the error queue. 127 | """ 128 | self.rank = rank 129 | try: 130 | # event loop 131 | while True: 132 | action, kwargs = input_pipe.recv() 133 | action_fn = getattr(self, action) 134 | return_pipe.send(action_fn(rank, device_id, **kwargs)) 135 | except EOFError: 136 | # input pipe was closed, do nothing 137 | pass 138 | except KeyboardInterrupt: 139 | # killed by parent, do nothing 140 | pass 141 | except Exception: 142 | # propagate exception from child to parent process, keeping 143 | # original traceback 144 | import traceback 145 | self.error_queue.put((rank, traceback.format_exc())) 146 | finally: 147 | # cleanup pipes 148 | input_pipe.close() 149 | return_pipe.close() 150 | 151 | 152 | class Future(object): 153 | """A wrapper around a Python generator, with syntactic sugar.""" 154 | def __init__(self, generator): 155 | self.generator = generator 156 | 157 | def gen(self): 158 | return next(self.generator) 159 | 160 | @staticmethod 161 | def gen_list(gens): 162 | return [g.gen() for g in gens] 163 | 164 | @staticmethod 165 | def gen_tuple_list(gens): 166 | list = [g.gen() for g in gens] 167 | return zip(*list) 168 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/preprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import argparse 10 | import os 11 | from itertools import zip_longest 12 | 13 | from fairseq import dictionary, indexed_dataset 14 | from fairseq.tokenizer import Tokenizer 15 | 16 | 17 | def main(): 18 | parser = argparse.ArgumentParser( 19 | description='Data pre-processing: Create dictionary and store data in binary format') 20 | parser.add_argument('-s', '--source-lang', default=None, metavar='SRC', help='source language') 21 | parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET', help='target language') 22 | parser.add_argument('--trainpref', metavar='FP', default='train', help='target language') 23 | parser.add_argument('--validpref', metavar='FP', default='valid', help='comma separated, valid language prefixes') 24 | parser.add_argument('--testpref', metavar='FP', default='test', help='comma separated, test language prefixes') 25 | parser.add_argument('--destdir', metavar='DIR', default='data-bin', help='destination dir') 26 | parser.add_argument('--thresholdtgt', metavar='N', default=0, type=int, 27 | help='map words appearing less than threshold times to unknown') 28 | parser.add_argument('--thresholdsrc', metavar='N', default=0, type=int, 29 | help='map words appearing less than threshold times to unknown') 30 | parser.add_argument('--nwordstgt', metavar='N', default=-1, type=int, help='number of target words to retain') 31 | parser.add_argument('--nwordssrc', metavar='N', default=-1, type=int, help='number of source words to retain') 32 | parser.add_argument('--alignfile', metavar='ALIGN', default=None, help='an alignment file (optional)') 33 | 34 | args = parser.parse_args() 35 | print(args) 36 | 37 | os.makedirs(args.destdir, exist_ok=True) 38 | 39 | src_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.source_lang)) 40 | src_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)), 41 | threshold=args.thresholdsrc, nwords=args.nwordssrc) 42 | tgt_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.target_lang)) 43 | tgt_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)), 44 | threshold=args.thresholdtgt, nwords=args.nwordstgt) 45 | 46 | def make_dataset(input_prefix, output_prefix, lang): 47 | dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(lang))) 48 | print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) 49 | 50 | ds = indexed_dataset.IndexedDatasetBuilder( 51 | '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix, args.source_lang, 52 | args.target_lang, lang) 53 | ) 54 | 55 | def consumer(tensor): 56 | ds.add_item(tensor) 57 | 58 | input_file = '{}.{}'.format(input_prefix, lang) 59 | res = Tokenizer.binarize(input_file, dict, consumer) 60 | print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( 61 | lang, input_file, res['nseq'], res['ntok'], 62 | 100 * res['nunk'] / res['ntok'], dict.unk_word)) 63 | ds.finalize('{}/{}.{}-{}.{}.idx'.format( 64 | args.destdir, output_prefix, 65 | args.source_lang, args.target_lang, lang)) 66 | 67 | make_dataset(args.trainpref, 'train', args.source_lang) 68 | make_dataset(args.trainpref, 'train', args.target_lang) 69 | for k, validpref in enumerate(args.validpref.split(',')): 70 | outprefix = 'valid{}'.format(k) if k > 0 else 'valid' 71 | make_dataset(validpref, outprefix, args.source_lang) 72 | make_dataset(validpref, outprefix, args.target_lang) 73 | for k, testpref in enumerate(args.testpref.split(',')): 74 | outprefix = 'test{}'.format(k) if k > 0 else 'test' 75 | make_dataset(testpref, outprefix, args.source_lang) 76 | make_dataset(testpref, outprefix, args.target_lang) 77 | print('| Wrote preprocessed data to {}'.format(args.destdir)) 78 | 79 | if args.alignfile: 80 | src_file_name = '{}.{}'.format(args.trainpref, args.source_lang) 81 | tgt_file_name = '{}.{}'.format(args.trainpref, args.target_lang) 82 | src_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang))) 83 | tgt_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang))) 84 | freq_map = {} 85 | with open(args.alignfile, 'r') as align_file: 86 | with open(src_file_name, 'r') as src_file: 87 | with open(tgt_file_name, 'r') as tgt_file: 88 | for a, s, t in zip_longest(align_file, src_file, tgt_file): 89 | si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) 90 | ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) 91 | ai = list(map(lambda x: tuple(x.split('-')), a.split())) 92 | for sai, tai in ai: 93 | srcidx = si[int(sai)] 94 | tgtidx = ti[int(tai)] 95 | if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk(): 96 | assert srcidx != src_dict.pad() 97 | assert srcidx != src_dict.eos() 98 | assert tgtidx != tgt_dict.pad() 99 | assert tgtidx != tgt_dict.eos() 100 | 101 | if srcidx not in freq_map: 102 | freq_map[srcidx] = {} 103 | if tgtidx not in freq_map[srcidx]: 104 | freq_map[srcidx][tgtidx] = 1 105 | else: 106 | freq_map[srcidx][tgtidx] += 1 107 | 108 | align_dict = {} 109 | for srcidx in freq_map.keys(): 110 | align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) 111 | 112 | with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format( 113 | args.source_lang, args.target_lang)), 'w') as f: 114 | for k, v in align_dict.items(): 115 | print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f) 116 | 117 | 118 | if __name__ == '__main__': 119 | main() 120 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/lib/m2scorer/util.py: -------------------------------------------------------------------------------- 1 | # This file is part of the NUS M2 scorer. 2 | # The NUS M2 scorer is free software: you can redistribute it and/or modify 3 | # it under the terms of the GNU General Public License as published by 4 | # the Free Software Foundation, either version 3 of the License, or 5 | # (at your option) any later version. 6 | 7 | # The NUS M2 scorer is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | # GNU General Public License for more details. 11 | 12 | # You should have received a copy of the GNU General Public License 13 | # along with this program. If not, see <http://www.gnu.org/licenses/>. 14 | 15 | # file: util.py 16 | # 17 | 18 | import operator 19 | import random 20 | import math 21 | import re 22 | 23 | def smart_open(fname, mode = 'r'): 24 | if fname.endswith('.gz'): 25 | import gzip 26 | # Using max compression (9) by default seems to be slow. 27 | # Let's try using the fastest. 28 | return gzip.open(fname, mode, 1) 29 | else: 30 | return open(fname, mode) 31 | 32 | 33 | def randint(b, a=0): 34 | return random.randint(a,b) 35 | 36 | def uniq(seq, idfun=None): 37 | # order preserving 38 | if idfun is None: 39 | def idfun(x): return x 40 | seen = {} 41 | result = [] 42 | for item in seq: 43 | marker = idfun(item) 44 | # in old Python versions: 45 | # if seen.has_key(marker) 46 | # but in new ones: 47 | if marker in seen: continue 48 | seen[marker] = 1 49 | result.append(item) 50 | return result 51 | 52 | 53 | def sort_dict(myDict, byValue=False, reverse=False): 54 | if byValue: 55 | items = myDict.items() 56 | items.sort(key = operator.itemgetter(1), reverse=reverse) 57 | else: 58 | items = sorted(myDict.items()) 59 | return items 60 | 61 | def max_dict(myDict, byValue=False): 62 | if byValue: 63 | skey=lambda x:x[1] 64 | else: 65 | skey=lambda x:x[0] 66 | return max(myDict.items(), key=skey) 67 | 68 | 69 | def min_dict(myDict, byValue=False): 70 | if byValue: 71 | skey=lambda x:x[1] 72 | else: 73 | skey=lambda x:x[0] 74 | return min(myDict.items(), key=skey) 75 | 76 | def paragraphs(lines, is_separator=lambda x : x == '\n', joiner=''.join): 77 | paragraph = [] 78 | for line in lines: 79 | if is_separator(line): 80 | if paragraph: 81 | yield joiner(paragraph) 82 | paragraph = [] 83 | else: 84 | paragraph.append(line) 85 | if paragraph: 86 | yield joiner(paragraph) 87 | 88 | 89 | def isASCII(word): 90 | try: 91 | word = word.decode("ascii") 92 | return True 93 | except UnicodeEncodeError : 94 | return False 95 | except UnicodeDecodeError: 96 | return False 97 | 98 | 99 | def intersect(x, y): 100 | return [z for z in x if z in y] 101 | 102 | 103 | 104 | # Mapping Windows CP1252 Gremlins to Unicode 105 | # from http://effbot.org/zone/unicode-gremlins.htm 106 | cp1252 = { 107 | # from http://www.microsoft.com/typography/unicode/1252.htm 108 | u"\x80": u"\u20AC", # EURO SIGN 109 | u"\x82": u"\u201A", # SINGLE LOW-9 QUOTATION MARK 110 | u"\x83": u"\u0192", # LATIN SMALL LETTER F WITH HOOK 111 | u"\x84": u"\u201E", # DOUBLE LOW-9 QUOTATION MARK 112 | u"\x85": u"\u2026", # HORIZONTAL ELLIPSIS 113 | u"\x86": u"\u2020", # DAGGER 114 | u"\x87": u"\u2021", # DOUBLE DAGGER 115 | u"\x88": u"\u02C6", # MODIFIER LETTER CIRCUMFLEX ACCENT 116 | u"\x89": u"\u2030", # PER MILLE SIGN 117 | u"\x8A": u"\u0160", # LATIN CAPITAL LETTER S WITH CARON 118 | u"\x8B": u"\u2039", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK 119 | u"\x8C": u"\u0152", # LATIN CAPITAL LIGATURE OE 120 | u"\x8E": u"\u017D", # LATIN CAPITAL LETTER Z WITH CARON 121 | u"\x91": u"\u2018", # LEFT SINGLE QUOTATION MARK 122 | u"\x92": u"\u2019", # RIGHT SINGLE QUOTATION MARK 123 | u"\x93": u"\u201C", # LEFT DOUBLE QUOTATION MARK 124 | u"\x94": u"\u201D", # RIGHT DOUBLE QUOTATION MARK 125 | u"\x95": u"\u2022", # BULLET 126 | u"\x96": u"\u2013", # EN DASH 127 | u"\x97": u"\u2014", # EM DASH 128 | u"\x98": u"\u02DC", # SMALL TILDE 129 | u"\x99": u"\u2122", # TRADE MARK SIGN 130 | u"\x9A": u"\u0161", # LATIN SMALL LETTER S WITH CARON 131 | u"\x9B": u"\u203A", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 132 | u"\x9C": u"\u0153", # LATIN SMALL LIGATURE OE 133 | u"\x9E": u"\u017E", # LATIN SMALL LETTER Z WITH CARON 134 | u"\x9F": u"\u0178", # LATIN CAPITAL LETTER Y WITH DIAERESIS 135 | } 136 | 137 | def fix_cp1252codes(text): 138 | # map cp1252 gremlins to real unicode characters 139 | if re.search(u"[\x80-\x9f]", text): 140 | def fixup(m): 141 | s = m.group(0) 142 | return cp1252.get(s, s) 143 | if isinstance(text, type("")): 144 | # make sure we have a unicode string 145 | text = unicode(text, "iso-8859-1") 146 | text = re.sub(u"[\x80-\x9f]", fixup, text) 147 | return text 148 | 149 | def clean_utf8(text): 150 | return filter(lambda x : x > '\x1f' and x < '\x7f', text) 151 | 152 | def pairs(iterable, overlapping=False): 153 | iterator = iterable.__iter__() 154 | token = iterator.next() 155 | i = 0 156 | for lookahead in iterator: 157 | if overlapping or i % 2 == 0: 158 | yield (token, lookahead) 159 | token = lookahead 160 | i += 1 161 | if i % 2 == 0: 162 | yield (token, None) 163 | 164 | def frange(start, end=None, inc=None): 165 | "A range function, that does accept float increments..." 166 | 167 | if end == None: 168 | end = start + 0.0 169 | start = 0.0 170 | 171 | if inc == None: 172 | inc = 1.0 173 | 174 | L = [] 175 | while 1: 176 | next = start + len(L) * inc 177 | if inc > 0 and next >= end: 178 | break 179 | elif inc < 0 and next <= end: 180 | break 181 | L.append(next) 182 | 183 | return L 184 | 185 | def softmax(values): 186 | a = max(values) 187 | Z = 0.0 188 | for v in values: 189 | Z += math.exp(v - a) 190 | sm = [math.exp(v-a) / Z for v in values] 191 | return sm 192 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/candidatesreader.py: -------------------------------------------------------------------------------- 1 | import sys 2 | #import dlm.utils as U 3 | #import dlm.io.logging as L 4 | import codecs 5 | 6 | class NBestList(): 7 | def __init__(self, nbest_path, mode='r', reference_list=None): 8 | assert mode == 'r' or mode == 'w', "Invalid mode: " + mode 9 | self.mode = mode 10 | self.nbest_file = codecs.open(nbest_path, mode=mode, encoding='UTF-8') 11 | self.prev_index = -1 12 | self.curr_item = None 13 | self.curr_index = 0 14 | self.eof_flag = False 15 | self.ref_manager = None 16 | if reference_list: 17 | assert mode == 'r', "Cannot accept a reference_list in 'w' mode" 18 | self.ref_manager = RefernceManager(reference_list) 19 | 20 | 21 | def __iter__(self): 22 | assert self.mode == 'r', "Iteration can only be done in 'r' mode" 23 | return self 24 | 25 | def next_item(self): 26 | assert self.mode == 'r', "next() method can only be used in 'r' mode" 27 | try: 28 | segments = self.nbest_file.next().split("|||") 29 | except StopIteration: 30 | self.close() 31 | raise StopIteration 32 | try: 33 | index = int(segments[0]) 34 | except ValueError: 35 | print >> sys.stderr, "The first segment in an n-best list must be an integer" 36 | #L.error("The first segment in an n-best list must be an integer") 37 | hyp = segments[1].strip() 38 | features = segments[2].strip() 39 | score = None 40 | phrase_alignments = None 41 | word_alignments = None 42 | phrase_alignments = None 43 | if len(segments) > 3: 44 | score = segments[3].strip() 45 | if len(segments) > 4: 46 | phrase_alignments = segments[4].strip() 47 | if len(segments) > 5: 48 | word_alignments = segments[5].strip() 49 | return NBestItem(index, hyp, features, score, phrase_alignments, word_alignments) 50 | 51 | def next(self): # Returns a group of NBestItems with the same index 52 | if self.eof_flag == True: 53 | raise StopIteration 54 | assert self.mode == 'r', "next_group() method can only be used in 'r' mode" 55 | group = NBestGroup(self.ref_manager) 56 | group.add(self.curr_item) # add the item that was read in the last next() call 57 | try: 58 | self.curr_item = self.next_item() 59 | except StopIteration: 60 | self.eof_flag = True 61 | return group 62 | if self.curr_index != self.curr_item.index: 63 | self.curr_index = self.curr_item.index 64 | return group 65 | while self.curr_index == self.curr_item.index: 66 | group.add(self.curr_item) 67 | try: 68 | self.curr_item = self.next_item() 69 | except StopIteration: 70 | self.eof_flag = True 71 | return group 72 | self.curr_index = self.curr_item.index 73 | return group 74 | 75 | def write(self, item): 76 | assert self.mode == 'w', "write() method can only be used in 'w' mode" 77 | self.nbest_file.write(unicode(item) + "\n") 78 | 79 | def close(self): 80 | self.nbest_file.close() 81 | 82 | 83 | 84 | class NBestItem: 85 | def __init__(self, index, hyp, features, score, phrase_alignments, word_alignments): 86 | self.index = index 87 | self.hyp = hyp 88 | self.features = features 89 | self.score = score 90 | self.phrase_alignments = phrase_alignments 91 | self.word_alignments = word_alignments 92 | 93 | def __unicode__(self): 94 | output = ' ||| '.join([unicode(self.index), self.hyp, self.features]) 95 | if self.score: 96 | output = output + ' ||| ' + self.score 97 | if self.phrase_alignments: 98 | output = output + ' ||| ' + self.phrase_alignments 99 | if self.word_alignments: 100 | output = output + ' ||| ' + self.word_alignments 101 | return output 102 | 103 | def append_feature(self, feature_name, feature_value): 104 | self.features += ' ' + str(feature_name) + '= ' + str(feature_value) + ' ' 105 | 106 | 107 | class NBestGroup: 108 | def __init__(self, refrence_manager=None): 109 | self.group_index = -1 110 | self.group = [] 111 | self.ref_manager = refrence_manager 112 | 113 | def __unicode__(self): 114 | return '\n'.join([unicode(item) for item in self.group]) 115 | 116 | def __iter__(self): 117 | self.item_index = 0 118 | return self 119 | 120 | def __getitem__(self, index): 121 | return self.group[index] 122 | 123 | def add(self, item): 124 | if item is None: 125 | return 126 | if self.group_index == -1: 127 | self.group_index = item.index 128 | if self.ref_manager: 129 | self.refs = self.ref_manager.get_all_refs(self.group_index) 130 | else: 131 | assert item.index == self.group_index, "Cannot add an nbest item with an incompatible index" 132 | self.group.append(item) 133 | 134 | def next(self): 135 | #if self.item_index < len(self.group): 136 | try: 137 | item = self.group[self.item_index] 138 | self.item_index += 1 139 | return item 140 | #else: 141 | except IndexError: 142 | raise StopIteration 143 | 144 | def size(self): 145 | return len(self.group) 146 | 147 | def append_features(self, features_list): 148 | assert len(features_list) == len(self.group), 'Number of features and number of items in this group do not match' 149 | for i in range(len(self.group)): 150 | self.group[i].append_feature(features_list[i]) 151 | 152 | 153 | 154 | class RefernceManager: 155 | def __init__(self, paths_list): 156 | assert type(paths_list) is list, "The input to a RefernceManager class must be a list" 157 | self.ref_list = [] 158 | self.num_lines = -1 159 | self.num_refs = 0 160 | for path in paths_list: 161 | with codecs.open(path, mode='r', encoding='UTF-8') as f: 162 | self.num_refs += 1 163 | sentences = f.readlines() 164 | if self.num_lines == -1: 165 | self.num_lines = len(sentences) 166 | else: 167 | assert self.num_lines == len(sentences), "Reference files must have the same number of lines" 168 | self.ref_list.append(sentences) 169 | 170 | def get_all_refs(self, index): 171 | assert index < self.num_lines, "Index out of bound" 172 | return [self.ref_list[k][index] for k in range(self.num_refs)] 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/nbest-reranker/lib/m2scorer/Tokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: iso-8859-15 -*- 3 | 4 | # This file is part of the NUS M2 scorer. 5 | # The NUS M2 scorer is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | 10 | # The NUS M2 scorer is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | 18 | # file: Tokenizer.py 19 | # 20 | # A Penn Treebank tokenizer reimplemented based on the MOSES implementation. 21 | # 22 | # usage : %prog < input > output 23 | 24 | 25 | import re 26 | import sys 27 | 28 | 29 | class DummyTokenizer(object): 30 | 31 | def tokenize(self, text): 32 | return text.split() 33 | 34 | 35 | 36 | class PTBTokenizer(object): 37 | 38 | def __init__(self, language="en"): 39 | self.language = language 40 | self.nonbreaking_prefixes = {} 41 | self.nonbreaking_prefixes_numeric = {} 42 | self.nonbreaking_prefixes["en"] = ''' A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 43 | Adj Adm Adv Asst Bart Bldg Brig Bros Capt Cmdr Col Comdr Con Corp Cpl DR Dr Drs Ens 44 | Gen Gov Hon Hr Hosp Insp Lt MM MR MRS MS Maj Messrs Mlle Mme Mr Mrs Ms Msgr Op Ord 45 | Pfc Ph Prof Pvt Rep Reps Res Rev Rt Sen Sens Sfc Sgt Sr St Supt Surg 46 | v vs i.e rev e.g Nos Nr'''.split() 47 | self.nonbreaking_prefixes_numeric["en"] = '''No Art pp'''.split() 48 | self.special_chars = re.compile(r"([^\w\s\.\'\`\,\-\"\|\/])", flags=re.UNICODE) 49 | 50 | def tokenize(self, text, ptb=False): 51 | text = text.strip() 52 | text = " " + text + " " 53 | 54 | # Separate all "other" punctuation 55 | 56 | text = re.sub(self.special_chars, r' \1 ', text) 57 | text = re.sub(r";", r' ; ', text) 58 | text = re.sub(r":", r' : ', text) 59 | 60 | # replace the pipe character 61 | text = re.sub(r"\|", r' -PIPE- ', text) 62 | 63 | # split internal slash, keep others 64 | text = re.sub(r"(\S)/(\S)", r'\1 / \2', text) 65 | 66 | # PTB tokenization 67 | if ptb: 68 | text = re.sub(r"\(", r' -LRB- ', text) 69 | text = re.sub(r"\)", r' -RRB- ', text) 70 | text = re.sub(r"\[", r' -LSB- ', text) 71 | text = re.sub(r"\]", r' -RSB- ', text) 72 | text = re.sub(r"\{", r' -LCB- ', text) 73 | text = re.sub(r"\}", r' -RCB- ', text) 74 | 75 | text = re.sub(r"\"\s*$", r" '' ", text) 76 | text = re.sub(r"^\s*\"", r' `` ', text) 77 | text = re.sub(r"(\S)\"\s", r"\1 '' ", text) 78 | text = re.sub(r"\s\"(\S)", r" `` \1", text) 79 | text = re.sub(r"(\S)\"", r"\1 '' ", text) 80 | text = re.sub(r"\"(\S)", r" `` \1", text) 81 | text = re.sub(r"'\s*$", r" ' ", text) 82 | text = re.sub(r"^\s*'", r" ` ", text) 83 | text = re.sub(r"(\S)'\s", r"\1 ' ", text) 84 | text = re.sub(r"\s'(\S)", r" ` \1", text) 85 | 86 | text = re.sub(r"'ll", r" -CONTRACT-ll", text) 87 | text = re.sub(r"'re", r" -CONTRACT-re", text) 88 | text = re.sub(r"'ve", r" -CONTRACT-ve", text) 89 | text = re.sub(r"n't", r" n-CONTRACT-t", text) 90 | text = re.sub(r"'LL", r" -CONTRACT-LL", text) 91 | text = re.sub(r"'RE", r" -CONTRACT-RE", text) 92 | text = re.sub(r"'VE", r" -CONTRACT-VE", text) 93 | text = re.sub(r"N'T", r" N-CONTRACT-T", text) 94 | text = re.sub(r"cannot", r"can not", text) 95 | text = re.sub(r"Cannot", r"Can not", text) 96 | 97 | # multidots stay together 98 | text = re.sub(r"\.([\.]+)", r" DOTMULTI\1", text) 99 | while re.search("DOTMULTI\.", text): 100 | text = re.sub(r"DOTMULTI\.([^\.])", r"DOTDOTMULTI \1", text) 101 | text = re.sub(r"DOTMULTI\.", r"DOTDOTMULTI", text) 102 | 103 | # multidashes stay together 104 | text = re.sub(r"\-([\-]+)", r" DASHMULTI\1", text) 105 | while re.search("DASHMULTI\-", text): 106 | text = re.sub(r"DASHMULTI\-([^\-])", r"DASHDASHMULTI \1", text) 107 | text = re.sub(r"DASHMULTI\-", r"DASHDASHMULTI", text) 108 | 109 | # Separate ',' except if within number. 110 | text = re.sub(r"(\D),(\D)", r'\1 , \2', text) 111 | # Separate ',' pre and post number. 112 | text = re.sub(r"(\d),(\D)", r'\1 , \2', text) 113 | text = re.sub(r"(\D),(\d)", r'\1 , \2', text) 114 | 115 | if self.language == "en": 116 | text = re.sub(r"([^a-zA-Z])'([^a-zA-Z])", r"\1 ' \2", text) 117 | text = re.sub(r"(\W)'([a-zA-Z])", r"\1 ' \2", text) 118 | text = re.sub(r"([a-zA-Z])'([^a-zA-Z])", r"\1 ' \2", text) 119 | text = re.sub(r"([a-zA-Z])'([a-zA-Z])", r"\1 '\2", text) 120 | text = re.sub(r"(\d)'(s)", r"\1 '\2", text) 121 | text = re.sub(r" '\s+s ", r" 's ", text) 122 | text = re.sub(r" '\s+s ", r" 's ", text) 123 | elif self.language == "fr": 124 | text = re.sub(r"([^a-zA-Z])'([^a-zA-Z])", r"\1 ' \2", text) 125 | text = re.sub(r"([^a-zA-Z])'([a-zA-Z])", r"\1 ' \2", text) 126 | text = re.sub(r"([a-zA-Z])'([^a-zA-Z])", r"\1 ' \2", text) 127 | text = re.sub(r"([a-zA-Z])'([a-zA-Z])", r"\1' \2", text) 128 | else: 129 | text = re.sub(r"'", r" ' ") 130 | 131 | # re-combine single quotes 132 | text = re.sub(r"' '", r"''", text) 133 | 134 | words = text.split() 135 | text = '' 136 | for i, word in enumerate(words): 137 | m = re.match("^(\S+)\.$", word) 138 | if m: 139 | pre = m.group(1) 140 | if ((re.search("\.", pre) and re.search("[a-zA-Z]", pre)) or \ 141 | (pre in self.nonbreaking_prefixes[self.language]) or \ 142 | ((i < len(words)-1) and re.match("^\d+", words[i+1]))): 143 | pass # do nothing 144 | elif ((pre in self.nonbreaking_prefixes_numeric[self.language] ) and \ 145 | (i < len(words)-1) and re.match("\d+", words[i+1])): 146 | pass # do nothing 147 | else: 148 | word = pre + " ." 149 | 150 | text += word + " " 151 | text = re.sub(r"'\s+'", r"''", text) 152 | 153 | # restore multidots 154 | while re.search("DOTDOTMULTI", text): 155 | text = re.sub(r"DOTDOTMULTI", r"DOTMULTI.", text) 156 | text = re.sub(r"DOTMULTI", r".", text) 157 | 158 | # restore multidashes 159 | while re.search("DASHDASHMULTI", text): 160 | text = re.sub(r"DASHDASHMULTI", r"DASHMULTI-", text) 161 | text = re.sub(r"DASHMULTI", r"-", text) 162 | text = re.sub(r"-CONTRACT-", r"'", text) 163 | 164 | return text.split() 165 | 166 | 167 | def tokenize_all(self,sentences, ptb=False): 168 | return [self.tokenize(t, ptb) for t in sentences] 169 | 170 | # starting point 171 | if __name__ == "__main__": 172 | tokenizer = PTBTokenizer() 173 | for line in sys.stdin: 174 | line = line.decode("utf8") 175 | tokens = tokenizer.tokenize(line.strip()) 176 | out = ' '.join(tokens) 177 | print out.encode("utf8") 178 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/generate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import sys 10 | import torch 11 | from torch.autograd import Variable 12 | 13 | from fairseq import bleu, options, utils, tokenizer 14 | from fairseq.meters import StopwatchMeter, TimeMeter 15 | from fairseq.progress_bar import progress_bar 16 | from fairseq.sequence_generator import SequenceGenerator 17 | 18 | 19 | def main(): 20 | parser = options.get_parser('Generation') 21 | parser.add_argument('--path', metavar='FILE', required=True, action='append', 22 | help='path(s) to model file(s)') 23 | dataset_args = options.add_dataset_args(parser) 24 | dataset_args.add_argument('-i', '--interactive', action='store_true', 25 | help='generate translations in interactive mode') 26 | dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N', 27 | help='batch size') 28 | dataset_args.add_argument('--gen-subset', default='test', metavar='SPLIT', 29 | help='data subset to generate (train, valid, test)') 30 | options.add_generation_args(parser) 31 | 32 | args = parser.parse_args() 33 | print(args) 34 | 35 | if args.no_progress_bar: 36 | progress_bar.enabled = False 37 | use_cuda = torch.cuda.is_available() and not args.cpu 38 | 39 | # Load model and dataset 40 | print('| loading model(s) from {}'.format(', '.join(args.path))) 41 | models, dataset = utils.load_ensemble_for_inference(args.path, args.data) 42 | 43 | print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) 44 | print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) 45 | if not args.interactive: 46 | print('| {} {} {} examples'.format(args.data, args.gen_subset, len(dataset.splits[args.gen_subset]))) 47 | 48 | # Optimize model for generation 49 | for model in models: 50 | model.make_generation_fast_(args.beam, not args.no_beamable_mm) 51 | 52 | # Initialize generator 53 | translator = SequenceGenerator(models, dataset.dst_dict, beam_size=args.beam, 54 | stop_early=(not args.no_early_stop), 55 | normalize_scores=(not args.unnormalized), 56 | len_penalty=args.lenpen) 57 | align_dict = {} 58 | if args.unk_replace_dict != '': 59 | assert args.interactive, "Unkown words replacing requires access to original source and is only" \ 60 | "supported in interactive mode" 61 | with open(args.unk_replace_dict, 'r') as f: 62 | for line in f: 63 | l = line.split() 64 | align_dict[l[0]] = l[1] 65 | 66 | def replace_unk(hypo_str, align_str, src, unk): 67 | hypo_tokens = hypo_str.split() 68 | src_tokens = tokenizer.tokenize_line(src) 69 | align_idx = [int(i) for i in align_str.split()] 70 | for i, ht in enumerate(hypo_tokens): 71 | if ht == unk: 72 | src_token = src_tokens[align_idx[i]] 73 | if src_token in align_dict: 74 | hypo_tokens[i] = align_dict[src_token] 75 | else: 76 | hypo_tokens[i] = src_token 77 | return ' '.join(hypo_tokens) 78 | 79 | if use_cuda: 80 | translator.cuda() 81 | 82 | bpe_symbol = '@@ ' if args.remove_bpe else None 83 | def display_hypotheses(id, src, orig, ref, hypos): 84 | id_str = '' if id is None else '-{}'.format(id) 85 | src_str = to_sentence(dataset.src_dict, src, bpe_symbol) 86 | print('S{}\t{}'.format(id_str, src_str)) 87 | if orig is not None: 88 | print('O{}\t{}'.format(id_str, orig.strip())) 89 | if ref is not None: 90 | print('T{}\t{}'.format(id_str, to_sentence(dataset.dst_dict, ref, bpe_symbol, ref_unk=True))) 91 | for hypo in hypos: 92 | hypo_str = to_sentence(dataset.dst_dict, hypo['tokens'], bpe_symbol) 93 | align_str = ' '.join(map(str, hypo['alignment'])) 94 | if args.unk_replace_dict != '': 95 | hypo_str = replace_unk(hypo_str, align_str, orig, unk_symbol(dataset.dst_dict)) 96 | print('H{}\t{}\t{}'.format( 97 | id_str, hypo['score'], hypo_str)) 98 | print('A{}\t{}'.format(id_str, align_str)) 99 | 100 | if args.interactive: 101 | for line in sys.stdin: 102 | tokens = tokenizer.Tokenizer.tokenize(line, dataset.src_dict, add_if_not_exist=False).long() 103 | start = dataset.src_dict.pad() + 1 104 | positions = torch.arange(start, start + len(tokens)).type_as(tokens) 105 | if use_cuda: 106 | positions = positions.cuda() 107 | tokens = tokens.cuda() 108 | translations = translator.generate(Variable(tokens.view(1, -1)), Variable(positions.view(1, -1))) 109 | hypos = translations[0] 110 | display_hypotheses(None, tokens, line, None, hypos[:min(len(hypos), args.nbest)]) 111 | 112 | else: 113 | def maybe_remove_bpe(tokens): 114 | """Helper for removing BPE symbols from a hypothesis.""" 115 | if not args.remove_bpe: 116 | return tokens 117 | assert (tokens == dataset.dst_dict.pad()).sum() == 0 118 | hypo_minus_bpe = to_sentence(dataset.dst_dict, tokens, bpe_symbol) 119 | return tokenizer.Tokenizer.tokenize(hypo_minus_bpe, dataset.dst_dict, add_if_not_exist=True) 120 | 121 | # Generate and compute BLEU score 122 | scorer = bleu.Scorer(dataset.dst_dict.pad(), dataset.dst_dict.eos(), dataset.dst_dict.unk()) 123 | itr = dataset.dataloader(args.gen_subset, batch_size=args.batch_size, max_positions=args.max_positions) 124 | num_sentences = 0 125 | with progress_bar(itr, smoothing=0, leave=False) as t: 126 | wps_meter = TimeMeter() 127 | gen_timer = StopwatchMeter() 128 | translations = translator.generate_batched_itr( 129 | t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, 130 | cuda_device=0 if use_cuda else None, timer=gen_timer) 131 | for id, src, ref, hypos in translations: 132 | ref = ref.int().cpu() 133 | top_hypo = hypos[0]['tokens'].int().cpu() 134 | scorer.add(maybe_remove_bpe(ref), maybe_remove_bpe(top_hypo)) 135 | display_hypotheses(id, src, None, ref, hypos[:min(len(hypos), args.nbest)]) 136 | 137 | wps_meter.update(src.size(0)) 138 | t.set_postfix(wps='{:5d}'.format(round(wps_meter.avg))) 139 | num_sentences += 1 140 | 141 | print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} tokens/s)'.format( 142 | num_sentences, gen_timer.n, gen_timer.sum, 1. / gen_timer.avg)) 143 | print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) 144 | 145 | 146 | def to_token(dict, i, runk): 147 | return runk if i == dict.unk() else dict[i] 148 | 149 | 150 | def unk_symbol(dict, ref_unk=False): 151 | return '<{}>'.format(dict.unk_word) if ref_unk else dict.unk_word 152 | 153 | 154 | def to_sentence(dict, tokens, bpe_symbol=None, ref_unk=False): 155 | if torch.is_tensor(tokens) and tokens.dim() == 2: 156 | sentences = [to_sentence(dict, token) for token in tokens] 157 | return '\n'.join(sentences) 158 | eos = dict.eos() 159 | runk = unk_symbol(dict, ref_unk=ref_unk) 160 | sent = ' '.join([to_token(dict, i, runk) for i in tokens if i != eos]) 161 | if bpe_symbol is not None: 162 | sent = sent.replace(bpe_symbol, '') 163 | return sent 164 | 165 | 166 | if __name__ == '__main__': 167 | main() 168 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/subword-nmt/learn_bpe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text. 6 | Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary 7 | of a text to a configurable number of symbols, with only a small increase in the number of tokens. 8 | 9 | Reference: 10 | Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units. 11 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. 12 | """ 13 | 14 | from __future__ import unicode_literals 15 | 16 | import sys 17 | import codecs 18 | import re 19 | import copy 20 | import argparse 21 | from collections import defaultdict, Counter 22 | 23 | # hack for python2/3 compatibility 24 | from io import open 25 | argparse.open = open 26 | 27 | # python 2/3 compatibility 28 | if sys.version_info < (3, 0): 29 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) 30 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) 31 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin) 32 | 33 | def create_parser(): 34 | parser = argparse.ArgumentParser( 35 | formatter_class=argparse.RawDescriptionHelpFormatter, 36 | description="learn BPE-based word segmentation") 37 | 38 | parser.add_argument( 39 | '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, 40 | metavar='PATH', 41 | help="Input text (default: standard input).") 42 | parser.add_argument( 43 | '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, 44 | metavar='PATH', 45 | help="Output file for BPE codes (default: standard output)") 46 | parser.add_argument( 47 | '--symbols', '-s', type=int, default=10000, 48 | help="Create this many new symbols (each representing a character n-gram) (default: %(default)s))") 49 | parser.add_argument( 50 | '--verbose', '-v', action="store_true", 51 | help="verbose mode.") 52 | 53 | return parser 54 | 55 | def get_vocabulary(fobj): 56 | """Read text and return dictionary that encodes vocabulary 57 | """ 58 | vocab = Counter() 59 | for line in fobj: 60 | for word in line.split(): 61 | vocab[word] += 1 62 | return vocab 63 | 64 | def update_pair_statistics(pair, changed, stats, indices): 65 | """Minimally update the indices and frequency of symbol pairs 66 | 67 | if we merge a pair of symbols, only pairs that overlap with occurrences 68 | of this pair are affected, and need to be updated. 69 | """ 70 | stats[pair] = 0 71 | indices[pair] = defaultdict(int) 72 | first, second = pair 73 | new_pair = first+second 74 | for j, word, old_word, freq in changed: 75 | 76 | # find all instances of pair, and update frequency/indices around it 77 | i = 0 78 | while True: 79 | try: 80 | i = old_word.index(first, i) 81 | except ValueError: 82 | break 83 | if i < len(old_word)-1 and old_word[i+1] == second: 84 | if i: 85 | prev = old_word[i-1:i+1] 86 | stats[prev] -= freq 87 | indices[prev][j] -= 1 88 | if i < len(old_word)-2: 89 | # don't double-count consecutive pairs 90 | if old_word[i+2] != first or i >= len(old_word)-3 or old_word[i+3] != second: 91 | nex = old_word[i+1:i+3] 92 | stats[nex] -= freq 93 | indices[nex][j] -= 1 94 | i += 2 95 | else: 96 | i += 1 97 | 98 | i = 0 99 | while True: 100 | try: 101 | i = word.index(new_pair, i) 102 | except ValueError: 103 | break 104 | if i: 105 | prev = word[i-1:i+1] 106 | stats[prev] += freq 107 | indices[prev][j] += 1 108 | # don't double-count consecutive pairs 109 | if i < len(word)-1 and word[i+1] != new_pair: 110 | nex = word[i:i+2] 111 | stats[nex] += freq 112 | indices[nex][j] += 1 113 | i += 1 114 | 115 | 116 | def get_pair_statistics(vocab): 117 | """Count frequency of all symbol pairs, and create index""" 118 | 119 | # data structure of pair frequencies 120 | stats = defaultdict(int) 121 | 122 | #index from pairs to words 123 | indices = defaultdict(lambda: defaultdict(int)) 124 | 125 | for i, (word, freq) in enumerate(vocab): 126 | prev_char = word[0] 127 | for char in word[1:]: 128 | stats[prev_char, char] += freq 129 | indices[prev_char, char][i] += 1 130 | prev_char = char 131 | 132 | return stats, indices 133 | 134 | 135 | def replace_pair(pair, vocab, indices): 136 | """Replace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'""" 137 | first, second = pair 138 | pair_str = ''.join(pair) 139 | pair_str = pair_str.replace('\\','\\\\') 140 | changes = [] 141 | pattern = re.compile(r'(?<!\S)' + re.escape(first + ' ' + second) + r'(?!\S)') 142 | if sys.version_info < (3, 0): 143 | iterator = indices[pair].iteritems() 144 | else: 145 | iterator = indices[pair].items() 146 | for j, freq in iterator: 147 | if freq < 1: 148 | continue 149 | word, freq = vocab[j] 150 | new_word = ' '.join(word) 151 | new_word = pattern.sub(pair_str, new_word) 152 | new_word = tuple(new_word.split()) 153 | 154 | vocab[j] = (new_word, freq) 155 | changes.append((j, new_word, word, freq)) 156 | 157 | return changes 158 | 159 | def prune_stats(stats, big_stats, threshold): 160 | """Prune statistics dict for efficiency of max() 161 | 162 | The frequency of a symbol pair never increases, so pruning is generally safe 163 | (until we the most frequent pair is less frequent than a pair we previously pruned) 164 | big_stats keeps full statistics for when we need to access pruned items 165 | """ 166 | for item,freq in list(stats.items()): 167 | if freq < threshold: 168 | del stats[item] 169 | if freq < 0: 170 | big_stats[item] += freq 171 | else: 172 | big_stats[item] = freq 173 | 174 | if __name__ == '__main__': 175 | 176 | parser = create_parser() 177 | args = parser.parse_args() 178 | 179 | vocab = get_vocabulary(args.input) 180 | vocab = dict([(tuple(x)+('</w>',) ,y) for (x,y) in vocab.items()]) 181 | sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True) 182 | 183 | stats, indices = get_pair_statistics(sorted_vocab) 184 | big_stats = copy.deepcopy(stats) 185 | # threshold is inspired by Zipfian assumption, but should only affect speed 186 | threshold = max(stats.values()) / 10 187 | for i in range(args.symbols): 188 | if stats: 189 | most_frequent = max(stats, key=stats.get) 190 | 191 | # we probably missed the best pair because of pruning; go back to full statistics 192 | if not stats or (i and stats[most_frequent] < threshold): 193 | prune_stats(stats, big_stats, threshold) 194 | stats = copy.deepcopy(big_stats) 195 | most_frequent = max(stats, key=stats.get) 196 | # threshold is inspired by Zipfian assumption, but should only affect speed 197 | threshold = stats[most_frequent] * i/(i+10000.0) 198 | prune_stats(stats, big_stats, threshold) 199 | 200 | if stats[most_frequent] < 2: 201 | sys.stderr.write('no pair has frequency > 1. Stopping\n') 202 | break 203 | 204 | if args.verbose: 205 | sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent])) 206 | args.output.write('{0} {1}\n'.format(*most_frequent)) 207 | changes = replace_pair(most_frequent, sorted_vocab, indices) 208 | update_pair_statistics(most_frequent, changes, stats, indices) 209 | stats[most_frequent] = 0 210 | if not i % 100: 211 | prune_stats(stats, big_stats, threshold) 212 | -------------------------------------------------------------------------------- /CS2S+BPE+Emb/software/fairseq-py/fairseq/options.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import argparse 10 | 11 | from fairseq import models 12 | 13 | 14 | def get_parser(desc): 15 | parser = argparse.ArgumentParser( 16 | description='Facebook AI Research Sequence-to-Sequence Toolkit -- ' + desc) 17 | parser.add_argument('--no-progress-bar', action='store_true', help='disable progress bar') 18 | parser.add_argument('--log-interval', type=int, default=1000, metavar='N', 19 | help='log progress every N updates (when progress bar is disabled)') 20 | parser.add_argument('--seed', default=1, type=int, metavar='N', 21 | help='pseudo random number generator seed') 22 | return parser 23 | 24 | 25 | def add_dataset_args(parser): 26 | group = parser.add_argument_group('Dataset and data loading') 27 | group.add_argument('data', metavar='DIR', 28 | help='path to data directory') 29 | group.add_argument('-s', '--source-lang', default=None, metavar='SRC', 30 | help='source language') 31 | group.add_argument('-t', '--target-lang', default=None, metavar='TARGET', 32 | help='target language') 33 | group.add_argument('-j', '--workers', default=1, type=int, metavar='N', 34 | help='number of data loading workers (default: 1)') 35 | group.add_argument('--max-positions', default=1024, type=int, metavar='N', 36 | help='max number of tokens in the sequence') 37 | return group 38 | 39 | 40 | def add_optimization_args(parser): 41 | group = parser.add_argument_group('Optimization') 42 | group.add_argument('--lr', '--learning-rate', default=0.25, type=float, metavar='LR', 43 | help='initial learning rate') 44 | group.add_argument('--min-lr', metavar='LR', default=1e-5, type=float, 45 | help='minimum learning rate') 46 | group.add_argument('--force-anneal', '--fa', default=0, type=int, metavar='N', 47 | help='force annealing at specified epoch') 48 | group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N', 49 | help='force stop training at specified epoch') 50 | group.add_argument('--lrshrink', default=0.1, type=float, metavar='LS', 51 | help='learning rate shrink factor for annealing, lr_new = (lr * lrshrink)') 52 | group.add_argument('--momentum', default=0.99, type=float, metavar='M', 53 | help='momentum factor') 54 | group.add_argument('--clip-norm', default=25, type=float, metavar='NORM', 55 | help='clip threshold of gradients') 56 | group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', 57 | help='weight decay') 58 | group.add_argument('--sample-without-replacement', default=0, type=int, metavar='N', 59 | help='If bigger than 0, use that number of mini-batches for each epoch,' 60 | ' where each sample is drawn randomly with replacement from the' 61 | ' dataset') 62 | return group 63 | 64 | 65 | def add_checkpoint_args(parser): 66 | group = parser.add_argument_group('Checkpointing') 67 | group.add_argument('--save-dir', metavar='DIR', default='checkpoints', 68 | help='path to save checkpoints') 69 | group.add_argument('--restore-file', default='checkpoint_last.pt', 70 | help='filename in save-dir from which to load checkpoint') 71 | group.add_argument('--save-interval', type=int, default=-1, 72 | help='checkpoint every this many batches') 73 | group.add_argument('--no-save', action='store_true', 74 | help='don\'t save models and checkpoints') 75 | group.add_argument('--no-epoch-checkpoints', action='store_true', 76 | help='only store last and best checkpoints') 77 | return group 78 | 79 | 80 | def add_generation_args(parser): 81 | group = parser.add_argument_group('Generation') 82 | group.add_argument('--beam', default=5, type=int, metavar='N', 83 | help='beam size') 84 | group.add_argument('--nbest', default=1, type=int, metavar='N', 85 | help='number of hypotheses to output') 86 | group.add_argument('--max-len-a', default=0, type=int, metavar='N', 87 | help=('generate sequence of maximum length ax + b, ' 88 | 'where x is the source length')) 89 | group.add_argument('--max-len-b', default=200, type=int, metavar='N', 90 | help=('generate sequence of maximum length ax + b, ' 91 | 'where x is the source length')) 92 | group.add_argument('--remove-bpe', action='store_true', 93 | help='remove BPE tokens before scoring') 94 | group.add_argument('--no-early-stop', action='store_true', 95 | help=('continue searching even after finalizing k=beam ' 96 | 'hypotheses; this is more correct, but increases ' 97 | 'generation time by 50%%')) 98 | group.add_argument('--unnormalized', action='store_true', 99 | help='compare unnormalized hypothesis scores') 100 | group.add_argument('--cpu', action='store_true', help='generate on CPU') 101 | group.add_argument('--no-beamable-mm', action='store_true', 102 | help='don\'t use BeamableMM in attention layers') 103 | group.add_argument('--lenpen', default=1, type=float, 104 | help='length penalty: <1.0 favors shorter, >1.0 favors longer sentences') 105 | group.add_argument('--unk-replace-dict', default='', type=str, 106 | help='performs unk word replacement') 107 | 108 | return group 109 | 110 | 111 | def add_model_args(parser): 112 | group = parser.add_argument_group( 113 | 'Model configuration', 114 | # Only include attributes which are explicitly given as command-line 115 | # arguments or which have model-independent default values. 116 | argument_default=argparse.SUPPRESS, 117 | ) 118 | 119 | # The model architecture can be specified in several ways. 120 | # In increasing order of priority: 121 | # 1) model defaults (lowest priority) 122 | # 2) --arch argument 123 | # 3) --encoder/decoder-* arguments (highest priority) 124 | # Note: --arch cannot be combined with --encoder/decoder-* arguments. 125 | group.add_argument('--arch', '-a', default='fconv', metavar='ARCH', choices=models.arch_model_map.keys(), 126 | help='model architecture ({})'.format(', '.join(models.arch_model_map.keys()))) 127 | group.add_argument('--encoder-embed-dim', type=int, metavar='N', 128 | help='encoder embedding dimension') 129 | group.add_argument('--encoder-layers', type=str, metavar='EXPR', 130 | help='encoder layers [(dim, kernel_size), ...]') 131 | group.add_argument('--decoder-embed-dim', type=int, metavar='N', 132 | help='decoder embedding dimension') 133 | group.add_argument('--decoder-layers', type=str, metavar='EXPR', 134 | help='decoder layers [(dim, kernel_size), ...]') 135 | group.add_argument('--decoder-out-embed-dim', type=int, metavar='N', 136 | help='decoder output embedding dimension') 137 | group.add_argument('--decoder-attention', type=str, metavar='EXPR', 138 | help='decoder attention [True, ...]') 139 | group.add_argument('--encoder-embed-path', default=None, type=str, metavar='STR', 140 | help='path to pre-trained encoder embeddings') 141 | group.add_argument('--decoder-embed-path', default=None, type=str, metavar='STR', 142 | help='path to pre-trained decoder embeddings') 143 | # These arguments have default values independent of the model: 144 | group.add_argument('--dropout', default=0.1, type=float, metavar='D', 145 | help='dropout probability') 146 | group.add_argument('--label-smoothing', default=0, type=float, metavar='D', 147 | help='epsilon for label smoothing, 0 means no label smoothing') 148 | return group 149 | --------------------------------------------------------------------------------