├── language-modeling ├── .gitignore ├── locked_dropout.py ├── getdata.sh ├── embed_regularize.py ├── utils.py ├── LICENSE ├── data.py ├── generate.py ├── weight_drop.py ├── test-model.py ├── model.py ├── pointer.py ├── g2_lstm.py ├── finetune.py └── main.py ├── machine-translation ├── libs │ ├── multiverso_ │ │ ├── theano_ext │ │ │ ├── __init__.py │ │ │ ├── lasagne_ext │ │ │ │ ├── __init__.py │ │ │ │ └── param_manager.py │ │ │ └── sharedvar.py │ │ ├── Multiverso.dll │ │ ├── libmultiverso.so │ │ ├── __init__.py │ │ ├── api.py │ │ ├── utils.py │ │ ├── tests │ │ │ └── test_multiverso.py │ │ └── tables.py │ ├── __init__.py │ ├── utility │ │ ├── __init__.py │ │ ├── basic.py │ │ ├── data_iterator.py │ │ ├── optimizers.py │ │ └── translate.py │ ├── layers │ │ ├── __init__.py │ │ ├── layers_.py │ │ └── basic.py │ ├── gpu_manager.py │ ├── models │ │ └── __init__.py │ ├── config.py │ └── constants.py ├── scripts │ ├── convert_bpe_dic.py │ ├── map_vocabs.py │ ├── get_small_train.py │ ├── build_dictionary.py │ ├── moses │ │ ├── detruecase.perl │ │ ├── truecase.perl │ │ ├── train-truecaser.perl │ │ └── multi-bleu.perl │ └── plot_cost.py ├── math │ └── math.py ├── .gitignore ├── translate_compressed.py ├── seq_translate.py ├── translate_single.py ├── translate.py ├── replace_unk.py └── train_nmt.py └── README.md /language-modeling/.gitignore: -------------------------------------------------------------------------------- 1 | *.pt 2 | __pycache__/ 3 | -------------------------------------------------------------------------------- /machine-translation/libs/multiverso_/theano_ext/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | -------------------------------------------------------------------------------- /machine-translation/libs/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | __author__ = 'fyabc' 5 | -------------------------------------------------------------------------------- /machine-translation/libs/multiverso_/theano_ext/lasagne_ext/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | -------------------------------------------------------------------------------- /machine-translation/libs/multiverso_/Multiverso.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cafe/g2-lstm/master/machine-translation/libs/multiverso_/Multiverso.dll -------------------------------------------------------------------------------- /machine-translation/libs/multiverso_/libmultiverso.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cafe/g2-lstm/master/machine-translation/libs/multiverso_/libmultiverso.so -------------------------------------------------------------------------------- /machine-translation/libs/utility/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from .basic import * 5 | 6 | __author__ = 'fyabc' 7 | -------------------------------------------------------------------------------- /machine-translation/libs/layers/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from .basic import tanh, linear, dropout_layer, attention_layer, param_init_feed_forward, feed_forward 5 | from .gru import * 6 | from .lstm import * 7 | from .layers_ import * 8 | 9 | __author__ = 'fyabc' 10 | -------------------------------------------------------------------------------- /machine-translation/libs/utility/basic.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import numpy as np 5 | 6 | from ..constants import fX 7 | 8 | __author__ = 'fyabc' 9 | 10 | 11 | def floatX(value): 12 | return np.asarray(value, dtype=fX) 13 | 14 | 15 | __all__ = [ 16 | 'floatX', 17 | ] 18 | -------------------------------------------------------------------------------- /machine-translation/libs/multiverso_/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | """The multiverso library. 5 | 6 | Copied from v-yixia. 7 | """ 8 | 9 | from api import init, shutdown, barrier, workers_num, worker_id, server_id, is_master_worker 10 | from tables import ArrayTableHandler, MatrixTableHandler 11 | -------------------------------------------------------------------------------- /language-modeling/locked_dropout.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | 5 | class LockedDropout(nn.Module): 6 | def __init__(self): 7 | super().__init__() 8 | 9 | def forward(self, x, dropout=0.5): 10 | if not self.training or not dropout: 11 | return x 12 | m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout) 13 | mask = Variable(m, requires_grad=False) / (1 - dropout) 14 | mask = mask.expand_as(x) 15 | return mask * x 16 | -------------------------------------------------------------------------------- /language-modeling/getdata.sh: -------------------------------------------------------------------------------- 1 | echo "=== Acquiring datasets ===" 2 | echo "---" 3 | mkdir -p data 4 | cd data 5 | 6 | echo "- Downloading Penn Treebank (PTB)" 7 | mkdir -p penn 8 | cd penn 9 | wget --quiet --continue https://github.com/pytorch/examples/raw/master/word_language_model/data/penn/train.txt 10 | wget --quiet --continue https://github.com/pytorch/examples/raw/master/word_language_model/data/penn/valid.txt 11 | wget --quiet --continue https://github.com/pytorch/examples/raw/master/word_language_model/data/penn/test.txt 12 | cd .. 13 | 14 | echo "- Downloading WikiText-2 (WT2)" 15 | wget --quiet --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip 16 | unzip -q wikitext-2-v1.zip 17 | cd wikitext-2 18 | mv wiki.train.tokens train.txt 19 | mv wiki.valid.tokens valid.txt 20 | mv wiki.test.tokens test.txt 21 | 22 | echo "---" 23 | echo "Happy language modeling :)" 24 | -------------------------------------------------------------------------------- /machine-translation/scripts/convert_bpe_dic.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | from __future__ import print_function 5 | 6 | import sys 7 | from collections import OrderedDict 8 | import cPickle as pkl 9 | 10 | __author__ = 'fyabc' 11 | 12 | 13 | def main(): 14 | input_filename = sys.argv[1] 15 | 16 | with open(input_filename, 'r') as f_in: 17 | d = OrderedDict() 18 | 19 | d['eos'] = 0 20 | d['UNK'] = 1 21 | 22 | i = 2 23 | 24 | for line in f_in: 25 | word = line.strip() 26 | if word: 27 | d[word] = i 28 | i += 1 29 | 30 | with open('{}.pkl'.format(input_filename), 'wb') as f_out: 31 | pkl.dump(d, f_out) 32 | 33 | print('Convert {} -> {}.pkl'.format(input_filename, input_filename)) 34 | 35 | if __name__ == '__main__': 36 | main() 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # g2-lstm 2 | Codes for "Towards Binary-Valued Gates for Robust LSTM Training". 3 | 4 | Language modeling code is based on [awd-lstm-lm](https://github.com/salesforce/awd-lstm-lm) using PyTorch. 5 | 6 | Translation code is based on Theano. 7 | 8 | Implementation of Gumbel-Gate LSTM: [Pytorch version](language-modeling/g2_lstm.py), [Theano version](machine-translation/libs/layers/stochastic_lstm.py). 9 | 10 | We also apply *dropout* to the Gumbel noise added to the gates. In particular, given a fixed probability *p*, all gates will independently be preturbed by the Gumbel noise with probability *p*, or stay unperturbed otherwise. We find that no matter what the value of *p* is, the performance of trained G2-LSTM will be better. When *p* is small, our model will have better generalization error, and when *p* is large, our model will have less performance drop under compression. We fix *p=0.2* in all our experiments in the paper. 11 | -------------------------------------------------------------------------------- /machine-translation/math/math.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def low_rank_approx(SVD=None, A=None, r=1): 5 | """ 6 | Computes an r-rank approximation of a matrix 7 | given the component u, s, and v of it's SVD 8 | Requires: numpy 9 | """ 10 | if not SVD: 11 | SVD = np.linalg.svd(A, full_matrices=False) 12 | u, s, v = SVD 13 | Ar = np.zeros((len(u), len(v))) 14 | for i in xrange(r): 15 | Ar += s[i] * np.outer(u.T[i], v[i]) 16 | return Ar 17 | 18 | if __name__ == "__main__": 19 | """ 20 | Test: visualize an r-rank approximation of `lena` 21 | for increasing values of r 22 | Requires: scipy, matplotlib 23 | """ 24 | x = np.random.rand(10,10) 25 | u, s, v = np.linalg.svd(x, full_matrices=False) 26 | i = 1 27 | print x[0] 28 | while i < 10: 29 | y = low_rank_approx((u, s, v), r=i) 30 | print y[0] 31 | i += 1 -------------------------------------------------------------------------------- /machine-translation/libs/gpu_manager.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import subprocess 3 | import os 4 | import re 5 | import numpy as np 6 | 7 | def get_gpu_usage(ranks): 8 | exec_nvidia_smi = 'nvidia-smi' if platform.system() == 'Linux' else '\"C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe\"' 9 | pl_output = subprocess.Popen(exec_nvidia_smi, shell=True, 10 | stdout=subprocess.PIPE, stderr=open(os.devnull, 'w')).stdout.read() 11 | 12 | pattern = re.compile(r'(?P[0-9]{1,5})MiB[\s]+/') 13 | gpu_mems_usages = [] 14 | for line in pl_output.split('\n'): 15 | result = pattern.search(line) 16 | if result: 17 | gpu_mems_usages.append(int(result.group("num"))) 18 | sorted_gpu_ids = np.argsort(np.array(gpu_mems_usages,dtype= np.float32)) 19 | top = min(ranks, len(gpu_mems_usages)) 20 | return (np.array(range(len(gpu_mems_usages)), dtype= np.int)[sorted_gpu_ids[:top]]).tolist() -------------------------------------------------------------------------------- /machine-translation/libs/models/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from .model import * 5 | 6 | __author__ = 'fyabc' 7 | 8 | 9 | def build_and_init_model(model_name, options=None, build=True, model_type='NMTModel'): 10 | import cPickle as pkl 11 | 12 | from ..config import DefaultOptions 13 | from ..utility.utils import load_params 14 | 15 | if options is None: 16 | with open('{}.pkl'.format(model_name), 'rb') as f: 17 | options = DefaultOptions.copy() 18 | options.update(pkl.load(f)) 19 | 20 | model = eval(model_type)(options) 21 | 22 | # allocate model parameters 23 | params = model.initializer.init_params() 24 | # load model parameters and set theano shared variables 25 | params = load_params(model_name, params) 26 | model.init_tparams(params) 27 | 28 | if build: 29 | ret = model.build_model() 30 | return model, options, ret 31 | return model, options 32 | -------------------------------------------------------------------------------- /machine-translation/libs/layers/layers_.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from .basic import * 5 | from .gru import * 6 | from .stochastic_lstm import * 7 | 8 | __author__ = 'fyabc' 9 | 10 | # layers: 'name': ('parameter initializer', 'builder') 11 | layers = { 12 | 'ff': (param_init_feed_forward, feed_forward), 13 | 'gru': (param_init_gru, gru_layer), 14 | 'gru_cond': (param_init_gru_cond, gru_cond_layer), 15 | 'multi_gru': (param_init_gru, gru_layer), 16 | 'multi_gru_cond': (param_init_gru_cond, gru_cond_layer), 17 | 'lstm': (param_init_lstm, lstm_layer), 18 | 'lstm_cond': (param_init_lstm_cond, lstm_cond_layer), 19 | # todo: implement it 20 | 'multi_lstm': (param_init_lstm, lstm_layer), 21 | 'multi_lstm_cond': (param_init_lstm_cond, lstm_cond_layer), 22 | } 23 | 24 | 25 | def get_layer(name): 26 | fns = layers[name] 27 | return fns[0], fns[1] 28 | 29 | 30 | def get_init(name): 31 | return layers[name][0] 32 | 33 | 34 | def get_build(name): 35 | return layers[name][1] 36 | 37 | 38 | __all__ = [ 39 | 'layers', 40 | 'get_layer', 41 | 'get_build', 42 | 'get_init', 43 | ] 44 | -------------------------------------------------------------------------------- /language-modeling/embed_regularize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | from torch.autograd import Variable 5 | 6 | def embedded_dropout(embed, words, dropout=0.1, scale=None): 7 | if dropout: 8 | mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(embed.weight) / (1 - dropout) 9 | mask = Variable(mask) 10 | masked_embed_weight = mask * embed.weight 11 | else: 12 | masked_embed_weight = embed.weight 13 | if scale: 14 | masked_embed_weight = scale.expand_as(masked_embed_weight) * masked_embed_weight 15 | 16 | padding_idx = embed.padding_idx 17 | if padding_idx is None: 18 | padding_idx = -1 19 | X = embed._backend.Embedding.apply(words, masked_embed_weight, 20 | padding_idx, embed.max_norm, embed.norm_type, 21 | embed.scale_grad_by_freq, embed.sparse 22 | ) 23 | return X 24 | 25 | if __name__ == '__main__': 26 | V = 50 27 | h = 4 28 | bptt = 10 29 | batch_size = 2 30 | 31 | embed = torch.nn.Embedding(V, h) 32 | 33 | words = np.random.random_integers(low=0, high=V-1, size=(batch_size, bptt)) 34 | words = torch.LongTensor(words) 35 | words = Variable(words) 36 | 37 | origX = embed(words) 38 | X = embedded_dropout(embed, words) 39 | 40 | print(origX) 41 | print(X) 42 | -------------------------------------------------------------------------------- /language-modeling/utils.py: -------------------------------------------------------------------------------- 1 | from torch.autograd import Variable 2 | 3 | def repackage_hidden(h): 4 | """Wraps hidden states in new Variables, to detach them from their history.""" 5 | if type(h) == Variable: 6 | return Variable(h.data) 7 | else: 8 | return tuple(repackage_hidden(v) for v in h) 9 | 10 | def batchify(data, bsz, args): 11 | # Work out how cleanly we can divide the dataset into bsz parts. 12 | nbatch = data.size(0) // bsz 13 | # Trim off any extra elements that wouldn't cleanly fit (remainders). 14 | data = data.narrow(0, 0, nbatch * bsz) 15 | # Evenly divide the data across the bsz batches. 16 | data = data.view(bsz, -1).t().contiguous() 17 | if args.cuda: 18 | data = data.cuda() 19 | return data 20 | 21 | def get_batch(source, i, args, seq_len=None, evaluation=False): 22 | seq_len = min(seq_len if seq_len else args.bptt, len(source) - 1 - i) 23 | data = Variable(source[i:i+seq_len], volatile=evaluation) 24 | target = Variable(source[i+1:i+1+seq_len].view(-1)) 25 | return data, target 26 | 27 | import sys 28 | 29 | _log_file = None 30 | 31 | def set_log_file(name): 32 | global _log_file 33 | if name != '': 34 | _log_file = open(name, 'w') 35 | 36 | def message(msg): 37 | print(msg) 38 | sys.stdout.flush() 39 | if _log_file is not None: 40 | _log_file.write(msg + '\n') 41 | _log_file.flush() 42 | 43 | -------------------------------------------------------------------------------- /machine-translation/scripts/map_vocabs.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | #used to map new dataset vocab id to old dataset vocab id 4 | 5 | import sys 6 | import cPickle as pkl 7 | 8 | def main(): 9 | 10 | new_src_dic_file = sys.argv[1] 11 | new_tgt_dic_file = sys.argv[2] 12 | old_src_dic_file = sys.argv[3] 13 | old_tgt_dic_file = sys.argv[4] 14 | 15 | new_to_old_src_map = {} 16 | new_to_old_tgt_map = {} 17 | 18 | o_src_dic = pkl.load(open(old_src_dic_file, 'rb')) 19 | o_tgt_dic = pkl.load(open(old_tgt_dic_file, 'rb')) 20 | 21 | new_src_dic = pkl.load(open(new_src_dic_file, 'rb')) 22 | new_tgt_dic = pkl.load(open(new_tgt_dic_file, 'rb')) 23 | 24 | for (word, id) in new_src_dic.iteritems(): 25 | if word in o_src_dic: 26 | new_to_old_src_map[id] = o_src_dic[word] 27 | 28 | print 'Find %d vocabs in total %d src vocabs' % (len(new_to_old_src_map), len(new_src_dic)) 29 | 30 | for (word, id) in new_tgt_dic.iteritems(): 31 | if word in o_tgt_dic: 32 | new_to_old_tgt_map[id] = o_tgt_dic[word] 33 | 34 | print 'Find %d vocabs in total %d target vocabs' % (len(new_to_old_tgt_map), len(new_tgt_dic)) 35 | 36 | pkl.dump(new_to_old_src_map, open('../resources/enfr_large2small_src_vocab_map.pkl', 'wb')) 37 | pkl.dump(new_to_old_tgt_map, open('../resources/enfr_large2small_tgt_vocab_map.pkl', 'wb')) 38 | 39 | 40 | if __name__ == '__main__': 41 | main() -------------------------------------------------------------------------------- /machine-translation/scripts/get_small_train.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | from __future__ import print_function 5 | 6 | import sys 7 | import os 8 | import random 9 | 10 | __author__ = 'fyabc' 11 | 12 | 13 | def main(): 14 | input_filename1 = sys.argv[1] 15 | input_filename2 = sys.argv[2] 16 | 17 | if len(sys.argv) >= 4: 18 | small_size = int(sys.argv[3]) 19 | else: 20 | small_size = 10000 21 | 22 | with open(input_filename1, 'r') as f_in: 23 | lines = list(f_in) 24 | 25 | selected_indices = random.sample(range(len(lines)), small_size) 26 | 27 | head, tail = os.path.split(input_filename1) 28 | output_filename1 = '{}{}small_{}'.format(head, '/' if head else '', tail) 29 | with open(output_filename1, 'w') as f_out: 30 | for index in selected_indices: 31 | print(lines[index], end='', file=f_out) 32 | 33 | print('Extract {} -> {}'.format(input_filename1, output_filename1)) 34 | 35 | with open(input_filename2, 'r') as f_in: 36 | lines = list(f_in) 37 | 38 | head, tail = os.path.split(input_filename2) 39 | output_filename2 = '{}{}small_{}'.format(head, '/' if head else '', tail) 40 | with open(output_filename2, 'w') as f_out: 41 | for index in selected_indices: 42 | print(lines[index], end='', file=f_out) 43 | 44 | print('Extract {} -> {}'.format(input_filename2, output_filename2)) 45 | 46 | if __name__ == '__main__': 47 | main() 48 | -------------------------------------------------------------------------------- /language-modeling/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /language-modeling/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | from collections import Counter 5 | 6 | 7 | class Dictionary(object): 8 | def __init__(self): 9 | self.word2idx = {} 10 | self.idx2word = [] 11 | self.counter = Counter() 12 | self.total = 0 13 | 14 | def add_word(self, word): 15 | if word not in self.word2idx: 16 | self.idx2word.append(word) 17 | self.word2idx[word] = len(self.idx2word) - 1 18 | token_id = self.word2idx[word] 19 | self.counter[token_id] += 1 20 | self.total += 1 21 | return self.word2idx[word] 22 | 23 | def __len__(self): 24 | return len(self.idx2word) 25 | 26 | 27 | class Corpus(object): 28 | def __init__(self, path): 29 | self.dictionary = Dictionary() 30 | self.train = self.tokenize(os.path.join(path, 'train.txt')) 31 | self.valid = self.tokenize(os.path.join(path, 'valid.txt')) 32 | self.test = self.tokenize(os.path.join(path, 'test.txt')) 33 | 34 | def tokenize(self, path): 35 | """Tokenizes a text file.""" 36 | assert os.path.exists(path) 37 | # Add words to the dictionary 38 | with open(path, 'r') as f: 39 | tokens = 0 40 | for line in f: 41 | words = line.split() + [''] 42 | tokens += len(words) 43 | for word in words: 44 | self.dictionary.add_word(word) 45 | 46 | # Tokenize file content 47 | with open(path, 'r') as f: 48 | ids = torch.LongTensor(tokens) 49 | token = 0 50 | for line in f: 51 | words = line.split() + [''] 52 | for word in words: 53 | ids[token] = self.dictionary.word2idx[word] 54 | token += 1 55 | 56 | return ids 57 | -------------------------------------------------------------------------------- /machine-translation/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | # *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # Pycharm project settings 92 | .idea/ 93 | 94 | # data 95 | data/ 96 | 97 | # model 98 | model/ 99 | 100 | # log 101 | log/ 102 | 103 | # translated files 104 | translated/ 105 | 106 | # numpy saved models 107 | *.npz 108 | 109 | # theano config files 110 | .theanorc.* 111 | 112 | # Command line argument files 113 | /arguments/ 114 | -------------------------------------------------------------------------------- /machine-translation/libs/multiverso_/api.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | import ctypes 5 | 6 | import numpy as np 7 | 8 | from utils import Loader 9 | 10 | 11 | mv_lib = Loader.get_lib() 12 | 13 | 14 | def init(sync=False): 15 | """Initialize multiverso. 16 | 17 | This should be called only once before training at the beginning of the 18 | whole project. 19 | If sync is True, a sync server will be created. Otherwise an async server 20 | will be created. 21 | """ 22 | 23 | args = [""] # the first argument will be ignored. So we put a placeholder here 24 | if sync: 25 | args.append("-sync=true") 26 | n = len(args) 27 | args_type = ctypes.c_char_p * n 28 | mv_lib.MV_Init(ctypes.pointer(ctypes.c_int(n)), args_type(*[ctypes.c_char_p(arg) for arg in args])) 29 | 30 | 31 | def shutdown(): 32 | """Shutdown multiverso. 33 | 34 | This should be called only once after finishing training at the end of the 35 | whole project. 36 | """ 37 | mv_lib.MV_ShutDown() 38 | 39 | 40 | def barrier(): 41 | """Set a barrier for all workers to wait. 42 | 43 | Workers will wait until all workers reach a specific barrier. 44 | """ 45 | mv_lib.MV_Barrier() 46 | 47 | 48 | def workers_num(): 49 | """Return the total number of workers.""" 50 | return mv_lib.MV_NumWorkers() 51 | 52 | 53 | def worker_id(): 54 | """Return the id (zero-based index) for current worker.""" 55 | return mv_lib.MV_WorkerId() 56 | 57 | 58 | def server_id(): 59 | return mv_lib.MV_ServerId() 60 | 61 | 62 | def is_master_worker(): 63 | """If the worker is master worker. 64 | 65 | Some things only need one worker process, such as validation, outputting the 66 | result, initializing the parameters and so on. So we mark the worker 0 as 67 | the master worker to finish these things. 68 | """ 69 | return worker_id() == 0 70 | 71 | 72 | __all__ = [ 73 | 'init', 74 | 'shutdown', 75 | 'barrier', 76 | 'workers_num', 77 | 'worker_id', 78 | 'server_id', 79 | 'is_master_worker', 80 | ] 81 | -------------------------------------------------------------------------------- /machine-translation/scripts/build_dictionary.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | 4 | from __future__ import print_function 5 | 6 | import numpy 7 | 8 | try: 9 | import cPickle as pkl 10 | except: 11 | import pickle as pkl 12 | 13 | import sys 14 | import os 15 | import fileinput 16 | 17 | from collections import OrderedDict 18 | import argparse 19 | 20 | __author__ = 'fyabc' 21 | 22 | 23 | def real_main(args): 24 | if args.output is None: 25 | args.output = '{}.pkl'.format(args.input[0]) 26 | 27 | tgt_filename = os.path.join('data', 'dic', args.output) 28 | 29 | word_freqs = OrderedDict() 30 | worddict = OrderedDict() 31 | worddict['eos'] = 0 32 | worddict['UNK'] = 1 33 | 34 | for filename in args.input: 35 | src_filename = os.path.join('data', 'train', filename) 36 | 37 | print('Processing', src_filename) 38 | 39 | with open(src_filename, 'r') as f: 40 | for line in f: 41 | words_in = line.strip().split(' ') 42 | for w in words_in: 43 | if w not in word_freqs: 44 | word_freqs[w] = 0 45 | word_freqs[w] += 1 46 | 47 | words = list(word_freqs.keys()) 48 | freqs = list(word_freqs.values()) 49 | 50 | sorted_idx = numpy.argsort(freqs) 51 | sorted_words = [words[ii] for ii in sorted_idx[::-1]] 52 | 53 | for ii, ww in enumerate(sorted_words): 54 | worddict[ww] = ii + 2 55 | 56 | with open(tgt_filename, 'wb') as f: 57 | print('Dump to', tgt_filename) 58 | 59 | pkl.dump(worddict, f) 60 | 61 | 62 | def main(args=None): 63 | parser = argparse.ArgumentParser(description='Build dictionary file.') 64 | 65 | parser.add_argument('input', nargs='+', 66 | help='input filenames') 67 | parser.add_argument('-o', '--output', action='store', dest='output', default=None, 68 | help='dict output file, default is first input filename + ".pkl"') 69 | 70 | args = parser.parse_args(args) 71 | 72 | real_main(args) 73 | 74 | 75 | if __name__ == '__main__': 76 | main() 77 | -------------------------------------------------------------------------------- /machine-translation/scripts/moses/detruecase.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | use warnings; 7 | use strict; 8 | use Getopt::Long "GetOptions"; 9 | 10 | binmode(STDIN, ":utf8"); 11 | binmode(STDOUT, ":utf8"); 12 | 13 | my ($SRC,$INFILE,$UNBUFFERED); 14 | die("detruecase.perl < in > out") 15 | unless &GetOptions('headline=s' => \$SRC, 16 | 'in=s' => \$INFILE, 17 | 'b|unbuffered' => \$UNBUFFERED); 18 | if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } 19 | 20 | my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); 21 | my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"""=>1,"'"=>1,"["=>1,"]"=>1); 22 | 23 | # lowercase even in headline 24 | my %ALWAYS_LOWER; 25 | foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; } 26 | 27 | # find out about the headlines 28 | my @HEADLINE; 29 | if (defined($SRC)) { 30 | open(SRC,$SRC); 31 | my $headline_flag = 0; 32 | while() { 33 | $headline_flag = 1 if //; 34 | $headline_flag = 0 if /<.hl>/; 35 | next unless /^) { 46 | &process($_,$sentence++); 47 | } 48 | close(IN); 49 | } 50 | else { 51 | while() { 52 | &process($_,$sentence++); 53 | } 54 | } 55 | 56 | sub process { 57 | my $line = $_[0]; 58 | chomp($line); 59 | $line =~ s/^\s+//; 60 | $line =~ s/\s+$//; 61 | my @WORD = split(/\s+/,$line); 62 | 63 | # uppercase at sentence start 64 | my $sentence_start = 1; 65 | for(my $i=0;$i