├── dlm ├── __init__.py ├── io │ ├── __init__.py │ ├── logging.py │ ├── ngramsReader.py │ ├── vocabReader.py │ ├── w2vEmbReader.py │ ├── plotting.py │ ├── textReader.py │ ├── mmapReader.py │ └── nbestReader.py ├── algorithms │ ├── asgd.py │ ├── __init__.py │ ├── lr_tuner.py │ ├── sgd.py │ └── sgd_using_inputs.py ├── models │ ├── __init__.py │ ├── components │ │ ├── __init__.py │ │ ├── operation.py │ │ ├── linear.py │ │ ├── activation.py │ │ └── lookuptable.py │ ├── classifier.py │ └── mlp.py ├── criterions │ ├── __init__.py │ ├── bce.py │ ├── nll.py │ ├── weighted_nll.py │ └── nce.py ├── reranker │ ├── __init__.py │ ├── mosesIniReader.py │ ├── augmenter.py │ ├── tools.py │ ├── rerank.py │ ├── oracle.py │ ├── train.py │ └── bleu.py ├── preprocess │ ├── convert_to_memmap.py │ ├── monolingual.py │ ├── features.py │ └── bilingual.py ├── misc │ ├── nplm_to_corelm.py │ └── corelm_to_nplm.py ├── eval.py ├── trainer.py └── utils.py ├── .gitignore ├── LICENSE.md ├── classify.py ├── test.py ├── README.md └── train.py /dlm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dlm/io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dlm/algorithms/asgd.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dlm/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dlm/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dlm/criterions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dlm/reranker/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dlm/models/components/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.class 3 | *.jar 4 | *.war 5 | *.ear 6 | 7 | *.o 8 | 9 | *.pdf 10 | *.PDF 11 | *.bin 12 | *.aux 13 | *.bbl 14 | *.blg 15 | *.log 16 | *.backup 17 | 18 | *~ 19 | .* 20 | 21 | deleted 22 | -------------------------------------------------------------------------------- /dlm/criterions/bce.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | class BinaryCrossEntropy(): 4 | 5 | def __init__(self, classifier, args): 6 | 7 | self.y = T.matrix('y') 8 | 9 | self.cost = ( 10 | classifier.mean_batch_cross_entropy(self.y) 11 | + args.L1_reg * classifier.L1 12 | + args.L2_reg * classifier.L2_sqr 13 | ) 14 | -------------------------------------------------------------------------------- /dlm/reranker/mosesIniReader.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def parseIni(ini_path): 4 | out = [] 5 | with open(ini_path, 'r') as ini_file: 6 | section = '[nil]' 7 | for line in ini_file: 8 | line = line.strip() 9 | if line.startswith('['): 10 | section = line 11 | elif section == '[weight]' and line != '': 12 | if line.startswith('UnknownWordPenalty0= '): 13 | out.append('UnknownWordPenalty0 UNTUNEABLE') 14 | else: 15 | out.append(line) 16 | return out 17 | -------------------------------------------------------------------------------- /dlm/algorithms/lr_tuner.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import theano.tensor as T 3 | import theano 4 | import numpy 5 | 6 | class LRTuner: 7 | def __init__(self, low, high, inc): 8 | self.low = low 9 | self.high = high 10 | self.inc = inc 11 | self.prev_ppl = numpy.inf 12 | 13 | def adapt_lr(self, curr_ppl, curr_lr): 14 | if curr_ppl >= self.prev_ppl: 15 | lr = max(curr_lr / 2, self.low) 16 | else: 17 | lr = min(curr_lr + self.inc, self.high) 18 | self.prev_ppl = curr_ppl 19 | return lr 20 | -------------------------------------------------------------------------------- /dlm/models/components/operation.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | class Operation(): 4 | 5 | def __init__(self, input, op_name): 6 | self.input = input 7 | self.operate = self.get_operation(op_name) 8 | self.output = self.operate(input, axis=1) 9 | 10 | def get_operation(self, op_name): 11 | if op_name == 'sum': 12 | return T.sum 13 | elif op_name == 'mean': 14 | return T.mean 15 | elif op_name == 'max': 16 | return T.max 17 | else: 18 | L.error('Invalid operation name given: ' + op_name) 19 | -------------------------------------------------------------------------------- /dlm/criterions/nll.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | class NegLogLikelihood(): 4 | 5 | def __init__(self, classifier, args): 6 | 7 | self.y = T.ivector('y') 8 | 9 | self.cost = ( 10 | classifier.negative_log_likelihood(self.y) 11 | + args.L1_reg * classifier.L1 12 | + args.L2_reg * classifier.L2_sqr 13 | ) 14 | 15 | if args.alpha is not None and args.alpha > 0: 16 | self.cost = self.cost + args.alpha * classifier.log_Z_sqr 17 | 18 | self.test = ( 19 | T.mean(classifier.p_y_given_x(self.y)) 20 | ) 21 | -------------------------------------------------------------------------------- /dlm/criterions/weighted_nll.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | class NegLogLikelihood(): 4 | 5 | def __init__(self, classifier, args): 6 | 7 | self.y = T.ivector('y') 8 | self.w = T.vector('w') 9 | 10 | if args.instance_weights_path: 11 | self.cost = classifier.negative_log_likelihood(self.y, self.w) 12 | else: 13 | self.cost = classifier.negative_log_likelihood(self.y) 14 | 15 | if args.L1_reg > 0: 16 | self.cost = self.cost + args.L1_reg * classifier.L1 17 | 18 | if args.L2_reg > 0: 19 | self.cost = self.cost + args.L2_reg * classifier.L2_sqr 20 | 21 | if args.alpha and args.alpha > 0: 22 | self.cost = self.cost + args.alpha * classifier.log_Z_sqr 23 | 24 | self.test = ( 25 | T.mean(classifier.p_y_given_x(self.y)) 26 | ) 27 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2016 National University of Singapore 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | 6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | 8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /dlm/io/logging.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import dlm.utils as U 3 | 4 | file_path = None 5 | quiet = False 6 | 7 | def set_file_path(path): 8 | global file_path 9 | file_path = path 10 | log_file = open(file_path, 'w') # reset the file 11 | log_file.close() 12 | info('Log file: ' + path) 13 | 14 | def error(message): 15 | stderr = U.BColors.BFAIL + "[ERROR] " + U.BColors.ENDC + message + "\n" 16 | log = "[ERROR] " + U.BColors.cleared(message) + "\n" 17 | _write(stderr, log) 18 | sys.exit() 19 | 20 | def warning(message): 21 | stderr = U.BColors.BWARNING + "[WARNING] " + U.BColors.ENDC + message + "\n" 22 | log = "[WARNING] " + U.BColors.cleared(message) + "\n" 23 | _write(stderr, log) 24 | 25 | def info(message): 26 | stderr = U.BColors.BOKBLUE + "[INFO] " + U.BColors.ENDC + message + "\n" 27 | log = "[INFO] " + U.BColors.cleared(message) + "\n" 28 | _write(stderr, log) 29 | 30 | def exception(): 31 | exc = str(sys.exc_info()[0].mro()[0].__name__) + ": " + sys.exc_info()[1].message + "\n" 32 | stderr = U.BColors.BFAIL + "[ERROR] " + U.BColors.ENDC + exc 33 | log = "[ERROR] " + exc 34 | _write(stderr, log) 35 | sys.exit() 36 | 37 | def _write(stderr, log): 38 | global quiet 39 | if not quiet: 40 | sys.stderr.write(stderr) 41 | global file_path 42 | if file_path: 43 | log_file = open(file_path, 'a') 44 | log_file.write(log) 45 | log_file.close() 46 | -------------------------------------------------------------------------------- /dlm/io/ngramsReader.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import dlm.utils as U 3 | import dlm.io.logging as L 4 | from dlm.io.vocabReader import VocabManager 5 | from dlm.io.nbestReader import NBestList 6 | import numpy as np 7 | import codecs 8 | import theano 9 | import theano.tensor as T 10 | 11 | class NgramsReader(): 12 | 13 | def __init__(self, dataset_path, ngram_size, vocab_path): 14 | 15 | L.info("Initializing dataset from: " + dataset_path) 16 | 17 | vocab = VocabManager(vocab_path) 18 | 19 | curr_index = 0 20 | self.num_sentences = 0 21 | 22 | ngrams_list = [] 23 | dataset = codecs.open(dataset_path, 'r', encoding="UTF-8") 24 | for line in dataset: 25 | tokens = line.split() 26 | ngrams = vocab.get_ids_given_word_list(tokens) 27 | ngrams_list.append(ngrams) 28 | curr_index += 1 29 | dataset.close() 30 | 31 | data = np.asarray(ngrams_list) 32 | 33 | x = data[:,0:-1] 34 | y = data[:,-1] 35 | self.num_samples = y.shape[0] 36 | 37 | self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') 38 | self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') 39 | 40 | def get_x(self, index): 41 | return self.shared_x[ index : index+1 ] 42 | 43 | def get_y(self, index): 44 | return self.shared_y[ index : index+1 ] 45 | 46 | def get_num_batches(self): 47 | return self.num_samples 48 | 49 | def _get_num_samples(self): 50 | return self.num_samples 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /dlm/io/vocabReader.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import dlm.utils as U 3 | import dlm.io.logging as L 4 | 5 | class VocabManager: 6 | def __init__(self, input_path): 7 | L.info("Initializing vocabulary from: " + input_path) 8 | self.word_to_id_dict = dict() 9 | self.id_to_word_dict = dict() 10 | curr_id = 0 11 | with codecs.open(input_path, 'r', encoding='UTF-8') as input_file: 12 | for line in input_file: 13 | word = line.strip() 14 | self.word_to_id_dict[word] = curr_id 15 | self.id_to_word_dict[curr_id] = word 16 | curr_id += 1 17 | try: 18 | self.unk_id = self.word_to_id_dict[''] 19 | self.padding_id = self.word_to_id_dict[''] 20 | except KeyError: 21 | L.error("Given vocab file does not include or ") 22 | self.has_end_padding = self.word_to_id_dict.has_key('') 23 | 24 | def get_word_given_id(self, id): 25 | try: 26 | return self.id_to_word_dict[id] 27 | except KeyError: 28 | raise KeyError 29 | 30 | def get_id_given_word(self, word): 31 | try: 32 | return self.word_to_id_dict[word] 33 | except KeyError: 34 | return self.unk_id 35 | 36 | def get_ids_given_word_list(self, word_list): 37 | output = [] 38 | for word in word_list: 39 | output.append(self.get_id_given_word(word)) 40 | return output 41 | 42 | def get_words_given_id_list(self, id_list): 43 | output = [] 44 | for id in id_list: 45 | output.append(self.get_word_given_id(id)) 46 | return output 47 | -------------------------------------------------------------------------------- /dlm/models/classifier.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from theano.misc.pkl_utils import pickle 3 | import gzip 4 | import dlm.io.logging as L 5 | import time 6 | import dlm.utils as U 7 | 8 | class Classifier: 9 | 10 | def __init__(self): 11 | self.params = [] 12 | 13 | def get_params(self): 14 | return self.params 15 | 16 | def set_params(self, params): 17 | U.xassert(len(self.params) == len(params), 'The given model file is consistent with the architecture') 18 | for param, loaded_param in zip(self.params, params): 19 | param.set_value(loaded_param) 20 | 21 | def load_model(self, model_path): 22 | L.info('Loading model from ' + model_path) 23 | t0 = time.time() 24 | if model_path.endswith('.gz'): 25 | with gzip.open(model_path, 'rb') as model_file: 26 | args, params = pickle.load(model_file) 27 | else: 28 | with open(model_path, 'r') as model_file: 29 | args, params = pickle.load(model_file) 30 | L.info(' |-> took %.2f seconds' % (time.time() - t0)) 31 | return args, params 32 | 33 | def save_model(self, model_path, zipped=True, compress_level=5): 34 | L.info('Saving model to ' + model_path) 35 | t0 = time.time() 36 | if zipped: 37 | with gzip.open(model_path, 'wb', compresslevel=compress_level) as model_file: 38 | params = self.get_params() 39 | pickle.dump((self.args, [param.get_value() for param in params]), model_file) 40 | else: 41 | with open(model_path, 'w') as model_file: 42 | params = self.get_params() 43 | pickle.dump((self.args, [param.get_value() for param in params]), model_file) 44 | L.info(' |-> took %.2f seconds' % (time.time() - t0)) 45 | -------------------------------------------------------------------------------- /dlm/models/components/linear.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import theano 3 | import theano.tensor as T 4 | import dlm.io.logging as L 5 | import dlm.utils as U 6 | 7 | class Linear(): 8 | 9 | def __init__(self, rng, input, n_in, n_out, W_values=None, init_method=0, b_values=None, no_bias=False, suffix=None): 10 | 11 | L.info("Linear layer, #inputs: %s, #outputs: %s" % (U.red(n_in), U.red(n_out))) 12 | 13 | self.input = input 14 | 15 | if W_values is None: 16 | if init_method == 0: # Useful for Relu activation 17 | high = 0.01 18 | elif init_method == 1: # Useful for Tanh activation 19 | high = numpy.sqrt(6. / (n_in + n_out)) 20 | elif init_method == 2: # Useful for Sigmoid activation 21 | high = 4 * numpy.sqrt(6. / (n_in + n_out)) 22 | else: 23 | L.error('Invalid initialization method') 24 | W_values = numpy.asarray( 25 | rng.uniform( 26 | low=-high, 27 | high=high, 28 | size=(n_in, n_out) 29 | ), 30 | dtype=theano.config.floatX 31 | ) 32 | 33 | if b_values is None and not no_bias: 34 | b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) 35 | 36 | W_name = 'W' 37 | if suffix is not None: 38 | W_name += '.' + str(suffix) 39 | 40 | W = theano.shared(value=W_values, name=W_name, borrow=True) 41 | self.W = W 42 | 43 | if no_bias: 44 | self.output = T.dot(input, self.W) 45 | self.params = [self.W] 46 | else: 47 | b_name = 'b' 48 | if suffix is not None: 49 | b_name += '.' + str(suffix) 50 | b = theano.shared(value=b_values, name=b_name, borrow=True) 51 | self.b = b 52 | self.output = T.dot(input, self.W) + self.b 53 | self.params = [self.W, self.b] 54 | -------------------------------------------------------------------------------- /dlm/algorithms/sgd.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | import theano 3 | import dlm.io.logging as L 4 | 5 | class SGD: 6 | def __init__(self, classifier, criterion, learning_rate, trainset, clip_threshold=0): 7 | self.eta = learning_rate 8 | self.is_weighted = trainset.is_weighted 9 | 10 | if clip_threshold > 0: 11 | gparams = [T.clip(T.grad(criterion.cost, param), -clip_threshold, clip_threshold) for param in classifier.params] 12 | else: 13 | gparams = [T.grad(criterion.cost, param) for param in classifier.params] 14 | 15 | lr = T.fscalar() 16 | 17 | updates = [ 18 | (param, param - lr * gparam) 19 | for param, gparam in zip(classifier.params, gparams) 20 | ] 21 | 22 | index = T.lscalar() # index to a [mini]batch 23 | x = classifier.input 24 | y = criterion.y 25 | 26 | if self.is_weighted: 27 | w = criterion.w 28 | self.step_func = theano.function( 29 | inputs=[index, lr], 30 | outputs=[criterion.cost] + gparams, 31 | updates=updates, 32 | givens={ 33 | x: trainset.get_x(index), 34 | y: trainset.get_y(index), 35 | w: trainset.get_w(index) 36 | } 37 | ) 38 | else: 39 | self.step_func = theano.function( 40 | inputs=[index, lr], 41 | outputs=[criterion.cost] + gparams, 42 | updates=updates, 43 | givens={ 44 | x: trainset.get_x(index), 45 | y: trainset.get_y(index) 46 | } 47 | ) 48 | 49 | def step(self, minibatch_index): 50 | outputs = self.step_func(minibatch_index, self.eta) 51 | step_cost, gparams = outputs[0], outputs[1:] 52 | return step_cost, gparams 53 | 54 | def set_learning_rate(self, eta): 55 | self.eta = eta 56 | 57 | def get_learning_rate(self): 58 | return self.eta 59 | -------------------------------------------------------------------------------- /dlm/algorithms/sgd_using_inputs.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | import theano 3 | import dlm.io.logging as L 4 | 5 | class SGD: 6 | def __init__(self, classifier, criterion, learning_rate, trainset, clip_threshold=0): 7 | self.eta = learning_rate 8 | self.is_weighted = trainset.is_weighted 9 | self.trainset = trainset 10 | 11 | if clip_threshold > 0: 12 | gparams = [T.clip(T.grad(criterion.cost, param), -clip_threshold, clip_threshold) for param in classifier.params] 13 | else: 14 | gparams = [T.grad(criterion.cost, param) for param in classifier.params] 15 | 16 | lr = T.fscalar() 17 | 18 | updates = [ 19 | (param, param - lr * gparam) 20 | for param, gparam in zip(classifier.params, gparams) 21 | ] 22 | 23 | x = classifier.input 24 | y = criterion.y 25 | 26 | if self.is_weighted: 27 | w = criterion.w 28 | self.step_func = theano.function( 29 | inputs=[x, y, w, lr], 30 | outputs=[criterion.cost] + gparams, 31 | updates=updates, 32 | ) 33 | else: 34 | self.step_func = theano.function( 35 | inputs=[x, y, lr], 36 | outputs=[criterion.cost] + gparams, 37 | updates=updates, 38 | ) 39 | 40 | def step(self, minibatch_index): 41 | if self.is_weighted: 42 | outputs = self.step_func(self.trainset.get_x(minibatch_index), self.trainset.get_y(minibatch_index), self.trainset.get_w(minibatch_index), self.eta) 43 | else: 44 | outputs = self.step_func(self.trainset.get_x(minibatch_index), self.trainset.get_y(minibatch_index), self.eta) 45 | step_cost, gparams = outputs[0], outputs[1:] 46 | return step_cost, gparams 47 | 48 | def set_learning_rate(self, eta): 49 | self.eta = eta 50 | 51 | def get_learning_rate(self): 52 | return self.eta 53 | -------------------------------------------------------------------------------- /dlm/models/components/activation.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import theano.tensor as T 3 | import dlm.io.logging as L 4 | import dlm.utils as U 5 | 6 | class Activation(): 7 | 8 | def __init__(self, input, func_name): 9 | L.info("Activation layer, function: " + U.red(func_name)) 10 | self.input = input 11 | self.func = self.get_function(func_name) 12 | self.output = self.func(input) 13 | 14 | def get_function(self, func_name): 15 | if func_name == 'tanh': 16 | return T.tanh 17 | elif func_name == 'hardtanh': 18 | L.warning('Current hardTanh implementation is slow!') 19 | return lambda x: ((abs(x) <= 1) * x) + ((1 < abs(x)) * T.sgn(x)) 20 | elif func_name == 'xtanh': 21 | return lambda x: T.tanh(x) + 0.1 * x 22 | elif func_name == 'sigmoid': 23 | return T.nnet.sigmoid 24 | elif func_name == 'fastsigmoid': 25 | L.error('T.nnet.ultra_fast_sigmoid function has some problems') 26 | elif func_name == 'hardsigmoid': 27 | return T.nnet.hard_sigmoid 28 | elif func_name == 'xsigmoid': 29 | return lambda x: T.nnet.sigmoid(x) + 0.1 * x 30 | elif func_name == 'softplus': 31 | return T.nnet.softplus 32 | elif func_name == 'relu': 33 | #return lambda x: T.maximum(x, 0) 34 | return lambda x: x * (x > 0) 35 | #return T.nnet.relu # Update theano and then use this one instead 36 | elif func_name == 'leakyrelu': 37 | return lambda x: T.maximum(x, 0.01 * x) 38 | elif func_name == 'cappedrelu': 39 | return lambda x: T.minimum(x * (x > 0), 6) 40 | elif func_name == 'softmax': 41 | return T.nnet.softmax 42 | elif func_name == 'norm1': 43 | return lambda x: x / T.nlinalg.norm(x, 1) 44 | elif func_name == 'norm2': 45 | #return lambda x: x / T.nlinalg.norm(x, 2) 46 | return lambda x: x / T.dot(x, x)**0.5 47 | else: 48 | L.error('Invalid function name given: ' + func_name) 49 | -------------------------------------------------------------------------------- /dlm/criterions/nce.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | from theano.tensor.shared_randomstreams import RandomStreams 3 | from theano import function 4 | import numpy as np 5 | import theano 6 | import math 7 | 8 | class NCELikelihood(): 9 | 10 | def __init__(self, classifier, args, noise_dist): 11 | self.y = T.ivector('y') 12 | 13 | ## Cost function 14 | # Sum over minibatch instances (log ( u(w|c) / (u(w|c) + k * p_n(w)) ) + sum over noise samples ( log ( u(x|c) / ( u(x|c) + k * p_n(x) ) ))) 15 | 16 | # Generating noise samples 17 | srng = RandomStreams(seed=1234) 18 | noise_samples = srng.choice(size=(self.y.shape[0],args.num_noise_samples), a=args.num_classes, p=noise_dist, dtype='int32') 19 | 20 | log_noise_dist = theano.shared(np.log(noise_dist.get_value()),borrow=True) 21 | #log_num_noise_samples = theano.shared(math.log(args.num_noise_samples)).astype(theano.config.floatX) 22 | log_num_noise_samples = theano.shared(np.log(args.num_noise_samples,dtype=theano.config.floatX)) 23 | # Data Part of Cost Function: log ( u(w|c) / (u(w|c) + k * p_n(w)) 24 | data_scores = classifier.output[T.arange(self.y.shape[0]),self.y] 25 | data_denom = self.logadd(data_scores, log_num_noise_samples + log_noise_dist[self.y] ) 26 | data_prob = data_scores - data_denom 27 | # Sumation of Noise Part of Cost Function: sum over noise samples ( log ( u(x|c) / ( u(x|c) + k * p_n(x) ) )) 28 | noise_mass = log_num_noise_samples + log_noise_dist[noise_samples] # log(k) + log(p_n(x)) for all noise samples (Shape: #instaces x k) 29 | noise_scores = classifier.output[T.arange(noise_samples.shape[0]).reshape((-1,1)),noise_samples] 30 | noise_denom = self.logadd(noise_scores, noise_mass) 31 | noise_prob_sum = T.sum(noise_mass - noise_denom, axis=1) 32 | 33 | self.cost = ( 34 | -T.mean(data_prob + noise_prob_sum) 35 | ) 36 | self.test = ( 37 | T.sum(data_scores) 38 | ) 39 | 40 | def logadd(self, a, b): 41 | g = T.maximum(a,b) 42 | l = T.minimum(a,b) 43 | return g + T.log(1 + T.exp(l-g)) 44 | 45 | -------------------------------------------------------------------------------- /dlm/reranker/augmenter.py: -------------------------------------------------------------------------------- 1 | import time 2 | import dlm.utils as U 3 | import dlm.io.logging as L 4 | from dlm.models.mlp import MLP 5 | from dlm import eval 6 | from dlm.io.nbestReader import NBestList 7 | from dlm.io.vocabReader import VocabManager 8 | import numpy as np 9 | 10 | def augment(model_path, input_nbest_path, vocab_path, output_nbest_path): 11 | classifier = MLP(model_path=model_path) 12 | evaluator = eval.Evaluator(None, classifier) 13 | 14 | vocab = VocabManager(vocab_path) 15 | 16 | ngram_size = classifier.ngram_size 17 | 18 | def get_ngrams(tokens): 19 | for i in range(ngram_size - 1): 20 | tokens.insert(0, '') 21 | if vocab.has_end_padding: 22 | tokens.append('') 23 | indices = vocab.get_ids_given_word_list(tokens) 24 | return U.get_all_windows(indices, ngram_size) 25 | 26 | input_nbest = NBestList(input_nbest_path, mode='r') 27 | output_nbest = NBestList(output_nbest_path, mode='w') 28 | 29 | L.info('Augmenting: ' + input_nbest_path) 30 | 31 | start_time = time.time() 32 | 33 | counter = 0 34 | cache = dict() 35 | for group in input_nbest: 36 | ngram_list = [] 37 | for item in group: 38 | tokens = item.hyp.split() 39 | ngrams = get_ngrams(tokens) 40 | for ngram in ngrams: 41 | if not cache.has_key(str(ngram)): 42 | ngram_list.append(ngram) 43 | cache[str(ngram)] = 1000 44 | if len(ngram_list) > 0: 45 | ngram_array = np.asarray(ngram_list, dtype='int32') 46 | ngram_log_prob_list = evaluator.get_ngram_log_prob(ngram_array[:,0:-1], ngram_array[:,-1]) 47 | for i in range(len(ngram_list)): 48 | cache[str(ngram_list[i])] = ngram_log_prob_list[i] 49 | for item in group: 50 | tokens = item.hyp.split() 51 | ngrams = get_ngrams(tokens) 52 | sum_ngram_log_prob = 0 53 | for ngram in ngrams: 54 | sum_ngram_log_prob += cache[str(ngram)] 55 | item.append_feature(sum_ngram_log_prob) 56 | output_nbest.write(item) 57 | #print counter 58 | counter += 1 59 | output_nbest.close() 60 | 61 | L.info("Ran for %.2fs" % (time.time() - start_time)) 62 | -------------------------------------------------------------------------------- /dlm/io/w2vEmbReader.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import dlm.utils as U 3 | import dlm.io.logging as L 4 | 5 | class W2VEmbReader: 6 | def __init__(self, emb_path): 7 | L.info('Loading embeddings from: ' + emb_path) 8 | has_header=False 9 | with codecs.open(emb_path, 'r', encoding='utf8') as emb_file: 10 | tokens = emb_file.next().split() 11 | if len(tokens) == 2: 12 | try: 13 | int(tokens[0]) 14 | int(tokens[1]) 15 | has_header = True 16 | except ValueError: 17 | pass 18 | if has_header: 19 | with codecs.open(emb_path, 'r', encoding='utf8') as emb_file: 20 | tokens = emb_file.next().split() 21 | U.xassert(len(tokens) == 2, 'The first line in W2V embeddings must be the pair (vocab_size, emb_dim)') 22 | self.vocab_size = int(tokens[0]) 23 | self.emb_dim = int(tokens[1]) 24 | self.embeddings = {} 25 | counter = 0 26 | for line in emb_file: 27 | tokens = line.split() 28 | U.xassert(len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info') 29 | word = tokens[0] 30 | vec = tokens[1:] 31 | self.embeddings[word] = vec 32 | counter += 1 33 | U.xassert(counter == self.vocab_size, 'Vocab size does not match the header info') 34 | else: 35 | with codecs.open(emb_path, 'r', encoding='utf8') as emb_file: 36 | self.vocab_size = 0 37 | self.emb_dim = -1 38 | self.embeddings = {} 39 | for line in emb_file: 40 | tokens = line.split() 41 | if self.emb_dim == -1: 42 | self.emb_dim = len(tokens) - 1 43 | else: 44 | U.xassert(len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info') 45 | word = tokens[0] 46 | vec = tokens[1:] 47 | self.embeddings[word] = vec 48 | self.vocab_size += 1 49 | 50 | L.info(' #vectors: %i, #dimensions: %i' % (self.vocab_size, self.emb_dim)) 51 | 52 | def get_emb_given_word(self, word): 53 | try: 54 | return self.embeddings[word] 55 | except KeyError: 56 | return None 57 | 58 | def get_emb_dim(self): 59 | return self.emb_dim 60 | -------------------------------------------------------------------------------- /dlm/io/plotting.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Pdf') 3 | import matplotlib.pyplot as plt 4 | import sys 5 | import math 6 | import numpy as np 7 | 8 | class Plotter: 9 | 10 | def __init__(self, path, title=None, xlabel=None, ylabel=None, xspace=1, yspace=0.5): 11 | self.path = path 12 | self.title = title 13 | self.xlabel = xlabel 14 | self.ylabel = ylabel 15 | self.xspace = xspace 16 | self.yspace = yspace 17 | self.series_list = [] 18 | #self.tix_list = ['b*-', 'ro--'] 19 | self.tix_list = ['b-', 'r--'] 20 | 21 | def plot(self): 22 | plt.title(self.title, y=1.01, fontsize='medium') 23 | plt.xlabel(self.xlabel) 24 | plt.ylabel(self.ylabel) 25 | plt.grid('on') 26 | plt.margins(0.1) 27 | for i in range(len(self.series_list)): 28 | x_list = [] 29 | y_list = [] 30 | for x in sorted(self.series_list[i].keys()): 31 | x_list.append(x) 32 | y_list.append(self.series_list[i][x]) 33 | #xmin, xmax = min(self.x_list), max(self.x_list) + 1 34 | #ymin, ymax = min(self.y_list), max(self.y_list) + 1 35 | #plt.xticks(np.arange(xmin, xmax, 1.0), np.arange(xmin, xmax, 1.0), fontsize='x-small') 36 | #plt.yticks(np.arange(ymin, ymax, 0.5), np.arange(ymin, ymax, 0.5), fontsize='x-small') 37 | plt.plot(x_list, y_list, self.tix_list[i], label='S' + str(i)) 38 | plt.legend(bbox_to_anchor=(1.15, 0.5), loc='center right', borderaxespad=0.2, fontsize='x-small') 39 | plt.savefig(self.path, format='pdf', bbox_inches='tight', pad_inches=0.3) 40 | plt.cla() 41 | 42 | def add(self, series_index, x, y): 43 | assert series_index < len(self.tix_list) 44 | assert series_index <= len(self.series_list) 45 | if series_index == len(self.series_list): 46 | self.series_list.append(dict()) 47 | series = self.series_list[series_index] 48 | series[x] = y 49 | self.plot() 50 | 51 | def add_list(self, series_index, x_list, y_list): 52 | assert series_index < len(self.tix_list) 53 | assert series_index <= len(self.series_list) 54 | if series_index == len(self.series_list): 55 | self.series_list.append(dict()) 56 | series = self.series_list[series_index] 57 | for x, y in zip(x_list, y_list): 58 | series[x] = y 59 | self.plot() 60 | 61 | def set_tix_list(self, tix_list): 62 | self.tix_list = tix_list 63 | 64 | -------------------------------------------------------------------------------- /dlm/models/components/lookuptable.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import theano 3 | import theano.tensor as T 4 | import numpy 5 | import dlm.utils as U 6 | import dlm.io.logging as L 7 | from dlm.io.vocabReader import VocabManager 8 | from dlm.io.w2vEmbReader import W2VEmbReader 9 | 10 | class LookupTable(): 11 | 12 | def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False): 13 | 14 | L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim))) 15 | 16 | self.input = input 17 | 18 | self.emb_matrix = emb_matrix 19 | 20 | if self.emb_matrix is None: 21 | self.emb_matrix = numpy.asarray( 22 | rng.uniform( 23 | low=-0.01, #low=-1, 24 | high=0.01, #high=1, 25 | size=(vocab_size, emb_dim) 26 | ), 27 | dtype=theano.config.floatX 28 | ) 29 | 30 | if emb_path: 31 | U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.') 32 | self.initialize(emb_path, vocab_path) 33 | 34 | self.embeddings = theano.shared(value=self.emb_matrix, name='embeddings', borrow=True) 35 | 36 | if add_weights: 37 | weights_vec = numpy.ones(vocab_size, dtype=theano.config.floatX) 38 | self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True) 39 | 40 | # Check if the speed can be improved 41 | self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input] 42 | #self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input] 43 | #self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input] 44 | 45 | self.params = [self.embeddings, self.weights] 46 | else: 47 | self.output = self.embeddings[input] 48 | self.params = [self.embeddings] 49 | 50 | if concat: 51 | self.output = self.output.reshape((input.shape[0], emb_dim * input.shape[1])) 52 | 53 | def initialize(self, emb_path, vocab_path): 54 | L.info('Initializing lookup table') 55 | vm = VocabManager(vocab_path) 56 | w2v = W2VEmbReader(emb_path) 57 | U.xassert(w2v.get_emb_dim() == self.emb_matrix.shape[1], 'The embeddings dimension does not match with the given word embeddings') 58 | for i in range(self.emb_matrix.shape[0]): 59 | vec = w2v.get_emb_given_word(vm.get_word_given_id(i)) 60 | if vec: 61 | self.emb_matrix[i] = vec 62 | -------------------------------------------------------------------------------- /classify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import time 5 | import argparse 6 | import dlm.utils as U 7 | import dlm.io.logging as L 8 | import numpy 9 | 10 | ############### 11 | ## Arguments 12 | # 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("-i", "--input-file", dest="input_path", required=True, help="Input feature file") 16 | parser.add_argument("-v", "--vocab-file", dest="vocab_path", required=True, help="The vocabulary file for the model") 17 | parser.add_argument("-rv", "--restricted-vocab-file", dest="restricted_vocab_path", help="Restricted vocab file to predict the word") 18 | parser.add_argument("-m", "--model-file", dest="model_path", required=True, help="Input CoreLM model file") 19 | parser.add_argument("-o", "--output-file",dest="output_path", required=True, help="Output File path.") 20 | parser.add_argument("-d", "--device", dest="device", default="gpu", help="The computing device (cpu or gpu)") 21 | args = parser.parse_args() 22 | 23 | U.set_theano_device(args.device,1) 24 | 25 | from dlm.models.mlp import MLP 26 | from dlm import eval 27 | import theano 28 | import theano.tensor as T 29 | 30 | ######################### 31 | ## Loading model 32 | # 33 | 34 | classifier = MLP(model_path=args.model_path) 35 | 36 | ######################### 37 | ## Loading dataset 38 | # 39 | 40 | from dlm.io.ngramsReader import NgramsReader 41 | from dlm.io.vocabReader import VocabManager 42 | testset = NgramsReader(dataset_path=args.input_path, ngram_size=classifier.ngram_size, vocab_path=args.vocab_path) 43 | vocab = VocabManager(args.vocab_path) 44 | 45 | ## Loading restricted vocab 46 | restricted_ids = [] 47 | restricted_vocab = [] 48 | if args.restricted_vocab_path: 49 | with open(args.restricted_vocab_path) as restricted_vocab_file: 50 | for line in restricted_vocab_file: 51 | restricted_vocab.append(line.strip()) 52 | restricted_ids = vocab.get_ids_given_word_list(restricted_vocab) 53 | 54 | 55 | ######################### 56 | ## Compiling theano function 57 | # 58 | 59 | evaluator = eval.Evaluator(testset, classifier) 60 | 61 | 62 | if args.output_path: 63 | with open(args.output_path, 'w') as output: 64 | for i in xrange(testset._get_num_samples()): 65 | out = evaluator.get_class(i, restricted_ids) 66 | output.write(vocab.get_word_given_id(out)+'\n') 67 | 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /dlm/io/textReader.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import dlm.utils as U 3 | import dlm.io.logging as L 4 | from dlm.io.vocabReader import VocabManager 5 | from dlm.io.nbestReader import NBestList 6 | import numpy as np 7 | import codecs 8 | import theano 9 | import theano.tensor as T 10 | 11 | class TextReader(): 12 | 13 | def __init__(self, dataset_path, is_nbest, ngram_size, vocab_path): 14 | 15 | L.info("Initializing dataset from: " + dataset_path) 16 | 17 | vocab = VocabManager(vocab_path) 18 | 19 | def get_ngrams(tokens): 20 | for i in range(ngram_size - 1): 21 | tokens.insert(0, '') 22 | if vocab.has_end_padding: 23 | tokens.append('') 24 | indices = vocab.get_ids_given_word_list(tokens) 25 | return U.get_all_windows(indices, ngram_size) 26 | 27 | starts_list = [] 28 | curr_index = 0 29 | curr_start_index = 0 30 | self.num_sentences = 0 31 | 32 | ngrams_list = [] 33 | if is_nbest == True: 34 | nbest = NBestList(dataset_path) 35 | for group in nbest: 36 | for item in group: 37 | tokens = item.hyp.split() 38 | starts_list.append(curr_start_index) 39 | ngrams = get_ngrams(tokens) 40 | ngrams_list += ngrams 41 | curr_start_index += len(ngrams) 42 | else: 43 | dataset = codecs.open(dataset_path, 'r', encoding="UTF-8") 44 | for line in dataset: 45 | tokens = line.split() 46 | starts_list.append(curr_start_index) 47 | ngrams = get_ngrams(tokens) 48 | ngrams_list += ngrams 49 | curr_start_index += len(ngrams) 50 | dataset.close() 51 | 52 | self.num_sentences = len(starts_list) 53 | 54 | data = np.asarray(ngrams_list) 55 | starts_list.append(curr_start_index) 56 | starts_array = np.asarray(starts_list) 57 | 58 | x = data[:,0:-1] 59 | y = data[:,-1] 60 | 61 | self.num_samples = y.shape[0] 62 | 63 | self.shared_starts = T.cast(theano.shared(starts_array, borrow=True), 'int64') 64 | self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') 65 | self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') 66 | 67 | def get_x(self, index): 68 | return self.shared_x[ self.shared_starts[index] : self.shared_starts[index+1] ] 69 | 70 | def get_y(self, index): 71 | return self.shared_y[ self.shared_starts[index] : self.shared_starts[index+1] ] 72 | 73 | def get_num_sentences(self): 74 | return self.num_sentences 75 | 76 | def get_num_batches(self): 77 | return self.num_sentences 78 | 79 | def _get_num_samples(self): 80 | return self.num_samples 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /dlm/io/mmapReader.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import dlm.io.logging as L 3 | import dlm.utils as U 4 | import numpy as np 5 | import theano 6 | import theano.tensor as T 7 | import math as M 8 | import sys 9 | import os 10 | 11 | class MemMapReader(): 12 | 13 | #### Constructor 14 | 15 | def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): 16 | 17 | L.info("Initializing dataset from: " + os.path.abspath(dataset_path)) 18 | 19 | # Reading parameters from the mmap file 20 | fp = np.memmap(dataset_path, dtype='int32', mode='r') 21 | self.num_samples = fp[0] 22 | self.ngram = fp[1] 23 | fp = fp.reshape((self.num_samples + 3, self.ngram)) 24 | self.vocab_size = fp[1,0] 25 | self.num_classes = fp[2,0] 26 | 27 | # Setting minibatch size and number of mini batches 28 | self.batch_size = batch_size 29 | self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) 30 | 31 | # Reading the matrix of samples 32 | x = fp[3:,0:self.ngram - 1] # Reading the context indices 33 | y = fp[3:,self.ngram - 1] # Reading the output word index 34 | self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') 35 | self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') 36 | 37 | self.is_weighted = False 38 | if instance_weights_path: 39 | instance_weights = np.loadtxt(instance_weights_path) 40 | U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.") 41 | self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX) 42 | self.is_weighted = True 43 | 44 | L.info(' #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % ( 45 | U.red(self.num_samples), U.red(self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches) 46 | ) 47 | ) 48 | 49 | #### Accessors 50 | 51 | def get_x(self, index): 52 | return self.shared_x[index * self.batch_size : (index+1) * self.batch_size] 53 | 54 | def get_y(self, index): 55 | return self.shared_y[index * self.batch_size : (index+1) * self.batch_size] 56 | 57 | def get_w(self, index): 58 | return self.shared_w[index * self.batch_size : (index+1) * self.batch_size] 59 | 60 | #### INFO 61 | 62 | def _get_num_samples(self): 63 | return self.num_samples 64 | 65 | def get_num_batches(self): 66 | return self.num_batches 67 | 68 | def get_ngram_size(self): 69 | return self.ngram 70 | 71 | def get_vocab_size(self): 72 | return self.vocab_size 73 | 74 | def get_num_classes(self): 75 | return self.num_classes 76 | 77 | def get_unigram_model(self): 78 | unigram_counts = np.bincount(self.shared_y.get_value()) 79 | unigram_counts = np.append(unigram_counts, np.zeros(self.num_classes - unigram_counts.size, dtype='int32')) 80 | sum_unigram_counts = np.sum(unigram_counts) 81 | 82 | unigram_model = unigram_counts / sum_unigram_counts 83 | unigram_model = unigram_model.astype(theano.config.floatX) 84 | return theano.shared(unigram_model,borrow=True) 85 | -------------------------------------------------------------------------------- /dlm/preprocess/convert_to_memmap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | import sys, os 5 | import argparse 6 | 7 | # Parsing arguments 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("-i", "--input-idx-file", dest="idx_path", required=True, help="Path to the input text (idx) file.") 10 | parser.add_argument("-v", "--input-vocab-file", dest="vocab_path", help="Path to the vocab file.") 11 | parser.add_argument("-o", "--output-file", dest="output_path", required=True, help="Path to output file.") 12 | parser.add_argument("-n", "--no-header", dest="no_header", action='store_true', help="Use this flag to write a plain mmap file with no header information.") 13 | 14 | args = parser.parse_args() 15 | 16 | if not args.no_header: 17 | assert args.vocab_path, "[ERROR] Give a vocab file or use --no-header flag." 18 | 19 | def verify_matrix_file(matrix_path): 20 | print "Verifying the input file" 21 | nrows = 0 22 | ncols = -1 23 | with open(matrix_path, 'r') as data: 24 | for line in data: 25 | tokens = line.split() 26 | if ncols <= 0: 27 | ncols = len(tokens) 28 | else: 29 | assert ncols == len(tokens) 30 | try: 31 | map(int, tokens) 32 | except ValueError: 33 | print "[ERROR] Matrix file format invalid @ line: " + line 34 | sys.exit() 35 | nrows += 1 36 | if nrows % 10000000 == 0: 37 | print nrows 38 | assert nrows > 0 and ncols > 0 39 | return nrows, ncols 40 | 41 | 42 | 43 | if args.no_header: 44 | nrows, ncols = verify_matrix_file(args.idx_path) 45 | 46 | print "Number of rows: ", nrows 47 | print "Number of columns: ", ncols 48 | 49 | print "Creating the memory mapped file" 50 | print("Output file: " + os.path.abspath(args.output_path)) 51 | 52 | with open(args.idx_path, 'r') as data: 53 | fp = np.memmap(args.output_path, dtype='int32', mode='w+', shape=(nrows, ncols)) 54 | counter = 0 55 | for line in data: 56 | tokens = line.split() 57 | fp[counter] = tokens 58 | counter = counter + 1 59 | if counter % 10000000 == 0: 60 | print counter 61 | print counter, "samples mapped" 62 | fp.flush 63 | del fp 64 | else: 65 | print "Reading the vocab file" 66 | 67 | vocab_size = 0 68 | with open(args.vocab_path, 'r') as vocab_file: 69 | for line in vocab_file: 70 | vocab_size += 1 71 | assert vocab_size > 0 72 | 73 | num_samples, ngram_size = verify_matrix_file(args.idx_path) 74 | 75 | print "Number of samples: ", num_samples 76 | print "Ngram size: ", ngram_size 77 | print "Vocab size: ", vocab_size 78 | 79 | print "Creating the memory mapped file" 80 | print("Output file: " + os.path.abspath(args.output_path)) 81 | 82 | with open(args.idx_path, 'r') as data: 83 | fp = np.memmap(args.output_path, dtype='int32', mode='w+', shape=(num_samples + 3, ngram_size)) 84 | fp[0,0] = num_samples 85 | fp[0,1] = ngram_size 86 | fp[1,0] = vocab_size # vocab size 87 | fp[2,0] = vocab_size # number of classes 88 | counter = 3 89 | for line in data: 90 | tokens = line.split() 91 | fp[counter] = tokens 92 | counter = counter + 1 93 | if counter % 10000000 == 0: 94 | print counter 95 | print str(counter - 3) + " samples mapped" 96 | fp.flush 97 | del fp 98 | -------------------------------------------------------------------------------- /dlm/reranker/tools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import imp 5 | try: 6 | import dlm 7 | except ImportError: 8 | print "[ERROR] dlm module not found. Add CoreLM root directory to your PYTHONPATH" 9 | sys.exit() 10 | 11 | import dlm.io.logging as L 12 | import dlm.utils as U 13 | import argparse 14 | from dlm.io.nbestReader import NBestList 15 | import codecs 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("-c", "--command", dest="command", required=True, help="The command (topN|1best|featureN|correlN|augment)") 19 | parser.add_argument("-i", "--input-file", dest="input_path", required=True, help="Input n-best file") 20 | parser.add_argument("-s", "--input-scores", dest="oracle", help="Input oracle scores the n-best file") 21 | parser.add_argument("-o", "--output-file", dest="output_path", required=True, help="Output file") 22 | parser.add_argument("-v", "--vocab-file", dest="vocab_path", help="The vocabulary file.") 23 | parser.add_argument("-m", "--model-file", dest="model_path", help="Input CoreLM model file") 24 | parser.add_argument("-d", "--device", dest="device", default="gpu", help="The computing device (cpu or gpu)") 25 | args = parser.parse_args() 26 | 27 | input_nbest = NBestList(args.input_path, mode='r') 28 | 29 | mode = -1 30 | 31 | if args.command.startswith('top'): 32 | mode = 0 33 | N = int(args.command[3:]) # N in N-best 34 | output_nbest = NBestList(args.output_path, mode='w') 35 | elif args.command == '1best': 36 | mode = 1 37 | output_1best = codecs.open(args.output_path, mode='w', encoding='UTF-8') 38 | elif args.command.startswith('feature'): 39 | mode = 2 40 | N = int(args.command[7:]) # Nth feature 41 | output = open(args.output_path, mode='w') 42 | elif args.command.startswith('correl'): 43 | mode = 3 44 | N = int(args.command[6:]) # Nth feature 45 | U.xassert(args.oracle, "correlN command needs a file (-s) containing oracle scores") 46 | with open(args.oracle, mode='r') as oracles_file: 47 | oracles = map(float, oracles_file.read().splitlines()) 48 | #output = open(args.output_path, mode='w') 49 | elif args.command.startswith('augment'): 50 | U.set_theano_device(args.device) 51 | from dlm.reranker import augmenter 52 | augmenter.augment(args.model_path, args.input_path, args.vocab_path, args.output_path) 53 | else: 54 | L.error('Invalid command: ' + args.command) 55 | 56 | counter = 0 57 | features = [] 58 | for group in input_nbest: 59 | if mode == 0: 60 | for i in range(min(N, group.size())): 61 | output_nbest.write(group[i]) 62 | elif mode == 1: 63 | output_1best.write(group[0].hyp + "\n") 64 | elif mode == 2: 65 | for i in range(group.size()): 66 | features = group[i].features.split() 67 | output.write(features[N] + "\n") 68 | elif mode == 3: 69 | for i in range(group.size()): 70 | features.append(float(group[i].features.split()[N])) 71 | counter += 1 72 | if counter % 100 == 0: 73 | L.info("%i groups processed" % (counter)) 74 | L.info("Finished processing %i groups" % (counter)) 75 | 76 | if mode == 0: 77 | output_nbest.close() 78 | elif mode == 1: 79 | output_1best.close() 80 | elif mode == 2: 81 | output.close() 82 | elif mode == 3: 83 | import scipy.stats as S 84 | print 'PEARSON: ', S.pearsonr(features, oracles) 85 | print 'SPEARMAN:', S.spearmanr(features, oracles) 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /dlm/reranker/rerank.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import imp 6 | import shutil 7 | try: 8 | import dlm 9 | except ImportError: 10 | print "[ERROR] dlm module not found. Add CoreLM root directory to your PYTHONPATH" 11 | sys.exit() 12 | 13 | import dlm.utils as U 14 | import dlm.io.logging as L 15 | import argparse 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("-i", "--input-nbest", dest="input_nbest", required=True, help="Input n-best file") 19 | parser.add_argument("-v", "--vocab-file", dest="vocab_path", required=True, help="The vocabulary file that was used in training") 20 | parser.add_argument("-m", "--model-file", dest="model_path", required=True, help="Input CoreLM model file") 21 | parser.add_argument("-w", "--weights", dest="weights", required=True, help="Input weights file") 22 | parser.add_argument("-d", "--device", dest="device", default="gpu", help="The computing device (cpu or gpu)") 23 | parser.add_argument("-o", "--output-dir", dest="out_dir", required=True, help="Output directory") 24 | parser.add_argument("-n", "--no-aug", dest="no_aug", action='store_true', help="Augmentation will be skipped, if this flag is set") 25 | parser.add_argument("-c", "--clean-up", dest="clean_up", action='store_true', help="Temporary files will be removed") 26 | parser.add_argument("-q", "--quiet", dest="quiet", action='store_true', help="Nothing will be printed in STDERR") 27 | args = parser.parse_args() 28 | 29 | if args.quiet: 30 | L.quiet = True 31 | 32 | U.set_theano_device(args.device) 33 | 34 | from dlm.io.nbestReader import NBestList 35 | import codecs 36 | import numpy as np 37 | 38 | U.mkdir_p(args.out_dir) 39 | 40 | from dlm.reranker import augmenter 41 | 42 | output_nbest_path = args.out_dir + '/augmented.nbest' 43 | 44 | if args.no_aug: 45 | shutil.copy(args.input_nbest, output_nbest_path) 46 | else: 47 | augmenter.augment(args.model_path, args.input_nbest, args.vocab_path, output_nbest_path) 48 | 49 | with open(args.weights, 'r') as input_weights: 50 | lines = input_weights.readlines() 51 | if len(lines) > 1: 52 | L.warning("Weights file has more than one line. I'll read the 1st and ignore the rest.") 53 | weights = np.asarray(lines[0].strip().split(" "), dtype=float) 54 | 55 | prefix = os.path.basename(args.input_nbest) 56 | input_aug_nbest = NBestList(output_nbest_path, mode='r') 57 | output_nbest = NBestList(args.out_dir + '/' + prefix + '.reranked.nbest', mode='w') 58 | output_1best = codecs.open(args.out_dir + '/' + prefix + '.reranked.1best', mode='w', encoding='UTF-8') 59 | 60 | def is_number(s): 61 | try: 62 | float(s) 63 | return True 64 | except ValueError: 65 | return False 66 | 67 | counter = 0 68 | for group in input_aug_nbest: 69 | index = 0 70 | scores = dict() 71 | for item in group: 72 | features = np.asarray([x for x in item.features.split() if is_number(x)], dtype=float) 73 | try: 74 | scores[index] = np.dot(features, weights) 75 | except ValueError: 76 | L.error('Number of features in the nbest and the weights file are not the same') 77 | index += 1 78 | sorted_indices = sorted(scores, key=scores.get, reverse=True) 79 | for idx in sorted_indices: 80 | output_nbest.write(group[idx]) 81 | output_1best.write(group[sorted_indices[0]].hyp + "\n") 82 | counter += 1 83 | if counter % 100 == 0: 84 | L.info("%i groups processed" % (counter)) 85 | L.info("Finished processing %i groups" % (counter)) 86 | 87 | output_nbest.close() 88 | output_1best.close() 89 | 90 | if args.clean_up: 91 | os.remove(output_nbest_path) 92 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import time 5 | import argparse 6 | import dlm.utils as U 7 | import dlm.io.logging as L 8 | 9 | ############### 10 | ## Arguments 11 | # 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("-t", "--test-file", dest="test_path", required=True, help="The evaluation file (memory-mapped, nbest list or text file)") 15 | parser.add_argument("-f", "--format", dest="format", required=True, help="The evaluation file format (mmap|nbest|text)") 16 | parser.add_argument("-v", "--vocab-file", dest="vocab_path", help="The vocabulary file that was used in training") 17 | parser.add_argument("-m", "--model-file", dest="model_path", required=True, help="Input CoreLM model file") 18 | parser.add_argument("-ulp", "--unnormalized-log-prob-file", dest="ulp_path", help="Output file for sentence-level UNNORMALIZED log-probabilities") 19 | parser.add_argument("-nlp", "--normalized-log-prob-file", dest="nlp_path", help="Output file for sentence-level NORMALIZED log-probabilities") 20 | parser.add_argument("-ppl", "--perplexity", action='store_true', help="Compute and output normalized perplexity") 21 | parser.add_argument("-un", "--unnormalized", action='store_true', help="Compute and output unnormalized perplexity") 22 | parser.add_argument("-d", "--device", dest="device", default="gpu", help="The computing device (cpu or gpu)") 23 | args = parser.parse_args() 24 | 25 | U.set_theano_device(args.device, 1) 26 | 27 | from dlm.models.mlp import MLP 28 | from dlm import eval 29 | import theano 30 | import theano.tensor as T 31 | 32 | ######################### 33 | ## Loading model 34 | # 35 | 36 | classifier = MLP(model_path=args.model_path) 37 | 38 | ######################### 39 | ## Loading dataset 40 | # 41 | 42 | U.xassert(args.format == "mmap" or args.format == "nbest" or args.format == "text", "Invalid file format given: " + args.format) 43 | U.xassert(args.perplexity or args.nlp_path or args.ulp_path, "You should use one of (or more) -ppl, -nlp or -ulp") 44 | 45 | if args.format == "mmap": 46 | U.xassert((args.nlp_path is None) and (args.ulp_path is None), "Cannot compute log-probabilities for an mmap file") 47 | from dlm.io.mmapReader import MemMapReader 48 | testset = MemMapReader(dataset_path=args.test_path, batch_size=500) 49 | else: 50 | U.xassert(args.vocab_path, "Vocab file is required for non-mmap file formats") 51 | from dlm.io.textReader import TextReader 52 | is_nbest = False 53 | if args.format == "nbest": 54 | is_nbest = True 55 | testset = TextReader(dataset_path=args.test_path, is_nbest=is_nbest, ngram_size=classifier.ngram_size, vocab_path=args.vocab_path) 56 | 57 | ######################### 58 | ## Compiling theano function 59 | # 60 | 61 | evaluator = eval.Evaluator(testset, classifier) 62 | 63 | ######################### 64 | ## Testing 65 | # 66 | 67 | start_time = time.time() 68 | 69 | if args.perplexity: 70 | L.info("Perplexity: %f" % (evaluator.perplexity())) 71 | if args.unnormalized: 72 | L.info("Unnormalized Perplexity: %f" % (evaluator.unnormalized_perplexity())) 73 | 74 | if args.nlp_path: 75 | with open(args.nlp_path, 'w') as output: 76 | for i in xrange(testset.get_num_sentences()): 77 | output.write(str(evaluator.get_sequence_log_prob(i)) + '\n') 78 | 79 | if args.ulp_path: 80 | with open(args.ulp_path, 'w') as output: 81 | for i in xrange(testset.get_num_sentences()): 82 | output.write(str(evaluator.get_unnormalized_sequence_log_prob(i)) + '\n') 83 | 84 | L.info("Ran for %.2fs" % (time.time() - start_time)) 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /dlm/reranker/oracle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import imp 5 | try: 6 | import dlm 7 | except ImportError: 8 | print "[ERROR] dlm module not found. Add CoreLM root directory to your PYTHONPATH" 9 | sys.exit() 10 | 11 | import dlm.utils as U 12 | import dlm.io.logging as L 13 | import argparse 14 | from dlm.io.nbestReader import NBestList 15 | import dlm.reranker.bleu as B 16 | import codecs 17 | from multiprocessing import Pool 18 | 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("-i", "--input-file", dest="input_path", required=True, help="Input n-best file") 21 | parser.add_argument("-r", "--reference-files", dest="ref_paths", required=True, help="A comma-seperated list of reference files") 22 | parser.add_argument("-o", "--output-nbest-file", dest="out_nbest_path", help="Output oracle n-best file") 23 | parser.add_argument("-b", "--output-1best-file", dest="out_1best_path", required=True, help="Output oracle 1-best file") 24 | parser.add_argument("-s", "--output-scores", dest="out_scores_path", help="Output oracle scores file") 25 | parser.add_argument("-m", "--smoothing-method", dest="method", required=True, help="Smoothing method (none|epsilon|lin|nist|chen)") 26 | parser.add_argument("-t", "--threads", dest="threads", type=int, default=14, help="Number of threads") 27 | parser.add_argument("-q", "--quiet", dest="quiet", action='store_true', help="Nothing will be printed in STDERR") 28 | args = parser.parse_args() 29 | 30 | if args.quiet: 31 | L.quiet = True 32 | 33 | methods = { 34 | 'none' : B.no_smoothing, 35 | 'epsilon' : B.add_epsilon_smoothing, 36 | 'lin' : B.lin_smoothing, 37 | 'nist' : B.nist_smoothing, 38 | 'chen' : B.chen_smoothing 39 | } 40 | 41 | ref_path_list = args.ref_paths.split(',') 42 | 43 | input_nbest = NBestList(args.input_path, mode='r', reference_list=ref_path_list) 44 | if args.out_nbest_path: 45 | output_nbest = NBestList(args.out_nbest_path, mode='w') 46 | if args.out_scores_path: 47 | output_scores = open(args.out_scores_path, mode='w') 48 | output_1best = codecs.open(args.out_1best_path, mode='w', encoding='UTF-8') 49 | 50 | U.xassert(methods.has_key(args.method), "Invalid smoothing method: " + args.method) 51 | scorer = methods[args.method] 52 | 53 | L.info('Processing the n-best list') 54 | 55 | def process_group(group): 56 | index = 0 57 | scores = dict() 58 | for item in group: 59 | scores[index] = scorer(item.hyp, group.refs) 60 | index += 1 61 | return scores 62 | 63 | pool = Pool(args.threads) 64 | 65 | counter = 0 66 | group_counter = 0 67 | flag = True 68 | while (flag): 69 | group_list = [] 70 | for i in range(args.threads): 71 | try: 72 | group_list.append(input_nbest.next()) 73 | except StopIteration: 74 | flag = False 75 | if len(group_list) > 0: 76 | outputs = pool.map(process_group, group_list) 77 | for i in range(len(group_list)): 78 | scores = outputs[i] 79 | group = group_list[i] 80 | sorted_indices = sorted(scores, key=scores.get, reverse=True) 81 | if args.out_scores_path: 82 | for idx in scores: 83 | output_scores.write(str(group.group_index) + ' ' + str(idx) + ' ' + str(scores[idx]) + "\n") 84 | if args.out_nbest_path: 85 | for idx in sorted_indices: 86 | output_nbest.write(group[idx]) 87 | output_1best.write(group[sorted_indices[0]].hyp + "\n") 88 | counter += 1 89 | group_counter += len(group_list) 90 | if counter % 5 == 0: 91 | L.info("%i groups processed" % (group_counter)) 92 | L.info("Finished processing %i groups" % (group_counter)) 93 | 94 | if args.out_scores_path: 95 | output_scores.close() 96 | if args.out_nbest_path: 97 | output_nbest.close() 98 | output_1best.close() 99 | -------------------------------------------------------------------------------- /dlm/misc/nplm_to_corelm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | import argparse 5 | import os 6 | import dlm.utils as U 7 | import dlm.io.logging as L 8 | 9 | 10 | def convert_type(param): 11 | return np.float32(param) 12 | 13 | 14 | 15 | # Arguments for this script 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("-m", "--nplm-model", dest="nplm_model", required=True, help="The input NPLM model file") 18 | parser.add_argument("-dir", "--directory", dest="out_dir", help="The output directory for log file, model, etc.") 19 | 20 | args = parser.parse_args() 21 | 22 | U.set_theano_device('cpu',1) 23 | from dlm.models.mlp import MLP 24 | 25 | 26 | if args.out_dir is None: 27 | args.out_dir = 'nplm_convert-' + U.curr_time() 28 | U.mkdir_p(args.out_dir) 29 | 30 | 31 | # Reading the NPLM Model 32 | args_nn = argparse.Namespace() 33 | model_dict = dict() 34 | lines = [] 35 | req_attribs = ['\config','\\vocab', '\input_vocab', '\output_vocab', '\input_embeddings', '\hidden_weights 1', '\hidden_biases 1', '\hidden_weights 2', '\hidden_biases 2', '\output_weights', '\output_biases','\end'] 36 | attrib = '' 37 | 38 | with open(args.nplm_model,'r') as f_model: 39 | for line in f_model: 40 | line = line.strip() 41 | if(line in req_attribs): 42 | if attrib != '': 43 | model_dict[attrib] = lines 44 | attrib = line 45 | lines = [] 46 | elif(line): 47 | lines.append(line) 48 | else: 49 | continue; 50 | 51 | 52 | # Storing the config parameters of the NPLM model 53 | config_dict = dict() 54 | for config_line in model_dict['\config']: 55 | config_arg,value = config_line.split() 56 | config_dict[config_arg] = value 57 | 58 | 59 | # Setting the args for the classifier 60 | args_nn.emb_dim = int(config_dict['input_embedding_dimension']) 61 | args_nn.num_hidden = config_dict['num_hidden'] + ',' + config_dict['output_embedding_dimension'] 62 | args_nn.vocab_size = int(config_dict['input_vocab_size']) 63 | args_nn.ngram_size = int(config_dict['ngram_size']) 64 | args_nn.num_classes = int(config_dict['output_vocab_size']) 65 | 66 | act_func = config_dict['activation_function'] 67 | if act_func == 'rectifier': 68 | act_func = 'relu' 69 | 70 | args_nn.activation_name = act_func 71 | 72 | # Creating the classifier with the arguments read 73 | L.info("Creating CoreLM model") 74 | classifier = MLP(args_nn) 75 | 76 | 77 | # Loading matrices 78 | embeddings = np.loadtxt(model_dict['\input_embeddings']) 79 | W1 = np.loadtxt(model_dict['\hidden_weights 1']) 80 | W1 = np.transpose(W1) 81 | b1 = np.loadtxt(model_dict['\hidden_biases 1']) 82 | W2 = np.loadtxt(model_dict['\hidden_weights 2']) 83 | W2 = np.transpose(W2) 84 | b2 = np.loadtxt(model_dict['\hidden_biases 2']) 85 | W3 = np.loadtxt(model_dict['\output_weights']) 86 | W3 = np.transpose(W3) 87 | b3 = np.loadtxt(model_dict['\output_biases']) 88 | params_nn =[embeddings, W1, b1, W2, b2, W3, b3] 89 | 90 | #Type Conversion 91 | params_nn = [convert_type(param) for param in params_nn] 92 | 93 | # Setting the classifier parameters 94 | classifier.set_params(params_nn) 95 | 96 | #Debugging 97 | #print [np.array_equal(x.get_value(),y) for x,y in zip(classifier.params,params_nn)] 98 | 99 | # Saving the vocab file 100 | vocab_file = args.out_dir + "/vocab" 101 | if '\input_vocab' in model_dict: 102 | with open(vocab_file,'w') as f_vocab: 103 | for word in model_dict['\input_vocab']: 104 | f_vocab.write(word+'\n') 105 | 106 | 107 | # Saving the CoreLM model 108 | model_file = args.out_dir + "/" + os.path.basename(args.nplm_model) + ".corelm" 109 | L.info("Saving CoreLM model: " + model_file) 110 | classifier.save_model(model_file) 111 | 112 | -------------------------------------------------------------------------------- /dlm/eval.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import sys 3 | import time 4 | from theano import * 5 | import theano.tensor as T 6 | from dlm.io.mmapReader import MemMapReader 7 | from dlm.models.mlp import MLP 8 | import dlm.utils as U 9 | import math 10 | import numpy as np 11 | 12 | class Evaluator(): 13 | 14 | def __init__(self, dataset, classifier): 15 | 16 | index = T.lscalar() 17 | x = classifier.input 18 | y = T.ivector('y') 19 | 20 | if dataset: 21 | self.dataset = dataset # Initializing the dataset 22 | self.num_batches = self.dataset.get_num_batches() # Number of minibatches in the dataset 23 | self.num_samples = self.dataset._get_num_samples() # Number of samples in the dataset 24 | 25 | self.neg_sum_batch_log_likelihood = theano.function( 26 | inputs=[index], 27 | outputs=-T.sum(T.log(classifier.p_y_given_x(y))), 28 | givens={ 29 | x: self.dataset.get_x(index), 30 | y: self.dataset.get_y(index) 31 | } 32 | ) 33 | 34 | self.unnormalized_neg_sum_batch_log_likelihood = theano.function( 35 | inputs=[index], 36 | outputs=-T.sum(classifier.unnormalized_p_y_given_x(y)), # which is: -T.sum(T.log(T.exp(classifier.unnormalized_p_y_given_x(y)))) 37 | givens={ 38 | x: self.dataset.get_x(index), 39 | y: self.dataset.get_y(index) 40 | } 41 | ) 42 | 43 | self.sum_batch_error = theano.function( 44 | inputs=[index], 45 | outputs=classifier.errors(y), 46 | givens={ 47 | x: self.dataset.get_x(index), 48 | y: self.dataset.get_y(index) 49 | } 50 | ) 51 | 52 | # x: A matrix (N * (ngram - 1)) representing the sequence of length N 53 | # y: A vector of class labels 54 | self.neg_sequence_log_prob = self.neg_sum_batch_log_likelihood 55 | 56 | self.denominator = theano.function( 57 | inputs=[index], 58 | outputs=classifier.log_Z_sqr, 59 | givens={ 60 | x: self.dataset.get_x(index) 61 | } 62 | ) 63 | 64 | self.get_p_matrix = theano.function( 65 | inputs=[index], 66 | outputs=classifier.p_y_given_x_matrix, 67 | givens={ 68 | x:self.dataset.get_x(index) 69 | } 70 | ) 71 | self.get_y_pred = theano.function( 72 | inputs=[index], 73 | outputs=classifier.y_pred, 74 | givens={ 75 | x:self.dataset.get_x(index) 76 | } 77 | ) 78 | # End of if 79 | 80 | self.ngram_log_prob = theano.function( 81 | inputs=[x, y], 82 | outputs=T.log(classifier.p_y_given_x(y)), 83 | ) 84 | 85 | 86 | def classification_error(self): 87 | return np.sum([self.sum_batch_error(i) for i in xrange(self.num_batches)]) / self.num_samples 88 | 89 | def mean_neg_log_likelihood(self): 90 | return math.fsum([self.neg_sum_batch_log_likelihood(i) for i in xrange(self.num_batches)]) / self.num_samples # np.sum() has some precision problems here 91 | 92 | def mean_unnormalized_neg_log_likelihood(self): 93 | return math.fsum([self.unnormalized_neg_sum_batch_log_likelihood(i) for i in xrange(self.num_batches)]) / self.num_samples # np.sum() has some precision problems here 94 | 95 | def perplexity(self): 96 | return math.exp(self.mean_neg_log_likelihood()) 97 | 98 | def unnormalized_perplexity(self): 99 | return math.exp(self.mean_unnormalized_neg_log_likelihood()) 100 | 101 | def get_sequence_log_prob(self, index): 102 | return - self.neg_sequence_log_prob(index) 103 | 104 | def get_unnormalized_sequence_log_prob(self, index): 105 | return - self.unnormalized_neg_sum_batch_log_likelihood(index) 106 | 107 | def get_ngram_log_prob(self, x, y): 108 | return self.ngram_log_prob(x, y) 109 | 110 | def get_denominator(self): 111 | return np.mean([self.denominator(i) for i in xrange(self.num_batches)]) 112 | 113 | def get_class(self, index, restricted_ids = []): 114 | if restricted_ids != []: 115 | return restricted_ids[np.argmax(self.get_p_matrix(index)[:,restricted_ids])] 116 | else: 117 | return self.get_y_pred(index)[0] 118 | -------------------------------------------------------------------------------- /dlm/trainer.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import theano 3 | import theano.tensor as T 4 | from dlm import eval 5 | import dlm.utils as U 6 | import dlm.io.logging as L 7 | from dlm.algorithms.lr_tuner import LRTuner 8 | import time 9 | import numpy as np 10 | import sys 11 | import time 12 | 13 | 14 | def train(classifier, criterion, args, trainset, devset, testset=None): 15 | if args.algorithm == "sgd": 16 | from dlm.algorithms.sgd import SGD as Trainer 17 | else: 18 | L.error("Invalid training algorithm: " + args.algorithm) 19 | 20 | # Get number of minibatches from the training file 21 | num_train_batches = trainset.get_num_batches() 22 | 23 | # Initialize the trainer object 24 | trainer = Trainer(classifier, criterion, args.learning_rate, trainset, clip_threshold=args.clip_threshold) 25 | 26 | # Initialize the Learning Rate tuner, which adjusts learning rate based on the development/validation file 27 | lr_tuner = LRTuner(low=0.01*args.learning_rate, high=10*args.learning_rate, inc=0.01*args.learning_rate) 28 | validation_frequency = 5000 # minibatches 29 | 30 | # Logging and statistics 31 | total_num_iter = args.num_epochs * num_train_batches 32 | hook = Hook(classifier, devset, testset, total_num_iter, args.out_dir) 33 | L.info('Training') 34 | start_time = time.time() 35 | verbose_freq = 1000 # minibatches 36 | epoch = 0 37 | 38 | hook.evaluate(0) 39 | 40 | a = time.time() 41 | classifier.save_model(args.out_dir + '/model.epoch_0.gz', zipped=True) 42 | 43 | while (epoch < args.num_epochs): 44 | epoch = epoch + 1 45 | L.info("Epoch: " + U.red(epoch)) 46 | 47 | minibatch_avg_cost_sum = 0 48 | for minibatch_index in xrange(num_train_batches): 49 | # Makes an update of the paramters after processing the minibatch 50 | minibatch_avg_cost, gparams = trainer.step(minibatch_index) 51 | minibatch_avg_cost_sum += minibatch_avg_cost 52 | 53 | if minibatch_index % verbose_freq == 0: 54 | grad_norms = [np.linalg.norm(gparam) for gparam in gparams] 55 | L.info(U.blue("[" + time.ctime() + "] ") + '%i/%i, cost=%.2f, lr=%f' 56 | % (minibatch_index, num_train_batches, minibatch_avg_cost_sum/(minibatch_index+1), trainer.get_learning_rate())) 57 | L.info('Grad Norms: [' + ', '.join(['%.6f' % gnorm for gnorm in grad_norms]) + ']') 58 | curr_iter = (epoch - 1) * num_train_batches + minibatch_index 59 | if curr_iter > 0 and curr_iter % validation_frequency == 0: 60 | hook.evaluate(curr_iter) 61 | 62 | L.info(U.blue("[" + time.ctime() + "] ") + '%i/%i, cost=%.2f, lr=%f' 63 | % (num_train_batches, num_train_batches, minibatch_avg_cost_sum/num_train_batches, trainer.get_learning_rate())) 64 | dev_ppl = hook.evaluate(curr_iter) 65 | lr = trainer.get_learning_rate() 66 | if args.enable_lr_adjust: 67 | lr = lr_tuner.adapt_lr(dev_ppl, lr) 68 | trainer.set_learning_rate(lr) 69 | classifier.save_model(args.out_dir + '/model.epoch_' + str(epoch) + '.gz', zipped=True) 70 | 71 | end_time = time.time() 72 | hook.evaluate(total_num_iter) 73 | L.info('Optimization complete') 74 | L.info('Ran for %.2fm' % ((end_time - start_time) / 60.)) 75 | 76 | 77 | class Hook: 78 | def __init__(self, classifier, devset, testset, total_num_iter, out_dir): 79 | self.classifier = classifier 80 | self.dev_eval = eval.Evaluator(dataset=devset, classifier=classifier) 81 | self.test_eval = None 82 | if testset: 83 | self.test_eval = eval.Evaluator(dataset=testset, classifier=classifier) 84 | self.best_iter = 0 85 | self.best_dev_perplexity = np.inf 86 | self.best_test_perplexity = np.inf 87 | self.t0 = time.time() 88 | self.total_num_iter = total_num_iter 89 | self.out_dir = out_dir 90 | 91 | def evaluate(self, curr_iter): 92 | denominator = self.dev_eval.get_denominator() 93 | dev_error = self.dev_eval.classification_error() 94 | dev_perplexity = self.dev_eval.perplexity() 95 | if self.test_eval: 96 | test_error = self.test_eval.classification_error() 97 | test_perplexity = self.test_eval.perplexity() 98 | 99 | if dev_perplexity < self.best_dev_perplexity: 100 | self.best_dev_perplexity = dev_perplexity 101 | self.best_iter = curr_iter 102 | if self.test_eval: 103 | self.best_test_perplexity = test_perplexity 104 | 105 | if curr_iter > 0: 106 | t1 = time.time() 107 | rem_time = int((self.total_num_iter - curr_iter) * (t1 - self.t0) / (curr_iter * 60)) 108 | rem_time = str(rem_time) + "m" 109 | else: 110 | rem_time = "" 111 | 112 | L.info(('DEV => Error=%.2f%%, PPL=' + U.b_yellow('%.2f @ %i') + ' (' + U.b_red('%.2f @ %i') + '), Denom=%.3f, %s') 113 | % (dev_error * 100., dev_perplexity, curr_iter, self.best_dev_perplexity, self.best_iter, denominator, rem_time)) 114 | if self.test_eval: 115 | L.info(('TEST => Error=%.2f%%, PPL=' + U.b_yellow('%.2f @ %i') + ' (' + U.b_red('%.2f @ %i') + ')') 116 | % (test_error * 100., test_perplexity, curr_iter, self.best_test_perplexity, self.best_iter)) 117 | 118 | return dev_perplexity 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CoreLM 2 | 3 | CoreLM is a flexible and reusable feed-forward neural network which can be used to train neural language models and joint models (Devlin et. al, 2014), and interface with popular SMT systems like [MOSES](http://www.statmt.org/moses/). It is implemented in Python using [Theano](http://deeplearning.net/software/theano/), which makes is easy-to-use and modify. 4 | 5 | ## Features 6 | 7 | * Implementation of self-normalized log-likelihood (Devlin et. al, 2014) and noise contrastive estimation (NCE) loss functions, to train fast neural language models. 8 | * Decoder Integration with MOSES using NeuralLM and BilingualLM feature functions in MOSES. Also, rescoring MOSES n-best lists using neural language models. 9 | * Efficient and optimized implementation using Theano, capable of using GPU support for faster training and decoding. 10 | * The neural network architecture is flexible. Multiple hidden layers and various activation function, multiple sets of features with different embeddings etc. 11 | * The training is also flexible, with layer specific and adjustable learning rates, using various cost functions like log-likelihood and NCE and regularizations (L1 and L2). 12 | * Preprocessing scripts for monolingual language modeling and bilingual language modeling. 13 | 14 | ## Getting Started 15 | 16 | ### Prerequisites 17 | * Python Version 2.7 18 | * Theano (See [installation instructions](http://deeplearning.net/software/theano/install_ubuntu.html)) with CUDA support (to use GPU) 19 | 20 | ### Installation 21 | 1. Download and unzip CoreLM package in your local machine. Alternatively, you can clone using GIT. 22 | ``` 23 | git clone https://github.com/nusnlp/corelm /path/to/corelm 24 | ``` 25 | 26 | 2. Add the CoreLM directory to PYTHONPATH environment variable. For bash users, add the following line to ~/.bashrc : 27 | ``` 28 | export PYTHONPATH="${PYTHONPATH}:/path/to/corelm/" 29 | ``` 30 | 31 | ## Using CoreLM 32 | 33 | 34 | ### Preprocessing 35 | 36 | The preprocessing scripts can be found in [dlm/preprocess/](dlm/preprocess) directory. The following scripts are available. For detailed help, run the required script with `--help` option. 37 | 38 | * **[monolingual.py](dlm/preprocess/monolingual.py)** : This script preprocesses a text file for monolingual language modeling. The text file must contain one sentence per line. 39 | 40 | * **[bilingual.py](dlm/preprocess/bilingual.py)** : This script preprocesses sentence aligned parallel corpora for bilingual language modeling. 41 | 42 | * **[features.py](dlm/preprocess/features.py)** : This script can be used for sequence labeling tasks. The input text file must have one sentence per line, and one per-word feature is accepted. An example is shown below: 43 | ``` 44 | word1_feature1 word2_feature2 word3_feature3 ... wordN_featureN 45 | ``` 46 | * **[convert_to_memmap.py](dlm/preprocess/convert_to_memmap.py)** : Custom inputs can be converted to input. The input must be a text file, with each line representing a training instance. The words or features must be replaced by corresponding indices according to the vocabulary file supplied. The format is as shown below: 47 | ``` 48 | word_index_11 word_index_12 ... word_index_1M output_word_index_1 49 | ... 50 | ... 51 | word_index_N1 word_index_N2 ... word_index_NM output_word_index_N 52 | ``` 53 | where M is the number of input words and N is the number of training instances. 54 | 55 | 56 | ### Training 57 | Training the neural network is done using the [train.py](train.py) script. The script takes in a memory-memory mapped file which is generated by the pre-processing scripts. Use `--help` for detailed list of options. 58 | 59 | 60 | ### Testing 61 | Evaluation of the neural network can be done using [test.py](test.py) script. It prints the perplexities and log-likelihood of the models on the test set. It optionally outputs the predicted labels. To predict lables of custom test instances use the [classify.py](classify.py) script. See --help for each script. 62 | 63 | ### Integration with Moses 64 | Integration of language and joint models trained using CoreLM is achieved by two methods, re-ranking n-best hypothesis and decoder integration. 65 | * **Re-ranking** : To perform re-ranking of SMT n-best lists (in Moses format) using CoreLM models, first the weight of the new feature is to be trained using the [dlm/reranker/train.py](dlm/reranker/train.py). This can be done using MERT or PRO, which can be set using command-line options. After training the weights, the re-ranking can be done using [dlm/reranker/rerank.py](dlm/reranker/rerank.py). Refer to `--help` for these scripts for the list of options. 66 | 67 | * **Decoder Integration** : Currently, CoreLM uses the NPLM interface to Moses for integration. CoreLM models can be converted to NPLM format using [corelm_to_nplm.py](dlm/misc/corelm_to_nplm.py) script. This can be integrated using `NeuralLM` and `BilingualLM` feature functions in Moses (See [Moses documentation](http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel)). 68 | 69 | 70 | 71 | 72 | ## License 73 | This project is licensed under the MIT license - see the [LICENSE.md](LICENSE.md) file for details 74 | 75 | -------------------------------------------------------------------------------- /dlm/models/mlp.py: -------------------------------------------------------------------------------- 1 | from dlm.models.components.lookuptable import LookupTable 2 | from dlm.models.components.linear import Linear 3 | from dlm.models.components.activation import Activation 4 | from dlm.models import classifier 5 | import dlm.utils as U 6 | import dlm.io.logging as L 7 | import theano.tensor as T 8 | import theano 9 | import numpy 10 | import math 11 | 12 | class MLP(classifier.Classifier): 13 | 14 | def __init__(self, args=None, model_path=None): 15 | 16 | ###################################################################### 17 | ## Parameters 18 | # 19 | 20 | U.xassert((args or model_path) and not (args and model_path), "args or model_path are mutually exclusive") 21 | 22 | if model_path: 23 | args, loaded_params = self.load_model(model_path) 24 | 25 | emb_dim = args.emb_dim 26 | num_hidden_list = map(int, args.num_hidden.split(',')) 27 | if num_hidden_list[0] <= 0: 28 | num_hidden_list = [] 29 | 30 | vocab_size = args.vocab_size 31 | self.ngram_size = args.ngram_size 32 | num_classes = args.num_classes 33 | activation_name = args.activation_name 34 | self.args = args 35 | self.L1 = 0 36 | self.L2_sqr = 0 37 | self.params = [] 38 | 39 | emb_path, vocab = None, None 40 | try: 41 | emb_path = args.emb_path 42 | vocab = args.vocab 43 | except AttributeError: 44 | pass 45 | 46 | rng = numpy.random.RandomState(1234) 47 | self.input = T.imatrix('input') 48 | 49 | ###################################################################### 50 | ## Lookup Table Layer 51 | # 52 | 53 | lookupTableLayer = LookupTable( 54 | rng=rng, 55 | input=self.input, 56 | vocab_size=vocab_size, 57 | emb_dim=emb_dim, 58 | emb_path=emb_path, 59 | vocab_path=vocab, 60 | add_weights=args.weighted_emb 61 | ) 62 | last_layer_output = lookupTableLayer.output 63 | last_layer_output_size = (self.ngram_size - 1) * emb_dim 64 | self.params += lookupTableLayer.params 65 | 66 | ###################################################################### 67 | ## Hidden Layer(s) 68 | # 69 | 70 | for i in range(0, len(num_hidden_list)): 71 | linearLayer = Linear( 72 | rng=rng, 73 | input=last_layer_output, 74 | n_in=last_layer_output_size, 75 | n_out=num_hidden_list[i], 76 | suffix=i 77 | ) 78 | last_layer_output = linearLayer.output 79 | last_layer_output_size = num_hidden_list[i] 80 | self.params += linearLayer.params 81 | 82 | activation = Activation( 83 | input=last_layer_output, 84 | func_name=activation_name 85 | ) 86 | last_layer_output = activation.output 87 | 88 | self.L1 = self.L1 + abs(linearLayer.W).sum() 89 | self.L2_sqr = self.L2_sqr + (linearLayer.W ** 2).sum() 90 | 91 | ###################################################################### 92 | ## Output Linear Layer 93 | # 94 | 95 | linearLayer = Linear( 96 | rng=rng, 97 | input=last_layer_output, 98 | n_in=last_layer_output_size, 99 | n_out=num_classes, 100 | #b_values = numpy.zeros(num_classes) - math.log(num_classes) 101 | b_values = numpy.full(shape=(num_classes),fill_value=(-math.log(num_classes)),dtype=theano.config.floatX), 102 | suffix='out' 103 | ) 104 | last_layer_output = linearLayer.output 105 | self.params += linearLayer.params 106 | 107 | self.L1 = self.L1 + abs(linearLayer.W).sum() 108 | self.L2_sqr = self.L2_sqr + (linearLayer.W ** 2).sum() 109 | 110 | ###################################################################### 111 | ## Model Output 112 | # 113 | 114 | self.output = last_layer_output 115 | self.p_y_given_x_matrix = T.nnet.softmax(last_layer_output) 116 | 117 | # Log Softmax 118 | last_layer_output_shifted = last_layer_output - last_layer_output.max(axis=1, keepdims=True) 119 | self.log_p_y_given_x_matrix = last_layer_output_shifted - T.log(T.sum(T.exp(last_layer_output_shifted),axis=1,keepdims=True)) 120 | 121 | 122 | #self.log_Z_sqr = T.log(T.mean(T.sum(T.exp(last_layer_output), axis=1))) ** 2 123 | #self.log_Z_sqr = T.sum(T.log(T.sum(T.exp(last_layer_output), axis=1))) ** 2 124 | self.log_Z_sqr = T.mean(T.log(T.sum(T.exp(last_layer_output), axis=1)) ** 2) 125 | 126 | ###################################################################### 127 | ## Model Predictions 128 | 129 | self.y_pred = T.argmax(self.p_y_given_x_matrix, axis=1) 130 | 131 | ###################################################################### 132 | ## Loading parameters from file (if given) 133 | # 134 | 135 | if model_path: 136 | self.set_params(loaded_params) 137 | 138 | ###################################################################### 139 | ## Model Functions 140 | # 141 | 142 | def p_y_given_x(self, y): 143 | return self.p_y_given_x_matrix[T.arange(y.shape[0]), y] 144 | 145 | def log_p_y_given_x(self, y): 146 | return self.log_p_y_given_x_matrix[T.arange(y.shape[0]), y] 147 | 148 | def unnormalized_p_y_given_x(self, y): 149 | return self.output[T.arange(y.shape[0]), y] 150 | 151 | def negative_log_likelihood(self, y, weights=None): 152 | if weights: 153 | return -T.sum(T.log(self.p_y_given_x(y)) * weights) / T.sum(weights) 154 | else: 155 | #return -T.mean( T.log(self.p_y_given_x(y))) # Unstable : can lead to NaN 156 | return -T.mean(self.log_p_y_given_x(y)) # Stable Version 157 | 158 | def errors(self, y): 159 | if y.ndim != self.y_pred.ndim: 160 | raise TypeError('y should have the same shape as self.y_pred', ('y', y.type, 'y_pred', self.y_pred.type)) 161 | if y.dtype.startswith('int'): 162 | return T.sum(T.neq(self.y_pred, y)) 163 | else: 164 | raise NotImplementedError() 165 | 166 | -------------------------------------------------------------------------------- /dlm/misc/corelm_to_nplm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | import argparse 5 | import os 6 | import dlm.utils as U 7 | import dlm.io.logging as L 8 | 9 | 10 | def write_matrix(f, matrix): 11 | for row in matrix: 12 | f.write(str(row[0])) 13 | for val in row[1:]: 14 | f.write("\t"+str(val)) 15 | f.write("\n") 16 | 17 | def write_biases(f, biases): 18 | for bias in biases: 19 | f.write(str(bias) + "\n") 20 | 21 | # Arguments for this script 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument("-m", "--corelm-model", dest="corelm_model", required=True, help="The input NPLM model file") 24 | parser.add_argument("-v", "--vocab-file", dest="vocab_path", required=True, help="The input vocabulary") 25 | parser.add_argument("-dir", "--directory", dest="out_dir", help="The output directory for log file, model, etc.") 26 | 27 | args = parser.parse_args() 28 | 29 | U.set_theano_device('cpu',1) 30 | from dlm.models.mlp import MLP 31 | 32 | if args.out_dir is None: 33 | args.out_dir = 'corelm_convert-' + U.curr_time() 34 | U.mkdir_p(args.out_dir) 35 | 36 | # Loading CoreLM model and creating classifier class 37 | L.info("Loading CoreLM model") 38 | classifier = MLP(model_path=args.corelm_model) 39 | args_nn = classifier.args 40 | params_nn = classifier.params 41 | U.xassert(len(params_nn)==7, "CoreLM model is not compatible with NPLM architecture. 2 hidden layers and an output linear layer is required.") 42 | 43 | embeddings = params_nn[0].get_value() 44 | W1 = params_nn[1].get_value() 45 | W1 = np.transpose(W1) 46 | b1 = params_nn[2].get_value() 47 | W2 = params_nn[3].get_value() 48 | W2 = np.transpose(W2) 49 | b2 = params_nn[4].get_value() 50 | W3 = params_nn[5].get_value() 51 | W3 = np.transpose(W3) 52 | b3 = params_nn[6].get_value() 53 | 54 | 55 | # Storing vocabulary into an array 56 | has_null = False 57 | has_sentence_end = False 58 | vocab_list = [] 59 | with open(args.vocab_path,'r') as f_vocab: 60 | for word in f_vocab: 61 | word = word.strip() 62 | vocab_list.append(word) 63 | if word == "": 64 | has_null = True 65 | if word == "": 66 | has_sentence_end = True 67 | 68 | U.xassert(has_sentence_end, "End-of-sentence marker () has to be present in CoreLM model.") 69 | 70 | # adding null if it is not present 71 | if has_null == False: 72 | vocab_list.append("") 73 | 74 | 75 | # Writing to NPLM model 76 | model_file = args.out_dir + "/" + os.path.basename(args.corelm_model) + ".nplm" 77 | L.info("Writing NPLM Model: " + model_file) 78 | with open(model_file,'w') as f_model: 79 | 80 | # Writing the config parameters for the NPLM model 81 | f_model.write("\config\n") 82 | f_model.write("version 1\n") 83 | f_model.write("ngram_size " + str(args_nn.ngram_size) + "\n") 84 | if has_null == True: 85 | f_model.write("input_vocab_size " + str(args_nn.vocab_size)+"\n") 86 | else: 87 | f_model.write("input_vocab_size " + str(args_nn.vocab_size + 1)+"\n") # +1 is used to add the token which is not in corelm 88 | if has_null == True: 89 | f_model.write("output_vocab_size " + str(args_nn.num_classes)+"\n") 90 | else: 91 | f_model.write("output_vocab_size " + str(args_nn.num_classes + 1)+"\n") 92 | f_model.write("input_embedding_dimension " + str(args_nn.emb_dim) + "\n") 93 | f_model.write("num_hidden " + args_nn.num_hidden.split(',')[0] + "\n") 94 | f_model.write("output_embedding_dimension " + args_nn.num_hidden.split(',')[1] + "\n") 95 | 96 | act_func = args_nn.activation_name 97 | U.xassert(act_func in ['relu','tanh','hardtanh'], "Invalid activation function: " + act_func + " (NPLM supports relu, tanh and hardtanh)") 98 | if act_func == "relu": 99 | act_func = "rectifier" 100 | f_model.write("activation_function " + act_func + "\n") 101 | 102 | f_model.write("\n") 103 | 104 | # Writing the input vocabulary 105 | f_model.write("\input_vocab\n") 106 | for word in vocab_list: 107 | f_model.write(word+"\n") 108 | 109 | f_model.write("\n") 110 | 111 | # Writing the output vocabulary ( Currently it is same as input vocabulary) 112 | f_model.write("\output_vocab\n") 113 | for word in vocab_list: 114 | f_model.write(word+"\n") 115 | 116 | f_model.write("\n") 117 | 118 | np.set_printoptions(precision=8, suppress=True) 119 | rng = np.random.RandomState(1234) 120 | 121 | # Writing the input embeddings 122 | f_model.write("\input_embeddings\n") 123 | if has_null == False: 124 | null_row = np.asarray(rng.uniform(low=-0.01, high=0.01, size=(1,embeddings.shape[1])), dtype=embeddings.dtype) 125 | embeddings = np.append(embeddings, null_row, axis=0) 126 | write_matrix(f_model, embeddings) 127 | 128 | f_model.write("\n") 129 | 130 | # Writing the hidden layer weights and biases 131 | f_model.write("\hidden_weights 1\n") 132 | write_matrix(f_model, W1) 133 | 134 | f_model.write("\n") 135 | f_model.write("\hidden_biases 1\n") 136 | write_biases(f_model, b1) 137 | 138 | f_model.write("\n") 139 | f_model.write("\hidden_weights 2\n") 140 | write_matrix(f_model, W2) 141 | 142 | f_model.write("\n") 143 | f_model.write("\hidden_biases 2\n") 144 | write_biases(f_model, b2) 145 | 146 | f_model.write("\n") 147 | 148 | # Writing the output linear layer and biases 149 | f_model.write("\output_weights\n") 150 | if has_null == False: 151 | null_row = np.asarray(rng.uniform(low=-0.01, high=0.01, size=(1,W3.shape[1])), dtype=W3.dtype) 152 | W3 = np.append(W3, null_row, axis=0) 153 | write_matrix(f_model, W3) 154 | 155 | f_model.write("\n") 156 | f_model.write("\output_biases\n") 157 | write_biases(f_model, b3) 158 | if has_null == False: 159 | f_model.write("0.0\n") 160 | f_model.write("\n") 161 | 162 | f_model.write("\end\n") 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /dlm/reranker/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import shutil 6 | import imp 7 | try: 8 | import dlm 9 | except ImportError: 10 | print "[ERROR] dlm module not found. Add CoreLM root directory to your PYTHONPATH" 11 | sys.exit() 12 | 13 | import dlm.utils as U 14 | import dlm.io.logging as L 15 | import argparse 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("-i", "--input-nbest", dest="input_nbest", required=True, help="Input n-best file") 19 | parser.add_argument("-v", "--vocab-file", dest="vocab_path", required=True, help="The vocabulary file that was used in training") 20 | parser.add_argument("-m", "--model-file", dest="model_path", required=True, help="Input CoreLM model file") 21 | parser.add_argument("-r", "--reference-files", dest="ref_paths", required=True, help="A comma-seperated list of reference files") 22 | parser.add_argument("-c", "--config", dest="input_config", required=True, help="Input moses config (ini) file") 23 | parser.add_argument("-o", "--output-dir", dest="out_dir", required=True, help="Output directory") 24 | parser.add_argument("-d", "--device", dest="device", default="gpu", help="The computing device (cpu or gpu)") 25 | parser.add_argument("-t", "--threads", dest="threads", default = 14, type=int, help="Number of MERT threads") 26 | parser.add_argument("-iv", "--init-value", dest="init_value", default = '0.05', help="The initial value of the feature") 27 | parser.add_argument("-n", "--no-aug", dest="no_aug", action='store_true', help="Augmentation will be skipped, if this flag is set") 28 | parser.add_argument("-a", "--tuning-algorithm", dest="alg", default = 'mert', help="Tuning Algorithm (mert|pro|wpro)") 29 | parser.add_argument("-w", "--instance-weights", dest="instance_weights_path", help="Instance weights for wpro algorithm") 30 | parser.add_argument("-s", "--predictable-seed", dest="pred_seed", action='store_true', help="Tune with predictable seed to avoid randomness") 31 | args = parser.parse_args() 32 | 33 | U.set_theano_device(args.device) 34 | 35 | from dlm.reranker import augmenter 36 | from dlm.reranker import mosesIniReader as iniReader 37 | 38 | if os.environ.has_key('MOSES_ROOT'): 39 | moses_root = os.environ['MOSES_ROOT'] 40 | else: 41 | L.error("Set MOSES_ROOT variable to your moses root directory") 42 | 43 | U.mkdir_p(args.out_dir) 44 | 45 | #cmd = moses_root + '/bin/moses -show-weights -f ' + args.input_config + ' 2> /dev/null' 46 | #features = U.capture(cmd).strip().split('\n') 47 | features = iniReader.parseIni(args.input_config) 48 | 49 | output_nbest_path = args.out_dir + '/augmented.nbest' 50 | 51 | if args.no_aug: 52 | shutil.copy(args.input_nbest, output_nbest_path) 53 | else: 54 | augmenter.augment(args.model_path, args.input_nbest, args.vocab_path, output_nbest_path) 55 | 56 | L.info('Extracting stats and features') 57 | #L.warning('The optional arguments of extractor are not used yet') 58 | cmd = moses_root + '/bin/extractor -r ' + args.ref_paths + ' -n ' + output_nbest_path + ' --scfile ' + args.out_dir + '/statscore.data --ffile ' + args.out_dir + '/features.data' 59 | U.capture(cmd) 60 | 61 | with open(args.out_dir + '/init.opt', 'w') as init_opt: 62 | init_list = [] 63 | for line in features: 64 | tokens = line.split(" ") 65 | try: 66 | float(tokens[1]) 67 | init_list += tokens[1:] 68 | except ValueError: 69 | pass 70 | if not args.no_aug: 71 | init_list.append(args.init_value) 72 | dim = len(init_list) 73 | init_opt.write(' '.join(init_list) + '\n') 74 | init_opt.write(' '.join(['0' for i in range(dim)]) + '\n') 75 | init_opt.write(' '.join(['1' for i in range(dim)]) + '\n') 76 | 77 | seed_arg = '' 78 | if args.pred_seed: 79 | seed_arg = ' -r 1234 ' 80 | 81 | if (args.alg == 'pro' or args.alg == 'wpro'): 82 | # PRO 83 | if args.alg == 'pro': 84 | L.info("Running PRO") 85 | cmd = moses_root + '/bin/pro' + ' -S ' + args.out_dir + '/statscore.data -F ' + args.out_dir + '/features.data -o ' + args.out_dir +'/pro.data' + seed_arg 86 | else: 87 | L.info("Running WEIGHTED PRO") 88 | U.xassert(args.instance_weights_path, 'Instance weights are not given to wpro') 89 | cmd = moses_root + '/bin/proWeighted' + ' -S ' + args.out_dir + '/statscore.data -F ' + args.out_dir + '/features.data -o ' + args.out_dir +'/pro.data' + seed_arg + ' -w ' + args.instance_weights_path 90 | U.capture(cmd) 91 | cmd = moses_root + '/bin/megam_i686.opt -fvals -maxi 30 -nobias binary ' + args.out_dir + '/pro.data' 92 | pro_weights = U.capture(cmd) 93 | 94 | pro_weights_arr = pro_weights.strip().split('\n') 95 | weights_dict = dict() 96 | sum = 0.0 97 | highest_feature_index = 0 98 | 99 | for elem in pro_weights_arr: 100 | feature_index,weight = elem[1:].split() 101 | feature_index = int(feature_index) 102 | weight = float(weight) 103 | weights_dict[feature_index] = weight 104 | sum = sum + weight 105 | if feature_index >= highest_feature_index: 106 | highest_feature_index = feature_index 107 | 108 | # Write normalized weights to the file 109 | f_weights = open('weights.txt','w') 110 | for feature_index in xrange(highest_feature_index+1): 111 | weight = weights_dict[feature_index] 112 | f_weights.write(str(weight/sum) + ' '); 113 | #f_weights.write(str(weight) + ' '); 114 | elif (args.alg == 'mert'): 115 | # MERT 116 | #L.warning('The optional arguments of mert are not used yet') 117 | L.info('Running MERT') 118 | cmd = moses_root + '/bin/mert -d ' + str(dim) + ' -S ' + args.out_dir + '/statscore.data -F ' + args.out_dir + '/features.data --ifile ' + args.out_dir + '/init.opt --threads ' + str(args.threads) + seed_arg 119 | U.capture(cmd) 120 | else: 121 | L.error('Invalid tuning algorithm: ' + args.alg) 122 | 123 | U.xassert(os.path.isfile('weights.txt'), 'Optimization failed') 124 | 125 | shutil.move('weights.txt', args.out_dir + '/weights.txt') 126 | 127 | -------------------------------------------------------------------------------- /dlm/io/nbestReader.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import dlm.utils as U 3 | import dlm.io.logging as L 4 | import codecs 5 | 6 | class NBestList(): 7 | def __init__(self, nbest_path, mode='r', reference_list=None): 8 | U.xassert(mode == 'r' or mode == 'w', "Invalid mode: " + mode) 9 | self.mode = mode 10 | self.nbest_file = codecs.open(nbest_path, mode=mode, encoding='UTF-8') 11 | self.prev_index = -1 12 | self.curr_item = None 13 | self.curr_index = 0 14 | self.eof_flag = False 15 | self.ref_manager = None 16 | if reference_list: 17 | U.xassert(mode == 'r', "Cannot accept a reference_list in 'w' mode") 18 | self.ref_manager = RefernceManager(reference_list) 19 | 20 | def __iter__(self): 21 | U.xassert(self.mode == 'r', "Iteration can only be done in 'r' mode") 22 | return self 23 | 24 | def next_item(self): 25 | U.xassert(self.mode == 'r', "next() method can only be used in 'r' mode") 26 | try: 27 | segments = self.nbest_file.next().split("|||") 28 | except StopIteration: 29 | self.close() 30 | raise StopIteration 31 | try: 32 | index = int(segments[0]) 33 | except ValueError: 34 | L.error("The first segment in an n-best list must be an integer") 35 | hyp = segments[1].strip() 36 | features = segments[2].strip() 37 | score = segments[3].strip() 38 | phrase_alignments = None 39 | word_alignments = None 40 | if len(segments) > 4: 41 | phrase_alignments = segments[4].strip() 42 | if len(segments) > 5: 43 | word_alignments = segments[5].strip() 44 | return NBestItem(index, hyp, features, score, phrase_alignments, word_alignments) 45 | 46 | def next(self): # Returns a group of NBestItems with the same index 47 | if self.eof_flag == True: 48 | raise StopIteration 49 | U.xassert(self.mode == 'r', "next_group() method can only be used in 'r' mode") 50 | group = NBestGroup(self.ref_manager) 51 | group.add(self.curr_item) # add the item that was read in the last next() call 52 | try: 53 | self.curr_item = self.next_item() 54 | except StopIteration: 55 | self.eof_flag = True 56 | return group 57 | if self.curr_index != self.curr_item.index: 58 | self.curr_index = self.curr_item.index 59 | return group 60 | while self.curr_index == self.curr_item.index: 61 | group.add(self.curr_item) 62 | try: 63 | self.curr_item = self.next_item() 64 | except StopIteration: 65 | self.eof_flag = True 66 | return group 67 | self.curr_index = self.curr_item.index 68 | return group 69 | 70 | def write(self, item): 71 | U.xassert(self.mode == 'w', "write() method can only be used in 'w' mode") 72 | self.nbest_file.write(unicode(item) + "\n") 73 | 74 | def close(self): 75 | self.nbest_file.close() 76 | 77 | 78 | 79 | class NBestItem: 80 | def __init__(self, index, hyp, features, score, phrase_alignments, word_alignments): 81 | self.index = index 82 | self.hyp = hyp 83 | self.features = features 84 | self.score = score 85 | self.phrase_alignments = phrase_alignments 86 | self.word_alignments = word_alignments 87 | 88 | def __unicode__(self): 89 | output = ' ||| '.join([unicode(self.index), self.hyp, self.features, self.score]) 90 | if self.phrase_alignments: 91 | output = output + ' ||| ' + self.phrase_alignments 92 | if self.word_alignments: 93 | output = output + ' ||| ' + self.word_alignments 94 | return output 95 | 96 | def append_feature(self, feature): 97 | self.features += ' ' + str(feature) 98 | 99 | 100 | class NBestGroup: 101 | def __init__(self, refrence_manager=None): 102 | self.group_index = -1 103 | self.group = [] 104 | self.ref_manager = refrence_manager 105 | 106 | def __unicode__(self): 107 | return '\n'.join([unicode(item) for item in self.group]) 108 | 109 | def __iter__(self): 110 | self.item_index = 0 111 | return self 112 | 113 | def __getitem__(self, index): 114 | return self.group[index] 115 | 116 | def add(self, item): 117 | if item is None: 118 | return 119 | if self.group_index == -1: 120 | self.group_index = item.index 121 | if self.ref_manager: 122 | self.refs = self.ref_manager.get_all_refs(self.group_index) 123 | else: 124 | U.xassert(item.index == self.group_index, "Cannot add an nbest item with an incompatible index") 125 | self.group.append(item) 126 | 127 | def next(self): 128 | #if self.item_index < len(self.group): 129 | try: 130 | item = self.group[self.item_index] 131 | self.item_index += 1 132 | return item 133 | #else: 134 | except IndexError: 135 | raise StopIteration 136 | 137 | def size(self): 138 | return len(self.group) 139 | 140 | def append_features(self, features_list): 141 | U.xassert(len(features_list) == len(self.group), 'Number of features and number of items in this group do not match') 142 | for i in range(len(self.group)): 143 | self.group[i].append_feature(features_list[i]) 144 | 145 | 146 | 147 | class RefernceManager: 148 | def __init__(self, paths_list): 149 | U.xassert(type(paths_list) is list, "The input to a RefernceManager class must be a list") 150 | self.ref_list = [] 151 | self.num_lines = -1 152 | self.num_refs = 0 153 | for path in paths_list: 154 | with codecs.open(path, mode='r', encoding='UTF-8') as f: 155 | self.num_refs += 1 156 | sentences = f.readlines() 157 | if self.num_lines == -1: 158 | self.num_lines = len(sentences) 159 | else: 160 | U.xassert(self.num_lines == len(sentences), "Reference files must have the same number of lines") 161 | self.ref_list.append(sentences) 162 | 163 | def get_all_refs(self, index): 164 | U.xassert(index < self.num_lines, "Index out of bound") 165 | return [self.ref_list[k][index] for k in range(self.num_refs)] 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys, os 4 | import argparse 5 | import dlm.utils as U 6 | import dlm.io.logging as L 7 | 8 | ############### 9 | ## Arguments 10 | # 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("-tr", "--train-mmap", dest="trainset", required=True, help="The memory-mapped training file") 14 | parser.add_argument("-tu", "--tune-mmap", dest="devset", required=True, help="The memory-mapped development (tune) file") 15 | parser.add_argument("-ts", "--test-mmap", dest="testset", help="The memory-mapped evaluation (test) file") 16 | parser.add_argument("-d", "--device", dest="device", default="gpu", help="The computing device (cpu or gpu). Default: gpu") 17 | parser.add_argument("-E", "--emb-dim", dest="emb_dim", default=50, type=int, help="Word embeddings dimension. Default: 50") 18 | parser.add_argument("-H", "--hidden-units", dest="num_hidden", default="512", help="A comma seperated list for the number of units in each hidden layer. Default: 512") 19 | parser.add_argument("-A", "--activation", dest="activation_name", default="tanh", help="Activation function (tanh|hardtanh|sigmoid|fastsigmoid|hardsigmoid|softplus|relu|cappedrelu|softmax). Default: tanh") 20 | parser.add_argument("-a", "--training-algorithm", dest="algorithm", default="sgd", help="The training algorithm (only sgd is supported for now). Default: sgd") 21 | parser.add_argument("-b", "--batch-size", dest="batchsize", default=128, type=int, help="Minibatch size for training. Default: 128") 22 | parser.add_argument("-l", "--learning-rate", dest="learning_rate", default=0.01, type=float, help="Learning rate. Default: 0.01") 23 | parser.add_argument("-D", "--learning-rate-decay", dest="learning_rate_decay", default=0, type=float, help="Learning rate decay (e.g. 0.995) (TO DO). Default: 0") 24 | parser.add_argument("-M", "--momentum", dest="momentum", default=0, type=float, help="Momentum (TO DO). Default: 0") 25 | parser.add_argument("-lf","--loss-function", dest="loss_function", default="nll", help="Loss function (nll|nce). Default: nll (Negative Log Likelihood)") 26 | parser.add_argument("-ns","--noise-samples", dest="num_noise_samples", default=100 ,type=int, help="Number of noise samples for noise contrastive estimation. Default:100") 27 | parser.add_argument("-e", "--num-epochs", dest="num_epochs", default=50, type=int, help="Number of iterations (epochs). Default: 50") 28 | parser.add_argument("-c", "--self-norm-coef", dest="alpha", default=0, type=float, help="Self normalization coefficient (alpha). Default: 0") 29 | parser.add_argument("-L1", "--L1-regularizer", dest="L1_reg", default=0, type=float, help="L1 regularization coefficient. Default: 0") 30 | parser.add_argument("-L2", "--L2-regularizer", dest="L2_reg", default=0, type=float, help="L2 regularization coefficient. Default: 0") 31 | parser.add_argument("-dir", "--directory", dest="out_dir", help="The output directory for log file, model, etc.") 32 | parser.add_argument("-iw", "--instance-weights-path", dest="instance_weights_path", help="(optional) Instance weights file.") 33 | parser.add_argument("--clip-threshold", dest="clip_threshold", default=0, type=float, help="If threshold > 0, clips gradients to [-threshold, +threshold]. Default: 0 (disabled)") 34 | parser.add_argument("--weighted-emb", dest="weighted_emb", action='store_true', help="Use this flag to add per-word weights to embeddings.") 35 | parser.add_argument("--threads", dest="threads", default=8, type=int, help="Number of threads when device is CPU. Default: 8") 36 | parser.add_argument("--emb-path", dest="emb_path", help="(optional) Word embeddings file.") 37 | parser.add_argument("--vocab", dest="vocab", help="(optional) Only needed if --emb-path is used.") 38 | parser.add_argument("--quiet", dest="quiet", action='store_true', help="Use this flag to disable the logger.") 39 | parser.add_argument( "--adjust-learning-rate", dest="enable_lr_adjust", action='store_true', help="Enable learning rate adjustment") 40 | 41 | #parser.add_argument("-m","--model-file", dest="model_path", help="The file path to load the model from") 42 | 43 | args = parser.parse_args() 44 | 45 | args.cwd = os.getcwd() 46 | 47 | if args.out_dir is None: 48 | args.out_dir = 'corelm-' + U.curr_time() 49 | U.mkdir_p(args.out_dir) 50 | 51 | L.quiet = args.quiet 52 | L.set_file_path(os.path.abspath(args.out_dir) + "/log.txt") 53 | 54 | L.info('Command: ' + ' '.join(sys.argv)) 55 | 56 | curr_version = U.curr_version() 57 | if curr_version: 58 | L.info("Version: " + curr_version) 59 | 60 | if args.emb_path: 61 | U.xassert(args.vocab, 'When --emb-path is used, vocab file must be given too (using --vocab).') 62 | 63 | if args.loss_function == "nll": 64 | args.num_noise_samples = 0 65 | 66 | U.print_args(args) 67 | U.set_theano_device(args.device, args.threads) 68 | 69 | import dlm.trainer 70 | from dlm.io.mmapReader import MemMapReader 71 | from dlm.models.mlp import MLP 72 | 73 | ######################### 74 | ## Loading datasets 75 | # 76 | 77 | trainset = MemMapReader(args.trainset, batch_size=args.batchsize, instance_weights_path=args.instance_weights_path) 78 | devset = MemMapReader(args.devset) 79 | testset = None 80 | if args.testset: 81 | testset = MemMapReader(args.testset) 82 | 83 | 84 | ######################### 85 | ## Creating model 86 | # 87 | 88 | L.info('Building the model') 89 | args.vocab_size = trainset.get_vocab_size() 90 | args.ngram_size = trainset.get_ngram_size() 91 | args.num_classes = trainset.get_num_classes() 92 | 93 | classifier = MLP(args) 94 | 95 | L.info('Parameters: ' + str(classifier.params)) 96 | 97 | ######################### 98 | ## Training criterion 99 | # 100 | if args.loss_function == "nll": 101 | from dlm.criterions.weighted_nll import NegLogLikelihood 102 | criterion = NegLogLikelihood(classifier, args) 103 | elif args.loss_function == "nce": 104 | from dlm.criterions.nce import NCELikelihood 105 | noise_dist = trainset.get_unigram_model() 106 | criterion = NCELikelihood(classifier, args, noise_dist) 107 | else: 108 | L.error('Invalid loss function \'' + args.loss_function + '\'') 109 | 110 | ######################### 111 | ## Training 112 | # 113 | 114 | dlm.trainer.train(classifier, criterion, args, trainset, devset, testset) 115 | -------------------------------------------------------------------------------- /dlm/preprocess/monolingual.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | import sys, os 5 | import tempfile 6 | import shutil 7 | import argparse 8 | try: 9 | import dlm 10 | except ImportError: 11 | print "[ERROR] dlm module not found. Add CoreLM root directory to your PYTHONPATH" 12 | sys.exit() 13 | import dlm.utils as U 14 | 15 | # Parsing arguments 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("-i", "--input-file", dest="input_path", required=True, help="Path to the input text file.") 18 | parser.add_argument("-n", "--ngram-size", dest="ngram_size", required=True, type=int, help="Ngram Size.") 19 | parser.add_argument("-o", "--output-dir", dest="output_dir_path", required=True, help="Path to output directory.") 20 | parser.add_argument("--text", dest="text_output", action='store_true', help="Add this flag to produce text output.") 21 | parser.add_argument("--shuffle", dest="shuffle", action='store_true', help="Add this flag to shuffle the output.") 22 | parser.add_argument("--endp", dest="endp", action='store_true', help="Add this flag to add sentence end padding .") 23 | 24 | # Mutually exculsive group of pruning arguments 25 | prune_args = parser.add_mutually_exclusive_group(required=True) 26 | prune_args.add_argument("--prune-vocab-size", dest="prune_vocab_size", type=int, help="Vocabulary size") 27 | prune_args.add_argument("--prune-threshold", dest="prune_threshold_count", type=int, help="Minimum number of occurances for a word to be added into vocabulary") 28 | prune_args.add_argument("--input-vocab-file", dest="input_vocab_path", help="Path to an existing vocabulary file") 29 | 30 | args = parser.parse_args() 31 | 32 | 33 | if (not os.path.exists(args.output_dir_path)): 34 | os.makedirs(args.output_dir_path) 35 | print("Output directory: " + os.path.abspath(args.output_dir_path)) 36 | 37 | prefix = args.output_dir_path + "/" + os.path.basename(args.input_path) 38 | 39 | if args.shuffle: 40 | output_path = prefix + ".idx.shuf.mmap" 41 | output_text_path = prefix + ".idx.shuf.txt" 42 | else: 43 | output_path = prefix + ".idx.mmap" 44 | output_text_path = prefix + ".idx.txt" 45 | 46 | word_to_id_dict = dict() # Word to Index Dictionary 47 | 48 | if args.input_vocab_path is None: 49 | # Counting the frequency of the words. 50 | word_to_freq_dict = dict() # Word Frequency Dictionary 51 | with open(args.input_path, 'r') as input_file: 52 | for line in input_file: 53 | line = line.strip() 54 | if len(line) == 0: 55 | continue 56 | tokens = line.split() 57 | for token in tokens: 58 | if not word_to_freq_dict.has_key(token): 59 | word_to_freq_dict[token] = 1 60 | else: 61 | word_to_freq_dict[token] += 1 62 | 63 | # Prune based on threshold 64 | if args.prune_threshold_count: 65 | for token, freq in word_to_freq_dict.items(): 66 | if freq < args.prune_threshold_count: 67 | del word_to_freq_dict[token] 68 | 69 | # Writing the vocab file and creating a word to id dictionary. 70 | vocab_path = prefix + ".vocab" 71 | word_to_id_dict[''] = 0 72 | word_to_id_dict[''] = 1 73 | word_to_id_dict[''] = 2 74 | added_tokens = '\n\n\n' 75 | if args.endp: 76 | word_to_id_dict[''] = 3 77 | added_tokens += '\n' 78 | with open(vocab_path, 'w') as f_vocab: 79 | curr_index = len(word_to_id_dict) 80 | f_vocab.write(added_tokens) 81 | tokens_freq_sorted = sorted(word_to_freq_dict, key=word_to_freq_dict.get, reverse=True) 82 | if args.prune_vocab_size is not None and args.prune_vocab_size < len(tokens_freq_sorted): 83 | tokens_freq_sorted = tokens_freq_sorted[0:args.prune_vocab_size] 84 | for token in tokens_freq_sorted: 85 | f_vocab.write(token+"\n") 86 | word_to_id_dict[token] = curr_index 87 | curr_index = curr_index + 1 88 | else: 89 | with open(args.input_vocab_path, 'r') as f_vocab: 90 | curr_index = 0 91 | for line in f_vocab: 92 | token = line.strip() 93 | if not word_to_id_dict.has_key(token): 94 | word_to_id_dict[token] = curr_index 95 | curr_index = curr_index + 1 96 | U.xassert(word_to_id_dict.has_key('') and word_to_id_dict.has_key('') and word_to_id_dict.has_key(''), "Missing or or in given vocab file") 97 | if args.endp: 98 | U.xassert(word_to_id_dict.has_key(''), "Missing in given vocab file while --endp flag is used") 99 | if word_to_id_dict.has_key(''): 100 | U.xassert(args.endp, "Given vocab file has but --endp flag is not activated") 101 | 102 | _, tmp_path = tempfile.mkstemp(prefix='dlm.tmp.') 103 | 104 | # For shuffling only 105 | samples = [] # List of samples 106 | nsamples = 0 107 | 108 | # Reading input text file to create IDX file 109 | with open(args.input_path, 'r') as input_file, open(tmp_path, 'w') as tmp_file: 110 | next_id = 0 111 | for line in input_file: 112 | line = line.strip() 113 | if len(line) == 0: 114 | continue 115 | tokens = line.split() 116 | for i in range(args.ngram_size - 1): 117 | tokens.insert(0, '') 118 | if args.endp: 119 | tokens.append('') 120 | indices = [] 121 | for token in tokens: 122 | if not word_to_id_dict.has_key(token): 123 | token = "" 124 | indices.append(str(word_to_id_dict[token])) 125 | for i in range(args.ngram_size - 1, len(indices)): 126 | sample = ' '.join(indices[i - args.ngram_size + 1 : i + 1]) + "\n" 127 | if args.shuffle: 128 | samples.append(sample) 129 | else: 130 | tmp_file.write(sample) 131 | nsamples += 1 132 | 133 | # Shuffling the data and writing to tmp file 134 | if args.shuffle: 135 | permutation_arr = np.random.permutation(nsamples) 136 | with open(tmp_path, 'w') as tmp_file: 137 | for index in permutation_arr: 138 | tmp_file.write(samples[index]) 139 | 140 | 141 | # Creating the memory-mapped file 142 | with open(tmp_path, 'r') as data: 143 | fp = np.memmap(output_path, dtype='int32', mode='w+', shape=(nsamples + 3, args.ngram_size)) 144 | fp[0,0] = nsamples # number of samples 145 | fp[0,1] = args.ngram_size # n-gram size 146 | fp[1,0] = len(word_to_id_dict) # vocab size (MLP classes) 147 | fp[2,0] = len(word_to_id_dict) # number of word types (MLP classes) 148 | counter = 3 149 | for line in data: 150 | tokens = line.split() 151 | fp[counter] = tokens 152 | counter = counter + 1 153 | if counter % 10000000 == 0: 154 | print counter 155 | print str(counter-1) + " samples mapped" 156 | fp.flush 157 | del fp 158 | 159 | if args.text_output: 160 | shutil.move(tmp_path, output_text_path) 161 | else: 162 | os.remove(tmp_path) 163 | -------------------------------------------------------------------------------- /dlm/reranker/bleu.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import sys 3 | import math 4 | import dlm.utils as U 5 | 6 | ################################################################### 7 | ## BLEU utility functions 8 | # 9 | 10 | def get_ngram_counts(tokens): 11 | dicts = [{}, {}, {}, {}] 12 | 13 | for token in tokens: 14 | if dicts[0].has_key(token): 15 | dicts[0][token] += 1 16 | else: 17 | dicts[0][token] = 1 18 | 19 | for k in range(1,4): 20 | for i in range(len(tokens) - k): 21 | segment = ' '.join(tokens[i:i+k+1]) 22 | if dicts[k].has_key(segment): 23 | dicts[k][segment] += 1 24 | else: 25 | dicts[k][segment] = 1 26 | return dicts 27 | 28 | def get_max_ngram_counts(refs_list, hyp_len): 29 | max_counts = [{}, {}, {}, {}] 30 | closest_ref_len = 1000 31 | closest_ref_diff = 1000 32 | for ref in refs_list: 33 | ref_tokens = ref.split() 34 | abs_diff = abs(len(ref_tokens) - hyp_len) 35 | if abs_diff < closest_ref_diff: 36 | closest_ref_len = len(ref_tokens) 37 | closest_ref_diff = abs_diff 38 | dicts = get_ngram_counts(ref_tokens) 39 | for k in range(0,4): 40 | for ngram in dicts[k]: 41 | if not max_counts[k].has_key(ngram) or max_counts[k][ngram] < dicts[k][ngram]: 42 | max_counts[k][ngram] = dicts[k][ngram] 43 | return max_counts, closest_ref_len 44 | 45 | def clip_ngram_counts(hyp_dicts, ref_dicts): 46 | for k in range(0,4): 47 | for ngram in hyp_dicts[k].keys(): 48 | org_count = hyp_dicts[k][ngram] 49 | if ref_dicts[k].has_key(ngram): 50 | hyp_dicts[k][ngram] = min(org_count, ref_dicts[k][ngram]) 51 | else: 52 | hyp_dicts[k][ngram] = 0 53 | 54 | ################################################################### 55 | ## Sentence-level BLEU metrics 56 | # 57 | 58 | def no_smoothing(hyp, refs_list): 59 | l = [0, 0, 0, 0] 60 | m = [0, 0, 0, 0] 61 | log_p = [0, 0, 0, 0] 62 | 63 | hyp_tokens = hyp.split() 64 | 65 | hyp_dicts = get_ngram_counts(hyp_tokens) 66 | ref_dicts, closest_ref_len = get_max_ngram_counts(refs_list, len(hyp_tokens)) 67 | 68 | clip_ngram_counts(hyp_dicts, ref_dicts) 69 | 70 | sum_log_p = 0 71 | for k in range(0,4): 72 | l[k] = max(len(hyp_tokens) - k, 0) 73 | if l[k] == 0: # sentence length is less than 4 74 | log_p[k] = 0 75 | else: 76 | for w in hyp_dicts[k]: 77 | if ref_dicts[k].has_key(w): 78 | m[k] += hyp_dicts[k][w] 79 | if (m[k] == 0): 80 | return 0 81 | else: 82 | log_p[k] = math.log(m[k]) - math.log(l[k]) 83 | sum_log_p += log_p[k] 84 | log_brevity = min(0, 1 - closest_ref_len/len(hyp_tokens)) 85 | return math.exp(1/4 * sum_log_p + log_brevity) 86 | 87 | ################################################################### 88 | 89 | def add_epsilon_smoothing(hyp, refs_list, eps=0.01): 90 | l = [0, 0, 0, 0] 91 | m = [0, 0, 0, 0] 92 | log_p = [0, 0, 0, 0] 93 | 94 | hyp_tokens = hyp.split() 95 | 96 | hyp_dicts = get_ngram_counts(hyp_tokens) 97 | ref_dicts, closest_ref_len = get_max_ngram_counts(refs_list, len(hyp_tokens)) 98 | 99 | clip_ngram_counts(hyp_dicts, ref_dicts) 100 | 101 | sum_log_p = 0 102 | for k in range(0,4): 103 | l[k] = max(len(hyp_tokens) - k, 0) 104 | if l[k] == 0: # sentence length is less than 4 105 | log_p[k] = 0 106 | else: 107 | for w in hyp_dicts[k]: 108 | if ref_dicts[k].has_key(w): 109 | m[k] += hyp_dicts[k][w] 110 | if (m[k] == 0): 111 | log_p[k] = math.log(eps) - math.log(l[k]) 112 | else: 113 | log_p[k] = math.log(m[k]) - math.log(l[k]) 114 | sum_log_p += log_p[k] 115 | log_brevity = min(0, 1 - closest_ref_len/len(hyp_tokens)) 116 | return math.exp(1/4 * sum_log_p + log_brevity) 117 | 118 | ################################################################### 119 | 120 | # Lin and Och, 2004 121 | def lin_smoothing(hyp, refs_list): 122 | l = [0, 1, 1, 1] 123 | m = [0, 1, 1, 1] 124 | log_p = [0, 0, 0, 0] 125 | 126 | hyp_tokens = hyp.split() 127 | 128 | hyp_dicts = get_ngram_counts(hyp_tokens) 129 | ref_dicts, closest_ref_len = get_max_ngram_counts(refs_list, len(hyp_tokens)) 130 | 131 | clip_ngram_counts(hyp_dicts, ref_dicts) 132 | 133 | sum_log_p = 0 134 | for k in range(0,4): 135 | l[k] = max(len(hyp_tokens) - k, 0) 136 | if l[k] == 0: # sentence length is less than 4 137 | log_p[k] = 0 138 | else: 139 | for w in hyp_dicts[k]: 140 | if ref_dicts[k].has_key(w): 141 | m[k] += hyp_dicts[k][w] 142 | if (m[k] == 0): # It can happen when unigram count m[0] is zero 143 | return 0 144 | else: 145 | log_p[k] = math.log(m[k]) - math.log(l[k]) 146 | sum_log_p += log_p[k] 147 | log_brevity = min(0, 1 - closest_ref_len/len(hyp_tokens)) 148 | return math.exp(1/4 * sum_log_p + log_brevity) 149 | 150 | ################################################################### 151 | 152 | # NIST (mteval-v13a.pl) smoothing 153 | def nist_smoothing(hyp, refs_list): 154 | l = [0, 0, 0, 0] 155 | m = [0, 0, 0, 0] 156 | log_p = [0, 0, 0, 0] 157 | 158 | hyp_tokens = hyp.split() 159 | 160 | hyp_dicts = get_ngram_counts(hyp_tokens) 161 | ref_dicts, closest_ref_len = get_max_ngram_counts(refs_list, len(hyp_tokens)) 162 | 163 | clip_ngram_counts(hyp_dicts, ref_dicts) 164 | 165 | invcnt = 1 166 | sum_log_p = 0 167 | for k in range(0,4): 168 | l[k] = max(len(hyp_tokens) - k, 0) 169 | if l[k] == 0: # sentence length is less than 4 170 | log_p[k] = 0 171 | else: 172 | for w in hyp_dicts[k]: 173 | if ref_dicts[k].has_key(w): 174 | m[k] += hyp_dicts[k][w] 175 | if (m[k] == 0): 176 | invcnt *= 2 177 | log_p[k] = math.log(1/invcnt) - math.log(l[k]) 178 | else: 179 | log_p[k] = math.log(m[k]) - math.log(l[k]) 180 | sum_log_p += log_p[k] 181 | log_brevity = min(0, 1 - closest_ref_len/len(hyp_tokens)) 182 | return math.exp(1/4 * sum_log_p + log_brevity) 183 | 184 | ################################################################### 185 | 186 | # Chen and Cherry (2014) smoothing 4 187 | def chen_smoothing(hyp, refs_list, coef=5): 188 | l = [0, 0, 0, 0] 189 | m = [0, 0, 0, 0] 190 | log_p = [0, 0, 0, 0] 191 | 192 | hyp_tokens = hyp.split() 193 | 194 | hyp_dicts = get_ngram_counts(hyp_tokens) 195 | ref_dicts, closest_ref_len = get_max_ngram_counts(refs_list, len(hyp_tokens)) 196 | 197 | clip_ngram_counts(hyp_dicts, ref_dicts) 198 | 199 | invcnt = 1 200 | sum_log_p = 0 201 | for k in range(0,4): 202 | l[k] = max(len(hyp_tokens) - k, 0) 203 | if l[k] == 0: # sentence length is less than 4 204 | log_p[k] = 0 205 | else: 206 | for w in hyp_dicts[k]: 207 | if ref_dicts[k].has_key(w): 208 | m[k] += hyp_dicts[k][w] 209 | if (m[k] == 0): 210 | invcnt *= coef / math.log(len(hyp_tokens) + 1) 211 | log_p[k] = math.log(1/invcnt) - math.log(l[k]) 212 | else: 213 | log_p[k] = math.log(m[k]) - math.log(l[k]) 214 | sum_log_p += log_p[k] 215 | log_brevity = min(0, 1 - closest_ref_len/len(hyp_tokens)) 216 | return math.exp(1/4 * sum_log_p + log_brevity) 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | -------------------------------------------------------------------------------- /dlm/utils.py: -------------------------------------------------------------------------------- 1 | import subprocess as sub 2 | import sys 3 | import os, errno 4 | 5 | #-----------------------------------------------------------------------------------------------------------# 6 | 7 | def __shell(command): 8 | return sub.Popen(command, shell=True, stdout=sub.PIPE, stderr=sub.PIPE) 9 | 10 | # Currently the best 11 | def capture(command): 12 | out, err, code = capture_all(command) 13 | assert (code == 0), "Failed to run the command: " + command 14 | return out 15 | 16 | # Good, if more info is needed 17 | def capture_all(command): 18 | p = __shell(command) 19 | output, err = p.communicate() 20 | return output, err, p.returncode 21 | 22 | # Better to avoid 23 | def capture_no_assert(command): 24 | p = __shell(command) 25 | return p.stdout.read() 26 | 27 | # Not well-tested, but should be good 28 | def capture_output(command): 29 | try: 30 | eval("sub.check_output") 31 | except: 32 | error("subprocess check_output function is not supported in this python version:" + version()) 33 | output = sub.check_output(command, shell=True) 34 | return output 35 | 36 | #-----------------------------------------------------------------------------------------------------------# 37 | 38 | # Dummy object for holding other objects 39 | class Object(object): 40 | pass 41 | 42 | #-----------------------------------------------------------------------------------------------------------# 43 | 44 | import re 45 | 46 | class BColors: 47 | HEADER = '\033[95m' 48 | OKBLUE = '\033[94m' 49 | OKGREEN = '\033[92m' 50 | WARNING = '\033[93m' 51 | FAIL = '\033[91m' 52 | ENDC = '\033[0m' 53 | BOLD = '\033[1m' 54 | UNDERLINE = '\033[4m' 55 | WHITE = '\033[37m' 56 | YELLOW = '\033[33m' 57 | GREEN = '\033[32m' 58 | BLUE = '\033[34m' 59 | CYAN = '\033[36m' 60 | RED = '\033[31m' 61 | MAGENTA = '\033[35m' 62 | BLACK = '\033[30m' 63 | BHEADER = BOLD + '\033[95m' 64 | BOKBLUE = BOLD + '\033[94m' 65 | BOKGREEN = BOLD + '\033[92m' 66 | BWARNING = BOLD + '\033[93m' 67 | BFAIL = BOLD + '\033[91m' 68 | BUNDERLINE = BOLD + '\033[4m' 69 | BWHITE = BOLD + '\033[37m' 70 | BYELLOW = BOLD + '\033[33m' 71 | BGREEN = BOLD + '\033[32m' 72 | BBLUE = BOLD + '\033[34m' 73 | BCYAN = BOLD + '\033[36m' 74 | BRED = BOLD + '\033[31m' 75 | BMAGENTA = BOLD + '\033[35m' 76 | BBLACK = BOLD + '\033[30m' 77 | 78 | @staticmethod 79 | def cleared(s): 80 | return re.sub("\033\[[0-9][0-9]?m", "", s) 81 | 82 | def red(message): 83 | return BColors.RED + str(message) + BColors.ENDC 84 | 85 | def b_red(message): 86 | return BColors.BRED + str(message) + BColors.ENDC 87 | 88 | def blue(message): 89 | return BColors.BLUE + str(message) + BColors.ENDC 90 | 91 | def b_yellow(message): 92 | return BColors.BYELLOW + str(message) + BColors.ENDC 93 | 94 | def green(message): 95 | return BColors.GREEN + str(message) + BColors.ENDC 96 | 97 | def b_green(message): 98 | return BColors.BGREEN + str(message) + BColors.ENDC 99 | 100 | #-----------------------------------------------------------------------------------------------------------# 101 | 102 | def xassert(condition, message): 103 | if not condition: 104 | import dlm.io.logging as L 105 | L.error(message) 106 | 107 | def assert_value(value, valid_values): 108 | assert type(valid_values) == list, "valid_values must be a list, given: " + str(type(valid_values)) 109 | assert value in valid_values, "Invalid value: " + str(value) + " is not in " + str(valid_values) 110 | 111 | def version(): 112 | return '.'.join(map(str, sys.version_info)[0:3]) 113 | 114 | #-----------------------------------------------------------------------------------------------------------# 115 | 116 | def prepend_to_file(file_name, text): 117 | with open(file_name, "r+") as f: 118 | old = f.read() 119 | f.seek(0) 120 | f.write(text + old) 121 | 122 | def append_to_file(file_name, text): 123 | with open(file_name, "a") as f: 124 | f.write(text) 125 | 126 | def mkdir_p(path): 127 | try: 128 | os.makedirs(path) 129 | except OSError as exc: # Python >2.5 130 | if exc.errno == errno.EEXIST and os.path.isdir(path): 131 | pass 132 | else: raise 133 | 134 | def num_lines(path): 135 | return sum(1 for line in open(path)) 136 | 137 | #-----------------------------------------------------------------------------------------------------------# 138 | 139 | def get_all_windows(input_list, window_size): 140 | if window_size <= 1: 141 | return input_list 142 | output = [] 143 | for i in range(len(input_list) - window_size + 1): 144 | output.append(input_list[i:i+window_size]) 145 | return output 146 | 147 | #-----------------------------------------------------------------------------------------------------------# 148 | 149 | def is_gpu_free(gpu_id): 150 | out = capture('nvidia-smi -i ' + str(gpu_id)).strip() 151 | tokens = out.split('\n')[-2].split() 152 | return ' '.join(tokens[1:5]) == 'No running processes found' 153 | 154 | def set_theano_device(device, threads): 155 | import sys 156 | import dlm.io.logging as L 157 | xassert(device == "cpu" or device.startswith("gpu"), "The device can only be 'cpu', 'gpu' or 'gpu'") 158 | if device.startswith("gpu") and len(device) > 3: 159 | try: 160 | gpu_id = int(device[3:]) 161 | if not is_gpu_free(gpu_id): 162 | L.warning('The selected GPU (GPU' + str(gpu_id) + ') is apparently busy.') 163 | except ValueError: 164 | L.error("Unknown GPU device format: " + device) 165 | if device.startswith("gpu"): 166 | L.warning('Running on GPU yields non-deterministic results.') 167 | xassert(sys.modules.has_key('theano') == False, "dlm.utils.set_theano_device() function cannot be called after importing theano") 168 | os.environ['OMP_NUM_THREADS'] = str(threads) 169 | os.environ['THEANO_FLAGS'] = 'device=' + device 170 | os.environ['THEANO_FLAGS'] += ',force_device=True' 171 | os.environ['THEANO_FLAGS'] += ',floatX=float32' 172 | os.environ['THEANO_FLAGS'] += ',warn_float64=warn' 173 | os.environ['THEANO_FLAGS'] += ',cast_policy=numpy+floatX' 174 | #os.environ['THEANO_FLAGS'] += ',allow_gc=True' 175 | os.environ['THEANO_FLAGS'] += ',print_active_device=False' 176 | os.environ['THEANO_FLAGS'] += ',exception_verbosity=high' # Highly verbose debugging 177 | os.environ['THEANO_FLAGS'] += ',mode=FAST_RUN' 178 | os.environ['THEANO_FLAGS'] += ',nvcc.fastmath=False' # True: makes div and sqrt faster at the cost of precision, and possible bugs 179 | #os.environ['THEANO_FLAGS'] += ',optimizer_including=cudnn' # Comment out if CUDNN is not available 180 | try: 181 | import theano 182 | except EnvironmentError: 183 | L.exception() 184 | global logger 185 | if theano.config.device == "gpu": 186 | L.info( 187 | "Device: " + theano.config.device.upper() + " " 188 | + str(theano.sandbox.cuda.active_device_number()) 189 | + " (" + str(theano.sandbox.cuda.active_device_name()) + ")" 190 | ) 191 | else: 192 | L.info("Device: " + theano.config.device.upper()) 193 | 194 | #-----------------------------------------------------------------------------------------------------------# 195 | 196 | def print_args(args): 197 | import dlm.io.logging as L 198 | L.info("Arguments:") 199 | items = vars(args) 200 | for key in sorted(items.keys(), key=lambda s: s.lower()): 201 | value = items[key] 202 | if not value: 203 | value = "None" 204 | L.info(" " + key + ": " + BColors.MAGENTA + str(items[key]) + BColors.ENDC) 205 | 206 | def curr_time(): 207 | import time 208 | t = time.localtime() 209 | return '%i-%i-%i-%ih-%im-%is' % (t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) 210 | 211 | def curr_version(): 212 | import dlm.io.logging as L 213 | info_path = os.path.dirname(sys.argv[0]) + '/.git/refs/heads/master' 214 | if os.path.exists(info_path): 215 | with open(info_path, 'r') as info_file: 216 | return info_file.next().strip() 217 | L.warning('Unable to read current version.') 218 | return None 219 | -------------------------------------------------------------------------------- /dlm/preprocess/features.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | import tempfile 3 | import shutil 4 | import argparse 5 | try: 6 | import dlm 7 | except ImportError: 8 | print "[ERROR] dlm module not found. Add CoreLM root directory to your PYTHONPATH" 9 | sys.exit() 10 | import dlm.utils as U 11 | import dlm.io.logging as L 12 | import numpy as np 13 | 14 | def read_vocab(vocab_path): 15 | word_to_id_dict = dict() 16 | found_sent_marker = False 17 | with open(vocab_path,'r') as f_vocab: 18 | curr_index = 0 19 | for line in f_vocab: 20 | token = line.strip().split()[0] 21 | U.xassert((not word_to_id_dict.has_key(token)), "Given vocab file has duplicate entry for '" + token + "'.") 22 | word_to_id_dict[token] = curr_index 23 | curr_index = curr_index + 1 24 | return word_to_id_dict 25 | 26 | def replace_unk(word, dict): 27 | if word in dict: 28 | return word 29 | else: 30 | return "" 31 | 32 | 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument("-i", "--input-file", dest="input_path", required=True, help="Path to the input text file, words and features separated by underscorre(_) e.g. word_feature .") 35 | parser.add_argument("-l", "--labels-file", dest="labels_path", required=True, help="Path to the labels text file") 36 | parser.add_argument("-n", "--context", dest="context_size", required=True, type=int, help="Context Size.") 37 | parser.add_argument("-o", "--output-dir", dest="output_dir_path", required=True, help="Path to output directory.") 38 | parser.add_argument("--text", dest="text_output", action='store_true', help="Add this flag to produce text output.") 39 | parser.add_argument("--input-vocab-file", dest="input_vocab_path", help="Path to an input(words) vocabulary file") 40 | parser.add_argument("--labels-vocab-file", dest="labels_vocab_path", help="Path to an labels (POS, NER etc.) vocabulary file") 41 | parser.add_argument("--features-vocab-file", dest="features_vocab_path", help="Path to an features vocabulary file") 42 | parser.add_argument("--shuffle", dest="shuffle", action='store_true', help="Add this flag to shuffle the output.") 43 | parser.add_argument("--word-output", dest="word_out", action='store_true', help="Get output in non-index format, i.e. as words and features") 44 | 45 | args = parser.parse_args() 46 | 47 | if (not os.path.exists(args.output_dir_path)): 48 | os.makedirs(args.output_dir_path) 49 | print("Output directory: " + os.path.abspath(args.output_dir_path)) 50 | 51 | 52 | prefix = args.output_dir_path + "/" + os.path.basename(args.input_path) 53 | 54 | if args.shuffle: 55 | output_mmap_path = prefix + ".idx.shuf.mmap" 56 | output_text_path = prefix + ".idx.shuf.txt" 57 | output_words_path = prefix + ".shuf.txt" 58 | 59 | else: 60 | output_mmap_path = prefix + ".idx.mmap" 61 | output_text_path = prefix + ".idx.txt" 62 | output_words_path = prefix + ".txt" 63 | 64 | if args.word_out: 65 | f_words = open(output_words_path, 'w') 66 | 67 | 68 | input_word_to_id = read_vocab(args.input_vocab_path) 69 | feature_to_id = read_vocab(args.features_vocab_path) 70 | label_to_id = read_vocab(args.labels_vocab_path) 71 | input_vocab_size = len(input_word_to_id) 72 | feature_vocab_size = len(feature_to_id) 73 | label_vocab_size = len(label_to_id) 74 | 75 | 76 | half_context = args.context_size/2 77 | U.xassert(input_word_to_id.has_key(""), "Sentence marker not found in input vocabulary!") 78 | U.xassert(feature_to_id.has_key(""), "Sentence marker not found in feature vocabulary!") 79 | 80 | 81 | _, tmp_path = tempfile.mkstemp(prefix='dlm.tmp.') 82 | # For shuffling only 83 | samples = [] # List of samples 84 | samples_idx = [] 85 | nsamples = 0 86 | 87 | 88 | # Read lines and write to the mmap file 89 | line_num=0 90 | nsamples= 0 91 | 92 | with open(args.input_path, 'r') as input_file, open(args.labels_path, 'r') as labels_file, open(tmp_path, 'w') as tmp_file: 93 | next_id = 0 94 | for line,labels_line in zip(input_file,labels_file): 95 | line_num += 1 # Increment the line number 96 | 97 | line = line.strip() 98 | labels_line = labels_line.strip() # Target labels line 99 | if len(line) == 0: 100 | continue 101 | 102 | tokens = line.split() 103 | ltokens = labels_line.split() 104 | U.xassert(len(tokens) == len(ltokens), "The number of labels does not match the input sentence does not match in line " + str(line_num) ) 105 | #for i in range(num_markers): 106 | # tokens.insert(0, '_') 107 | # tokens.append('_') 108 | 109 | indices = [] 110 | f_indices = [] 111 | for token_idx in xrange(len(ltokens)): 112 | word, feature = tokens[token_idx].split('_') 113 | label = ltokens[token_idx] 114 | U.xassert(feature_to_id.has_key(feature), "Feature " + feature + " not present in feature vocab!") 115 | 116 | sample = [] 117 | sample_idx = [] 118 | 119 | 120 | #### Add words to the sample ##### 121 | # Add sentence padding for words if it is at beginning of sentence 122 | for i in xrange(max(0, half_context - token_idx )): 123 | sample.append("") 124 | sample_idx.append(input_word_to_id[""]) 125 | 126 | sample_words = [replace_unk(token.split('_')[0],input_word_to_id) for token in tokens[max(0, token_idx - half_context): token_idx + half_context + 1]] 127 | sample = sample + sample_words 128 | sample_idx = sample_idx + [input_word_to_id[word] for word in sample_words] 129 | 130 | for i in xrange(max(0, token_idx + half_context + 1 - len(tokens))): 131 | sample.append("") 132 | sample_idx.append(input_word_to_id[""]) 133 | 134 | #### Add features to the sample ##### 135 | # Add sentence padding for features it is at beginning of sentence 136 | for i in xrange(max(0, half_context - token_idx )): 137 | sample.append("") 138 | sample_idx.append(feature_to_id[""]) 139 | 140 | sample_features = [token.split('_')[1] for token in tokens[max(0, token_idx - half_context): token_idx + half_context + 1]] 141 | sample = sample + sample_features 142 | sample_idx = sample_idx + [feature_to_id[feature] for feature in sample_features] 143 | 144 | for i in xrange(max(0, token_idx + half_context + 1 - len(tokens))): 145 | sample.append("") 146 | sample_idx.append(feature_to_id[""]) 147 | 148 | #### Add POS tag to the sample #### 149 | sample.append(label) 150 | sample_idx.append(label_to_id[label]) 151 | 152 | if args.shuffle: 153 | samples.append(sample) 154 | samples_idx.append(sample_idx) 155 | else: 156 | tmp_file.write(" ".join([str(idx) for idx in sample_idx]) + "\n") 157 | if args.word_out: 158 | f_words.write(" ".join([word for word in sample]) + "\n") 159 | 160 | nsamples += 1 161 | if nsamples % 100000 == 0: 162 | L.info( str(nsamples) + " samples processed.") 163 | 164 | 165 | 166 | #print word, feature, label 167 | 168 | #if not input_word_to_id.has_key(word): 169 | # word = "" 170 | #indices.append(str(input_word_to_id[word])) 171 | #f_indices.append(str(feature_to_id[feature])) 172 | 173 | # Shuffling the data and writing to tmp file 174 | if args.shuffle: 175 | L.info("Shuffling data.") 176 | permutation_arr = np.random.permutation(nsamples) 177 | with open(tmp_path, 'w') as tmp_file: 178 | for index in permutation_arr: 179 | tmp_file.write(" ".join([str(idx) for idx in samples_idx[index]]) + "\n") 180 | if args.word_out: 181 | f_words.write(" ".join([word for word in samples[index]]) + "\n") 182 | 183 | L.info("Writing to MMap") 184 | # Creating the memory-mapped file 185 | with open(tmp_path, 'r') as data: 186 | fp = np.memmap(output_mmap_path, dtype='int32', mode='w+', shape=(nsamples + 5, args.context_size * 2 + 1)) 187 | fp[0,0] = nsamples # number of samples 188 | fp[0,1] = args.context_size * 2 + 1 # No. of words + POS tag 189 | fp[1,0] = 3 # No. of header lines 190 | fp[2,0] = input_vocab_size 191 | fp[2,1] = args.context_size # No. of header lines 192 | fp[3,0] = feature_vocab_size 193 | fp[3,1] = args.context_size # No. of header lines 194 | fp[4,0] = label_vocab_size 195 | fp[4,1] = 1 196 | counter = 5 197 | for line in data: 198 | tokens = line.split() 199 | fp[counter] = tokens 200 | counter = counter + 1 201 | if counter % 100000 == 0: 202 | L.info(str(counter) + " samples mapped") 203 | L.info(str(counter-5) + " samples mapped") 204 | fp.flush 205 | del fp 206 | 207 | 208 | shutil.move(tmp_path, output_text_path) 209 | 210 | if args.word_out: 211 | f_words.close() 212 | -------------------------------------------------------------------------------- /dlm/preprocess/bilingual.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | import sys, os 5 | import tempfile 6 | import shutil 7 | import argparse 8 | try: 9 | import dlm 10 | except ImportError: 11 | print "[ERROR] dlm module not found. Add CoreLM root directory to your PYTHONPATH" 12 | sys.exit() 13 | import dlm.utils as U 14 | import dlm.io.logging as L 15 | 16 | 17 | def process_vocab(input_path, vocab_size, vocab_path, has_null): 18 | word_to_id_dict = dict() # Word to Index Dictionary 19 | word_to_freq_dict = dict() # Word Frequency Dictionary 20 | with open(input_path, 'r') as input_file: 21 | for line in input_file: 22 | line = line.strip() 23 | if len(line) == 0: 24 | continue 25 | tokens = line.split() 26 | for token in tokens: 27 | if not word_to_freq_dict.has_key(token): 28 | word_to_freq_dict[token] = 1 29 | else: 30 | word_to_freq_dict[token] += 1 31 | 32 | # Writing the vocab file and creating a word to id dictionary. 33 | curr_index = 0 34 | word_to_id_dict[''] = curr_index 35 | added_tokens = '\n' 36 | curr_index += 1 37 | if has_null: 38 | word_to_id_dict[''] = curr_index 39 | added_tokens += '\n' 40 | curr_index += 1 41 | word_to_id_dict[''] = curr_index 42 | added_tokens += '\n' 43 | curr_index += 1 44 | 45 | if args.endp: 46 | word_to_id_dict[''] = curr_index 47 | added_tokens += '\n' 48 | curr_index += 1 49 | with open(vocab_path, 'w') as f_vocab: 50 | f_vocab.write(added_tokens) 51 | tokens_freq_sorted = sorted(word_to_freq_dict, key=word_to_freq_dict.get, reverse=True) 52 | if vocab_size < len(tokens_freq_sorted): 53 | tokens_freq_sorted = tokens_freq_sorted[0:vocab_size] 54 | for token in tokens_freq_sorted: 55 | f_vocab.write(token+"\n") 56 | word_to_id_dict[token] = curr_index 57 | curr_index = curr_index + 1 58 | return word_to_id_dict 59 | 60 | def read_vocab(vocab_path, endp, has_null): 61 | word_to_id_dict = dict() 62 | with open(vocab_path,'r') as f_vocab: 63 | curr_index = 0 64 | for line in f_vocab: 65 | token = line.strip() 66 | if not word_to_id_dict.has_key(token): 67 | word_to_id_dict[token] = curr_index 68 | curr_index = curr_index + 1 69 | U.xassert(word_to_id_dict.has_key('') and word_to_id_dict.has_key(''), "Missing or in given vocab file") 70 | if has_null: 71 | U.xassert(word_to_id_dict.has_key(''), "Missing in given target vocab file") 72 | if endp: 73 | U.xassert(word_to_id_dict.has_key(''), "Missing in given vocab file while --endp flag is used") 74 | if word_to_id_dict.has_key(''): 75 | U.xassert(args.endp, "Given vocab file has but --endp flag is not activated") 76 | return word_to_id_dict 77 | 78 | def replace_unks(tokens, word_to_id_dict): 79 | replaced_tokens = [] 80 | for token in tokens: 81 | if not word_to_id_dict.has_key(token): 82 | token = "" 83 | replaced_tokens.append(token) 84 | return replaced_tokens 85 | 86 | # Parsing arguments 87 | parser = argparse.ArgumentParser() 88 | parser.add_argument("-is", "--input-source-text", dest="src_input_path", required=True, help="Path to the source langauge training text file") 89 | parser.add_argument("-it", "--input-target-text", dest="trg_input_path", required=True, help="Path to the target language training text file") 90 | parser.add_argument("-ia", "--alignment-file", dest="alignment_path", required=True, help="Alignment file for training text") 91 | 92 | parser.add_argument("-cs", "--source-context", dest="src_context", required=True, type=int, help="(Size of source context window - 1)/ 2") 93 | parser.add_argument("-ct", "--target-context", dest="trg_context", required=True, type=int, help="Size of target ngram (including the output)") 94 | 95 | parser.add_argument("-o", "--output-dir", dest="output_dir_path", required=True, help="Path to output directory") 96 | 97 | parser.add_argument("--shuffle", dest="shuffle", action='store_true', help="Add this flag to shuffle the output") 98 | parser.add_argument("--endp", dest="endp", action='store_true', help="Add this flag to add sentence end padding ") 99 | parser.add_argument("--word-output", dest="word_out", action='store_true', help="Get output in non-index format, i.e. as ngrams") 100 | 101 | src_prune_args = parser.add_mutually_exclusive_group(required=True) 102 | src_prune_args.add_argument("-vs","--prune-source-vocab", dest="src_vocab_size", type=int, help="Source vocabulary size") 103 | src_prune_args.add_argument("--source-vocab-file", dest="src_vocab_path", help="Source vocabulary file path") 104 | 105 | trg_prune_args = parser.add_mutually_exclusive_group(required=True) 106 | trg_prune_args.add_argument("-vt","--prune-target-vocab", dest="trg_vocab_size", type=int, help="Target vocabulary size") 107 | trg_prune_args.add_argument("--target-vocab-file", dest="trg_vocab_path", help="Target vocabulary file path") 108 | 109 | output_prune_args = parser.add_mutually_exclusive_group(required=True) 110 | output_prune_args.add_argument("-vo","--prune-output-vocab", dest="output_vocab_size", type=int, help="Output vocabulary size. Defaults to target vocabulary size.") 111 | output_prune_args.add_argument("--output-vocab-file", dest="output_vocab_path", help="Output vocabulary file") 112 | 113 | args = parser.parse_args() 114 | 115 | # Format of the memmap file does not support less than 5 because the first row consists of parameters for the neural network 116 | U.xassert(args.trg_context + args.src_context*2 + 1 > 3, "Total ngram size must be greater than 3. ngrams < 3 are not supported by the current memmap format.") 117 | 118 | L.info("Source Window Size: " + str(args.src_context * 2 + 1)) 119 | L.info("Target Window Size: " + str(args.trg_context - 1)) 120 | L.info("Total Sample Size: " + str(args.trg_context + args.src_context * 2 + 1)) 121 | 122 | if (args.output_vocab_size is None): 123 | args.output_vocab_size = args.trg_vocab_size 124 | 125 | # The output directory is 126 | if (not os.path.exists(args.output_dir_path)): 127 | os.makedirs(args.output_dir_path) 128 | L.info("Output directory: " + os.path.abspath(args.output_dir_path)) 129 | 130 | # Prefix of files 131 | src_prefix = args.output_dir_path + "/" + os.path.basename(args.src_input_path) 132 | trg_prefix = args.output_dir_path + "/" + os.path.basename(args.trg_input_path) 133 | 134 | prefix = os.path.basename(args.src_input_path).split('.')[0] 135 | 136 | output_prefix = args.output_dir_path + "/output" 137 | 138 | # File paths 139 | if args.shuffle: 140 | raise NotImplementedError 141 | output_mmap_path = args.output_dir_path + "/" + prefix + ".idx.shuf.mmap" 142 | output_idx_path = args.output_dir_path + "/" + prefix + ".idx.shuf.txt" 143 | output_ngrams_path = args.output_dir_path + "/" + prefix + ".shuf.txt" 144 | else: 145 | output_mmap_path = args.output_dir_path + "/" + prefix + ".idx.mmap" 146 | output_idx_path = args.output_dir_path + "/" + prefix + ".idx.txt" 147 | output_ngrams_path = args.output_dir_path + "/" + prefix + ".txt" 148 | 149 | tune_output_path = "tune.idx.mmap" 150 | 151 | if args.src_vocab_path is None: 152 | src_word_to_id = process_vocab(args.src_input_path, args.src_vocab_size, src_prefix+'.vocab', has_null=False) # Word to index dictionary of source langauge 153 | else: 154 | src_word_to_id = read_vocab(args.src_vocab_path,args.endp, has_null=False) 155 | 156 | if args.trg_vocab_path is None: 157 | trg_word_to_id = process_vocab(args.trg_input_path, args.trg_vocab_size, trg_prefix+'.vocab', has_null=True) # Word to index dictionary of target langauge 158 | else: 159 | trg_word_to_id = read_vocab(args.trg_vocab_path, args.endp, has_null=True) 160 | 161 | if args.output_vocab_path is None: 162 | output_word_to_id = process_vocab(args.trg_input_path, args.output_vocab_size, output_prefix+'.vocab', has_null=True) # Word to index dictionary of vocab 163 | else: 164 | output_word_to_id = read_vocab(args.output_vocab_path, args.endp, has_null=True) 165 | 166 | svocab = len(src_word_to_id) 167 | tvocab = len(trg_word_to_id) 168 | ovocab = len(output_word_to_id) 169 | 170 | ## Generating the mmap file 171 | _, tmp_path = tempfile.mkstemp(prefix='dlm.tmp.') 172 | 173 | # Word output 174 | if args.word_out: 175 | f_ngrams = open(output_ngrams_path, 'w') 176 | 177 | # For shuffling only 178 | samples = [] # List of samples 179 | nsamples= 0 180 | 181 | sentence_count=0 182 | 183 | with open(args.src_input_path,'r') as src_file, open(args.trg_input_path, 'r') as trg_file, open(args.alignment_path, 'r') as align_file, open(tmp_path,'w') as tmp_file: 184 | for sline,tline,aline in zip(src_file,trg_file,align_file): 185 | stokens = sline[:-1].split() 186 | ttokens = tline[:-1].split() 187 | atokens = aline[:-1].split() 188 | sentence_count += 1 189 | 190 | if args.endp: 191 | stokens.append('') 192 | ttokens.append('') 193 | 194 | stokens = replace_unks(stokens, src_word_to_id) 195 | otokens = replace_unks(ttokens, output_word_to_id) 196 | ttokens = replace_unks(ttokens, trg_word_to_id) 197 | 198 | trg_aligns = [[] for t in range(len(ttokens))] 199 | for atoken in atokens: 200 | sindex,tindex = atoken.split("-") 201 | sindex,tindex = int(sindex), int(tindex) 202 | trg_aligns[tindex].append(sindex) 203 | trg_aligns[-1] = [len(stokens)-1] # Alignment for 204 | 205 | for tindex, sindex_list in enumerate(trg_aligns): 206 | if sindex_list == []: # No Alignment for the target token, look at nearby tokens, giving preference to right 207 | r_tindex = tindex + 1 208 | l_tindex = tindex - 1 209 | while r_tindex < len(ttokens) or l_tindex >=0: 210 | if r_tindex < len(ttokens) and trg_aligns[r_tindex]: 211 | sindex_list = trg_aligns[r_tindex] 212 | break 213 | if l_tindex >= 0 and trg_aligns[l_tindex]: 214 | sindex_list = trg_aligns[l_tindex] 215 | break 216 | r_tindex = r_tindex + 1 217 | l_tindex = l_tindex - 1 218 | 219 | if sindex_list == []: 220 | L.error("No alignments in line " + sentence_count) 221 | 222 | mid = (len(sindex_list)-1)/2 # Middle of the source alignments 223 | sindex_align = sorted(sindex_list)[mid] 224 | 225 | src_ngrams = [] 226 | trg_ngrams = [] 227 | 228 | ngram_idx = [] 229 | 230 | # Get source context 231 | for i in range(max(0, args.src_context - sindex_align)): 232 | src_ngrams.append("") 233 | src_ngrams = src_ngrams + stokens[max(0, sindex_align - args.src_context): sindex_align + args.src_context + 1] 234 | for i in range(max(0, sindex_align + args.src_context + 1 - len(stokens))): 235 | src_ngrams.append("") 236 | 237 | # Get target context and predicted word 238 | for i in range(max(0, args.trg_context - (tindex + 1 ))): 239 | trg_ngrams.append("") 240 | trg_ngrams = trg_ngrams + ttokens[max(0, tindex + 1 - args.trg_context): tindex] 241 | 242 | output_word = otokens[tindex] 243 | 244 | sample = " ".join(src_ngrams) + " " + " ".join(trg_ngrams) + " " + output_word + "\n" 245 | sample_idx = " ".join([str(src_word_to_id[stoken] + tvocab) for stoken in src_ngrams]) 246 | sample_idx += " " + " ".join([str(trg_word_to_id[ttoken]) for ttoken in trg_ngrams]) 247 | sample_idx += " " + str(output_word_to_id[output_word]) + "\n" 248 | 249 | if args.shuffle: 250 | samples.append(sample) 251 | samples_idx.append(sample_idx) 252 | else: 253 | tmp_file.write(sample_idx) 254 | if args.word_out: 255 | f_ngrams.write(sample) 256 | 257 | nsamples += 1 258 | if nsamples % 10000000 == 0: 259 | L.info( str(nsamples) + " samples processed.") 260 | 261 | # Shuffling the data and writing to tmp file 262 | if args.shuffle: 263 | permutation_arr = np.random.permutation(nsamples) 264 | with open(tmp_path, 'w') as tmp_file: 265 | for index in permutation_arr: 266 | tmp_file.write(samples_idx[index]) 267 | if args.word_out: 268 | f_ngrams.write(samples[index]) 269 | 270 | ngram_size = args.trg_context + args.src_context * 2 + 1 271 | 272 | # Creating the memory-mapped file 273 | with open(tmp_path, 'r') as data: 274 | fp = np.memmap(output_mmap_path, dtype='int32', mode='w+', shape=(nsamples + 3, ngram_size)) 275 | fp[0,0] = nsamples # number of samples 276 | fp[0,1] = ngram_size # n-gram size 277 | fp[1,0] = svocab + tvocab # context vocab size 278 | fp[2,0] = ovocab # output vocab size 279 | counter = 3 280 | for line in data: 281 | tokens = line.split() 282 | fp[counter] = tokens 283 | counter = counter + 1 284 | if counter % 10000000 == 0: 285 | L.info(str(counter) + " samples mapped") 286 | L.info(str(counter-3) + " samples mapped") 287 | fp.flush 288 | del fp 289 | 290 | shutil.move(tmp_path, output_idx_path) 291 | 292 | if args.word_out: 293 | f_ngrams.close() 294 | --------------------------------------------------------------------------------