├── dlm
    ├── __init__.py
    ├── io
    │   ├── __init__.py
    │   ├── logging.py
    │   ├── ngramsReader.py
    │   ├── vocabReader.py
    │   ├── w2vEmbReader.py
    │   ├── plotting.py
    │   ├── textReader.py
    │   ├── mmapReader.py
    │   └── nbestReader.py
    ├── algorithms
    │   ├── asgd.py
    │   ├── __init__.py
    │   ├── lr_tuner.py
    │   ├── sgd.py
    │   └── sgd_using_inputs.py
    ├── models
    │   ├── __init__.py
    │   ├── components
    │   │   ├── __init__.py
    │   │   ├── operation.py
    │   │   ├── linear.py
    │   │   ├── activation.py
    │   │   └── lookuptable.py
    │   ├── classifier.py
    │   └── mlp.py
    ├── criterions
    │   ├── __init__.py
    │   ├── bce.py
    │   ├── nll.py
    │   ├── weighted_nll.py
    │   └── nce.py
    ├── reranker
    │   ├── __init__.py
    │   ├── mosesIniReader.py
    │   ├── augmenter.py
    │   ├── tools.py
    │   ├── rerank.py
    │   ├── oracle.py
    │   ├── train.py
    │   └── bleu.py
    ├── preprocess
    │   ├── convert_to_memmap.py
    │   ├── monolingual.py
    │   ├── features.py
    │   └── bilingual.py
    ├── misc
    │   ├── nplm_to_corelm.py
    │   └── corelm_to_nplm.py
    ├── eval.py
    ├── trainer.py
    └── utils.py
├── .gitignore
├── LICENSE.md
├── classify.py
├── test.py
├── README.md
└── train.py


/dlm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dlm/io/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dlm/algorithms/asgd.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dlm/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dlm/algorithms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dlm/criterions/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dlm/reranker/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dlm/models/components/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.class
 3 | *.jar
 4 | *.war
 5 | *.ear
 6 | 
 7 | *.o
 8 | 
 9 | *.pdf
10 | *.PDF
11 | *.bin
12 | *.aux
13 | *.bbl
14 | *.blg
15 | *.log
16 | *.backup
17 | 
18 | *~
19 | .*
20 | 
21 | deleted
22 | 


--------------------------------------------------------------------------------
/dlm/criterions/bce.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | class BinaryCrossEntropy():
 4 | 
 5 | 	def __init__(self, classifier, args):
 6 | 		
 7 | 		self.y = T.matrix('y')
 8 | 		
 9 | 		self.cost = (
10 | 			classifier.mean_batch_cross_entropy(self.y)
11 | 			+ args.L1_reg * classifier.L1
12 | 			+ args.L2_reg * classifier.L2_sqr
13 | 		)
14 | 


--------------------------------------------------------------------------------
/dlm/reranker/mosesIniReader.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def parseIni(ini_path):
 4 | 	out = []
 5 | 	with open(ini_path, 'r') as ini_file:
 6 | 		section = '[nil]'
 7 | 		for line in ini_file:
 8 | 			line = line.strip()
 9 | 			if line.startswith('['):
10 | 				section = line
11 | 			elif section == '[weight]' and line != '':
12 | 				if line.startswith('UnknownWordPenalty0= '):
13 | 					out.append('UnknownWordPenalty0 UNTUNEABLE')
14 | 				else:
15 | 					out.append(line)
16 | 	return out
17 | 


--------------------------------------------------------------------------------
/dlm/algorithms/lr_tuner.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import theano.tensor as T
 3 | import theano
 4 | import numpy
 5 | 
 6 | class LRTuner:
 7 | 	def __init__(self, low, high, inc):
 8 | 		self.low = low
 9 | 		self.high = high
10 | 		self.inc = inc
11 | 		self.prev_ppl = numpy.inf
12 | 	
13 | 	def adapt_lr(self, curr_ppl, curr_lr):
14 | 		if curr_ppl >= self.prev_ppl:
15 | 			lr = max(curr_lr / 2, self.low)
16 | 		else:
17 | 			lr = min(curr_lr + self.inc, self.high)
18 | 		self.prev_ppl = curr_ppl
19 | 		return lr
20 | 


--------------------------------------------------------------------------------
/dlm/models/components/operation.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | class Operation():
 4 | 
 5 | 	def __init__(self, input, op_name):
 6 | 		self.input = input
 7 | 		self.operate = self.get_operation(op_name)
 8 | 		self.output = self.operate(input, axis=1)
 9 | 	
10 | 	def get_operation(self, op_name):
11 | 		if op_name == 'sum':
12 | 			return T.sum
13 | 		elif op_name == 'mean':
14 | 			return T.mean
15 | 		elif op_name == 'max':
16 | 			return T.max
17 | 		else:
18 | 			L.error('Invalid operation name given: ' + op_name)
19 | 


--------------------------------------------------------------------------------
/dlm/criterions/nll.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | class NegLogLikelihood():
 4 | 
 5 | 	def __init__(self, classifier, args):
 6 | 		
 7 | 		self.y = T.ivector('y')
 8 | 		
 9 | 		self.cost = (
10 | 			classifier.negative_log_likelihood(self.y)
11 | 			+ args.L1_reg * classifier.L1
12 | 			+ args.L2_reg * classifier.L2_sqr
13 | 		)
14 | 		
15 | 		if args.alpha is not None and args.alpha > 0:
16 | 			self.cost = self.cost + args.alpha  * classifier.log_Z_sqr
17 | 
18 | 		self.test = (
19 | 			T.mean(classifier.p_y_given_x(self.y))
20 | 		)
21 | 


--------------------------------------------------------------------------------
/dlm/criterions/weighted_nll.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | class NegLogLikelihood():
 4 | 
 5 | 	def __init__(self, classifier, args):
 6 | 		
 7 | 		self.y = T.ivector('y')
 8 | 		self.w = T.vector('w')
 9 | 		
10 | 		if args.instance_weights_path:
11 | 			self.cost = classifier.negative_log_likelihood(self.y, self.w)
12 | 		else:
13 | 			self.cost = classifier.negative_log_likelihood(self.y)
14 | 		
15 | 		if args.L1_reg > 0:
16 | 			self.cost = self.cost + args.L1_reg * classifier.L1
17 | 		
18 | 		if args.L2_reg > 0:
19 | 			self.cost = self.cost + args.L2_reg * classifier.L2_sqr
20 | 		
21 | 		if args.alpha and args.alpha > 0:
22 | 			self.cost = self.cost + args.alpha  * classifier.log_Z_sqr
23 | 
24 | 		self.test = (
25 | 			T.mean(classifier.p_y_given_x(self.y))
26 | 		)
27 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 | Copyright (c) 2016 National University of Singapore
3 | 
4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
5 | 
6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7 | 
8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/dlm/io/logging.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import dlm.utils as U
 3 | 
 4 | file_path = None
 5 | quiet = False
 6 | 
 7 | def set_file_path(path):
 8 | 	global file_path
 9 | 	file_path = path
10 | 	log_file = open(file_path, 'w') # reset the file
11 | 	log_file.close()
12 | 	info('Log file: ' + path)
13 | 
14 | def error(message):
15 | 	stderr = U.BColors.BFAIL + "[ERROR] " + U.BColors.ENDC + message + "\n"
16 | 	log = "[ERROR] " + U.BColors.cleared(message) + "\n"
17 | 	_write(stderr, log)
18 | 	sys.exit()
19 | 
20 | def warning(message):
21 | 	stderr = U.BColors.BWARNING + "[WARNING] " + U.BColors.ENDC + message + "\n"
22 | 	log = "[WARNING] " + U.BColors.cleared(message) + "\n"
23 | 	_write(stderr, log)
24 | 
25 | def info(message):
26 | 	stderr = U.BColors.BOKBLUE + "[INFO] " + U.BColors.ENDC + message + "\n"
27 | 	log = "[INFO] " + U.BColors.cleared(message) + "\n"
28 | 	_write(stderr, log)
29 | 
30 | def exception():
31 | 	exc = str(sys.exc_info()[0].mro()[0].__name__) + ": " + sys.exc_info()[1].message + "\n"
32 | 	stderr = U.BColors.BFAIL + "[ERROR] " + U.BColors.ENDC + exc
33 | 	log = "[ERROR] " + exc
34 | 	_write(stderr, log)
35 | 	sys.exit()
36 | 
37 | def _write(stderr, log):
38 | 	global quiet
39 | 	if not quiet:
40 | 		sys.stderr.write(stderr)
41 | 	global file_path
42 | 	if file_path:
43 | 		log_file = open(file_path, 'a')
44 | 		log_file.write(log)
45 | 		log_file.close()
46 | 


--------------------------------------------------------------------------------
/dlm/io/ngramsReader.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import dlm.utils as U
 3 | import dlm.io.logging as L
 4 | from dlm.io.vocabReader import VocabManager
 5 | from dlm.io.nbestReader import NBestList
 6 | import numpy as np
 7 | import codecs
 8 | import theano
 9 | import theano.tensor as T
10 | 
11 | class NgramsReader():
12 | 
13 | 	def __init__(self, dataset_path, ngram_size, vocab_path):
14 | 
15 | 		L.info("Initializing dataset from: " + dataset_path)
16 | 
17 | 		vocab = VocabManager(vocab_path)
18 | 
19 | 		curr_index = 0
20 | 		self.num_sentences = 0
21 | 
22 | 		ngrams_list = []
23 | 		dataset = codecs.open(dataset_path, 'r', encoding="UTF-8")
24 | 		for line in dataset:
25 | 			tokens = line.split()
26 | 			ngrams = vocab.get_ids_given_word_list(tokens)
27 | 			ngrams_list.append(ngrams)
28 | 			curr_index += 1
29 | 		dataset.close()
30 | 
31 | 		data = np.asarray(ngrams_list)
32 | 
33 | 		x = data[:,0:-1]
34 | 		y = data[:,-1]
35 | 		self.num_samples = y.shape[0]
36 | 
37 | 		self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
38 | 		self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
39 | 
40 | 	def get_x(self, index):
41 | 		return self.shared_x[ index : index+1 ]
42 | 
43 | 	def get_y(self, index):
44 | 		return self.shared_y[ index : index+1 ]
45 | 
46 | 	def get_num_batches(self):
47 | 		return self.num_samples
48 | 
49 | 	def _get_num_samples(self):
50 | 		return self.num_samples
51 | 
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/dlm/io/vocabReader.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | import dlm.utils as U
 3 | import dlm.io.logging as L
 4 | 
 5 | class VocabManager:
 6 | 	def __init__(self, input_path):
 7 | 		L.info("Initializing vocabulary from: " + input_path)
 8 | 		self.word_to_id_dict = dict()
 9 | 		self.id_to_word_dict = dict()
10 | 		curr_id = 0
11 | 		with codecs.open(input_path, 'r', encoding='UTF-8') as input_file:
12 | 			for line in input_file:
13 | 				word = line.strip()
14 | 				self.word_to_id_dict[word] = curr_id
15 | 				self.id_to_word_dict[curr_id] = word
16 | 				curr_id += 1
17 | 		try:
18 | 			self.unk_id = self.word_to_id_dict['<unk>']
19 | 			self.padding_id = self.word_to_id_dict['<s>']
20 | 		except KeyError:
21 | 			L.error("Given vocab file does not include <unk> or <s>")
22 | 		self.has_end_padding = self.word_to_id_dict.has_key('</s>')
23 | 		
24 | 	def get_word_given_id(self, id):
25 | 		try:
26 | 			return self.id_to_word_dict[id]
27 | 		except KeyError:
28 | 			raise KeyError
29 | 	
30 | 	def get_id_given_word(self, word):
31 | 		try:
32 | 			return self.word_to_id_dict[word]
33 | 		except KeyError:
34 | 			return self.unk_id
35 | 	
36 | 	def get_ids_given_word_list(self, word_list):
37 | 		output = []
38 | 		for word in word_list:
39 | 			output.append(self.get_id_given_word(word))
40 | 		return output
41 | 	
42 | 	def get_words_given_id_list(self, id_list):
43 | 		output = []
44 | 		for id in id_list:
45 | 			output.append(self.get_word_given_id(id))
46 | 		return output
47 | 


--------------------------------------------------------------------------------
/dlm/models/classifier.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from theano.misc.pkl_utils import pickle
 3 | import gzip
 4 | import dlm.io.logging as L
 5 | import time
 6 | import dlm.utils as U
 7 | 
 8 | class Classifier:
 9 | 
10 | 	def __init__(self):
11 | 		self.params = []
12 | 
13 | 	def get_params(self):
14 | 		return self.params
15 | 
16 | 	def set_params(self, params):
17 | 		U.xassert(len(self.params) == len(params), 'The given model file is consistent with the architecture')
18 | 		for param, loaded_param in zip(self.params, params):
19 | 			param.set_value(loaded_param)
20 | 
21 | 	def load_model(self, model_path):
22 | 		L.info('Loading model from ' + model_path)
23 | 		t0 = time.time()
24 | 		if model_path.endswith('.gz'):
25 | 			with gzip.open(model_path, 'rb') as model_file:
26 | 				args, params = pickle.load(model_file)
27 | 		else:
28 | 			with open(model_path, 'r') as model_file:
29 | 				args, params = pickle.load(model_file)
30 | 		L.info('  |-> took %.2f seconds' % (time.time() - t0))
31 | 		return args, params
32 | 
33 | 	def save_model(self, model_path, zipped=True, compress_level=5):
34 | 		L.info('Saving model to ' + model_path)
35 | 		t0 = time.time()
36 | 		if zipped:
37 | 			with gzip.open(model_path, 'wb', compresslevel=compress_level) as model_file:
38 | 				params = self.get_params()
39 | 				pickle.dump((self.args, [param.get_value() for param in params]), model_file)
40 | 		else:
41 | 			with open(model_path, 'w') as model_file:
42 | 				params = self.get_params()
43 | 				pickle.dump((self.args, [param.get_value() for param in params]), model_file)
44 | 		L.info('  |-> took %.2f seconds' % (time.time() - t0))
45 | 


--------------------------------------------------------------------------------
/dlm/models/components/linear.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import theano
 3 | import theano.tensor as T
 4 | import dlm.io.logging as L
 5 | import dlm.utils as U
 6 | 
 7 | class Linear():
 8 | 
 9 | 	def __init__(self, rng, input, n_in, n_out, W_values=None, init_method=0, b_values=None, no_bias=False, suffix=None):
10 | 		
11 | 		L.info("Linear layer, #inputs: %s, #outputs: %s" % (U.red(n_in), U.red(n_out)))
12 | 
13 | 		self.input = input
14 | 
15 | 		if W_values is None:
16 | 			if init_method == 0:	# Useful for Relu activation
17 | 				high = 0.01
18 | 			elif init_method == 1:	# Useful for Tanh activation
19 | 				high = numpy.sqrt(6. / (n_in + n_out))
20 | 			elif init_method == 2:	# Useful for Sigmoid activation
21 | 				high = 4 * numpy.sqrt(6. / (n_in + n_out))
22 | 			else:
23 | 				L.error('Invalid initialization method')
24 | 			W_values = numpy.asarray(
25 | 				rng.uniform(
26 | 					low=-high,
27 | 					high=high,
28 | 					size=(n_in, n_out)
29 | 				),
30 | 				dtype=theano.config.floatX
31 | 			)
32 | 
33 | 		if b_values is None and not no_bias:
34 | 			b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
35 | 		
36 | 		W_name = 'W'
37 | 		if suffix is not None:
38 | 			W_name += '.' + str(suffix)
39 | 		
40 | 		W = theano.shared(value=W_values, name=W_name, borrow=True)
41 | 		self.W = W
42 | 
43 | 		if no_bias:
44 | 			self.output = T.dot(input, self.W)
45 | 			self.params = [self.W]
46 | 		else:
47 | 			b_name = 'b'
48 | 			if suffix is not None:
49 | 				b_name += '.' + str(suffix)
50 | 			b = theano.shared(value=b_values, name=b_name, borrow=True)
51 | 			self.b = b
52 | 			self.output = T.dot(input, self.W) + self.b
53 | 			self.params = [self.W, self.b]
54 | 


--------------------------------------------------------------------------------
/dlm/algorithms/sgd.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | import theano
 3 | import dlm.io.logging as L
 4 | 
 5 | class SGD:
 6 | 	def __init__(self, classifier, criterion, learning_rate, trainset, clip_threshold=0):
 7 | 		self.eta = learning_rate
 8 | 		self.is_weighted = trainset.is_weighted
 9 | 		
10 | 		if clip_threshold > 0:
11 | 			gparams = [T.clip(T.grad(criterion.cost, param), -clip_threshold, clip_threshold) for param in classifier.params]
12 | 		else:
13 | 			gparams = [T.grad(criterion.cost, param) for param in classifier.params]
14 | 		
15 | 		lr = T.fscalar()
16 | 		
17 | 		updates = [
18 | 			(param, param - lr * gparam)
19 | 			for param, gparam in zip(classifier.params, gparams)
20 | 		]
21 | 	
22 | 		index = T.lscalar()		# index to a [mini]batch
23 | 		x = classifier.input
24 | 		y = criterion.y
25 | 		
26 | 		if self.is_weighted: 
27 | 			w = criterion.w
28 | 			self.step_func = theano.function(
29 | 				inputs=[index, lr],
30 | 				outputs=[criterion.cost] + gparams,
31 | 				updates=updates,
32 | 				givens={
33 | 					x: trainset.get_x(index),
34 | 					y: trainset.get_y(index),
35 | 					w: trainset.get_w(index)
36 | 				}
37 | 			)
38 | 		else:
39 | 			self.step_func = theano.function(
40 | 				inputs=[index, lr],
41 | 				outputs=[criterion.cost] + gparams,
42 | 				updates=updates,
43 | 				givens={
44 | 					x: trainset.get_x(index),
45 | 					y: trainset.get_y(index)
46 | 				}
47 | 			)
48 | 
49 | 	def step(self, minibatch_index):
50 | 		outputs = self.step_func(minibatch_index, self.eta)
51 | 		step_cost, gparams = outputs[0], outputs[1:]
52 | 		return step_cost, gparams
53 | 
54 | 	def set_learning_rate(self, eta):
55 | 		self.eta = eta
56 | 	
57 | 	def get_learning_rate(self):
58 | 		return self.eta
59 | 


--------------------------------------------------------------------------------
/dlm/algorithms/sgd_using_inputs.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | import theano
 3 | import dlm.io.logging as L
 4 | 
 5 | class SGD:
 6 | 	def __init__(self, classifier, criterion, learning_rate, trainset, clip_threshold=0):
 7 | 		self.eta = learning_rate
 8 | 		self.is_weighted = trainset.is_weighted
 9 | 		self.trainset = trainset
10 | 		
11 | 		if clip_threshold > 0:
12 | 			gparams = [T.clip(T.grad(criterion.cost, param), -clip_threshold, clip_threshold) for param in classifier.params]
13 | 		else:
14 | 			gparams = [T.grad(criterion.cost, param) for param in classifier.params]
15 | 		
16 | 		lr = T.fscalar()
17 | 		
18 | 		updates = [
19 | 			(param, param - lr * gparam)
20 | 			for param, gparam in zip(classifier.params, gparams)
21 | 		]
22 | 		
23 | 		x = classifier.input
24 | 		y = criterion.y
25 | 		
26 | 		if self.is_weighted: 
27 | 			w = criterion.w
28 | 			self.step_func = theano.function(
29 | 				inputs=[x, y, w, lr],
30 | 				outputs=[criterion.cost] + gparams,
31 | 				updates=updates,
32 | 			)
33 | 		else:
34 | 			self.step_func = theano.function(
35 | 				inputs=[x, y, lr],
36 | 				outputs=[criterion.cost] + gparams,
37 | 				updates=updates,
38 | 			)
39 | 
40 | 	def step(self, minibatch_index):
41 | 		if self.is_weighted:
42 | 			outputs = self.step_func(self.trainset.get_x(minibatch_index), self.trainset.get_y(minibatch_index), self.trainset.get_w(minibatch_index), self.eta)
43 | 		else:
44 | 			outputs = self.step_func(self.trainset.get_x(minibatch_index), self.trainset.get_y(minibatch_index), self.eta)
45 | 		step_cost, gparams = outputs[0], outputs[1:]
46 | 		return step_cost, gparams
47 | 
48 | 	def set_learning_rate(self, eta):
49 | 		self.eta = eta
50 | 	
51 | 	def get_learning_rate(self):
52 | 		return self.eta
53 | 


--------------------------------------------------------------------------------
/dlm/models/components/activation.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import theano.tensor as T
 3 | import dlm.io.logging as L
 4 | import dlm.utils as U
 5 | 
 6 | class Activation():
 7 | 
 8 | 	def __init__(self, input, func_name):
 9 | 		L.info("Activation layer, function: " + U.red(func_name))
10 | 		self.input = input
11 | 		self.func = self.get_function(func_name)
12 | 		self.output = self.func(input)
13 | 	
14 | 	def get_function(self, func_name):
15 | 		if func_name == 'tanh':
16 | 			return T.tanh
17 | 		elif func_name == 'hardtanh':
18 | 			L.warning('Current hardTanh implementation is slow!')
19 | 			return lambda x: ((abs(x) <= 1) * x) + ((1 < abs(x)) * T.sgn(x))
20 | 		elif func_name == 'xtanh':
21 | 			return lambda x: T.tanh(x) + 0.1 * x
22 | 		elif func_name == 'sigmoid':
23 | 			return T.nnet.sigmoid
24 | 		elif func_name == 'fastsigmoid':
25 | 			L.error('T.nnet.ultra_fast_sigmoid function has some problems')
26 | 		elif func_name == 'hardsigmoid':
27 | 			return T.nnet.hard_sigmoid
28 | 		elif func_name == 'xsigmoid':
29 | 			return lambda x: T.nnet.sigmoid(x) + 0.1 * x
30 | 		elif func_name == 'softplus':
31 | 			return T.nnet.softplus
32 | 		elif func_name == 'relu':
33 | 			#return lambda x: T.maximum(x, 0)
34 | 			return lambda x: x * (x > 0)
35 | 			#return T.nnet.relu # Update theano and then use this one instead
36 | 		elif func_name == 'leakyrelu':
37 | 			return lambda x: T.maximum(x, 0.01 * x)
38 | 		elif func_name == 'cappedrelu':
39 | 			return lambda x: T.minimum(x * (x > 0), 6)
40 | 		elif func_name == 'softmax':
41 | 			return T.nnet.softmax
42 | 		elif func_name == 'norm1':
43 | 			return lambda x: x / T.nlinalg.norm(x, 1)
44 | 		elif func_name == 'norm2':
45 | 			#return lambda x: x / T.nlinalg.norm(x, 2)
46 | 			return lambda x: x / T.dot(x, x)**0.5
47 | 		else:
48 | 			L.error('Invalid function name given: ' + func_name)
49 | 


--------------------------------------------------------------------------------
/dlm/criterions/nce.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | from theano.tensor.shared_randomstreams import RandomStreams
 3 | from theano import function
 4 | import numpy as np
 5 | import theano
 6 | import math
 7 | 
 8 | class NCELikelihood():
 9 | 
10 | 	def __init__(self, classifier, args, noise_dist):
11 | 		self.y = T.ivector('y')
12 | 		
13 | 		## Cost function
14 | 		#  Sum over minibatch instances (log ( u(w|c) / (u(w|c) + k * p_n(w)) ) + sum over noise samples ( log ( u(x|c) / ( u(x|c) + k * p_n(x) ) )))
15 | 		
16 | 		# Generating noise samples
17 | 		srng = RandomStreams(seed=1234)
18 | 		noise_samples = srng.choice(size=(self.y.shape[0],args.num_noise_samples),  a=args.num_classes, p=noise_dist, dtype='int32')
19 | 
20 | 		log_noise_dist = theano.shared(np.log(noise_dist.get_value()),borrow=True)
21 | 		#log_num_noise_samples = theano.shared(math.log(args.num_noise_samples)).astype(theano.config.floatX)
22 | 		log_num_noise_samples = theano.shared(np.log(args.num_noise_samples,dtype=theano.config.floatX))
23 | 		# Data Part of Cost Function: log ( u(w|c) / (u(w|c) + k * p_n(w))
24 | 		data_scores = classifier.output[T.arange(self.y.shape[0]),self.y]
25 | 		data_denom = self.logadd(data_scores, log_num_noise_samples + log_noise_dist[self.y] )
26 | 		data_prob = data_scores - data_denom
27 | 		# Sumation of Noise Part of Cost Function: sum over noise samples ( log ( u(x|c) / ( u(x|c) + k * p_n(x) ) ))
28 | 		noise_mass = log_num_noise_samples + log_noise_dist[noise_samples] # log(k) + log(p_n(x)) for all noise samples (Shape: #instaces x k)
29 | 		noise_scores = classifier.output[T.arange(noise_samples.shape[0]).reshape((-1,1)),noise_samples]
30 | 		noise_denom = self.logadd(noise_scores, noise_mass)
31 | 		noise_prob_sum = T.sum(noise_mass - noise_denom, axis=1)
32 | 
33 | 		self.cost = (
34 | 			-T.mean(data_prob + noise_prob_sum)
35 | 		)
36 | 		self.test = (
37 | 			T.sum(data_scores)
38 | 		)
39 | 
40 | 	def logadd(self, a, b):
41 | 		g = T.maximum(a,b)
42 | 		l = T.minimum(a,b)
43 | 		return g + T.log(1 + T.exp(l-g))
44 | 
45 | 


--------------------------------------------------------------------------------
/dlm/reranker/augmenter.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import dlm.utils as U
 3 | import dlm.io.logging as L
 4 | from dlm.models.mlp import MLP
 5 | from dlm import eval
 6 | from dlm.io.nbestReader import NBestList
 7 | from dlm.io.vocabReader import VocabManager
 8 | import numpy as np
 9 | 
10 | def augment(model_path, input_nbest_path, vocab_path, output_nbest_path):
11 | 	classifier = MLP(model_path=model_path)
12 | 	evaluator = eval.Evaluator(None, classifier)
13 | 
14 | 	vocab = VocabManager(vocab_path)
15 | 
16 | 	ngram_size = classifier.ngram_size
17 | 
18 | 	def get_ngrams(tokens):
19 | 		for i in range(ngram_size - 1):
20 | 			tokens.insert(0, '<s>')
21 | 		if vocab.has_end_padding:
22 | 			tokens.append('</s>')
23 | 		indices = vocab.get_ids_given_word_list(tokens)
24 | 		return U.get_all_windows(indices, ngram_size)
25 | 
26 | 	input_nbest = NBestList(input_nbest_path, mode='r')
27 | 	output_nbest = NBestList(output_nbest_path, mode='w')
28 | 
29 | 	L.info('Augmenting: ' + input_nbest_path)
30 | 	
31 | 	start_time = time.time()
32 | 
33 | 	counter = 0
34 | 	cache = dict()
35 | 	for group in input_nbest:
36 | 		ngram_list = []
37 | 		for item in group:
38 | 			tokens = item.hyp.split()
39 | 			ngrams = get_ngrams(tokens)
40 | 			for ngram in ngrams:
41 | 				if not cache.has_key(str(ngram)):
42 | 					ngram_list.append(ngram)
43 | 					cache[str(ngram)] = 1000
44 | 		if len(ngram_list) > 0:
45 | 			ngram_array = np.asarray(ngram_list, dtype='int32')
46 | 			ngram_log_prob_list = evaluator.get_ngram_log_prob(ngram_array[:,0:-1], ngram_array[:,-1])
47 | 			for i in range(len(ngram_list)):
48 | 				cache[str(ngram_list[i])] = ngram_log_prob_list[i]
49 | 		for item in group:
50 | 			tokens = item.hyp.split()
51 | 			ngrams = get_ngrams(tokens)
52 | 			sum_ngram_log_prob = 0
53 | 			for ngram in ngrams:
54 | 				sum_ngram_log_prob += cache[str(ngram)]
55 | 			item.append_feature(sum_ngram_log_prob)
56 | 			output_nbest.write(item)
57 | 		#print counter
58 | 		counter += 1
59 | 	output_nbest.close()
60 | 
61 | 	L.info("Ran for %.2fs" % (time.time() - start_time))
62 | 


--------------------------------------------------------------------------------
/dlm/io/w2vEmbReader.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | import dlm.utils as U
 3 | import dlm.io.logging as L
 4 | 
 5 | class W2VEmbReader:
 6 | 	def __init__(self, emb_path):
 7 | 		L.info('Loading embeddings from: ' + emb_path)
 8 | 		has_header=False
 9 | 		with codecs.open(emb_path, 'r', encoding='utf8') as emb_file:
10 | 			tokens = emb_file.next().split()
11 | 			if len(tokens) == 2:
12 | 				try:
13 | 					int(tokens[0])
14 | 					int(tokens[1])
15 | 					has_header = True
16 | 				except ValueError:
17 | 					pass
18 | 		if has_header:
19 | 			with codecs.open(emb_path, 'r', encoding='utf8') as emb_file:
20 | 				tokens = emb_file.next().split()
21 | 				U.xassert(len(tokens) == 2, 'The first line in W2V embeddings must be the pair (vocab_size, emb_dim)')
22 | 				self.vocab_size = int(tokens[0])
23 | 				self.emb_dim = int(tokens[1])
24 | 				self.embeddings = {}
25 | 				counter = 0
26 | 				for line in emb_file:
27 | 					tokens = line.split()
28 | 					U.xassert(len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info')
29 | 					word = tokens[0]
30 | 					vec = tokens[1:]
31 | 					self.embeddings[word] = vec
32 | 					counter += 1
33 | 				U.xassert(counter == self.vocab_size, 'Vocab size does not match the header info')
34 | 		else:
35 | 			with codecs.open(emb_path, 'r', encoding='utf8') as emb_file:
36 | 				self.vocab_size = 0
37 | 				self.emb_dim = -1
38 | 				self.embeddings = {}
39 | 				for line in emb_file:
40 | 					tokens = line.split()
41 | 					if self.emb_dim == -1:
42 | 						self.emb_dim = len(tokens) - 1
43 | 					else:
44 | 						U.xassert(len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info')
45 | 					word = tokens[0]
46 | 					vec = tokens[1:]
47 | 					self.embeddings[word] = vec
48 | 					self.vocab_size += 1
49 | 		
50 | 		L.info('  #vectors: %i, #dimensions: %i' % (self.vocab_size, self.emb_dim))
51 | 	
52 | 	def get_emb_given_word(self, word):
53 | 		try:
54 | 			return self.embeddings[word]
55 | 		except KeyError:
56 | 			return None
57 | 	
58 | 	def get_emb_dim(self):
59 | 		return self.emb_dim
60 | 


--------------------------------------------------------------------------------
/dlm/io/plotting.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use('Pdf')
 3 | import matplotlib.pyplot as plt
 4 | import sys
 5 | import math
 6 | import numpy as np
 7 | 
 8 | class Plotter:
 9 | 	
10 | 	def __init__(self, path, title=None, xlabel=None, ylabel=None, xspace=1, yspace=0.5):
11 | 		self.path = path
12 | 		self.title = title
13 | 		self.xlabel = xlabel
14 | 		self.ylabel = ylabel
15 | 		self.xspace = xspace
16 | 		self.yspace = yspace
17 | 		self.series_list = []
18 | 		#self.tix_list = ['b*-', 'ro--']
19 | 		self.tix_list = ['b-', 'r--']
20 | 	
21 | 	def plot(self):
22 | 		plt.title(self.title, y=1.01, fontsize='medium')
23 | 		plt.xlabel(self.xlabel)
24 | 		plt.ylabel(self.ylabel)
25 | 		plt.grid('on')
26 | 		plt.margins(0.1)
27 | 		for i in range(len(self.series_list)):
28 | 			x_list = []
29 | 			y_list = []
30 | 			for x in sorted(self.series_list[i].keys()):
31 | 				x_list.append(x)
32 | 				y_list.append(self.series_list[i][x])
33 | 			#xmin, xmax = min(self.x_list), max(self.x_list) + 1
34 | 			#ymin, ymax = min(self.y_list), max(self.y_list) + 1
35 | 			#plt.xticks(np.arange(xmin, xmax, 1.0), np.arange(xmin, xmax, 1.0), fontsize='x-small')
36 | 			#plt.yticks(np.arange(ymin, ymax, 0.5), np.arange(ymin, ymax, 0.5), fontsize='x-small')
37 | 			plt.plot(x_list, y_list, self.tix_list[i], label='S' + str(i))
38 | 		plt.legend(bbox_to_anchor=(1.15, 0.5), loc='center right', borderaxespad=0.2, fontsize='x-small')
39 | 		plt.savefig(self.path, format='pdf', bbox_inches='tight', pad_inches=0.3)
40 | 		plt.cla()
41 | 	
42 | 	def add(self, series_index, x, y):
43 | 		assert series_index < len(self.tix_list)
44 | 		assert series_index <= len(self.series_list)
45 | 		if series_index == len(self.series_list):
46 | 			self.series_list.append(dict())
47 | 		series = self.series_list[series_index]
48 | 		series[x] = y
49 | 		self.plot()
50 | 	
51 | 	def add_list(self, series_index, x_list, y_list):
52 | 		assert series_index < len(self.tix_list)
53 | 		assert series_index <= len(self.series_list)
54 | 		if series_index == len(self.series_list):
55 | 			self.series_list.append(dict())
56 | 		series = self.series_list[series_index]
57 | 		for x, y in zip(x_list, y_list):
58 | 			series[x] = y
59 | 		self.plot()
60 | 	
61 | 	def set_tix_list(self, tix_list):
62 | 		self.tix_list = tix_list
63 | 	
64 | 


--------------------------------------------------------------------------------
/dlm/models/components/lookuptable.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import theano
 3 | import theano.tensor as T
 4 | import numpy
 5 | import dlm.utils as U
 6 | import dlm.io.logging as L
 7 | from dlm.io.vocabReader import VocabManager
 8 | from dlm.io.w2vEmbReader import W2VEmbReader
 9 | 
10 | class LookupTable():
11 | 	
12 | 	def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False):
13 | 		
14 | 		L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim)))
15 | 
16 | 		self.input = input
17 | 		
18 | 		self.emb_matrix = emb_matrix
19 | 
20 | 		if self.emb_matrix is None:
21 | 			self.emb_matrix = numpy.asarray(
22 | 				rng.uniform(
23 | 					low=-0.01, #low=-1,
24 | 					high=0.01, #high=1,
25 | 					size=(vocab_size, emb_dim)
26 | 				),
27 | 				dtype=theano.config.floatX
28 | 			)
29 | 		
30 | 		if emb_path:
31 | 			U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.')
32 | 			self.initialize(emb_path, vocab_path)
33 | 		
34 | 		self.embeddings = theano.shared(value=self.emb_matrix, name='embeddings', borrow=True)
35 | 		
36 | 		if add_weights:
37 | 			weights_vec = numpy.ones(vocab_size, dtype=theano.config.floatX)
38 | 			self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True)
39 | 			
40 | 			# Check if the speed can be improved
41 | 			self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input]
42 | 			#self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input]
43 | 			#self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input]
44 | 			
45 | 			self.params = [self.embeddings, self.weights]
46 | 		else:
47 | 			self.output = self.embeddings[input]
48 | 			self.params = [self.embeddings]
49 | 		
50 | 		if concat:
51 | 			self.output = self.output.reshape((input.shape[0], emb_dim * input.shape[1]))
52 | 	
53 | 	def initialize(self, emb_path, vocab_path):
54 | 		L.info('Initializing lookup table')
55 | 		vm = VocabManager(vocab_path)
56 | 		w2v = W2VEmbReader(emb_path)
57 | 		U.xassert(w2v.get_emb_dim() == self.emb_matrix.shape[1], 'The embeddings dimension does not match with the given word embeddings')
58 | 		for i in range(self.emb_matrix.shape[0]):
59 | 			vec = w2v.get_emb_given_word(vm.get_word_given_id(i))
60 | 			if vec:
61 | 				self.emb_matrix[i] = vec
62 | 


--------------------------------------------------------------------------------
/classify.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import time
 5 | import argparse
 6 | import dlm.utils as U
 7 | import dlm.io.logging as L
 8 | import numpy
 9 | 
10 | ###############
11 | ## Arguments
12 | #
13 | 
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument("-i", "--input-file", dest="input_path", required=True, help="Input feature file")
16 | parser.add_argument("-v", "--vocab-file", dest="vocab_path", required=True, help="The vocabulary file for the model")
17 | parser.add_argument("-rv", "--restricted-vocab-file", dest="restricted_vocab_path", help="Restricted vocab file to predict the word")
18 | parser.add_argument("-m", "--model-file", dest="model_path", required=True, help="Input CoreLM model file")
19 | parser.add_argument("-o", "--output-file",dest="output_path", required=True, help="Output File path.")
20 | parser.add_argument("-d", "--device", dest="device", default="gpu", help="The computing device (cpu or gpu)")
21 | args = parser.parse_args()
22 | 
23 | U.set_theano_device(args.device,1)
24 | 
25 | from dlm.models.mlp import MLP
26 | from dlm import eval
27 | import theano
28 | import theano.tensor as T
29 | 
30 | #########################
31 | ## Loading model
32 | #
33 | 
34 | classifier = MLP(model_path=args.model_path)
35 | 
36 | #########################
37 | ## Loading dataset
38 | #
39 | 
40 | from dlm.io.ngramsReader import NgramsReader
41 | from dlm.io.vocabReader import VocabManager
42 | testset = NgramsReader(dataset_path=args.input_path, ngram_size=classifier.ngram_size, vocab_path=args.vocab_path)
43 | vocab = VocabManager(args.vocab_path)
44 | 
45 | ## Loading restricted vocab
46 | restricted_ids = []
47 | restricted_vocab = []
48 | if args.restricted_vocab_path:
49 | 	with open(args.restricted_vocab_path) as restricted_vocab_file:
50 | 		for line in restricted_vocab_file:
51 | 			restricted_vocab.append(line.strip())
52 | 	restricted_ids = vocab.get_ids_given_word_list(restricted_vocab)
53 | 
54 | 
55 | #########################
56 | ## Compiling theano function
57 | #
58 | 
59 | evaluator = eval.Evaluator(testset, classifier)
60 | 
61 | 
62 | if args.output_path:
63 | 	with open(args.output_path, 'w') as output:
64 | 		for i in xrange(testset._get_num_samples()):
65 | 			out = evaluator.get_class(i, restricted_ids) 
66 | 			output.write(vocab.get_word_given_id(out)+'\n')
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/dlm/io/textReader.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import dlm.utils as U
 3 | import dlm.io.logging as L
 4 | from dlm.io.vocabReader import VocabManager
 5 | from dlm.io.nbestReader import NBestList
 6 | import numpy as np
 7 | import codecs
 8 | import theano
 9 | import theano.tensor as T
10 | 
11 | class TextReader():
12 | 	
13 | 	def __init__(self, dataset_path, is_nbest, ngram_size, vocab_path):
14 | 		
15 | 		L.info("Initializing dataset from: " + dataset_path)
16 | 		
17 | 		vocab = VocabManager(vocab_path)
18 | 		
19 | 		def get_ngrams(tokens):
20 | 			for i in range(ngram_size - 1):
21 | 				tokens.insert(0, '<s>')
22 | 			if vocab.has_end_padding:
23 | 				tokens.append('</s>')
24 | 			indices = vocab.get_ids_given_word_list(tokens)
25 | 			return U.get_all_windows(indices, ngram_size)
26 | 		
27 | 		starts_list = []
28 | 		curr_index = 0
29 | 		curr_start_index = 0
30 | 		self.num_sentences = 0
31 | 		
32 | 		ngrams_list = []
33 | 		if is_nbest == True:
34 | 			nbest = NBestList(dataset_path)
35 | 			for group in nbest:
36 | 				for item in group:
37 | 					tokens = item.hyp.split()
38 | 					starts_list.append(curr_start_index)
39 | 					ngrams = get_ngrams(tokens)
40 | 					ngrams_list += ngrams
41 | 					curr_start_index += len(ngrams)
42 | 		else:
43 | 			dataset = codecs.open(dataset_path, 'r', encoding="UTF-8")
44 | 			for line in dataset:
45 | 				tokens = line.split()
46 | 				starts_list.append(curr_start_index)
47 | 				ngrams = get_ngrams(tokens)
48 | 				ngrams_list += ngrams
49 | 				curr_start_index += len(ngrams)
50 | 			dataset.close()
51 | 		
52 | 		self.num_sentences = len(starts_list)
53 | 		
54 | 		data = np.asarray(ngrams_list)
55 | 		starts_list.append(curr_start_index)
56 | 		starts_array = np.asarray(starts_list)
57 | 		
58 | 		x = data[:,0:-1]
59 | 		y = data[:,-1]
60 | 		
61 | 		self.num_samples = y.shape[0]
62 | 		
63 | 		self.shared_starts = T.cast(theano.shared(starts_array, borrow=True), 'int64')
64 | 		self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
65 | 		self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
66 | 	
67 | 	def get_x(self, index):
68 | 		return self.shared_x[ self.shared_starts[index] : self.shared_starts[index+1] ]
69 | 	
70 | 	def get_y(self, index):
71 | 		return self.shared_y[ self.shared_starts[index] : self.shared_starts[index+1] ]
72 | 
73 | 	def get_num_sentences(self):
74 | 		return self.num_sentences
75 | 	
76 | 	def get_num_batches(self):
77 | 		return self.num_sentences
78 | 	
79 | 	def _get_num_samples(self):
80 | 		return self.num_samples
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/dlm/io/mmapReader.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import dlm.io.logging as L
 3 | import dlm.utils as U
 4 | import numpy as np
 5 | import theano
 6 | import theano.tensor as T
 7 | import math as M
 8 | import sys
 9 | import os
10 | 
11 | class MemMapReader():
12 | 	
13 | 	#### Constructor
14 | 	
15 | 	def __init__(self, dataset_path, batch_size=500, instance_weights_path=None):
16 | 		
17 | 		L.info("Initializing dataset from: " + os.path.abspath(dataset_path))
18 | 		
19 | 		# Reading parameters from the mmap file
20 | 		fp = np.memmap(dataset_path, dtype='int32', mode='r')
21 | 		self.num_samples = fp[0]
22 | 		self.ngram = fp[1]
23 | 		fp = fp.reshape((self.num_samples + 3, self.ngram))
24 | 		self.vocab_size = fp[1,0]
25 | 		self.num_classes = fp[2,0]
26 | 
27 | 		# Setting minibatch size and number of mini batches
28 | 		self.batch_size = batch_size
29 | 		self.num_batches = int(M.ceil(self.num_samples / self.batch_size))
30 | 		
31 | 		# Reading the matrix of samples
32 | 		x = fp[3:,0:self.ngram - 1]			# Reading the context indices
33 | 		y = fp[3:,self.ngram - 1]			# Reading the output word index
34 | 		self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
35 | 		self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
36 | 		
37 | 		self.is_weighted = False
38 | 		if instance_weights_path:
39 | 			instance_weights = np.loadtxt(instance_weights_path)
40 | 			U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.")
41 | 			self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX)
42 | 			self.is_weighted = True
43 | 		
44 | 		L.info('  #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % (
45 | 				U.red(self.num_samples), U.red(self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)
46 | 			)
47 | 		)
48 | 	
49 | 	#### Accessors
50 | 	
51 | 	def get_x(self, index):
52 | 		return self.shared_x[index * self.batch_size : (index+1) * self.batch_size]
53 | 	
54 | 	def get_y(self, index):
55 | 		return self.shared_y[index * self.batch_size : (index+1) * self.batch_size]
56 | 	
57 | 	def get_w(self, index):
58 | 		return self.shared_w[index * self.batch_size : (index+1) * self.batch_size]
59 | 	
60 | 	#### INFO
61 | 	
62 | 	def _get_num_samples(self):
63 | 		return self.num_samples
64 | 	
65 | 	def get_num_batches(self):
66 | 		return self.num_batches
67 | 	
68 | 	def get_ngram_size(self):
69 | 		return self.ngram
70 | 	
71 | 	def get_vocab_size(self):
72 | 		return self.vocab_size
73 | 	
74 | 	def get_num_classes(self):
75 | 		return self.num_classes
76 | 
77 | 	def get_unigram_model(self):
78 | 		unigram_counts = np.bincount(self.shared_y.get_value())
79 | 		unigram_counts = np.append(unigram_counts, np.zeros(self.num_classes - unigram_counts.size, dtype='int32'))
80 | 		sum_unigram_counts = np.sum(unigram_counts)
81 | 
82 | 		unigram_model = unigram_counts / sum_unigram_counts
83 | 		unigram_model = unigram_model.astype(theano.config.floatX)
84 | 		return theano.shared(unigram_model,borrow=True)
85 | 


--------------------------------------------------------------------------------
/dlm/preprocess/convert_to_memmap.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import numpy as np
 4 | import sys, os
 5 | import argparse
 6 | 
 7 | # Parsing arguments
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument("-i", "--input-idx-file", dest="idx_path", required=True, help="Path to the input text (idx) file.")
10 | parser.add_argument("-v", "--input-vocab-file", dest="vocab_path", help="Path to the vocab file.")
11 | parser.add_argument("-o", "--output-file", dest="output_path", required=True, help="Path to output file.")
12 | parser.add_argument("-n", "--no-header", dest="no_header", action='store_true', help="Use this flag to write a plain mmap file with no header information.")
13 | 
14 | args = parser.parse_args()
15 | 
16 | if not args.no_header:
17 | 	assert args.vocab_path, "[ERROR] Give a vocab file or use --no-header flag."
18 | 
19 | def verify_matrix_file(matrix_path):
20 | 	print "Verifying the input file"
21 | 	nrows = 0
22 | 	ncols = -1
23 | 	with open(matrix_path, 'r') as data:
24 | 		for line in data:
25 | 			tokens = line.split()
26 | 			if ncols <= 0:
27 | 				ncols = len(tokens)
28 | 			else:
29 | 				assert ncols == len(tokens)
30 | 			try:
31 | 				map(int, tokens)
32 | 			except ValueError:
33 | 				print "[ERROR] Matrix file format invalid @ line: " + line
34 | 				sys.exit()
35 | 			nrows += 1
36 | 			if nrows % 10000000 == 0:
37 | 				print nrows
38 | 	assert nrows > 0 and ncols > 0
39 | 	return nrows, ncols
40 | 
41 | 
42 | 
43 | if args.no_header:
44 | 	nrows, ncols = verify_matrix_file(args.idx_path)
45 | 
46 | 	print "Number of rows: ", nrows
47 | 	print "Number of columns: ", ncols
48 | 
49 | 	print "Creating the memory mapped file"
50 | 	print("Output file: " + os.path.abspath(args.output_path))
51 | 
52 | 	with open(args.idx_path, 'r') as data:
53 | 		fp = np.memmap(args.output_path, dtype='int32', mode='w+', shape=(nrows, ncols))
54 | 		counter = 0
55 | 		for line in data:
56 | 			tokens = line.split()
57 | 			fp[counter] = tokens
58 | 			counter = counter + 1
59 | 			if counter % 10000000 == 0:
60 | 				print counter
61 | 		print counter, "samples mapped"
62 | 		fp.flush
63 | 		del fp
64 | else:
65 | 	print "Reading the vocab file"
66 | 
67 | 	vocab_size = 0
68 | 	with open(args.vocab_path, 'r') as vocab_file:
69 | 		for line in vocab_file:
70 | 			vocab_size += 1
71 | 	assert vocab_size > 0
72 | 
73 | 	num_samples, ngram_size = verify_matrix_file(args.idx_path)
74 | 
75 | 	print "Number of samples: ", num_samples
76 | 	print "Ngram size: ", ngram_size
77 | 	print "Vocab size: ", vocab_size
78 | 
79 | 	print "Creating the memory mapped file"
80 | 	print("Output file: " + os.path.abspath(args.output_path))
81 | 
82 | 	with open(args.idx_path, 'r') as data:
83 | 		fp = np.memmap(args.output_path, dtype='int32', mode='w+', shape=(num_samples + 3, ngram_size))
84 | 		fp[0,0] = num_samples
85 | 		fp[0,1] = ngram_size
86 | 		fp[1,0] = vocab_size		# vocab size
87 | 		fp[2,0] = vocab_size		# number of classes
88 | 		counter = 3
89 | 		for line in data:
90 | 			tokens = line.split()
91 | 			fp[counter] = tokens
92 | 			counter = counter + 1
93 | 			if counter % 10000000 == 0:
94 | 				print counter
95 | 		print str(counter - 3) + " samples mapped"
96 | 		fp.flush
97 | 		del fp
98 | 


--------------------------------------------------------------------------------
/dlm/reranker/tools.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import imp
 5 | try:
 6 | 	import dlm
 7 | except ImportError:
 8 | 	print "[ERROR] dlm module not found. Add CoreLM root directory to your PYTHONPATH"
 9 | 	sys.exit()
10 | 
11 | import dlm.io.logging as L
12 | import dlm.utils as U
13 | import argparse
14 | from dlm.io.nbestReader import NBestList
15 | import codecs
16 | 
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument("-c", "--command", dest="command", required=True, help="The command (topN|1best|featureN|correlN|augment)")
19 | parser.add_argument("-i", "--input-file", dest="input_path", required=True, help="Input n-best file")
20 | parser.add_argument("-s", "--input-scores", dest="oracle", help="Input oracle scores  the n-best file")
21 | parser.add_argument("-o", "--output-file", dest="output_path", required=True, help="Output file")
22 | parser.add_argument("-v", "--vocab-file", dest="vocab_path", help="The vocabulary file.")
23 | parser.add_argument("-m", "--model-file", dest="model_path",  help="Input CoreLM model file")
24 | parser.add_argument("-d", "--device", dest="device", default="gpu", help="The computing device (cpu or gpu)")
25 | args = parser.parse_args()
26 | 
27 | input_nbest = NBestList(args.input_path, mode='r')
28 | 
29 | mode = -1
30 | 
31 | if args.command.startswith('top'):
32 | 	mode = 0
33 | 	N = int(args.command[3:]) # N in N-best
34 | 	output_nbest = NBestList(args.output_path, mode='w')
35 | elif args.command == '1best':
36 | 	mode = 1
37 | 	output_1best = codecs.open(args.output_path, mode='w', encoding='UTF-8')
38 | elif args.command.startswith('feature'):
39 | 	mode = 2
40 | 	N = int(args.command[7:]) # Nth feature
41 | 	output = open(args.output_path, mode='w')
42 | elif args.command.startswith('correl'):
43 | 	mode = 3
44 | 	N = int(args.command[6:]) # Nth feature
45 | 	U.xassert(args.oracle, "correlN command needs a file (-s) containing oracle scores")
46 | 	with open(args.oracle, mode='r') as oracles_file:
47 | 		oracles = map(float, oracles_file.read().splitlines())
48 | 	#output = open(args.output_path, mode='w')
49 | elif args.command.startswith('augment'):
50 | 	U.set_theano_device(args.device)
51 | 	from dlm.reranker import augmenter
52 | 	augmenter.augment(args.model_path, args.input_path, args.vocab_path, args.output_path)
53 | else:
54 | 	L.error('Invalid command: ' + args.command)
55 | 
56 | counter = 0
57 | features = []
58 | for group in input_nbest:
59 | 	if mode == 0:
60 | 		for i in range(min(N, group.size())):
61 | 			output_nbest.write(group[i])
62 | 	elif mode == 1:
63 | 		output_1best.write(group[0].hyp + "\n")
64 | 	elif mode == 2:
65 | 		for i in range(group.size()):
66 | 			features = group[i].features.split()
67 | 			output.write(features[N] + "\n")
68 | 	elif mode == 3:
69 | 		for i in range(group.size()):
70 | 			features.append(float(group[i].features.split()[N]))
71 | 	counter += 1
72 | 	if counter % 100 == 0:
73 | 		L.info("%i groups processed" % (counter))
74 | L.info("Finished processing %i groups" % (counter))
75 | 
76 | if mode == 0:
77 | 	output_nbest.close()
78 | elif mode == 1:
79 | 	output_1best.close()
80 | elif mode == 2:
81 | 	output.close()
82 | elif mode == 3:
83 | 	import scipy.stats as S
84 | 	print 'PEARSON: ', S.pearsonr(features, oracles)
85 | 	print 'SPEARMAN:', S.spearmanr(features, oracles)
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/dlm/reranker/rerank.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import os
 5 | import imp
 6 | import shutil
 7 | try:
 8 | 	import dlm
 9 | except ImportError:
10 | 	print "[ERROR] dlm module not found. Add CoreLM root directory to your PYTHONPATH"
11 | 	sys.exit()
12 | 
13 | import dlm.utils as U
14 | import dlm.io.logging as L
15 | import argparse
16 | 
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument("-i", "--input-nbest", dest="input_nbest", required=True, help="Input n-best file")
19 | parser.add_argument("-v", "--vocab-file", dest="vocab_path", required=True, help="The vocabulary file that was used in training")
20 | parser.add_argument("-m", "--model-file", dest="model_path", required=True, help="Input CoreLM model file")
21 | parser.add_argument("-w", "--weights", dest="weights", required=True, help="Input weights file")
22 | parser.add_argument("-d", "--device", dest="device", default="gpu", help="The computing device (cpu or gpu)")
23 | parser.add_argument("-o", "--output-dir", dest="out_dir", required=True, help="Output directory")
24 | parser.add_argument("-n", "--no-aug", dest="no_aug", action='store_true', help="Augmentation will be skipped, if this flag is set")
25 | parser.add_argument("-c", "--clean-up", dest="clean_up", action='store_true', help="Temporary files will be removed")
26 | parser.add_argument("-q", "--quiet", dest="quiet", action='store_true', help="Nothing will be printed in STDERR")
27 | args = parser.parse_args()
28 | 
29 | if args.quiet:
30 | 	L.quiet = True
31 | 
32 | U.set_theano_device(args.device)
33 | 
34 | from dlm.io.nbestReader import NBestList
35 | import codecs
36 | import numpy as np
37 | 
38 | U.mkdir_p(args.out_dir)
39 | 
40 | from dlm.reranker import augmenter
41 | 
42 | output_nbest_path = args.out_dir + '/augmented.nbest'
43 | 
44 | if args.no_aug:
45 | 	shutil.copy(args.input_nbest, output_nbest_path)
46 | else:
47 | 	augmenter.augment(args.model_path, args.input_nbest, args.vocab_path, output_nbest_path)
48 | 
49 | with open(args.weights, 'r') as input_weights:
50 | 	lines = input_weights.readlines()
51 | 	if len(lines) > 1:
52 | 		L.warning("Weights file has more than one line. I'll read the 1st and ignore the rest.")
53 | 	weights = np.asarray(lines[0].strip().split(" "), dtype=float)
54 | 
55 | prefix = os.path.basename(args.input_nbest)
56 | input_aug_nbest = NBestList(output_nbest_path, mode='r')
57 | output_nbest = NBestList(args.out_dir + '/' + prefix + '.reranked.nbest', mode='w')
58 | output_1best = codecs.open(args.out_dir + '/' + prefix + '.reranked.1best', mode='w', encoding='UTF-8')
59 | 
60 | def is_number(s):
61 | 	try:
62 | 		float(s)
63 | 		return True
64 | 	except ValueError:
65 | 		return False
66 | 
67 | counter = 0
68 | for group in input_aug_nbest:
69 | 	index = 0
70 | 	scores = dict()
71 | 	for item in group:
72 | 		features = np.asarray([x for x in item.features.split() if is_number(x)], dtype=float)
73 | 		try:
74 | 			scores[index] = np.dot(features, weights)
75 | 		except ValueError:
76 | 			L.error('Number of features in the nbest and the weights file are not the same')
77 | 		index += 1
78 | 	sorted_indices = sorted(scores, key=scores.get, reverse=True)
79 | 	for idx in sorted_indices:
80 | 		output_nbest.write(group[idx])
81 | 	output_1best.write(group[sorted_indices[0]].hyp + "\n")
82 | 	counter += 1
83 | 	if counter % 100 == 0:
84 | 		L.info("%i groups processed" % (counter))
85 | L.info("Finished processing %i groups" % (counter))
86 | 
87 | output_nbest.close()
88 | output_1best.close()
89 | 
90 | if args.clean_up:
91 | 	os.remove(output_nbest_path)
92 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import time
 5 | import argparse
 6 | import dlm.utils as U
 7 | import dlm.io.logging as L
 8 | 
 9 | ###############
10 | ## Arguments
11 | #
12 | 
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument("-t", "--test-file", dest="test_path", required=True, help="The evaluation file (memory-mapped, nbest list or text file)")
15 | parser.add_argument("-f", "--format", dest="format", required=True, help="The evaluation file format (mmap|nbest|text)")
16 | parser.add_argument("-v", "--vocab-file", dest="vocab_path", help="The vocabulary file that was used in training")
17 | parser.add_argument("-m", "--model-file", dest="model_path", required=True, help="Input CoreLM model file")
18 | parser.add_argument("-ulp", "--unnormalized-log-prob-file", dest="ulp_path", help="Output file for sentence-level UNNORMALIZED log-probabilities")
19 | parser.add_argument("-nlp", "--normalized-log-prob-file", dest="nlp_path", help="Output file for sentence-level NORMALIZED log-probabilities")
20 | parser.add_argument("-ppl", "--perplexity", action='store_true', help="Compute and output normalized perplexity")
21 | parser.add_argument("-un", "--unnormalized", action='store_true', help="Compute and output unnormalized perplexity")
22 | parser.add_argument("-d", "--device", dest="device", default="gpu", help="The computing device (cpu or gpu)")
23 | args = parser.parse_args()
24 | 
25 | U.set_theano_device(args.device, 1)
26 | 
27 | from dlm.models.mlp import MLP
28 | from dlm import eval
29 | import theano
30 | import theano.tensor as T
31 | 
32 | #########################
33 | ## Loading model
34 | #
35 | 
36 | classifier = MLP(model_path=args.model_path)
37 | 
38 | #########################
39 | ## Loading dataset
40 | #
41 | 
42 | U.xassert(args.format == "mmap" or args.format == "nbest" or args.format == "text", "Invalid file format given: " + args.format)
43 | U.xassert(args.perplexity or args.nlp_path or args.ulp_path, "You should use one of (or more) -ppl, -nlp or -ulp")
44 | 
45 | if args.format == "mmap":
46 | 	U.xassert((args.nlp_path is None) and (args.ulp_path is None), "Cannot compute log-probabilities for an mmap file")
47 | 	from dlm.io.mmapReader import MemMapReader
48 | 	testset = MemMapReader(dataset_path=args.test_path, batch_size=500)
49 | else:
50 | 	U.xassert(args.vocab_path, "Vocab file is required for non-mmap file formats")
51 | 	from dlm.io.textReader import TextReader
52 | 	is_nbest = False
53 | 	if args.format == "nbest":
54 | 		is_nbest = True
55 | 	testset = TextReader(dataset_path=args.test_path, is_nbest=is_nbest, ngram_size=classifier.ngram_size, vocab_path=args.vocab_path)
56 | 
57 | #########################
58 | ## Compiling theano function
59 | #
60 | 
61 | evaluator = eval.Evaluator(testset, classifier)
62 | 
63 | #########################
64 | ## Testing
65 | #
66 | 
67 | start_time = time.time()
68 | 
69 | if args.perplexity:
70 | 	L.info("Perplexity: %f" % (evaluator.perplexity()))
71 | 	if args.unnormalized:
72 | 		L.info("Unnormalized Perplexity: %f" % (evaluator.unnormalized_perplexity()))
73 | 
74 | if args.nlp_path:
75 | 	with open(args.nlp_path, 'w') as output:
76 | 		for i in xrange(testset.get_num_sentences()):
77 | 			output.write(str(evaluator.get_sequence_log_prob(i)) + '\n')
78 | 
79 | if args.ulp_path:
80 | 	with open(args.ulp_path, 'w') as output:
81 | 		for i in xrange(testset.get_num_sentences()):
82 | 			output.write(str(evaluator.get_unnormalized_sequence_log_prob(i)) + '\n')
83 | 
84 | L.info("Ran for %.2fs" % (time.time() - start_time))
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/dlm/reranker/oracle.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import imp
 5 | try:
 6 | 	import dlm
 7 | except ImportError:
 8 | 	print "[ERROR] dlm module not found. Add CoreLM root directory to your PYTHONPATH"
 9 | 	sys.exit()
10 | 
11 | import dlm.utils as U
12 | import dlm.io.logging as L
13 | import argparse
14 | from dlm.io.nbestReader import NBestList
15 | import dlm.reranker.bleu as B
16 | import codecs
17 | from multiprocessing import Pool
18 | 
19 | parser = argparse.ArgumentParser()
20 | parser.add_argument("-i", "--input-file", dest="input_path", required=True, help="Input n-best file")
21 | parser.add_argument("-r", "--reference-files", dest="ref_paths", required=True, help="A comma-seperated list of reference files")
22 | parser.add_argument("-o", "--output-nbest-file", dest="out_nbest_path", help="Output oracle n-best file")
23 | parser.add_argument("-b", "--output-1best-file", dest="out_1best_path", required=True, help="Output oracle 1-best file")
24 | parser.add_argument("-s", "--output-scores", dest="out_scores_path", help="Output oracle scores file")
25 | parser.add_argument("-m", "--smoothing-method", dest="method", required=True, help="Smoothing method (none|epsilon|lin|nist|chen)")
26 | parser.add_argument("-t", "--threads", dest="threads", type=int, default=14, help="Number of threads")
27 | parser.add_argument("-q", "--quiet", dest="quiet", action='store_true', help="Nothing will be printed in STDERR")
28 | args = parser.parse_args()
29 | 
30 | if args.quiet:
31 | 	L.quiet = True
32 | 
33 | methods = {
34 | 	'none'    : B.no_smoothing,
35 | 	'epsilon' : B.add_epsilon_smoothing,
36 | 	'lin'     : B.lin_smoothing,
37 | 	'nist'    : B.nist_smoothing,
38 | 	'chen'    : B.chen_smoothing
39 | }
40 | 
41 | ref_path_list = args.ref_paths.split(',')
42 | 
43 | input_nbest = NBestList(args.input_path, mode='r', reference_list=ref_path_list)
44 | if args.out_nbest_path:
45 | 	output_nbest = NBestList(args.out_nbest_path, mode='w')
46 | if args.out_scores_path:
47 | 	output_scores = open(args.out_scores_path, mode='w')
48 | output_1best = codecs.open(args.out_1best_path, mode='w', encoding='UTF-8')
49 | 
50 | U.xassert(methods.has_key(args.method), "Invalid smoothing method: " + args.method)
51 | scorer = methods[args.method]
52 | 
53 | L.info('Processing the n-best list')
54 | 
55 | def process_group(group):
56 | 	index = 0
57 | 	scores = dict()
58 | 	for item in group:
59 | 		scores[index] = scorer(item.hyp, group.refs)
60 | 		index += 1
61 | 	return scores
62 | 
63 | pool = Pool(args.threads)
64 | 
65 | counter = 0
66 | group_counter = 0
67 | flag = True
68 | while (flag):
69 | 	group_list = []
70 | 	for i in range(args.threads):
71 | 		try:
72 | 			group_list.append(input_nbest.next())
73 | 		except StopIteration:
74 | 			flag = False
75 | 	if len(group_list) > 0:
76 | 		outputs = pool.map(process_group, group_list)
77 | 		for i in range(len(group_list)):
78 | 			scores = outputs[i]
79 | 			group = group_list[i]
80 | 			sorted_indices = sorted(scores, key=scores.get, reverse=True)
81 | 			if args.out_scores_path:
82 | 				for idx in scores:
83 | 					output_scores.write(str(group.group_index) + ' ' + str(idx) + ' ' + str(scores[idx]) + "\n")
84 | 			if args.out_nbest_path:
85 | 				for idx in sorted_indices:
86 | 					output_nbest.write(group[idx])
87 | 			output_1best.write(group[sorted_indices[0]].hyp + "\n")
88 | 		counter += 1
89 | 		group_counter += len(group_list)
90 | 		if counter % 5 == 0:
91 | 			L.info("%i groups processed" % (group_counter))
92 | L.info("Finished processing %i groups" % (group_counter))
93 | 
94 | if args.out_scores_path:
95 | 	output_scores.close()
96 | if args.out_nbest_path:
97 | 	output_nbest.close()
98 | output_1best.close()
99 | 


--------------------------------------------------------------------------------
/dlm/misc/nplm_to_corelm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import numpy as np
  4 | import argparse
  5 | import os
  6 | import dlm.utils as U
  7 | import dlm.io.logging as L
  8 | 
  9 | 
 10 | def convert_type(param):
 11 | 	return np.float32(param)
 12 | 
 13 | 
 14 | 
 15 | # Arguments for this script
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("-m", "--nplm-model", dest="nplm_model", required=True, help="The input NPLM model file")
 18 | parser.add_argument("-dir", "--directory", dest="out_dir", help="The output directory for log file, model, etc.")
 19 | 
 20 | args = parser.parse_args()
 21 | 
 22 | U.set_theano_device('cpu',1)
 23 | from dlm.models.mlp import MLP
 24 | 
 25 | 
 26 | if args.out_dir is None:
 27 | 	args.out_dir = 'nplm_convert-' + U.curr_time()
 28 | U.mkdir_p(args.out_dir)
 29 | 
 30 | 
 31 | # Reading the NPLM Model
 32 | args_nn = argparse.Namespace()
 33 | model_dict = dict()
 34 | lines = []
 35 | req_attribs = ['\config','\\vocab', '\input_vocab', '\output_vocab', '\input_embeddings',  '\hidden_weights 1', '\hidden_biases 1', '\hidden_weights 2', '\hidden_biases 2', '\output_weights', '\output_biases','\end']
 36 | attrib = ''
 37 | 
 38 | with open(args.nplm_model,'r') as f_model:
 39 | 	for line in f_model:
 40 | 		line = line.strip()
 41 | 		if(line in req_attribs):
 42 | 			if attrib != '':
 43 | 				model_dict[attrib] = lines
 44 | 			attrib = line
 45 | 			lines = []
 46 | 		elif(line):
 47 | 			lines.append(line)
 48 | 		else:
 49 | 			continue;
 50 | 
 51 | 
 52 | # Storing the config parameters of the NPLM model
 53 | config_dict = dict()
 54 | for config_line in model_dict['\config']:
 55 | 	config_arg,value = config_line.split()
 56 | 	config_dict[config_arg] = value
 57 | 
 58 | 
 59 | # Setting the args for the classifier
 60 | args_nn.emb_dim = int(config_dict['input_embedding_dimension'])
 61 | args_nn.num_hidden = config_dict['num_hidden'] + ',' + config_dict['output_embedding_dimension']
 62 | args_nn.vocab_size = int(config_dict['input_vocab_size'])
 63 | args_nn.ngram_size = int(config_dict['ngram_size'])
 64 | args_nn.num_classes = int(config_dict['output_vocab_size'])
 65 | 
 66 | act_func = config_dict['activation_function']
 67 | if act_func == 'rectifier':
 68 | 	act_func = 'relu'
 69 | 
 70 | args_nn.activation_name = act_func
 71 | 
 72 | # Creating the classifier with the arguments read
 73 | L.info("Creating CoreLM model")
 74 | classifier = MLP(args_nn)
 75 | 
 76 | 
 77 | # Loading matrices
 78 | embeddings = np.loadtxt(model_dict['\input_embeddings'])
 79 | W1 = np.loadtxt(model_dict['\hidden_weights 1'])
 80 | W1 = np.transpose(W1)
 81 | b1 = np.loadtxt(model_dict['\hidden_biases 1'])
 82 | W2 = np.loadtxt(model_dict['\hidden_weights 2'])
 83 | W2 = np.transpose(W2)
 84 | b2 = np.loadtxt(model_dict['\hidden_biases 2'])
 85 | W3 = np.loadtxt(model_dict['\output_weights'])
 86 | W3 = np.transpose(W3)
 87 | b3 = np.loadtxt(model_dict['\output_biases'])
 88 | params_nn =[embeddings, W1, b1, W2, b2, W3, b3]
 89 | 
 90 | #Type Conversion
 91 | params_nn = [convert_type(param) for param in params_nn]
 92 | 
 93 | # Setting the classifier parameters
 94 | classifier.set_params(params_nn)
 95 | 
 96 | #Debugging
 97 | #print [np.array_equal(x.get_value(),y) for x,y in zip(classifier.params,params_nn)]
 98 | 
 99 | # Saving the vocab file
100 | vocab_file = args.out_dir + "/vocab"
101 | if '\input_vocab' in model_dict:
102 | 	with open(vocab_file,'w') as f_vocab:
103 | 		for word in model_dict['\input_vocab']:
104 | 			f_vocab.write(word+'\n')
105 | 
106 | 
107 | # Saving the CoreLM model
108 | model_file = args.out_dir + "/" + os.path.basename(args.nplm_model) + ".corelm"
109 | L.info("Saving CoreLM model: " + model_file)
110 | classifier.save_model(model_file)
111 | 
112 | 


--------------------------------------------------------------------------------
/dlm/eval.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import sys
  3 | import time
  4 | from theano import *
  5 | import theano.tensor as T
  6 | from dlm.io.mmapReader import MemMapReader
  7 | from dlm.models.mlp import MLP
  8 | import dlm.utils as U
  9 | import math
 10 | import numpy as np
 11 | 
 12 | class Evaluator():
 13 | 
 14 | 	def __init__(self, dataset, classifier):
 15 | 
 16 | 		index = T.lscalar()
 17 | 		x = classifier.input
 18 | 		y = T.ivector('y')
 19 | 
 20 | 		if dataset:
 21 | 			self.dataset = dataset								# Initializing the dataset
 22 | 			self.num_batches = self.dataset.get_num_batches()	# Number of minibatches in the dataset
 23 | 			self.num_samples = self.dataset._get_num_samples()	# Number of samples in the dataset
 24 | 
 25 | 			self.neg_sum_batch_log_likelihood = theano.function(
 26 | 				inputs=[index],
 27 | 				outputs=-T.sum(T.log(classifier.p_y_given_x(y))),
 28 | 				givens={
 29 | 					x: self.dataset.get_x(index),
 30 | 					y: self.dataset.get_y(index)
 31 | 				}
 32 | 			)
 33 | 
 34 | 			self.unnormalized_neg_sum_batch_log_likelihood = theano.function(
 35 | 				inputs=[index],
 36 | 				outputs=-T.sum(classifier.unnormalized_p_y_given_x(y)), # which is: -T.sum(T.log(T.exp(classifier.unnormalized_p_y_given_x(y))))
 37 | 				givens={
 38 | 					x: self.dataset.get_x(index),
 39 | 					y: self.dataset.get_y(index)
 40 | 				}
 41 | 			)
 42 | 
 43 | 			self.sum_batch_error = theano.function(
 44 | 				inputs=[index],
 45 | 				outputs=classifier.errors(y),
 46 | 				givens={
 47 | 					x: self.dataset.get_x(index),
 48 | 					y: self.dataset.get_y(index)
 49 | 				}
 50 | 			)
 51 | 
 52 | 			# x: A matrix (N * (ngram - 1)) representing the sequence of length N
 53 | 			# y: A vector of class labels
 54 | 			self.neg_sequence_log_prob = self.neg_sum_batch_log_likelihood
 55 | 
 56 | 			self.denominator = theano.function(
 57 | 				inputs=[index],
 58 | 				outputs=classifier.log_Z_sqr,
 59 | 				givens={
 60 | 					x: self.dataset.get_x(index)
 61 | 				}
 62 | 			)
 63 | 
 64 | 			self.get_p_matrix  = theano.function(
 65 | 				inputs=[index],
 66 | 				outputs=classifier.p_y_given_x_matrix,
 67 | 				givens={
 68 | 					x:self.dataset.get_x(index)
 69 | 				}
 70 | 			)
 71 | 			self.get_y_pred = theano.function(
 72 | 				inputs=[index],
 73 | 				outputs=classifier.y_pred,
 74 | 				givens={
 75 | 					x:self.dataset.get_x(index)
 76 | 				}
 77 | 			)
 78 | 		# End of if
 79 | 
 80 | 		self.ngram_log_prob = theano.function(
 81 | 			inputs=[x, y],
 82 | 			outputs=T.log(classifier.p_y_given_x(y)),
 83 | 		)
 84 | 
 85 | 
 86 | 	def classification_error(self):
 87 | 		return np.sum([self.sum_batch_error(i) for i in xrange(self.num_batches)]) / self.num_samples
 88 | 
 89 | 	def mean_neg_log_likelihood(self):
 90 | 		return math.fsum([self.neg_sum_batch_log_likelihood(i) for i in xrange(self.num_batches)]) / self.num_samples # np.sum() has some precision problems here
 91 | 
 92 | 	def mean_unnormalized_neg_log_likelihood(self):
 93 | 		return math.fsum([self.unnormalized_neg_sum_batch_log_likelihood(i) for i in xrange(self.num_batches)]) / self.num_samples # np.sum() has some precision problems here
 94 | 
 95 | 	def perplexity(self):
 96 | 		return math.exp(self.mean_neg_log_likelihood())
 97 | 
 98 | 	def unnormalized_perplexity(self):
 99 | 		return math.exp(self.mean_unnormalized_neg_log_likelihood())
100 | 
101 | 	def get_sequence_log_prob(self, index):
102 | 		return - self.neg_sequence_log_prob(index)
103 | 
104 | 	def get_unnormalized_sequence_log_prob(self, index):
105 | 		return - self.unnormalized_neg_sum_batch_log_likelihood(index)
106 | 
107 | 	def get_ngram_log_prob(self, x, y):
108 | 		return self.ngram_log_prob(x, y)
109 | 
110 | 	def get_denominator(self):
111 | 		return np.mean([self.denominator(i) for i in xrange(self.num_batches)])
112 | 
113 | 	def get_class(self, index, restricted_ids = []):
114 | 		if restricted_ids != []:
115 | 			return restricted_ids[np.argmax(self.get_p_matrix(index)[:,restricted_ids])]
116 | 		else:
117 | 			return self.get_y_pred(index)[0]
118 | 


--------------------------------------------------------------------------------
/dlm/trainer.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import theano
  3 | import theano.tensor as T
  4 | from dlm import eval
  5 | import dlm.utils as U
  6 | import dlm.io.logging as L
  7 | from dlm.algorithms.lr_tuner import LRTuner
  8 | import time
  9 | import numpy as np
 10 | import sys
 11 | import time
 12 | 
 13 | 
 14 | def train(classifier, criterion, args, trainset, devset, testset=None):
 15 | 	if args.algorithm == "sgd":
 16 | 		from dlm.algorithms.sgd import SGD as Trainer
 17 | 	else:
 18 | 		L.error("Invalid training algorithm: " + args.algorithm)
 19 | 
 20 | 	# Get number of minibatches from the training file
 21 | 	num_train_batches = trainset.get_num_batches()
 22 | 	
 23 | 	# Initialize the trainer object
 24 | 	trainer = Trainer(classifier, criterion, args.learning_rate, trainset, clip_threshold=args.clip_threshold)
 25 | 
 26 | 	# Initialize the Learning Rate tuner, which adjusts learning rate based on the development/validation file
 27 | 	lr_tuner = LRTuner(low=0.01*args.learning_rate, high=10*args.learning_rate, inc=0.01*args.learning_rate)
 28 | 	validation_frequency = 5000 # minibatches
 29 | 
 30 | 	# Logging and statistics
 31 | 	total_num_iter = args.num_epochs * num_train_batches
 32 | 	hook = Hook(classifier, devset, testset, total_num_iter, args.out_dir)
 33 | 	L.info('Training')
 34 | 	start_time = time.time()
 35 | 	verbose_freq = 1000 # minibatches
 36 | 	epoch = 0
 37 | 	
 38 | 	hook.evaluate(0)
 39 | 	
 40 | 	a = time.time()
 41 | 	classifier.save_model(args.out_dir + '/model.epoch_0.gz', zipped=True)
 42 | 	
 43 | 	while (epoch < args.num_epochs):
 44 | 		epoch = epoch + 1
 45 | 		L.info("Epoch: " + U.red(epoch))
 46 | 
 47 | 		minibatch_avg_cost_sum = 0
 48 | 		for minibatch_index in xrange(num_train_batches):
 49 | 			# Makes an update of the paramters after processing the minibatch
 50 | 			minibatch_avg_cost, gparams = trainer.step(minibatch_index)
 51 | 			minibatch_avg_cost_sum += minibatch_avg_cost
 52 | 			
 53 | 			if minibatch_index % verbose_freq == 0:
 54 | 				grad_norms = [np.linalg.norm(gparam) for gparam in gparams]
 55 | 				L.info(U.blue("[" + time.ctime() + "] ") + '%i/%i, cost=%.2f, lr=%f'
 56 | 					% (minibatch_index, num_train_batches, minibatch_avg_cost_sum/(minibatch_index+1), trainer.get_learning_rate()))
 57 | 				L.info('Grad Norms: [' + ', '.join(['%.6f' % gnorm for gnorm in grad_norms]) + ']')
 58 | 			curr_iter = (epoch - 1) * num_train_batches + minibatch_index
 59 | 			if curr_iter > 0 and curr_iter % validation_frequency == 0:
 60 | 				hook.evaluate(curr_iter)
 61 | 
 62 | 		L.info(U.blue("[" + time.ctime() + "] ") + '%i/%i, cost=%.2f, lr=%f'
 63 | 			% (num_train_batches, num_train_batches, minibatch_avg_cost_sum/num_train_batches, trainer.get_learning_rate()))
 64 | 		dev_ppl = hook.evaluate(curr_iter)
 65 | 		lr = trainer.get_learning_rate()
 66 | 		if args.enable_lr_adjust:
 67 | 			lr = lr_tuner.adapt_lr(dev_ppl, lr)
 68 | 		trainer.set_learning_rate(lr)
 69 | 		classifier.save_model(args.out_dir + '/model.epoch_' + str(epoch) + '.gz', zipped=True)
 70 | 
 71 | 	end_time = time.time()
 72 | 	hook.evaluate(total_num_iter)
 73 | 	L.info('Optimization complete')
 74 | 	L.info('Ran for %.2fm' % ((end_time - start_time) / 60.))
 75 | 
 76 | 
 77 | class Hook:
 78 | 	def __init__(self, classifier, devset, testset, total_num_iter, out_dir):
 79 | 		self.classifier = classifier
 80 | 		self.dev_eval = eval.Evaluator(dataset=devset, classifier=classifier)
 81 | 		self.test_eval = None
 82 | 		if testset:
 83 | 			self.test_eval = eval.Evaluator(dataset=testset, classifier=classifier)
 84 | 		self.best_iter = 0
 85 | 		self.best_dev_perplexity = np.inf
 86 | 		self.best_test_perplexity = np.inf
 87 | 		self.t0 = time.time()
 88 | 		self.total_num_iter = total_num_iter
 89 | 		self.out_dir = out_dir
 90 | 
 91 | 	def evaluate(self, curr_iter):
 92 | 		denominator = self.dev_eval.get_denominator()
 93 | 		dev_error = self.dev_eval.classification_error()
 94 | 		dev_perplexity = self.dev_eval.perplexity()
 95 | 		if self.test_eval:
 96 | 			test_error = self.test_eval.classification_error()
 97 | 			test_perplexity = self.test_eval.perplexity()
 98 | 
 99 | 		if dev_perplexity < self.best_dev_perplexity:
100 | 			self.best_dev_perplexity = dev_perplexity
101 | 			self.best_iter = curr_iter
102 | 			if self.test_eval:
103 | 				self.best_test_perplexity = test_perplexity
104 | 
105 | 		if curr_iter > 0:
106 | 			t1 = time.time()
107 | 			rem_time = int((self.total_num_iter - curr_iter) * (t1 - self.t0) / (curr_iter * 60))
108 | 			rem_time = str(rem_time) + "m"
109 | 		else:
110 | 			rem_time = ""
111 | 
112 | 		L.info(('DEV  => Error=%.2f%%, PPL=' + U.b_yellow('%.2f @ %i') + ' (' + U.b_red('%.2f @ %i') + '), Denom=%.3f, %s')
113 | 			% (dev_error * 100., dev_perplexity, curr_iter, self.best_dev_perplexity, self.best_iter, denominator, rem_time))
114 | 		if self.test_eval:
115 | 			L.info(('TEST => Error=%.2f%%, PPL=' + U.b_yellow('%.2f @ %i') + ' (' + U.b_red('%.2f @ %i') + ')')
116 | 				% (test_error * 100., test_perplexity, curr_iter, self.best_test_perplexity, self.best_iter))
117 | 		
118 | 		return dev_perplexity
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CoreLM 
 2 | 
 3 | CoreLM is a flexible and reusable feed-forward neural network which can be used to train neural language models and joint models (Devlin et. al, 2014), and interface with popular SMT systems like [MOSES](http://www.statmt.org/moses/). It is implemented in Python using [Theano](http://deeplearning.net/software/theano/), which makes is easy-to-use and modify. 
 4 | 
 5 | ## Features
 6 | 
 7 | * Implementation of self-normalized log-likelihood (Devlin et. al, 2014)  and noise contrastive estimation (NCE) loss functions, to train fast neural language models.
 8 | * Decoder Integration with MOSES using NeuralLM and BilingualLM feature functions in MOSES. Also, rescoring MOSES n-best lists using neural language models. 
 9 | * Efficient and optimized implementation using Theano, capable of using GPU support for faster training and decoding. 
10 | * The neural network architecture is flexible. Multiple hidden layers and various activation function, multiple sets of features with different embeddings etc.
11 | * The training is also flexible, with layer specific and adjustable learning rates, using various cost functions like log-likelihood and NCE and regularizations (L1 and L2). 
12 | * Preprocessing scripts for monolingual language modeling and bilingual language modeling. 
13 | 
14 | ## Getting Started
15 | 
16 | ### Prerequisites
17 | * Python Version 2.7 
18 | * Theano (See [installation instructions](http://deeplearning.net/software/theano/install_ubuntu.html)) with CUDA support (to use GPU)
19 | 
20 | ### Installation
21 | 1. Download and unzip CoreLM package in your local machine.  Alternatively, you can clone using GIT.
22 | 	```
23 | 	git clone https://github.com/nusnlp/corelm /path/to/corelm
24 | 	```
25 | 
26 | 2. Add the CoreLM directory to PYTHONPATH environment variable. For bash users, add the following line to ~/.bashrc : 
27 | ```
28 | export PYTHONPATH="${PYTHONPATH}:/path/to/corelm/"
29 | ```
30 | 
31 | ## Using CoreLM
32 | 
33 | 
34 | ### Preprocessing
35 | 
36 | The preprocessing scripts can be found in [dlm/preprocess/](dlm/preprocess) directory. The following scripts are available. For detailed help, run the required script with `--help` option. 
37 | 
38 | * **[monolingual.py](dlm/preprocess/monolingual.py)** : This script preprocesses a text file for monolingual language modeling. The text file must contain one sentence per line.
39 | 
40 | * **[bilingual.py](dlm/preprocess/bilingual.py)** : This script preprocesses sentence aligned parallel corpora for bilingual language modeling. 
41 | 
42 | * **[features.py](dlm/preprocess/features.py)** : This script can be used for sequence labeling tasks. The input text file must have one sentence per line, and one per-word feature is accepted.  An example is shown below:
43 | ```
44 | word1_feature1 word2_feature2 word3_feature3 ... wordN_featureN 
45 | ```
46 | * **[convert_to_memmap.py](dlm/preprocess/convert_to_memmap.py)** : Custom inputs can be converted to input. The input must be a text file, with each line representing a training instance. The words or features must be replaced by corresponding indices according to the vocabulary file supplied. The format is as shown below:
47 | ```
48 | word_index_11 word_index_12  ... word_index_1M output_word_index_1
49 | ...
50 | ...
51 | word_index_N1 word_index_N2  ... word_index_NM output_word_index_N
52 | ```
53 | where M is the number of input words and N is the number of training instances.
54 | 
55 | 
56 | ### Training
57 | Training the neural network is done using the [train.py](train.py) script. The script takes in a memory-memory mapped file which is generated by the pre-processing scripts. Use `--help` for detailed list of options. 
58 | 
59 | 
60 | ### Testing
61 | Evaluation of the neural network can be done using [test.py](test.py) script. It prints the perplexities and log-likelihood of the models on the test set. It optionally outputs the predicted labels. To predict lables of custom test instances use the [classify.py](classify.py) script. See --help for each script. 
62 | 
63 | ### Integration with Moses
64 | Integration of language and joint models trained using CoreLM is achieved by two methods, re-ranking n-best hypothesis and decoder integration.
65 | * **Re-ranking** : To perform re-ranking of SMT n-best lists (in Moses format) using CoreLM models, first the weight of the new feature is to be trained using the [dlm/reranker/train.py](dlm/reranker/train.py). This can be done using MERT or PRO, which can be set using command-line options. After training the weights, the re-ranking can be done using [dlm/reranker/rerank.py](dlm/reranker/rerank.py). Refer to `--help` for these scripts for the list of options. 
66 | 
67 | * **Decoder Integration** : Currently, CoreLM uses the NPLM interface to Moses for integration. CoreLM models can be converted to NPLM format using [corelm_to_nplm.py](dlm/misc/corelm_to_nplm.py) script. This can be integrated using `NeuralLM` and `BilingualLM` feature functions in Moses (See [Moses documentation](http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel)).
68 | 
69 | 
70 | 
71 | 
72 | ## License
73 | This project is licensed under the MIT license - see the [LICENSE.md](LICENSE.md) file for details
74 | 
75 | 


--------------------------------------------------------------------------------
/dlm/models/mlp.py:
--------------------------------------------------------------------------------
  1 | from dlm.models.components.lookuptable import LookupTable
  2 | from dlm.models.components.linear import Linear
  3 | from dlm.models.components.activation import Activation
  4 | from dlm.models import classifier
  5 | import dlm.utils as U
  6 | import dlm.io.logging as L
  7 | import theano.tensor as T
  8 | import theano
  9 | import numpy
 10 | import math
 11 | 
 12 | class MLP(classifier.Classifier):
 13 | 
 14 | 	def __init__(self, args=None, model_path=None):
 15 | 
 16 | 		######################################################################
 17 | 		## Parameters
 18 | 		#
 19 | 		
 20 | 		U.xassert((args or model_path) and not (args and model_path), "args or model_path are mutually exclusive")
 21 | 		
 22 | 		if model_path:
 23 | 			args, loaded_params = self.load_model(model_path)
 24 | 		
 25 | 		emb_dim = args.emb_dim
 26 | 		num_hidden_list = map(int, args.num_hidden.split(','))
 27 | 		if num_hidden_list[0] <= 0:
 28 | 			num_hidden_list = []
 29 | 
 30 | 		vocab_size = args.vocab_size
 31 | 		self.ngram_size = args.ngram_size
 32 | 		num_classes = args.num_classes
 33 | 		activation_name = args.activation_name
 34 | 		self.args = args
 35 | 		self.L1 = 0
 36 | 		self.L2_sqr = 0
 37 | 		self.params = []
 38 | 		
 39 | 		emb_path, vocab = None, None
 40 | 		try:
 41 | 			emb_path = args.emb_path
 42 | 			vocab = args.vocab
 43 | 		except AttributeError:
 44 | 			pass
 45 | 		
 46 | 		rng = numpy.random.RandomState(1234)
 47 | 		self.input = T.imatrix('input')
 48 | 
 49 | 		######################################################################
 50 | 		## Lookup Table Layer
 51 | 		#
 52 | 		
 53 | 		lookupTableLayer = LookupTable(
 54 | 			rng=rng,
 55 | 			input=self.input,
 56 | 			vocab_size=vocab_size,
 57 | 			emb_dim=emb_dim,
 58 | 			emb_path=emb_path,
 59 | 			vocab_path=vocab,
 60 | 			add_weights=args.weighted_emb
 61 | 		)
 62 | 		last_layer_output = lookupTableLayer.output
 63 | 		last_layer_output_size = (self.ngram_size - 1) * emb_dim
 64 | 		self.params += lookupTableLayer.params
 65 | 		
 66 | 		######################################################################
 67 | 		## Hidden Layer(s)
 68 | 		#
 69 | 		
 70 | 		for i in range(0, len(num_hidden_list)):
 71 | 			linearLayer = Linear(
 72 | 				rng=rng,
 73 | 				input=last_layer_output,
 74 | 				n_in=last_layer_output_size,
 75 | 				n_out=num_hidden_list[i],
 76 | 				suffix=i
 77 | 			)
 78 | 			last_layer_output = linearLayer.output
 79 | 			last_layer_output_size = num_hidden_list[i]
 80 | 			self.params += linearLayer.params
 81 | 			
 82 | 			activation = Activation(
 83 | 				input=last_layer_output,
 84 | 				func_name=activation_name
 85 | 			)
 86 | 			last_layer_output = activation.output
 87 | 			
 88 | 			self.L1 = self.L1 + abs(linearLayer.W).sum()
 89 | 			self.L2_sqr = self.L2_sqr + (linearLayer.W ** 2).sum()
 90 | 		
 91 | 		######################################################################
 92 | 		## Output Linear Layer
 93 | 		#
 94 | 		
 95 | 		linearLayer = Linear(
 96 | 			rng=rng,
 97 | 			input=last_layer_output,
 98 | 			n_in=last_layer_output_size,
 99 | 			n_out=num_classes,
100 | 			#b_values = numpy.zeros(num_classes) - math.log(num_classes)
101 | 			b_values = numpy.full(shape=(num_classes),fill_value=(-math.log(num_classes)),dtype=theano.config.floatX),
102 | 			suffix='out'
103 | 		)
104 | 		last_layer_output = linearLayer.output
105 | 		self.params += linearLayer.params
106 | 		
107 | 		self.L1 = self.L1 + abs(linearLayer.W).sum()
108 | 		self.L2_sqr = self.L2_sqr + (linearLayer.W ** 2).sum()
109 | 		
110 | 		######################################################################
111 | 		## Model Output
112 | 		#
113 | 		
114 | 		self.output = last_layer_output
115 | 		self.p_y_given_x_matrix = T.nnet.softmax(last_layer_output)
116 | 		
117 | 		# Log Softmax
118 | 		last_layer_output_shifted = last_layer_output - last_layer_output.max(axis=1, keepdims=True)
119 | 		self.log_p_y_given_x_matrix = last_layer_output_shifted - T.log(T.sum(T.exp(last_layer_output_shifted),axis=1,keepdims=True))
120 | 
121 | 
122 | 		#self.log_Z_sqr = T.log(T.mean(T.sum(T.exp(last_layer_output), axis=1))) ** 2
123 | 		#self.log_Z_sqr = T.sum(T.log(T.sum(T.exp(last_layer_output), axis=1))) ** 2
124 | 		self.log_Z_sqr = T.mean(T.log(T.sum(T.exp(last_layer_output), axis=1)) ** 2)
125 | 
126 | 		######################################################################
127 | 		## Model Predictions
128 | 
129 | 		self.y_pred = T.argmax(self.p_y_given_x_matrix, axis=1)
130 | 		
131 | 		######################################################################
132 | 		## Loading parameters from file (if given)
133 | 		#
134 | 		
135 | 		if model_path:
136 | 			self.set_params(loaded_params)
137 | 		
138 | 	######################################################################
139 | 	## Model Functions
140 | 	#
141 | 	
142 | 	def p_y_given_x(self, y):
143 | 		return self.p_y_given_x_matrix[T.arange(y.shape[0]), y]
144 | 
145 | 	def log_p_y_given_x(self, y):
146 | 		return self.log_p_y_given_x_matrix[T.arange(y.shape[0]), y]
147 | 	
148 | 	def unnormalized_p_y_given_x(self, y):
149 | 		return self.output[T.arange(y.shape[0]), y]
150 | 	
151 | 	def negative_log_likelihood(self, y, weights=None):
152 | 		if weights:
153 | 			return -T.sum(T.log(self.p_y_given_x(y)) * weights) / T.sum(weights)
154 | 		else:
155 | 			#return -T.mean( T.log(self.p_y_given_x(y)))						# Unstable : can lead to NaN
156 | 			return -T.mean(self.log_p_y_given_x(y))								# Stable Version
157 | 
158 | 	def errors(self, y):
159 | 		if y.ndim != self.y_pred.ndim:
160 | 			raise TypeError('y should have the same shape as self.y_pred', ('y', y.type, 'y_pred', self.y_pred.type))
161 | 		if y.dtype.startswith('int'):
162 | 			return T.sum(T.neq(self.y_pred, y))
163 | 		else:
164 | 			raise NotImplementedError()
165 | 
166 | 


--------------------------------------------------------------------------------
/dlm/misc/corelm_to_nplm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import numpy as np
  4 | import argparse
  5 | import os
  6 | import dlm.utils as U
  7 | import dlm.io.logging as L
  8 | 
  9 | 
 10 | def write_matrix(f, matrix):
 11 | 	for row in matrix:
 12 | 		f.write(str(row[0]))
 13 | 		for val in row[1:]:
 14 | 			f.write("\t"+str(val))
 15 | 		f.write("\n")
 16 | 
 17 | def write_biases(f, biases):
 18 | 	for bias in biases:
 19 | 		f.write(str(bias) + "\n")
 20 | 
 21 | # Arguments for this script
 22 | parser = argparse.ArgumentParser()
 23 | parser.add_argument("-m", "--corelm-model", dest="corelm_model", required=True, help="The input NPLM model file")
 24 | parser.add_argument("-v", "--vocab-file", dest="vocab_path", required=True, help="The input vocabulary")
 25 | parser.add_argument("-dir", "--directory", dest="out_dir", help="The output directory for log file, model, etc.")
 26 | 
 27 | args = parser.parse_args()
 28 | 
 29 | U.set_theano_device('cpu',1)
 30 | from dlm.models.mlp import MLP
 31 | 
 32 | if args.out_dir is None:
 33 | 	args.out_dir = 'corelm_convert-' + U.curr_time()
 34 | U.mkdir_p(args.out_dir)
 35 | 
 36 | # Loading CoreLM model and creating classifier class
 37 | L.info("Loading CoreLM model")
 38 | classifier = MLP(model_path=args.corelm_model)
 39 | args_nn = classifier.args
 40 | params_nn = classifier.params
 41 | U.xassert(len(params_nn)==7, "CoreLM model is not compatible with NPLM architecture. 2 hidden layers and an output linear layer is required.")
 42 | 
 43 | embeddings = params_nn[0].get_value()
 44 | W1 = params_nn[1].get_value()
 45 | W1 = np.transpose(W1)
 46 | b1 = params_nn[2].get_value()
 47 | W2 = params_nn[3].get_value()
 48 | W2 = np.transpose(W2)
 49 | b2 = params_nn[4].get_value()
 50 | W3 = params_nn[5].get_value()
 51 | W3 = np.transpose(W3)
 52 | b3 = params_nn[6].get_value()
 53 | 
 54 | 
 55 | # Storing vocabulary into an array
 56 | has_null = False
 57 | has_sentence_end = False
 58 | vocab_list = []
 59 | with open(args.vocab_path,'r') as f_vocab:
 60 | 	for word in f_vocab:
 61 | 		word = word.strip()
 62 | 		vocab_list.append(word)
 63 | 		if word == "<null>":
 64 | 			has_null = True
 65 | 		if word == "</s>":
 66 | 			has_sentence_end = True
 67 | 
 68 | U.xassert(has_sentence_end, "End-of-sentence marker (</s>) has to be present in CoreLM model.")
 69 | 
 70 | # adding null if it is not present
 71 | if has_null == False:
 72 | 	vocab_list.append("<null>")	
 73 | 
 74 | 
 75 | # Writing to NPLM model
 76 | model_file = args.out_dir + "/" + os.path.basename(args.corelm_model) + ".nplm"
 77 | L.info("Writing NPLM Model: " + model_file)
 78 | with open(model_file,'w') as f_model:
 79 | 	
 80 | 	# Writing the config parameters for the NPLM model
 81 | 	f_model.write("\config\n")
 82 | 	f_model.write("version 1\n")
 83 | 	f_model.write("ngram_size " + str(args_nn.ngram_size) + "\n")
 84 |  	if has_null == True:
 85 | 		f_model.write("input_vocab_size " + str(args_nn.vocab_size)+"\n")
 86 | 	else:
 87 | 		f_model.write("input_vocab_size " + str(args_nn.vocab_size + 1)+"\n") # +1 is used to add the <null> token which is not in corelm
 88 | 	if has_null == True:
 89 | 		f_model.write("output_vocab_size " + str(args_nn.num_classes)+"\n")
 90 | 	else:
 91 | 		f_model.write("output_vocab_size " + str(args_nn.num_classes + 1)+"\n")
 92 | 	f_model.write("input_embedding_dimension " + str(args_nn.emb_dim) + "\n")
 93 | 	f_model.write("num_hidden " + args_nn.num_hidden.split(',')[0] + "\n")
 94 | 	f_model.write("output_embedding_dimension " + args_nn.num_hidden.split(',')[1] + "\n")
 95 | 
 96 | 	act_func = args_nn.activation_name
 97 | 	U.xassert(act_func in ['relu','tanh','hardtanh'], "Invalid activation function: " + act_func + " (NPLM supports relu, tanh and hardtanh)")
 98 | 	if act_func == "relu":
 99 | 		act_func = "rectifier"
100 | 	f_model.write("activation_function " + act_func + "\n")
101 | 
102 | 	f_model.write("\n")
103 | 	
104 | 	# Writing the input vocabulary
105 | 	f_model.write("\input_vocab\n")
106 | 	for word in vocab_list:
107 | 		f_model.write(word+"\n")
108 | 
109 | 	f_model.write("\n")
110 | 
111 | 	# Writing the output vocabulary ( Currently it is same as input vocabulary)
112 | 	f_model.write("\output_vocab\n")
113 | 	for word in vocab_list:
114 | 		f_model.write(word+"\n")
115 | 
116 | 	f_model.write("\n")
117 | 
118 | 	np.set_printoptions(precision=8, suppress=True)
119 | 	rng = np.random.RandomState(1234)
120 | 	
121 | 	# Writing the input embeddings
122 | 	f_model.write("\input_embeddings\n")
123 | 	if has_null == False:
124 | 		null_row = np.asarray(rng.uniform(low=-0.01, high=0.01, size=(1,embeddings.shape[1])), dtype=embeddings.dtype)
125 | 		embeddings = np.append(embeddings, null_row, axis=0)
126 | 	write_matrix(f_model, embeddings)
127 | 
128 | 	f_model.write("\n")
129 | 	
130 | 	# Writing the hidden layer weights and biases
131 | 	f_model.write("\hidden_weights 1\n")
132 | 	write_matrix(f_model, W1)
133 | 
134 | 	f_model.write("\n")
135 | 	f_model.write("\hidden_biases 1\n")
136 | 	write_biases(f_model, b1)
137 | 
138 | 	f_model.write("\n")
139 | 	f_model.write("\hidden_weights 2\n")
140 | 	write_matrix(f_model, W2)
141 | 
142 | 	f_model.write("\n")
143 | 	f_model.write("\hidden_biases 2\n")
144 | 	write_biases(f_model, b2)
145 | 
146 | 	f_model.write("\n")
147 | 	
148 | 	# Writing the output linear layer and biases
149 | 	f_model.write("\output_weights\n")
150 | 	if has_null == False:
151 | 		null_row = np.asarray(rng.uniform(low=-0.01, high=0.01, size=(1,W3.shape[1])), dtype=W3.dtype)
152 | 		W3 = np.append(W3, null_row, axis=0)
153 | 	write_matrix(f_model, W3)
154 | 	
155 | 	f_model.write("\n")
156 | 	f_model.write("\output_biases\n")
157 | 	write_biases(f_model, b3)
158 | 	if has_null == False:
159 | 		f_model.write("0.0\n")
160 | 	f_model.write("\n")
161 | 
162 | 	f_model.write("\end\n")
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 


--------------------------------------------------------------------------------
/dlm/reranker/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | import os
  5 | import shutil
  6 | import imp
  7 | try:
  8 | 	import dlm
  9 | except ImportError:
 10 | 	print "[ERROR] dlm module not found. Add CoreLM root directory to your PYTHONPATH"
 11 | 	sys.exit()
 12 | 
 13 | import dlm.utils as U
 14 | import dlm.io.logging as L
 15 | import argparse
 16 | 
 17 | parser = argparse.ArgumentParser()
 18 | parser.add_argument("-i", "--input-nbest", dest="input_nbest", required=True, help="Input n-best file")
 19 | parser.add_argument("-v", "--vocab-file", dest="vocab_path", required=True, help="The vocabulary file that was used in training")
 20 | parser.add_argument("-m", "--model-file", dest="model_path", required=True, help="Input CoreLM model file")
 21 | parser.add_argument("-r", "--reference-files", dest="ref_paths", required=True, help="A comma-seperated list of reference files")
 22 | parser.add_argument("-c", "--config", dest="input_config", required=True, help="Input moses config (ini) file")
 23 | parser.add_argument("-o", "--output-dir", dest="out_dir", required=True, help="Output directory")
 24 | parser.add_argument("-d", "--device", dest="device", default="gpu", help="The computing device (cpu or gpu)")
 25 | parser.add_argument("-t", "--threads", dest="threads", default = 14, type=int, help="Number of MERT threads")
 26 | parser.add_argument("-iv", "--init-value", dest="init_value", default = '0.05', help="The initial value of the feature")
 27 | parser.add_argument("-n", "--no-aug", dest="no_aug", action='store_true', help="Augmentation will be skipped, if this flag is set")
 28 | parser.add_argument("-a", "--tuning-algorithm", dest="alg", default = 'mert', help="Tuning Algorithm (mert|pro|wpro)")
 29 | parser.add_argument("-w", "--instance-weights", dest="instance_weights_path", help="Instance weights for wpro algorithm")
 30 | parser.add_argument("-s", "--predictable-seed", dest="pred_seed", action='store_true', help="Tune with predictable seed to avoid randomness")
 31 | args = parser.parse_args()
 32 | 
 33 | U.set_theano_device(args.device)
 34 | 
 35 | from dlm.reranker import augmenter
 36 | from dlm.reranker import mosesIniReader as iniReader
 37 | 
 38 | if os.environ.has_key('MOSES_ROOT'):
 39 | 	moses_root = os.environ['MOSES_ROOT']
 40 | else:
 41 | 	L.error("Set MOSES_ROOT variable to your moses root directory")
 42 | 
 43 | U.mkdir_p(args.out_dir)
 44 | 
 45 | #cmd = moses_root + '/bin/moses -show-weights -f ' + args.input_config + ' 2> /dev/null'
 46 | #features = U.capture(cmd).strip().split('\n')
 47 | features = iniReader.parseIni(args.input_config)
 48 | 
 49 | output_nbest_path = args.out_dir + '/augmented.nbest'
 50 | 
 51 | if args.no_aug:
 52 | 	shutil.copy(args.input_nbest, output_nbest_path)
 53 | else:
 54 | 	augmenter.augment(args.model_path, args.input_nbest, args.vocab_path, output_nbest_path)
 55 | 
 56 | L.info('Extracting stats and features')
 57 | #L.warning('The optional arguments of extractor are not used yet')
 58 | cmd = moses_root + '/bin/extractor -r ' + args.ref_paths + ' -n ' + output_nbest_path + ' --scfile ' + args.out_dir + '/statscore.data --ffile ' + args.out_dir + '/features.data'
 59 | U.capture(cmd)
 60 | 
 61 | with open(args.out_dir + '/init.opt', 'w') as init_opt:
 62 | 	init_list = []
 63 | 	for line in features:
 64 | 		tokens = line.split(" ")
 65 | 		try:
 66 | 			float(tokens[1])
 67 | 			init_list += tokens[1:]
 68 | 		except ValueError:
 69 | 			pass
 70 | 	if not args.no_aug:
 71 | 		init_list.append(args.init_value)
 72 | 	dim = len(init_list)
 73 | 	init_opt.write(' '.join(init_list) + '\n')
 74 | 	init_opt.write(' '.join(['0' for i in range(dim)]) + '\n')
 75 | 	init_opt.write(' '.join(['1' for i in range(dim)]) + '\n')
 76 | 
 77 | seed_arg = ''
 78 | if args.pred_seed:
 79 | 	seed_arg = ' -r 1234 '
 80 | 
 81 | if (args.alg == 'pro' or args.alg == 'wpro'):
 82 | 	# PRO
 83 | 	if args.alg == 'pro':
 84 | 		L.info("Running PRO")
 85 | 		cmd = moses_root + '/bin/pro' + ' -S ' + args.out_dir + '/statscore.data -F ' + args.out_dir + '/features.data -o ' + args.out_dir +'/pro.data' + seed_arg
 86 | 	else:
 87 | 		L.info("Running WEIGHTED PRO")
 88 | 		U.xassert(args.instance_weights_path, 'Instance weights are not given to wpro')
 89 | 		cmd = moses_root + '/bin/proWeighted' + ' -S ' + args.out_dir + '/statscore.data -F ' + args.out_dir + '/features.data -o ' + args.out_dir +'/pro.data' + seed_arg + ' -w ' + args.instance_weights_path
 90 | 	U.capture(cmd)
 91 | 	cmd = moses_root + '/bin/megam_i686.opt -fvals -maxi 30 -nobias binary ' + args.out_dir + '/pro.data'
 92 | 	pro_weights = U.capture(cmd)
 93 | 
 94 | 	pro_weights_arr = pro_weights.strip().split('\n')
 95 | 	weights_dict = dict()
 96 | 	sum = 0.0
 97 | 	highest_feature_index = 0
 98 | 
 99 | 	for elem in pro_weights_arr:
100 | 		feature_index,weight = elem[1:].split()
101 | 		feature_index = int(feature_index)
102 | 		weight = float(weight)
103 | 		weights_dict[feature_index] = weight
104 | 		sum = sum + weight
105 | 		if feature_index >= highest_feature_index:
106 | 			highest_feature_index = feature_index
107 | 
108 | 	# Write normalized weights to the file
109 | 	f_weights = open('weights.txt','w')
110 | 	for feature_index in xrange(highest_feature_index+1):
111 | 		weight = weights_dict[feature_index]
112 | 		f_weights.write(str(weight/sum) + ' ');
113 | 		#f_weights.write(str(weight) + ' ');
114 | elif (args.alg == 'mert'):
115 | 	# MERT
116 | 	#L.warning('The optional arguments of mert are not used yet')
117 | 	L.info('Running MERT')
118 | 	cmd = moses_root + '/bin/mert -d ' + str(dim) + ' -S ' + args.out_dir + '/statscore.data -F ' + args.out_dir + '/features.data --ifile ' + args.out_dir + '/init.opt --threads ' + str(args.threads) + seed_arg
119 | 	U.capture(cmd)
120 | else:
121 | 	L.error('Invalid tuning algorithm: ' + args.alg)
122 | 
123 | U.xassert(os.path.isfile('weights.txt'), 'Optimization failed')
124 | 
125 | shutil.move('weights.txt', args.out_dir + '/weights.txt')
126 | 
127 | 


--------------------------------------------------------------------------------
/dlm/io/nbestReader.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import dlm.utils as U
  3 | import dlm.io.logging as L
  4 | import codecs
  5 | 
  6 | class NBestList():
  7 | 	def __init__(self, nbest_path, mode='r', reference_list=None):
  8 | 		U.xassert(mode == 'r' or mode == 'w', "Invalid mode: " + mode)
  9 | 		self.mode = mode
 10 | 		self.nbest_file = codecs.open(nbest_path, mode=mode, encoding='UTF-8')
 11 | 		self.prev_index = -1
 12 | 		self.curr_item = None
 13 | 		self.curr_index = 0
 14 | 		self.eof_flag = False
 15 | 		self.ref_manager = None
 16 | 		if reference_list:
 17 | 			U.xassert(mode == 'r', "Cannot accept a reference_list in 'w' mode")
 18 | 			self.ref_manager = RefernceManager(reference_list)
 19 | 
 20 | 	def __iter__(self):
 21 | 		U.xassert(self.mode == 'r', "Iteration can only be done in 'r' mode")
 22 | 		return self
 23 | 	
 24 | 	def next_item(self):
 25 | 		U.xassert(self.mode == 'r', "next() method can only be used in 'r' mode")
 26 | 		try:
 27 | 			segments = self.nbest_file.next().split("|||")
 28 | 		except StopIteration:
 29 | 			self.close()
 30 | 			raise StopIteration
 31 | 		try:
 32 | 			index = int(segments[0])
 33 | 		except ValueError:
 34 | 			L.error("The first segment in an n-best list must be an integer")
 35 | 		hyp = segments[1].strip()
 36 | 		features = segments[2].strip()
 37 | 		score = segments[3].strip()
 38 | 		phrase_alignments = None
 39 | 		word_alignments = None
 40 | 		if len(segments) > 4:
 41 | 			phrase_alignments = segments[4].strip()
 42 | 		if len(segments) > 5:
 43 | 			word_alignments = segments[5].strip()
 44 | 		return NBestItem(index, hyp, features, score, phrase_alignments, word_alignments)
 45 | 	
 46 | 	def next(self): # Returns a group of NBestItems with the same index
 47 | 		if self.eof_flag == True:
 48 | 			raise StopIteration
 49 | 		U.xassert(self.mode == 'r', "next_group() method can only be used in 'r' mode")
 50 | 		group = NBestGroup(self.ref_manager)
 51 | 		group.add(self.curr_item) # add the item that was read in the last next() call
 52 | 		try:
 53 | 			self.curr_item = self.next_item()
 54 | 		except StopIteration:
 55 | 			self.eof_flag = True
 56 | 			return group
 57 | 		if self.curr_index != self.curr_item.index:
 58 | 			self.curr_index = self.curr_item.index
 59 | 			return group
 60 | 		while self.curr_index == self.curr_item.index:
 61 | 			group.add(self.curr_item)
 62 | 			try:
 63 | 				self.curr_item = self.next_item()
 64 | 			except StopIteration:
 65 | 				self.eof_flag = True
 66 | 				return group
 67 | 		self.curr_index = self.curr_item.index
 68 | 		return group
 69 | 	
 70 | 	def write(self, item):
 71 | 		U.xassert(self.mode == 'w', "write() method can only be used in 'w' mode")
 72 | 		self.nbest_file.write(unicode(item) + "\n")
 73 | 	
 74 | 	def close(self):
 75 | 		self.nbest_file.close()
 76 | 
 77 | 
 78 | 
 79 | class NBestItem:
 80 | 	def __init__(self, index, hyp, features, score, phrase_alignments, word_alignments):
 81 | 		self.index = index
 82 | 		self.hyp = hyp
 83 | 		self.features = features
 84 | 		self.score = score
 85 | 		self.phrase_alignments = phrase_alignments
 86 | 		self.word_alignments = word_alignments
 87 | 	
 88 | 	def __unicode__(self):
 89 | 		output = ' ||| '.join([unicode(self.index), self.hyp, self.features, self.score])
 90 | 		if self.phrase_alignments:
 91 | 			output = output + ' ||| ' + self.phrase_alignments
 92 | 		if self.word_alignments:
 93 | 			output = output + ' ||| ' + self.word_alignments
 94 | 		return output
 95 | 	
 96 | 	def append_feature(self, feature):
 97 | 		self.features += ' ' + str(feature)
 98 | 
 99 | 
100 | class NBestGroup:
101 | 	def __init__(self, refrence_manager=None):
102 | 		self.group_index = -1
103 | 		self.group = []
104 | 		self.ref_manager = refrence_manager
105 | 	
106 | 	def __unicode__(self):
107 | 		return '\n'.join([unicode(item) for item in self.group])
108 | 	
109 | 	def __iter__(self):
110 | 		self.item_index = 0
111 | 		return self
112 | 	
113 | 	def __getitem__(self, index):
114 | 		return self.group[index]
115 | 
116 | 	def add(self, item):
117 | 		if item is None:
118 | 			return
119 | 		if self.group_index == -1:
120 | 			self.group_index = item.index
121 | 			if self.ref_manager:
122 | 				self.refs = self.ref_manager.get_all_refs(self.group_index)
123 | 		else:
124 | 			U.xassert(item.index == self.group_index, "Cannot add an nbest item with an incompatible index")
125 | 		self.group.append(item)
126 | 		
127 | 	def next(self):
128 | 		#if self.item_index < len(self.group):
129 | 		try:
130 | 			item = self.group[self.item_index]
131 | 			self.item_index += 1
132 | 			return item
133 | 		#else:
134 | 		except IndexError:
135 | 			raise StopIteration
136 | 	
137 | 	def size(self):
138 | 		return len(self.group)
139 | 	
140 | 	def append_features(self, features_list):
141 | 		U.xassert(len(features_list) == len(self.group), 'Number of features and number of items in this group do not match')
142 | 		for i in range(len(self.group)):
143 | 			self.group[i].append_feature(features_list[i])
144 | 
145 | 
146 | 
147 | class RefernceManager:
148 | 	def __init__(self, paths_list):
149 | 		U.xassert(type(paths_list) is list, "The input to a RefernceManager class must be a list")
150 | 		self.ref_list = []
151 | 		self.num_lines = -1
152 | 		self.num_refs = 0
153 | 		for path in paths_list:
154 | 			with codecs.open(path, mode='r', encoding='UTF-8') as f:
155 | 				self.num_refs += 1
156 | 				sentences = f.readlines()
157 | 				if self.num_lines == -1:
158 | 					self.num_lines = len(sentences)
159 | 				else:
160 | 					U.xassert(self.num_lines == len(sentences), "Reference files must have the same number of lines")
161 | 				self.ref_list.append(sentences)
162 | 	
163 | 	def get_all_refs(self, index):
164 | 		U.xassert(index < self.num_lines, "Index out of bound")
165 | 		return [self.ref_list[k][index] for k in range(self.num_refs)]
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys, os
  4 | import argparse
  5 | import dlm.utils as U
  6 | import dlm.io.logging as L
  7 | 
  8 | ###############
  9 | ## Arguments
 10 | #
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument("-tr", "--train-mmap", dest="trainset", required=True, help="The memory-mapped training file")
 14 | parser.add_argument("-tu", "--tune-mmap", dest="devset", required=True, help="The memory-mapped development (tune) file")
 15 | parser.add_argument("-ts", "--test-mmap", dest="testset", help="The memory-mapped evaluation (test) file")
 16 | parser.add_argument("-d", "--device", dest="device", default="gpu", help="The computing device (cpu or gpu). Default: gpu")
 17 | parser.add_argument("-E", "--emb-dim", dest="emb_dim", default=50, type=int, help="Word embeddings dimension. Default: 50")
 18 | parser.add_argument("-H", "--hidden-units", dest="num_hidden", default="512", help="A comma seperated list for the number of units in each hidden layer. Default: 512")
 19 | parser.add_argument("-A", "--activation", dest="activation_name", default="tanh", help="Activation function (tanh|hardtanh|sigmoid|fastsigmoid|hardsigmoid|softplus|relu|cappedrelu|softmax). Default: tanh")
 20 | parser.add_argument("-a", "--training-algorithm", dest="algorithm", default="sgd", help="The training algorithm (only sgd is supported for now). Default: sgd")
 21 | parser.add_argument("-b", "--batch-size", dest="batchsize", default=128, type=int, help="Minibatch size for training. Default: 128")
 22 | parser.add_argument("-l", "--learning-rate", dest="learning_rate", default=0.01, type=float, help="Learning rate. Default: 0.01")
 23 | parser.add_argument("-D", "--learning-rate-decay", dest="learning_rate_decay", default=0, type=float, help="Learning rate decay (e.g. 0.995) (TO DO). Default: 0")
 24 | parser.add_argument("-M", "--momentum", dest="momentum", default=0, type=float, help="Momentum (TO DO). Default: 0")
 25 | parser.add_argument("-lf","--loss-function", dest="loss_function", default="nll", help="Loss function (nll|nce). Default: nll (Negative Log Likelihood)")
 26 | parser.add_argument("-ns","--noise-samples", dest="num_noise_samples", default=100 ,type=int, help="Number of noise samples for noise contrastive estimation. Default:100")
 27 | parser.add_argument("-e", "--num-epochs", dest="num_epochs", default=50, type=int, help="Number of iterations (epochs). Default: 50")
 28 | parser.add_argument("-c", "--self-norm-coef", dest="alpha", default=0, type=float, help="Self normalization coefficient (alpha). Default: 0")
 29 | parser.add_argument("-L1", "--L1-regularizer", dest="L1_reg", default=0, type=float, help="L1 regularization coefficient. Default: 0")
 30 | parser.add_argument("-L2", "--L2-regularizer", dest="L2_reg", default=0, type=float, help="L2 regularization coefficient. Default: 0")
 31 | parser.add_argument("-dir", "--directory", dest="out_dir", help="The output directory for log file, model, etc.")
 32 | parser.add_argument("-iw", "--instance-weights-path", dest="instance_weights_path", help="(optional) Instance weights file.")
 33 | parser.add_argument("--clip-threshold", dest="clip_threshold", default=0, type=float, help="If threshold > 0, clips gradients to [-threshold, +threshold]. Default: 0 (disabled)")
 34 | parser.add_argument("--weighted-emb", dest="weighted_emb", action='store_true', help="Use this flag to add per-word weights to embeddings.")
 35 | parser.add_argument("--threads", dest="threads", default=8, type=int, help="Number of threads when device is CPU. Default: 8")
 36 | parser.add_argument("--emb-path", dest="emb_path", help="(optional) Word embeddings file.")
 37 | parser.add_argument("--vocab", dest="vocab", help="(optional) Only needed if --emb-path is used.")
 38 | parser.add_argument("--quiet", dest="quiet", action='store_true', help="Use this flag to disable the logger.")
 39 | parser.add_argument( "--adjust-learning-rate", dest="enable_lr_adjust", action='store_true', help="Enable learning rate adjustment")
 40 | 
 41 | #parser.add_argument("-m","--model-file", dest="model_path",  help="The file path to load the model from")
 42 | 
 43 | args = parser.parse_args()
 44 | 
 45 | args.cwd = os.getcwd()
 46 | 
 47 | if args.out_dir is None:
 48 | 	args.out_dir = 'corelm-' + U.curr_time()
 49 | U.mkdir_p(args.out_dir)
 50 | 
 51 | L.quiet = args.quiet
 52 | L.set_file_path(os.path.abspath(args.out_dir) + "/log.txt")
 53 | 
 54 | L.info('Command: ' + ' '.join(sys.argv))
 55 | 
 56 | curr_version = U.curr_version()
 57 | if curr_version:
 58 | 	L.info("Version: " + curr_version)
 59 | 
 60 | if args.emb_path:
 61 | 	U.xassert(args.vocab, 'When --emb-path is used, vocab file must be given too (using --vocab).')
 62 | 
 63 | if args.loss_function == "nll":
 64 | 	args.num_noise_samples = 0
 65 | 
 66 | U.print_args(args)
 67 | U.set_theano_device(args.device, args.threads)
 68 | 
 69 | import dlm.trainer
 70 | from dlm.io.mmapReader import MemMapReader
 71 | from dlm.models.mlp import MLP
 72 | 
 73 | #########################
 74 | ## Loading datasets
 75 | #
 76 | 
 77 | trainset = MemMapReader(args.trainset, batch_size=args.batchsize, instance_weights_path=args.instance_weights_path)
 78 | devset = MemMapReader(args.devset)
 79 | testset = None
 80 | if args.testset:
 81 | 	testset = MemMapReader(args.testset)
 82 | 
 83 | 
 84 | #########################
 85 | ## Creating model
 86 | #
 87 | 
 88 | L.info('Building the model')
 89 | args.vocab_size = trainset.get_vocab_size()
 90 | args.ngram_size = trainset.get_ngram_size()
 91 | args.num_classes = trainset.get_num_classes()
 92 | 
 93 | classifier = MLP(args)
 94 | 
 95 | L.info('Parameters: ' + str(classifier.params))
 96 | 
 97 | #########################
 98 | ## Training criterion
 99 | #
100 | if args.loss_function == "nll":
101 | 	from dlm.criterions.weighted_nll import NegLogLikelihood
102 | 	criterion = NegLogLikelihood(classifier, args)
103 | elif args.loss_function == "nce":
104 | 	from dlm.criterions.nce import NCELikelihood
105 | 	noise_dist = trainset.get_unigram_model()
106 | 	criterion = NCELikelihood(classifier, args, noise_dist)
107 | else:
108 | 	L.error('Invalid loss function \'' + args.loss_function + '\'')
109 | 
110 | #########################
111 | ## Training
112 | #
113 | 
114 | dlm.trainer.train(classifier, criterion, args, trainset, devset, testset)
115 | 


--------------------------------------------------------------------------------
/dlm/preprocess/monolingual.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import numpy as np
  4 | import sys, os
  5 | import tempfile
  6 | import shutil
  7 | import argparse
  8 | try:
  9 | 	import dlm
 10 | except ImportError:
 11 | 	print "[ERROR] dlm module not found. Add CoreLM root directory to your PYTHONPATH"
 12 | 	sys.exit()
 13 | import dlm.utils as U
 14 | 
 15 | # Parsing arguments
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("-i", "--input-file", dest="input_path", required=True, help="Path to the input text file.")
 18 | parser.add_argument("-n", "--ngram-size", dest="ngram_size", required=True, type=int, help="Ngram Size.")
 19 | parser.add_argument("-o", "--output-dir", dest="output_dir_path", required=True, help="Path to output directory.")
 20 | parser.add_argument("--text", dest="text_output", action='store_true', help="Add this flag to produce text output.")
 21 | parser.add_argument("--shuffle", dest="shuffle", action='store_true', help="Add this flag to shuffle the output.")
 22 | parser.add_argument("--endp", dest="endp", action='store_true', help="Add this flag to add sentence end padding </s>.")
 23 | 
 24 | # Mutually exculsive group of pruning arguments
 25 | prune_args = parser.add_mutually_exclusive_group(required=True)
 26 | prune_args.add_argument("--prune-vocab-size", dest="prune_vocab_size", type=int, help="Vocabulary size")
 27 | prune_args.add_argument("--prune-threshold",  dest="prune_threshold_count", type=int, help="Minimum number of occurances for a word to be added into vocabulary")
 28 | prune_args.add_argument("--input-vocab-file", dest="input_vocab_path", help="Path to an existing vocabulary file")
 29 | 
 30 | args = parser.parse_args()
 31 | 
 32 | 
 33 | if (not os.path.exists(args.output_dir_path)):
 34 | 	os.makedirs(args.output_dir_path)
 35 | print("Output directory: " + os.path.abspath(args.output_dir_path))
 36 | 
 37 | prefix = args.output_dir_path + "/" + os.path.basename(args.input_path)
 38 | 
 39 | if args.shuffle:
 40 | 	output_path = prefix + ".idx.shuf.mmap"
 41 | 	output_text_path = prefix + ".idx.shuf.txt"
 42 | else:
 43 | 	output_path = prefix + ".idx.mmap"
 44 | 	output_text_path = prefix + ".idx.txt"
 45 | 
 46 | word_to_id_dict = dict()			# Word to Index Dictionary
 47 | 
 48 | if args.input_vocab_path is None:
 49 | 	# Counting the frequency of the words.
 50 | 	word_to_freq_dict = dict()		# Word Frequency Dictionary
 51 | 	with open(args.input_path, 'r') as input_file:
 52 | 		for line in input_file:
 53 | 			line = line.strip()
 54 | 			if len(line) == 0:
 55 | 				continue
 56 | 			tokens = line.split()
 57 | 			for token in tokens:
 58 | 				if not word_to_freq_dict.has_key(token):
 59 | 					word_to_freq_dict[token] = 1
 60 | 				else:
 61 | 					word_to_freq_dict[token] += 1
 62 | 	
 63 | 	# Prune based on threshold
 64 | 	if args.prune_threshold_count:
 65 | 		for token, freq in word_to_freq_dict.items():
 66 | 			if freq < args.prune_threshold_count:
 67 | 				del word_to_freq_dict[token]
 68 | 
 69 | 	# Writing the vocab file and creating a word to id dictionary.
 70 | 	vocab_path = prefix + ".vocab"
 71 | 	word_to_id_dict['<unk>'] = 0
 72 | 	word_to_id_dict['<null>'] = 1
 73 | 	word_to_id_dict['<s>'] = 2
 74 | 	added_tokens = '<unk>\n<null>\n<s>\n'
 75 | 	if args.endp:
 76 | 		word_to_id_dict['</s>'] = 3
 77 | 		added_tokens += '</s>\n'
 78 | 	with open(vocab_path, 'w') as f_vocab:
 79 | 		curr_index = len(word_to_id_dict)
 80 | 		f_vocab.write(added_tokens)
 81 | 		tokens_freq_sorted = sorted(word_to_freq_dict, key=word_to_freq_dict.get, reverse=True)
 82 | 		if args.prune_vocab_size is not None and args.prune_vocab_size < len(tokens_freq_sorted):
 83 | 			tokens_freq_sorted = tokens_freq_sorted[0:args.prune_vocab_size]
 84 | 		for token in tokens_freq_sorted:
 85 | 			f_vocab.write(token+"\n")
 86 | 			word_to_id_dict[token] = curr_index
 87 | 			curr_index = curr_index + 1
 88 | else:
 89 | 	with open(args.input_vocab_path, 'r') as f_vocab:
 90 | 		curr_index = 0
 91 | 		for line in f_vocab:
 92 | 			token = line.strip()
 93 | 			if not word_to_id_dict.has_key(token):
 94 | 				word_to_id_dict[token] = curr_index
 95 | 			curr_index = curr_index + 1
 96 | 		U.xassert(word_to_id_dict.has_key('<s>') and word_to_id_dict.has_key('<unk>') and word_to_id_dict.has_key('<null>'), "Missing <s> or <unk> or <null> in given vocab file")
 97 | 		if args.endp:
 98 | 			U.xassert(word_to_id_dict.has_key('</s>'), "Missing </s> in given vocab file while --endp flag is used")
 99 | 		if word_to_id_dict.has_key('</s>'):
100 | 			U.xassert(args.endp, "Given vocab file has </s> but --endp flag is not activated")
101 | 
102 | _, tmp_path = tempfile.mkstemp(prefix='dlm.tmp.')
103 | 
104 | # For shuffling only
105 | samples = []			# List of samples
106 | nsamples = 0
107 | 
108 | # Reading input text file to create IDX file
109 | with open(args.input_path, 'r') as input_file, open(tmp_path, 'w') as tmp_file:
110 | 	next_id = 0
111 | 	for line in input_file:
112 | 		line = line.strip()
113 | 		if len(line) == 0:
114 | 			continue
115 | 		tokens = line.split()
116 | 		for i in range(args.ngram_size - 1):
117 | 			tokens.insert(0, '<s>')
118 | 		if args.endp:
119 | 			tokens.append('</s>')
120 | 		indices = []
121 | 		for token in tokens:
122 | 			if not word_to_id_dict.has_key(token):
123 | 				token = "<unk>"
124 | 			indices.append(str(word_to_id_dict[token]))
125 | 		for i in range(args.ngram_size - 1, len(indices)):
126 | 			sample = ' '.join(indices[i - args.ngram_size + 1 : i + 1]) + "\n"
127 | 			if args.shuffle:
128 | 				samples.append(sample)
129 | 			else:
130 | 				tmp_file.write(sample)
131 | 			nsamples += 1
132 | 
133 | # Shuffling the data and writing to tmp file
134 | if args.shuffle:
135 | 	permutation_arr = np.random.permutation(nsamples)
136 | 	with open(tmp_path, 'w') as tmp_file:
137 | 		for index in permutation_arr:
138 | 			tmp_file.write(samples[index])
139 | 
140 | 
141 | # Creating the memory-mapped file
142 | with open(tmp_path, 'r') as data:
143 | 	fp = np.memmap(output_path, dtype='int32', mode='w+', shape=(nsamples + 3, args.ngram_size))
144 | 	fp[0,0] = nsamples					# number of samples
145 | 	fp[0,1] = args.ngram_size			# n-gram size
146 | 	fp[1,0] = len(word_to_id_dict)		# vocab size (MLP classes)
147 | 	fp[2,0] = len(word_to_id_dict)		# number of word types (MLP classes)
148 | 	counter = 3
149 | 	for line in data:
150 | 		tokens = line.split()
151 | 		fp[counter] = tokens
152 | 		counter = counter + 1
153 | 		if counter % 10000000 == 0:
154 | 			print counter
155 | 	print str(counter-1) + " samples mapped"
156 | 	fp.flush
157 | 	del fp
158 | 
159 | if args.text_output:
160 | 	shutil.move(tmp_path, output_text_path)
161 | else:
162 | 	os.remove(tmp_path)
163 | 


--------------------------------------------------------------------------------
/dlm/reranker/bleu.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import sys
  3 | import math
  4 | import dlm.utils as U
  5 | 
  6 | ###################################################################
  7 | ## BLEU utility functions
  8 | #
  9 | 
 10 | def get_ngram_counts(tokens):
 11 | 	dicts = [{}, {}, {}, {}]
 12 | 	
 13 | 	for token in tokens:
 14 | 		if dicts[0].has_key(token):
 15 | 			dicts[0][token] += 1
 16 | 		else:
 17 | 			dicts[0][token] = 1
 18 | 	
 19 | 	for k in range(1,4):
 20 | 		for i in range(len(tokens) - k):
 21 | 			segment = ' '.join(tokens[i:i+k+1])
 22 | 			if dicts[k].has_key(segment):
 23 | 				dicts[k][segment] += 1
 24 | 			else:
 25 | 				dicts[k][segment] = 1
 26 | 	return dicts
 27 | 	
 28 | def get_max_ngram_counts(refs_list, hyp_len):
 29 | 	max_counts = [{}, {}, {}, {}]
 30 | 	closest_ref_len = 1000
 31 | 	closest_ref_diff = 1000
 32 | 	for ref in refs_list:
 33 | 		ref_tokens = ref.split()
 34 | 		abs_diff = abs(len(ref_tokens) - hyp_len)
 35 | 		if abs_diff < closest_ref_diff:
 36 | 			closest_ref_len = len(ref_tokens)
 37 | 			closest_ref_diff = abs_diff
 38 | 		dicts = get_ngram_counts(ref_tokens)
 39 | 		for k in range(0,4):
 40 | 			for ngram in dicts[k]:
 41 | 				if not max_counts[k].has_key(ngram) or max_counts[k][ngram] < dicts[k][ngram]:
 42 | 					max_counts[k][ngram] = dicts[k][ngram]
 43 | 	return max_counts, closest_ref_len
 44 | 	
 45 | def clip_ngram_counts(hyp_dicts, ref_dicts):
 46 | 	for k in range(0,4):
 47 | 		for ngram in hyp_dicts[k].keys():
 48 | 			org_count = hyp_dicts[k][ngram]
 49 | 			if ref_dicts[k].has_key(ngram):
 50 | 				hyp_dicts[k][ngram] = min(org_count, ref_dicts[k][ngram])
 51 | 			else:
 52 | 				hyp_dicts[k][ngram] = 0
 53 | 
 54 | ###################################################################
 55 | ## Sentence-level BLEU metrics
 56 | #
 57 | 
 58 | def no_smoothing(hyp, refs_list):
 59 | 	l = [0, 0, 0, 0]
 60 | 	m = [0, 0, 0, 0]
 61 | 	log_p = [0, 0, 0, 0]
 62 | 	
 63 | 	hyp_tokens = hyp.split()
 64 | 	
 65 | 	hyp_dicts = get_ngram_counts(hyp_tokens)
 66 | 	ref_dicts, closest_ref_len = get_max_ngram_counts(refs_list, len(hyp_tokens))
 67 | 	
 68 | 	clip_ngram_counts(hyp_dicts, ref_dicts)
 69 | 	
 70 | 	sum_log_p = 0
 71 | 	for k in range(0,4):
 72 | 		l[k] = max(len(hyp_tokens) - k, 0)
 73 | 		if l[k] == 0: # sentence length is less than 4
 74 | 			log_p[k] = 0
 75 | 		else:
 76 | 			for w in hyp_dicts[k]:
 77 | 				if ref_dicts[k].has_key(w):
 78 | 					m[k] += hyp_dicts[k][w]
 79 | 			if (m[k] == 0):
 80 | 				return 0
 81 | 			else:
 82 | 				log_p[k] = math.log(m[k]) - math.log(l[k])
 83 | 		sum_log_p += log_p[k]
 84 | 	log_brevity = min(0, 1 - closest_ref_len/len(hyp_tokens))
 85 | 	return math.exp(1/4 * sum_log_p + log_brevity)
 86 | 
 87 | ###################################################################
 88 | 
 89 | def add_epsilon_smoothing(hyp, refs_list, eps=0.01):
 90 | 	l = [0, 0, 0, 0]
 91 | 	m = [0, 0, 0, 0]
 92 | 	log_p = [0, 0, 0, 0]
 93 | 	
 94 | 	hyp_tokens = hyp.split()
 95 | 	
 96 | 	hyp_dicts = get_ngram_counts(hyp_tokens)
 97 | 	ref_dicts, closest_ref_len = get_max_ngram_counts(refs_list, len(hyp_tokens))
 98 | 	
 99 | 	clip_ngram_counts(hyp_dicts, ref_dicts)
100 | 	
101 | 	sum_log_p = 0
102 | 	for k in range(0,4):
103 | 		l[k] = max(len(hyp_tokens) - k, 0)
104 | 		if l[k] == 0: # sentence length is less than 4
105 | 			log_p[k] = 0
106 | 		else:
107 | 			for w in hyp_dicts[k]:
108 | 				if ref_dicts[k].has_key(w):
109 | 					m[k] += hyp_dicts[k][w]
110 | 			if (m[k] == 0):
111 | 				log_p[k] = math.log(eps) - math.log(l[k])
112 | 			else:
113 | 				log_p[k] = math.log(m[k]) - math.log(l[k])
114 | 		sum_log_p += log_p[k]
115 | 	log_brevity = min(0, 1 - closest_ref_len/len(hyp_tokens))
116 | 	return math.exp(1/4 * sum_log_p + log_brevity)
117 | 
118 | ###################################################################
119 | 	
120 | # Lin and Och, 2004
121 | def lin_smoothing(hyp, refs_list):
122 | 	l = [0, 1, 1, 1]
123 | 	m = [0, 1, 1, 1]
124 | 	log_p = [0, 0, 0, 0]
125 | 	
126 | 	hyp_tokens = hyp.split()
127 | 	
128 | 	hyp_dicts = get_ngram_counts(hyp_tokens)
129 | 	ref_dicts, closest_ref_len = get_max_ngram_counts(refs_list, len(hyp_tokens))
130 | 	
131 | 	clip_ngram_counts(hyp_dicts, ref_dicts)
132 | 	
133 | 	sum_log_p = 0
134 | 	for k in range(0,4):
135 | 		l[k] = max(len(hyp_tokens) - k, 0)
136 | 		if l[k] == 0: # sentence length is less than 4
137 | 			log_p[k] = 0
138 | 		else:
139 | 			for w in hyp_dicts[k]:
140 | 				if ref_dicts[k].has_key(w):
141 | 					m[k] += hyp_dicts[k][w]
142 | 			if (m[k] == 0): # It can happen when unigram count m[0] is zero
143 | 				return 0
144 | 			else:
145 | 				log_p[k] = math.log(m[k]) - math.log(l[k])
146 | 		sum_log_p += log_p[k]
147 | 	log_brevity = min(0, 1 - closest_ref_len/len(hyp_tokens))
148 | 	return math.exp(1/4 * sum_log_p + log_brevity)
149 | 
150 | ###################################################################
151 | 
152 | # NIST (mteval-v13a.pl) smoothing
153 | def nist_smoothing(hyp, refs_list):
154 | 	l = [0, 0, 0, 0]
155 | 	m = [0, 0, 0, 0]
156 | 	log_p = [0, 0, 0, 0]
157 | 	
158 | 	hyp_tokens = hyp.split()
159 | 	
160 | 	hyp_dicts = get_ngram_counts(hyp_tokens)
161 | 	ref_dicts, closest_ref_len = get_max_ngram_counts(refs_list, len(hyp_tokens))
162 | 	
163 | 	clip_ngram_counts(hyp_dicts, ref_dicts)
164 | 	
165 | 	invcnt = 1
166 | 	sum_log_p = 0
167 | 	for k in range(0,4):
168 | 		l[k] = max(len(hyp_tokens) - k, 0)
169 | 		if l[k] == 0: # sentence length is less than 4
170 | 			log_p[k] = 0
171 | 		else:
172 | 			for w in hyp_dicts[k]:
173 | 				if ref_dicts[k].has_key(w):
174 | 					m[k] += hyp_dicts[k][w]
175 | 			if (m[k] == 0):
176 | 				invcnt *= 2
177 | 				log_p[k] = math.log(1/invcnt) - math.log(l[k])
178 | 			else:
179 | 				log_p[k] = math.log(m[k]) - math.log(l[k])
180 | 		sum_log_p += log_p[k]
181 | 	log_brevity = min(0, 1 - closest_ref_len/len(hyp_tokens))
182 | 	return math.exp(1/4 * sum_log_p + log_brevity)
183 | 	
184 | ###################################################################
185 | 
186 | # Chen and Cherry (2014) smoothing 4
187 | def chen_smoothing(hyp, refs_list, coef=5):
188 | 	l = [0, 0, 0, 0]
189 | 	m = [0, 0, 0, 0]
190 | 	log_p = [0, 0, 0, 0]
191 | 	
192 | 	hyp_tokens = hyp.split()
193 | 	
194 | 	hyp_dicts = get_ngram_counts(hyp_tokens)
195 | 	ref_dicts, closest_ref_len = get_max_ngram_counts(refs_list, len(hyp_tokens))
196 | 	
197 | 	clip_ngram_counts(hyp_dicts, ref_dicts)
198 | 	
199 | 	invcnt = 1
200 | 	sum_log_p = 0
201 | 	for k in range(0,4):
202 | 		l[k] = max(len(hyp_tokens) - k, 0)
203 | 		if l[k] == 0: # sentence length is less than 4
204 | 			log_p[k] = 0
205 | 		else:
206 | 			for w in hyp_dicts[k]:
207 | 				if ref_dicts[k].has_key(w):
208 | 					m[k] += hyp_dicts[k][w]
209 | 			if (m[k] == 0):
210 | 				invcnt *= coef / math.log(len(hyp_tokens) + 1)
211 | 				log_p[k] = math.log(1/invcnt) - math.log(l[k])
212 | 			else:
213 | 				log_p[k] = math.log(m[k]) - math.log(l[k])
214 | 		sum_log_p += log_p[k]
215 | 	log_brevity = min(0, 1 - closest_ref_len/len(hyp_tokens))
216 | 	return math.exp(1/4 * sum_log_p + log_brevity)
217 | 	
218 | 	
219 | 	
220 | 	
221 | 	
222 | 	
223 | 	
224 | 	
225 | 	
226 | 
227 | 


--------------------------------------------------------------------------------
/dlm/utils.py:
--------------------------------------------------------------------------------
  1 | import subprocess as sub
  2 | import sys
  3 | import os, errno
  4 | 	
  5 | #-----------------------------------------------------------------------------------------------------------#
  6 | 
  7 | def __shell(command):
  8 | 	return sub.Popen(command, shell=True, stdout=sub.PIPE, stderr=sub.PIPE)
  9 | 	
 10 | # Currently the best
 11 | def capture(command):
 12 | 	out, err, code = capture_all(command)
 13 | 	assert (code == 0), "Failed to run the command: " + command
 14 | 	return out
 15 | 
 16 | # Good, if more info is needed
 17 | def capture_all(command):
 18 | 	p = __shell(command)
 19 | 	output, err = p.communicate()
 20 | 	return output, err, p.returncode
 21 | 	
 22 | # Better to avoid
 23 | def capture_no_assert(command):
 24 | 	p = __shell(command)
 25 | 	return p.stdout.read()
 26 | 	
 27 | # Not well-tested, but should be good
 28 | def capture_output(command):
 29 | 	try:
 30 | 		eval("sub.check_output")
 31 | 	except:
 32 | 		error("subprocess check_output function is not supported in this python version:" + version())
 33 | 	output = sub.check_output(command, shell=True)
 34 | 	return output
 35 | 
 36 | #-----------------------------------------------------------------------------------------------------------#
 37 | 
 38 | # Dummy object for holding other objects
 39 | class Object(object):
 40 |     pass
 41 | 	
 42 | #-----------------------------------------------------------------------------------------------------------#
 43 | 	
 44 | import re
 45 | 
 46 | class BColors:
 47 | 	HEADER = '\033[95m'
 48 | 	OKBLUE = '\033[94m'
 49 | 	OKGREEN = '\033[92m'
 50 | 	WARNING = '\033[93m'
 51 | 	FAIL = '\033[91m'
 52 | 	ENDC = '\033[0m'
 53 | 	BOLD = '\033[1m'
 54 | 	UNDERLINE = '\033[4m'
 55 | 	WHITE = '\033[37m'
 56 | 	YELLOW = '\033[33m'
 57 | 	GREEN = '\033[32m'
 58 | 	BLUE = '\033[34m'
 59 | 	CYAN = '\033[36m'
 60 | 	RED = '\033[31m'
 61 | 	MAGENTA = '\033[35m'
 62 | 	BLACK = '\033[30m'
 63 | 	BHEADER = BOLD + '\033[95m'
 64 | 	BOKBLUE = BOLD + '\033[94m'
 65 | 	BOKGREEN = BOLD + '\033[92m'
 66 | 	BWARNING = BOLD + '\033[93m'
 67 | 	BFAIL = BOLD + '\033[91m'
 68 | 	BUNDERLINE = BOLD + '\033[4m'
 69 | 	BWHITE = BOLD + '\033[37m'
 70 | 	BYELLOW = BOLD + '\033[33m'
 71 | 	BGREEN = BOLD + '\033[32m'
 72 | 	BBLUE = BOLD + '\033[34m'
 73 | 	BCYAN = BOLD + '\033[36m'
 74 | 	BRED = BOLD + '\033[31m'
 75 | 	BMAGENTA = BOLD + '\033[35m'
 76 | 	BBLACK = BOLD + '\033[30m'
 77 | 	
 78 | 	@staticmethod
 79 | 	def cleared(s):
 80 | 		return re.sub("\033\[[0-9][0-9]?m", "", s)
 81 | 
 82 | def red(message):
 83 | 	return BColors.RED + str(message) + BColors.ENDC
 84 | 
 85 | def b_red(message):
 86 | 	return BColors.BRED + str(message) + BColors.ENDC
 87 | 
 88 | def blue(message):
 89 | 	return BColors.BLUE + str(message) + BColors.ENDC
 90 | 
 91 | def b_yellow(message):
 92 | 	return BColors.BYELLOW + str(message) + BColors.ENDC
 93 | 
 94 | def green(message):
 95 | 	return BColors.GREEN + str(message) + BColors.ENDC
 96 | 
 97 | def b_green(message):
 98 | 	return BColors.BGREEN + str(message) + BColors.ENDC
 99 | 	
100 | #-----------------------------------------------------------------------------------------------------------#
101 | 	
102 | def xassert(condition, message):
103 | 	if not condition:
104 | 		import dlm.io.logging as L
105 | 		L.error(message)
106 | 
107 | def assert_value(value, valid_values):
108 | 	assert type(valid_values) == list, "valid_values must be a list, given: " + str(type(valid_values))
109 | 	assert value in valid_values, "Invalid value: " + str(value) + " is not in " + str(valid_values)
110 | 	
111 | def version():
112 | 	return '.'.join(map(str, sys.version_info)[0:3])
113 | 	
114 | #-----------------------------------------------------------------------------------------------------------#
115 | 
116 | def prepend_to_file(file_name, text):
117 | 	with open(file_name, "r+") as f:
118 | 		old = f.read()
119 | 		f.seek(0)
120 | 		f.write(text + old)
121 | 	
122 | def append_to_file(file_name, text):
123 | 	with open(file_name, "a") as f:
124 | 		f.write(text)
125 | 
126 | def mkdir_p(path):
127 | 	try:
128 | 		os.makedirs(path)
129 | 	except OSError as exc: # Python >2.5
130 | 		if exc.errno == errno.EEXIST and os.path.isdir(path):
131 | 			pass
132 | 		else: raise
133 | 
134 | def num_lines(path):
135 | 	return sum(1 for line in open(path))
136 | 
137 | #-----------------------------------------------------------------------------------------------------------#
138 | 
139 | def get_all_windows(input_list, window_size):
140 | 	if window_size <= 1:
141 | 		return input_list
142 | 	output = []
143 | 	for i in range(len(input_list) - window_size + 1):
144 | 		output.append(input_list[i:i+window_size])
145 | 	return output
146 | 
147 | #-----------------------------------------------------------------------------------------------------------#
148 | 
149 | def is_gpu_free(gpu_id):
150 | 	out = capture('nvidia-smi -i ' + str(gpu_id)).strip()
151 | 	tokens = out.split('\n')[-2].split()
152 | 	return ' '.join(tokens[1:5]) == 'No running processes found'
153 | 
154 | def set_theano_device(device, threads):
155 | 	import sys
156 | 	import dlm.io.logging as L
157 | 	xassert(device == "cpu" or device.startswith("gpu"), "The device can only be 'cpu', 'gpu' or 'gpu<id>'")
158 | 	if device.startswith("gpu") and len(device) > 3:
159 | 		try:
160 | 			gpu_id = int(device[3:])
161 | 			if not is_gpu_free(gpu_id):
162 | 				L.warning('The selected GPU (GPU' + str(gpu_id) + ') is apparently busy.')
163 | 		except ValueError:
164 | 			L.error("Unknown GPU device format: " + device)
165 | 	if device.startswith("gpu"):
166 | 		L.warning('Running on GPU yields non-deterministic results.')
167 | 	xassert(sys.modules.has_key('theano') == False, "dlm.utils.set_theano_device() function cannot be called after importing theano")
168 | 	os.environ['OMP_NUM_THREADS'] = str(threads)
169 | 	os.environ['THEANO_FLAGS'] = 'device=' + device
170 | 	os.environ['THEANO_FLAGS'] += ',force_device=True'
171 | 	os.environ['THEANO_FLAGS'] += ',floatX=float32'
172 | 	os.environ['THEANO_FLAGS'] += ',warn_float64=warn'
173 | 	os.environ['THEANO_FLAGS'] += ',cast_policy=numpy+floatX'
174 | 	#os.environ['THEANO_FLAGS'] += ',allow_gc=True'
175 | 	os.environ['THEANO_FLAGS'] += ',print_active_device=False'
176 | 	os.environ['THEANO_FLAGS'] += ',exception_verbosity=high'		# Highly verbose debugging
177 | 	os.environ['THEANO_FLAGS'] += ',mode=FAST_RUN'
178 | 	os.environ['THEANO_FLAGS'] += ',nvcc.fastmath=False' 			# True: makes div and sqrt faster at the cost of precision, and possible bugs
179 | 	#os.environ['THEANO_FLAGS'] += ',optimizer_including=cudnn' 	# Comment out if CUDNN is not available
180 | 	try:
181 | 		import theano
182 | 	except EnvironmentError:
183 | 		L.exception()
184 | 	global logger
185 | 	if theano.config.device == "gpu":
186 | 		L.info(
187 | 			"Device: " + theano.config.device.upper() + " "
188 | 			+ str(theano.sandbox.cuda.active_device_number())
189 | 			+ " (" + str(theano.sandbox.cuda.active_device_name()) + ")"
190 | 		)
191 | 	else:
192 | 		L.info("Device: " + theano.config.device.upper())
193 | 
194 | #-----------------------------------------------------------------------------------------------------------#
195 | 
196 | def print_args(args):
197 | 	import dlm.io.logging as L
198 | 	L.info("Arguments:")
199 | 	items = vars(args)
200 | 	for key in sorted(items.keys(), key=lambda s: s.lower()):
201 | 		value = items[key]
202 | 		if not value:
203 | 			value = "None"
204 | 		L.info("  " + key + ": " + BColors.MAGENTA + str(items[key]) + BColors.ENDC)
205 | 
206 | def curr_time():
207 | 	import time
208 | 	t = time.localtime()
209 | 	return '%i-%i-%i-%ih-%im-%is' % (t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec)
210 | 
211 | def curr_version():
212 | 	import dlm.io.logging as L
213 | 	info_path = os.path.dirname(sys.argv[0]) + '/.git/refs/heads/master'
214 | 	if os.path.exists(info_path):
215 | 		with open(info_path, 'r') as info_file:
216 | 			return info_file.next().strip()
217 | 	L.warning('Unable to read current version.')
218 | 	return None
219 | 


--------------------------------------------------------------------------------
/dlm/preprocess/features.py:
--------------------------------------------------------------------------------
  1 | import sys, os
  2 | import tempfile
  3 | import shutil
  4 | import argparse
  5 | try:
  6 | 	import dlm
  7 | except ImportError:
  8 | 	print "[ERROR] dlm module not found. Add CoreLM root directory to your PYTHONPATH"
  9 | 	sys.exit()
 10 | import dlm.utils as U
 11 | import dlm.io.logging as L
 12 | import numpy as np
 13 | 
 14 | def read_vocab(vocab_path):
 15 | 	word_to_id_dict = dict()
 16 | 	found_sent_marker = False
 17 | 	with open(vocab_path,'r') as f_vocab:
 18 | 		curr_index = 0
 19 | 		for line in f_vocab:
 20 | 			token = line.strip().split()[0]
 21 | 			U.xassert((not word_to_id_dict.has_key(token)), "Given vocab file has duplicate entry for '" + token + "'.")
 22 | 			word_to_id_dict[token] = curr_index
 23 | 			curr_index = curr_index + 1
 24 | 	return word_to_id_dict		
 25 | 
 26 | def replace_unk(word, dict):
 27 | 	if word in dict:
 28 | 		return word
 29 | 	else:
 30 | 		return "<unk>" 
 31 | 
 32 | 
 33 | parser = argparse.ArgumentParser()
 34 | parser.add_argument("-i", "--input-file", dest="input_path", required=True, help="Path to the input text file, words and features separated by underscorre(_) e.g. word_feature .")
 35 | parser.add_argument("-l", "--labels-file", dest="labels_path", required=True, help="Path to the labels text file")
 36 | parser.add_argument("-n", "--context", dest="context_size", required=True, type=int, help="Context Size.")
 37 | parser.add_argument("-o", "--output-dir", dest="output_dir_path", required=True, help="Path to output directory.")
 38 | parser.add_argument("--text", dest="text_output", action='store_true', help="Add this flag to produce text output.")
 39 | parser.add_argument("--input-vocab-file", dest="input_vocab_path", help="Path to an input(words) vocabulary file")
 40 | parser.add_argument("--labels-vocab-file", dest="labels_vocab_path", help="Path to an labels (POS, NER etc.) vocabulary file")
 41 | parser.add_argument("--features-vocab-file", dest="features_vocab_path", help="Path to an features vocabulary file")
 42 | parser.add_argument("--shuffle", dest="shuffle", action='store_true', help="Add this flag to shuffle the output.")
 43 | parser.add_argument("--word-output", dest="word_out", action='store_true', help="Get output in non-index format, i.e. as words and features")
 44 | 
 45 | args = parser.parse_args()
 46 | 
 47 | if (not os.path.exists(args.output_dir_path)):
 48 | 	os.makedirs(args.output_dir_path)
 49 | print("Output directory: " + os.path.abspath(args.output_dir_path))
 50 | 
 51 | 
 52 | prefix = args.output_dir_path + "/" + os.path.basename(args.input_path)
 53 | 
 54 | if args.shuffle:
 55 | 	output_mmap_path = prefix + ".idx.shuf.mmap"
 56 | 	output_text_path = prefix + ".idx.shuf.txt"
 57 | 	output_words_path = prefix + ".shuf.txt"
 58 | 
 59 | else:
 60 | 	output_mmap_path = prefix + ".idx.mmap"
 61 | 	output_text_path = prefix + ".idx.txt"
 62 | 	output_words_path = prefix + ".txt"
 63 | 
 64 | if args.word_out:
 65 | 	f_words = open(output_words_path, 'w')
 66 | 
 67 | 
 68 | input_word_to_id = read_vocab(args.input_vocab_path)
 69 | feature_to_id = read_vocab(args.features_vocab_path)
 70 | label_to_id = read_vocab(args.labels_vocab_path)
 71 | input_vocab_size = len(input_word_to_id)
 72 | feature_vocab_size = len(feature_to_id)
 73 | label_vocab_size = len(label_to_id)
 74 | 
 75 | 
 76 | half_context = args.context_size/2
 77 | U.xassert(input_word_to_id.has_key("<s>"), "Sentence marker <s> not found in input vocabulary!")
 78 | U.xassert(feature_to_id.has_key("<s>"), "Sentence marker <s> not found in feature vocabulary!")
 79 | 
 80 | 
 81 | _, tmp_path = tempfile.mkstemp(prefix='dlm.tmp.')
 82 | # For shuffling only
 83 | samples = []			# List of samples
 84 | samples_idx = []
 85 | nsamples = 0
 86 | 
 87 | 
 88 | # Read lines and write to the mmap file
 89 | line_num=0
 90 | nsamples= 0
 91 | 
 92 | with open(args.input_path, 'r') as input_file, open(args.labels_path, 'r') as labels_file, open(tmp_path, 'w') as tmp_file:
 93 | 	next_id = 0
 94 | 	for line,labels_line in zip(input_file,labels_file):
 95 | 		line_num += 1			# Increment the line number
 96 | 
 97 | 		line = line.strip()
 98 | 		labels_line = labels_line.strip()		# Target labels line
 99 | 		if len(line) == 0:
100 | 			continue
101 | 
102 | 		tokens = line.split()
103 | 		ltokens = labels_line.split()
104 | 		U.xassert(len(tokens) == len(ltokens), "The number of labels does not match the input sentence does not match in line " + str(line_num) )
105 | 		#for i in range(num_markers):
106 | 		#	tokens.insert(0, '<s>_<s>')
107 | 		#	tokens.append('<s>_<s>')
108 | 
109 | 		indices = []
110 | 		f_indices = []
111 | 		for token_idx in xrange(len(ltokens)):
112 | 			word, feature = tokens[token_idx].split('_')
113 | 			label = ltokens[token_idx]
114 | 			U.xassert(feature_to_id.has_key(feature), "Feature " + feature + " not present in feature vocab!")
115 | 		
116 | 			sample = []
117 | 			sample_idx = []
118 | 
119 | 
120 | 			#### Add words to the sample #####
121 | 			# Add sentence padding for words if it is at beginning of sentence
122 | 			for i in xrange(max(0, half_context - token_idx )):
123 | 				sample.append("<s>")
124 | 				sample_idx.append(input_word_to_id["<s>"])
125 | 			
126 | 			sample_words = [replace_unk(token.split('_')[0],input_word_to_id) for token in tokens[max(0, token_idx - half_context): token_idx + half_context + 1]]
127 | 			sample 	= sample + sample_words
128 | 			sample_idx = sample_idx + [input_word_to_id[word] for word in sample_words]
129 | 
130 | 			for i in xrange(max(0, token_idx + half_context + 1 - len(tokens))):
131 | 				sample.append("<s>")
132 | 				sample_idx.append(input_word_to_id["<s>"])
133 | 
134 | 			#### Add features to the sample #####
135 | 			# Add sentence padding for features it is at beginning of sentence
136 | 			for i in xrange(max(0, half_context - token_idx )):
137 | 				sample.append("<s>")
138 | 				sample_idx.append(feature_to_id["<s>"])
139 | 
140 | 			sample_features = [token.split('_')[1] for token in tokens[max(0, token_idx - half_context): token_idx + half_context + 1]]
141 | 			sample = sample + sample_features
142 | 			sample_idx = sample_idx + [feature_to_id[feature] for feature in sample_features]
143 | 
144 | 			for i in xrange(max(0, token_idx + half_context + 1 - len(tokens))):
145 | 				sample.append("<s>")			
146 | 				sample_idx.append(feature_to_id["<s>"])
147 | 
148 | 			#### Add POS tag to the sample ####
149 | 			sample.append(label)
150 | 			sample_idx.append(label_to_id[label])
151 | 
152 | 			if args.shuffle:
153 | 				samples.append(sample)
154 | 				samples_idx.append(sample_idx)
155 | 			else:
156 | 				tmp_file.write(" ".join([str(idx) for idx in sample_idx]) + "\n")
157 | 				if args.word_out:
158 | 					f_words.write(" ".join([word for word in sample]) + "\n")
159 | 			
160 | 			nsamples += 1
161 | 			if nsamples % 100000 == 0:
162 | 				L.info( str(nsamples) + " samples processed.")
163 | 
164 | 
165 | 		
166 | 			#print word, feature, label
167 | 
168 | 			#if not input_word_to_id.has_key(word):
169 | 			#	word = "<unk>"
170 | 			#indices.append(str(input_word_to_id[word]))
171 | 			#f_indices.append(str(feature_to_id[feature]))
172 | 		
173 | # Shuffling the data and writing to tmp file
174 | if args.shuffle:
175 | 	L.info("Shuffling data.")
176 | 	permutation_arr = np.random.permutation(nsamples)
177 | 	with open(tmp_path, 'w') as tmp_file:
178 | 		for index in permutation_arr:
179 | 			tmp_file.write(" ".join([str(idx) for idx in samples_idx[index]]) + "\n")
180 | 			if args.word_out:
181 | 				f_words.write(" ".join([word for word in samples[index]]) + "\n")
182 | 
183 | L.info("Writing to MMap")
184 | # Creating the memory-mapped file
185 | with open(tmp_path, 'r') as data:
186 | 	fp = np.memmap(output_mmap_path, dtype='int32', mode='w+', shape=(nsamples + 5, args.context_size * 2 + 1))
187 | 	fp[0,0] = nsamples												# number of samples
188 | 	fp[0,1] = args.context_size * 2  + 1									# No. of words + POS tag
189 | 	fp[1,0]	= 3 															# No. of header lines
190 | 	fp[2,0] = input_vocab_size
191 | 	fp[2,1] = args.context_size												# No. of header lines
192 | 	fp[3,0] = feature_vocab_size
193 | 	fp[3,1] = args.context_size												# No. of header lines
194 | 	fp[4,0] = label_vocab_size
195 | 	fp[4,1] = 1
196 | 	counter = 5
197 | 	for line in data:
198 | 		tokens = line.split()
199 | 		fp[counter] = tokens
200 | 		counter = counter + 1
201 | 		if counter % 100000 == 0:
202 | 			L.info(str(counter) + " samples mapped")
203 | 	L.info(str(counter-5) + " samples mapped")
204 | 	fp.flush
205 | 	del fp
206 | 
207 | 
208 | shutil.move(tmp_path, output_text_path)
209 | 
210 | if args.word_out:
211 | 	f_words.close()
212 | 


--------------------------------------------------------------------------------
/dlm/preprocess/bilingual.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import numpy as np
  4 | import sys, os
  5 | import tempfile
  6 | import shutil
  7 | import argparse
  8 | try:
  9 | 	import dlm
 10 | except ImportError:
 11 | 	print "[ERROR] dlm module not found. Add CoreLM root directory to your PYTHONPATH"
 12 | 	sys.exit()
 13 | import dlm.utils as U
 14 | import dlm.io.logging as L
 15 | 
 16 | 
 17 | def process_vocab(input_path, vocab_size, vocab_path, has_null):
 18 | 	word_to_id_dict = dict()			# Word to Index Dictionary
 19 | 	word_to_freq_dict = dict()			# Word Frequency Dictionary
 20 | 	with open(input_path, 'r') as input_file:
 21 | 		for line in input_file:
 22 | 			line = line.strip()
 23 | 			if len(line) == 0:
 24 | 				continue
 25 | 			tokens = line.split()
 26 | 			for token in tokens:
 27 | 				if not word_to_freq_dict.has_key(token):
 28 | 					word_to_freq_dict[token] = 1
 29 | 				else:
 30 | 					word_to_freq_dict[token] += 1	
 31 | 	
 32 | 	# Writing the vocab file and creating a word to id dictionary.
 33 | 	curr_index = 0
 34 | 	word_to_id_dict['<unk>'] = curr_index
 35 | 	added_tokens = '<unk>\n'
 36 | 	curr_index += 1
 37 | 	if has_null:
 38 | 		word_to_id_dict['<null>'] = curr_index
 39 | 		added_tokens += '<null>\n'
 40 | 		curr_index += 1
 41 | 	word_to_id_dict['<s>'] = curr_index
 42 | 	added_tokens += '<s>\n'
 43 | 	curr_index += 1
 44 | 	
 45 | 	if args.endp:
 46 | 		word_to_id_dict['</s>'] = curr_index
 47 | 		added_tokens += '</s>\n'
 48 | 		curr_index += 1
 49 | 	with open(vocab_path, 'w') as f_vocab:
 50 | 		f_vocab.write(added_tokens)
 51 | 		tokens_freq_sorted = sorted(word_to_freq_dict, key=word_to_freq_dict.get, reverse=True)
 52 | 		if vocab_size < len(tokens_freq_sorted):
 53 | 			tokens_freq_sorted = tokens_freq_sorted[0:vocab_size]
 54 | 		for token in tokens_freq_sorted:
 55 | 			f_vocab.write(token+"\n")
 56 | 			word_to_id_dict[token] = curr_index
 57 | 			curr_index = curr_index + 1
 58 | 	return word_to_id_dict
 59 | 
 60 | def read_vocab(vocab_path, endp, has_null):
 61 | 	word_to_id_dict = dict()
 62 | 	with open(vocab_path,'r') as f_vocab:
 63 | 		curr_index = 0
 64 | 		for line in f_vocab:
 65 | 			token = line.strip()
 66 | 			if not word_to_id_dict.has_key(token):
 67 | 				word_to_id_dict[token] = curr_index
 68 | 			curr_index = curr_index + 1
 69 | 		U.xassert(word_to_id_dict.has_key('<s>') and word_to_id_dict.has_key('<unk>'), "Missing <s> or <unk> in given vocab file")
 70 | 		if has_null:
 71 | 			U.xassert(word_to_id_dict.has_key('<null>'), "Missing <null> in given target vocab file")
 72 | 		if endp:
 73 | 			U.xassert(word_to_id_dict.has_key('</s>'), "Missing </s> in given vocab file while --endp flag is used")
 74 | 		if word_to_id_dict.has_key('</s>'):
 75 | 			U.xassert(args.endp, "Given vocab file has </s> but --endp flag is not activated")
 76 | 	return word_to_id_dict
 77 | 
 78 | def replace_unks(tokens, word_to_id_dict):
 79 | 	replaced_tokens = []
 80 | 	for token in tokens:
 81 | 		if not word_to_id_dict.has_key(token):
 82 | 			token = "<unk>"
 83 | 		replaced_tokens.append(token)
 84 | 	return replaced_tokens
 85 | 
 86 | # Parsing arguments
 87 | parser = argparse.ArgumentParser()
 88 | parser.add_argument("-is", "--input-source-text", dest="src_input_path", required=True, help="Path to the source langauge training text file")
 89 | parser.add_argument("-it", "--input-target-text", dest="trg_input_path", required=True, help="Path to the target language training text file")
 90 | parser.add_argument("-ia", "--alignment-file", dest="alignment_path", required=True, help="Alignment file for training text")
 91 | 
 92 | parser.add_argument("-cs", "--source-context", dest="src_context", required=True, type=int, help="(Size of source context window - 1)/ 2")
 93 | parser.add_argument("-ct", "--target-context", dest="trg_context", required=True, type=int, help="Size of target ngram (including the output)")
 94 | 
 95 | parser.add_argument("-o", "--output-dir", dest="output_dir_path", required=True, help="Path to output directory")
 96 | 
 97 | parser.add_argument("--shuffle", dest="shuffle", action='store_true', help="Add this flag to shuffle the output")
 98 | parser.add_argument("--endp", dest="endp", action='store_true', help="Add this flag to add sentence end padding </s>")
 99 | parser.add_argument("--word-output", dest="word_out", action='store_true', help="Get output in non-index format, i.e. as ngrams")
100 | 
101 | src_prune_args = parser.add_mutually_exclusive_group(required=True)
102 | src_prune_args.add_argument("-vs","--prune-source-vocab", dest="src_vocab_size",  type=int, help="Source vocabulary size")
103 | src_prune_args.add_argument("--source-vocab-file", dest="src_vocab_path",  help="Source vocabulary file path")
104 | 
105 | trg_prune_args = parser.add_mutually_exclusive_group(required=True)
106 | trg_prune_args.add_argument("-vt","--prune-target-vocab", dest="trg_vocab_size", type=int, help="Target vocabulary size")
107 | trg_prune_args.add_argument("--target-vocab-file", dest="trg_vocab_path", help="Target vocabulary file path")
108 | 
109 | output_prune_args = parser.add_mutually_exclusive_group(required=True)
110 | output_prune_args.add_argument("-vo","--prune-output-vocab", dest="output_vocab_size", type=int, help="Output vocabulary size. Defaults to target vocabulary size.")
111 | output_prune_args.add_argument("--output-vocab-file", dest="output_vocab_path", help="Output vocabulary file")
112 | 
113 | args = parser.parse_args()
114 | 
115 | # Format of the memmap file does not support less than 5 because the first row consists of parameters for the neural network
116 | U.xassert(args.trg_context + args.src_context*2 + 1 > 3, "Total ngram size must be greater than 3. ngrams < 3 are not supported by the current memmap format.")
117 | 
118 | L.info("Source Window Size: " + str(args.src_context * 2 + 1))
119 | L.info("Target Window Size: " + str(args.trg_context - 1))
120 | L.info("Total Sample Size: " + str(args.trg_context + args.src_context * 2 + 1))
121 | 
122 | if (args.output_vocab_size is None):
123 | 	args.output_vocab_size = args.trg_vocab_size
124 | 
125 | # The output directory is 
126 | if (not os.path.exists(args.output_dir_path)):
127 | 	os.makedirs(args.output_dir_path)
128 | L.info("Output directory: " + os.path.abspath(args.output_dir_path))
129 | 
130 | # Prefix of files
131 | src_prefix = args.output_dir_path + "/" + os.path.basename(args.src_input_path)
132 | trg_prefix = args.output_dir_path + "/" + os.path.basename(args.trg_input_path)
133 | 
134 | prefix = os.path.basename(args.src_input_path).split('.')[0]
135 | 
136 | output_prefix = args.output_dir_path + "/output"
137 | 
138 | # File paths
139 | if args.shuffle:
140 | 	raise NotImplementedError
141 | 	output_mmap_path = args.output_dir_path + "/" + prefix + ".idx.shuf.mmap"
142 | 	output_idx_path = args.output_dir_path + "/" + prefix + ".idx.shuf.txt"
143 | 	output_ngrams_path = args.output_dir_path + "/" + prefix + ".shuf.txt"
144 | else:
145 | 	output_mmap_path = args.output_dir_path + "/" + prefix + ".idx.mmap"
146 | 	output_idx_path = args.output_dir_path + "/" +  prefix + ".idx.txt"
147 | 	output_ngrams_path = args.output_dir_path + "/" + prefix + ".txt"
148 | 
149 | tune_output_path = "tune.idx.mmap"
150 | 
151 | if args.src_vocab_path is None:
152 | 	src_word_to_id = process_vocab(args.src_input_path, args.src_vocab_size, src_prefix+'.vocab', has_null=False)	# Word to index dictionary of source langauge
153 | else:
154 | 	src_word_to_id = read_vocab(args.src_vocab_path,args.endp, has_null=False)
155 | 
156 | if args.trg_vocab_path is None:
157 | 	trg_word_to_id = process_vocab(args.trg_input_path, args.trg_vocab_size, trg_prefix+'.vocab', has_null=True)	# Word to index dictionary of target langauge
158 | else:
159 | 	trg_word_to_id = read_vocab(args.trg_vocab_path, args.endp, has_null=True)
160 | 
161 | if args.output_vocab_path is None:
162 | 	output_word_to_id = process_vocab(args.trg_input_path, args.output_vocab_size, output_prefix+'.vocab', has_null=True) # Word to index dictionary of vocab
163 | else:
164 | 	output_word_to_id = read_vocab(args.output_vocab_path, args.endp, has_null=True)
165 | 
166 | svocab = len(src_word_to_id)
167 | tvocab = len(trg_word_to_id)
168 | ovocab = len(output_word_to_id)
169 | 
170 | ## Generating the mmap file
171 | _, tmp_path = tempfile.mkstemp(prefix='dlm.tmp.')
172 | 
173 | # Word output
174 | if args.word_out:
175 | 	f_ngrams = open(output_ngrams_path, 'w')
176 | 
177 | # For shuffling only
178 | samples = []			# List of samples
179 | nsamples= 0
180 | 
181 | sentence_count=0
182 | 
183 | with open(args.src_input_path,'r') as src_file, open(args.trg_input_path, 'r') as trg_file, open(args.alignment_path, 'r') as align_file, open(tmp_path,'w') as tmp_file:
184 | 	for sline,tline,aline in zip(src_file,trg_file,align_file):
185 | 		stokens = sline[:-1].split()
186 | 		ttokens = tline[:-1].split()
187 | 		atokens = aline[:-1].split()
188 | 		sentence_count += 1
189 | 		
190 | 		if args.endp:
191 | 			stokens.append('</s>')
192 | 			ttokens.append('</s>')
193 | 		
194 | 		stokens = replace_unks(stokens, src_word_to_id)
195 | 		otokens = replace_unks(ttokens, output_word_to_id)
196 | 		ttokens = replace_unks(ttokens, trg_word_to_id)
197 | 		
198 | 		trg_aligns = [[] for t in range(len(ttokens))]
199 | 		for atoken in atokens:
200 | 			sindex,tindex = atoken.split("-")
201 | 			sindex,tindex = int(sindex), int(tindex)
202 | 			trg_aligns[tindex].append(sindex)
203 | 		trg_aligns[-1] = [len(stokens)-1] # Alignment for </s>
204 | 				
205 | 		for tindex, sindex_list in enumerate(trg_aligns):
206 | 			if sindex_list == []: 		# No Alignment for the target token, look at nearby tokens, giving preference to right
207 | 				r_tindex = tindex + 1
208 | 				l_tindex = tindex - 1
209 | 				while r_tindex < len(ttokens) or l_tindex >=0:
210 | 					if r_tindex < len(ttokens) and trg_aligns[r_tindex]:
211 | 						sindex_list = trg_aligns[r_tindex]
212 | 						break
213 | 					if l_tindex >= 0 and trg_aligns[l_tindex]:
214 | 						sindex_list = trg_aligns[l_tindex]
215 | 						break
216 | 					r_tindex = r_tindex + 1
217 | 					l_tindex = l_tindex - 1
218 | 
219 | 				if sindex_list == []:
220 | 					L.error("No alignments in line " + sentence_count)
221 | 			
222 | 			mid = (len(sindex_list)-1)/2   # Middle of the source alignments
223 | 			sindex_align = sorted(sindex_list)[mid]
224 | 				
225 | 			src_ngrams = []
226 | 			trg_ngrams = []
227 | 			
228 | 			ngram_idx = []
229 | 		
230 | 			# Get source context
231 | 			for i in range(max(0, args.src_context - sindex_align)):
232 | 				src_ngrams.append("<s>")
233 | 			src_ngrams = src_ngrams + stokens[max(0, sindex_align - args.src_context): sindex_align + args.src_context + 1]
234 | 			for i in range(max(0, sindex_align + args.src_context + 1 - len(stokens))):
235 | 				src_ngrams.append("</s>")
236 | 
237 | 			# Get target context and predicted word
238 | 			for i in range(max(0, args.trg_context - (tindex + 1 ))):
239 | 				trg_ngrams.append("<s>")
240 | 			trg_ngrams = trg_ngrams +  ttokens[max(0, tindex + 1 - args.trg_context): tindex]
241 | 
242 | 			output_word = otokens[tindex]
243 | 			
244 | 			sample = " ".join(src_ngrams) + " " + " ".join(trg_ngrams) + " " + output_word + "\n"
245 | 			sample_idx = " ".join([str(src_word_to_id[stoken] + tvocab) for stoken in src_ngrams]) 
246 | 			sample_idx += " " + " ".join([str(trg_word_to_id[ttoken]) for ttoken in trg_ngrams])
247 | 			sample_idx += " " + str(output_word_to_id[output_word]) + "\n"
248 | 
249 | 			if args.shuffle:
250 | 				samples.append(sample)
251 | 				samples_idx.append(sample_idx)
252 | 			else:
253 | 				tmp_file.write(sample_idx)
254 | 				if args.word_out:
255 | 					f_ngrams.write(sample)
256 | 
257 | 			nsamples += 1
258 | 			if nsamples % 10000000 == 0:
259 | 				L.info( str(nsamples) + " samples processed.")
260 | 				
261 | # Shuffling the data and writing to tmp file
262 | if args.shuffle:
263 | 	permutation_arr = np.random.permutation(nsamples)
264 | 	with open(tmp_path, 'w') as tmp_file:
265 | 		for index in permutation_arr:
266 | 			tmp_file.write(samples_idx[index])
267 | 			if args.word_out:
268 | 				f_ngrams.write(samples[index])
269 | 
270 | ngram_size = args.trg_context + args.src_context * 2 + 1
271 | 
272 | # Creating the memory-mapped file
273 | with open(tmp_path, 'r') as data:
274 | 	fp = np.memmap(output_mmap_path, dtype='int32', mode='w+', shape=(nsamples + 3, ngram_size))
275 | 	fp[0,0] = nsamples												# number of samples
276 | 	fp[0,1] = ngram_size											# n-gram size
277 | 	fp[1,0] = svocab + tvocab										# context vocab size
278 | 	fp[2,0] = ovocab												# output vocab size
279 | 	counter = 3
280 | 	for line in data:
281 | 		tokens = line.split()
282 | 		fp[counter] = tokens
283 | 		counter = counter + 1
284 | 		if counter % 10000000 == 0:
285 | 			L.info(str(counter) + " samples mapped")
286 | 	L.info(str(counter-3) + " samples mapped")
287 | 	fp.flush
288 | 	del fp
289 | 
290 | shutil.move(tmp_path, output_idx_path)
291 | 
292 | if args.word_out:
293 | 	f_ngrams.close()
294 | 


--------------------------------------------------------------------------------