├── README.md
└── src
    ├── cis
        ├── .gitignore
        ├── __init__.py
        └── deep
        │   ├── .gitignore
        │   ├── __init__.py
        │   └── utils
        │       ├── .gitignore
        │       ├── __init__.py
        │       ├── classification
        │           ├── __init__.py
        │           └── apps
        │           │   ├── __init__.py
        │           │   ├── calc_error_metrics.py
        │           │   ├── classify.py
        │           │   ├── classify_mode.py
        │           │   ├── classify_xval.py
        │           │   └── prepare_significance_test.py
        │       ├── clustering
        │           ├── __init__.py
        │           └── apps
        │           │   ├── __init__.py
        │           │   └── kmeans.py
        │       ├── embeddings
        │           └── __init__.py
        │       ├── lm
        │           └── __init__.py
        │       ├── misc
        │           ├── __init__.py
        │           └── apps
        │           │   ├── __init__.py
        │           │   ├── automatic_cluster_labeling.py
        │           │   └── combine_files.py
        │       ├── preprocessing
        │           ├── __init__.py
        │           ├── apps
        │           │   ├── __init__.py
        │           │   ├── convert_apnews_to_text.py
        │           │   ├── escape_regex.py
        │           │   ├── filter_file_by_lines.py
        │           │   ├── preprocess.py
        │           │   ├── splitter.py
        │           │   ├── text_to_bow.py
        │           │   ├── text_to_features.py
        │           │   ├── tokenizer.py
        │           │   └── word_count.py
        │           └── corpus.py
        │       ├── statistics
        │           ├── __init__.py
        │           └── apps
        │           │   ├── __init__.py
        │           │   └── calc_matrix_statistics.py
        │       ├── text.py
        │       ├── theano
        │           ├── .gitignore
        │           ├── __init__.py
        │           ├── gpu_test.py
        │           └── log_reg.py
        │       └── visualization
        │           ├── __init__.py
        │           └── apps
        │               ├── __init__.py
        │               └── visualize_by_tsne.py
    ├── common_functions.py
    ├── load_data.py
    ├── log.best.scitail.txt
    ├── logistic_sgd.py
    ├── logistic_sgd_biased.py
    ├── mlp.py
    ├── model_para_0.820930232558
    ├── preprocess_SciTail.py
    ├── train_SciTail_DeIsTe_model.py
    └── word2embeddings
        ├── .gitignore
        ├── AUTHORS.rst
        ├── MANIFEST.in
        ├── README.rst
        ├── __init__.py
        ├── apps
            ├── __init__.py
            ├── analyze_lbl_distribution.py
            ├── classify_imdb_docs.py
            ├── create_embeddings.py
            ├── extract_model_data.py
            ├── extract_words_with_we.py
            ├── prepare_brown_file.py
            ├── test_mlp.py
            ├── train_mlp.py
            ├── train_model.py
            ├── use_lm.py
            └── use_model.py
        ├── lm
            ├── __init__.py
            └── networks.py
        ├── nn
            ├── .gitignore
            ├── __init__.py
            ├── layers.py
            ├── networks.py
            ├── predictor.py
            ├── tools.py
            ├── trainer.py
            └── util.py
        └── tools
            ├── .gitignore
            ├── __init__.py
            ├── examples_generator.py
            ├── theano_extensions.py
            └── util.py


/README.md:
--------------------------------------------------------------------------------
1 | # SciTail
2 | This released code is for our ACL2018 paper "End-Task Oriented Textual Entailment via Deep Explorations of Inter-Sentence Interactions". It gets STOA performance in a textual entailment benchmark  -- 82.1% accuracy on SciTail dataset. We release the code and the pretrained model
3 | 
4 | To reproduce the result, just run the "train_SciTail_DeIsTe_model.py" file. It needs word2vec embeddings. We provide the pretrained model so that reloading it can reproduce the paper numbers.
5 | 


--------------------------------------------------------------------------------
/src/cis/.gitignore:
--------------------------------------------------------------------------------
1 | /__init__.pyc
2 | 


--------------------------------------------------------------------------------
/src/cis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/cis/__init__.py


--------------------------------------------------------------------------------
/src/cis/deep/.gitignore:
--------------------------------------------------------------------------------
1 | /__init__.pyc
2 | 


--------------------------------------------------------------------------------
/src/cis/deep/__init__.py:
--------------------------------------------------------------------------------
1 | from pkgutil import extend_path
2 | __path__ = extend_path(__path__, __name__)


--------------------------------------------------------------------------------
/src/cis/deep/utils/.gitignore:
--------------------------------------------------------------------------------
1 | /__init__.pyc
2 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This file contains common utility classes and methods.
  4 | """
  5 | from bz2 import BZ2File
  6 | import cPickle
  7 | import codecs
  8 | import collections
  9 | from datetime import date
 10 | import hashlib
 11 | import io
 12 | from itertools import izip_longest
 13 | import logging
 14 | from operator import itemgetter
 15 | import os
 16 | import sys
 17 | 
 18 | import numpy as np
 19 | import itertools
 20 | 
 21 | 
 22 | def are_generators_equal(gen1, gen2):
 23 |     """Indicate whether or not the given generators are equal.
 24 | 
 25 |     Generators cannot be compared as easily as lists. Here's the description of
 26 |     what happens:
 27 |     "This can actually short-circuit without necessarily having to look at all
 28 |     values. As pointed out by larsmans in the comments, we can't use izip() here
 29 |     since it might give wrong results if the generators produce a different
 30 |     number of elements – izip() will stop on the shortest iterator. We use a
 31 |     newly created object instance as fill value for izip_longest(), since object
 32 |     instances are also compared by object identity, so sentinel is guaranteed to
 33 |     compare unequal to everything else."
 34 |     [http://stackoverflow.com/questions/9983547/comparing-two-generators-in-python]
 35 |     """
 36 |     return all(a == b for a, b in
 37 |                 izip_longest(gen1, gen2, fillvalue=object()))
 38 | 
 39 | def digest(string):
 40 |     """Calculate a hash for the given string.
 41 | 
 42 |     Parameters
 43 |     ----------
 44 |     string : str
 45 |         string to calculate the hash for
 46 | 
 47 |     Examples
 48 |     --------
 49 |     >>> digest('hello world')
 50 |     '2f05477fc24bb4faefd86517156dafdecec45b8ad3cf2522a563582b'
 51 |     """
 52 |     return hashlib.sha224(string).hexdigest()
 53 | 
 54 | def file_line_generator(filename, strip=True, comment=None):
 55 |     """Iterates over the lines in a file.
 56 | 
 57 |     Each line is one string. Uses utf8_file_open.
 58 | 
 59 |     Parameters
 60 |     ----------
 61 |     filename : str
 62 |         name of the file to load
 63 |     strip : bool
 64 |         indicates whether or not to strip each line after reading (removes line
 65 |         endings, but also tabs or spaces at the beginning of the line)
 66 |     comment : str
 67 |         if a line in the file starts with this string, then it's considered to
 68 |         be a comment and discarded. None if nothing should be discarded.
 69 | 
 70 |     Returns
 71 |     -------
 72 |     list
 73 |         each line of the given file is one item in the list
 74 |     """
 75 | 
 76 |     with utf8_file_open(filename) as f:
 77 | 
 78 |         for line in f:
 79 | 
 80 |             if strip:
 81 |                 line = line.strip()
 82 | 
 83 |             if comment and line.startswith(comment):
 84 |                 continue
 85 | 
 86 |             yield line
 87 | 
 88 |     raise StopIteration
 89 | 
 90 | def flatten_iterable(it):
 91 |     """Flattens an iteratable object.
 92 | 
 93 |     Parameters
 94 |     ----------
 95 |     it : iterable
 96 |         nested iterable
 97 | 
 98 |     Returns
 99 |     -------
100 |     generator
101 |         generator that iterates over all items in the iterable
102 |     """
103 | 
104 |     for item in it:
105 |         if isinstance(item, collections.Iterable) and \
106 |                 not isinstance(item, basestring):
107 | 
108 |             for sub in flatten_iterable(item):
109 |                 yield sub
110 |         else:
111 |             yield item
112 | 
113 | def generator_has_next(gen):
114 |     """Check if the given generator contains more elements.
115 | 
116 |     This is a hack. If the generator contains more elements, the returned
117 |     generator must be used, because the original generator "lost" an element.
118 |     The returned generator however contains this element. This is possible by
119 |     using itertools.chain.
120 | 
121 |     Returns
122 |     -------
123 |     Any
124 |         False: generator does not contain any more elements
125 |         generator: generator does contain more elements, use this generator
126 |         instead of the original one, otherwise you loose one element.
127 |     """
128 | 
129 |     try:
130 |         elem = gen.next()
131 |         return itertools.chain([elem], gen)
132 |     except StopIteration:
133 |         return False
134 | 
135 | 
136 | def load_object_from_file(filename):
137 |     """Loads an object from the given filename.
138 | 
139 |     The given file must have been written using save_object.
140 | 
141 |     Parameters
142 |     ----------
143 |     filename : string
144 |         name of the persisted object
145 |     """
146 |     # Caution: using utf8_file_open doesn't work with cPickle
147 |     return cPickle.load(open(filename, 'rb'))
148 | 
149 | def log_iterations(log, count, log_every):
150 |     """Log how many iterations have been handled every log_every iterations.
151 | 
152 |     Parameters
153 |     ----------
154 |     log : logger
155 |         logger to be logged into
156 |     count : int
157 |         current count of iterations
158 |     log_every : int
159 |         the count is logged every log_every iterations
160 |     """
161 | 
162 |     if count % log_every == 0:
163 |         log.info('iterations: ' + str(count))
164 | 
165 | 
166 | def logger_config(logger, level=logging.INFO, log_dir=None):
167 |     """Configure the given logger.
168 | 
169 |     Parameters
170 |     ----------
171 |     logger : logger
172 |         logger to configure
173 |     log_dir : str
174 |         path where to store the log file, if None no log file is created
175 |     """
176 |     logger.setLevel(level)
177 |     formatter = _logger_config_create_formatter()
178 |     logger.addHandler(_logger_config_create_console_handler(formatter, level))
179 | 
180 |     if log_dir is not None:
181 |         logger.addHandler(_logger_config_create_file_handler(formatter, level,
182 |                 log_dir))
183 | 
184 | def _logger_config_create_formatter():
185 |     """Return a formatter object."""
186 |     formatter = logging.Formatter(
187 |             '%(asctime)s\t%(levelname)s\t%(module)s\t%(funcName)s\t%(message)s',
188 |             '%Y-%m-%d %H:%M:%S')
189 |     return formatter
190 | 
191 | def _logger_config_create_console_handler(formatter, level):
192 |     """Return a console handler."""
193 |     ch = logging.StreamHandler(sys.stdout)
194 |     ch.setLevel(level)
195 |     ch.setFormatter(formatter)
196 |     return ch
197 | 
198 | def _logger_config_create_file_handler(formatter, level, log_dir):
199 |     """Return a log file handler."""
200 |     fh = logging.FileHandler(os.path.join(log_dir, 'log-' +
201 |             date.today().strftime('%Y-%m-%d')), encoding='utf-8')
202 |     fh.setLevel(level)
203 |     fh.setFormatter(formatter)
204 |     return fh
205 | 
206 | def ndarray_to_string(array):
207 |     """Converts the given ndarray into a unicode string.
208 | 
209 |     Parameters
210 |     ----------
211 |     array : ndarray
212 | 
213 |     Returns
214 |     -------
215 |     unicode
216 |     """
217 |     array = np.asarray(array)
218 | 
219 |     if array.ndim == 1:
220 |         return u' '.join([unicode(item) for item in array])
221 |     elif array.ndim == 2:
222 |         return u'\n'.join([ndarray_to_string(line) for line in array])
223 | 
224 |     raise ValueError(u'only 1d arrays supported')
225 | 
226 | 
227 | def save_object_to_file(obj, filename):
228 |     """Saves the given object to file using cPickle.
229 | 
230 |     The object might provide extra routings for storing (e.g., __getstate__).
231 | 
232 |     Parameters
233 |     ----------
234 |     obj : any
235 |         object to store
236 |     filename : string
237 |         file to store the object to
238 |     """
239 |     # Caution: using utf8_file_open doesn't work with cPickle
240 |     cPickle.dump(obj, open(filename, "wb"), protocol=-1)
241 | 
242 | def sort_dict_by_key(d, reverse=False):
243 |     """Sort the given dictionary by its keys.
244 | 
245 |     Parameters
246 |     ----------
247 |     d : dict
248 |         dictionary to sort
249 |     reverse : bool
250 |         indicates if the sorting should be reversed
251 | 
252 |     Returns
253 |     -------
254 |     list of tupels
255 |         contains tupels of key and value ordered according to key
256 | 
257 |     Examples
258 |     --------
259 |     >>> x = {'c':2, 'a':4, 'b':3, 'd':1, 'e':0}
260 |     >>> sort_dict_by_key(x)
261 |     [('a', 4), ('b', 3), ('c', 2), ('d', 1), ('e', 0)]
262 | 
263 |     >>> x = {'c':2, 'e':4, 'd':3, 'b':1, 'a':0}
264 |     >>> sort_dict_by_key(x, True)
265 |     [('e', 4), ('d', 3), ('c', 2), ('b', 1), ('a', 0)]
266 |     """
267 |     return sorted(d.iteritems(), key=itemgetter(0), reverse=reverse)
268 | 
269 | def sort_dict_by_label(d, reverse=False):
270 |     """Sort the given dictionary by its values.
271 | 
272 |     Parameters
273 |     ----------
274 |     d : dict
275 |         dictionary to sort
276 |     reverse : bool
277 |         indicates if the sorting should be reversed
278 | 
279 |     Returns
280 |     -------
281 |     list of tupels
282 |         contains tupels of key and value ordered according to value
283 | 
284 |     Examples
285 |     --------
286 |     >>> x = {'c':2, 'a':4, 'b':3, 'd':1, 'e':0}
287 |     >>> sort_dict_by_label(x)
288 |     [('e', 0), ('d', 1), ('c', 2), ('b', 3), ('a', 4)]
289 | 
290 |     >>> x = {'c':2, 'e':4, 'd':3, 'b':1, 'a':0}
291 |     >>> sort_dict_by_label(x, True)
292 |     [('e', 4), ('d', 3), ('c', 2), ('b', 1), ('a', 0)]
293 |     """
294 |     return sorted(d.iteritems(), key=itemgetter(1), reverse=reverse)
295 | 
296 | def text_to_vocab_indices(vocab, tokens, unk=u'<UNK>'):
297 |     """
298 |     Convert all tokens in the text into their indices in the given vocabulary.
299 | 
300 |     Tokens that do not exist in the vocabulary will receive the <UNK> token
301 |     index.
302 | 
303 |     Parameters
304 |     ----------
305 |     vocabulary : dict(str, int)
306 |         mapping from token text to index
307 |         must contain an UNKNOWN token
308 |     tokens : str or list(str)
309 |         text to replace all tokens in
310 |     unk : str
311 |         unknown word token
312 | 
313 |     Returns
314 |     -------
315 |     list(int)
316 |         list that contains the vocabulary indices for all tokens instead of
317 |         the tokens themselves
318 |     list(str)
319 |         list of the original input text having unknown tokens replaced by the
320 |         unknown word token
321 | 
322 |     Examples
323 |     >>> vocab = {u'i': 0, u'am': 1, u'home': 2, u'<UNK>':-1}
324 |     >>> text_to_vocab_indices(vocab, u'i am home now .')
325 |     ([0, 1, 2, -1, -1], [u'i', u'am', u'home', u'<UNK>', u'<UNK>'])
326 |     >>> text_to_vocab_indices(vocab, [u'i', u'am', u'home', u'now', u'.'])
327 |     ([0, 1, 2, -1, -1], [u'i', u'am', u'home', u'<UNK>', u'<UNK>'])
328 |     """
329 | 
330 |     if isinstance(tokens, (str, unicode)):
331 |         tokens = tokens.split()
332 | 
333 |     conv_tokens = [t if t in vocab else unk for t in tokens]
334 |     sent_indices = [vocab[t] for t in conv_tokens]
335 | 
336 |     return sent_indices, conv_tokens
337 | 
338 | def utf8_file_open(filename, mode='r'):
339 |     """Return a file object for the given filename in the given mode.
340 | 
341 |     Open an utf-8 file in the given mode (see io.open for further details) and
342 |     uses only \n as line endings. Can open bz2 files.
343 | 
344 |     Parameters
345 |     ----------
346 |     filename : string
347 |         name of the file to open
348 |     mode : string
349 |         open mode (see io.open for further details), default value: 'r'
350 |     """
351 | 
352 |     # It seems that utf8 files are read properly by BZ2File.
353 |     if filename.endswith(u'.bz2'):
354 |         return codecs.getreader("utf-8")(BZ2File(filename, mode, compresslevel=9))
355 | 
356 |     return io.open(filename, mode, encoding='utf8', newline='\n')
357 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/classification/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | This file contains common utility classes and methods for classification.
 4 | """
 5 | from sklearn.metrics.metrics import accuracy_score, \
 6 |     precision_recall_fscore_support
 7 | 
 8 | 
 9 | def calc_metrics(true_labels, predicted_labels):
10 |     """Provide accuracy, precision, recall, and f1 as error measure.
11 | 
12 |     Parameters
13 |     ----------
14 |     true_labels : list, ndarray
15 |         true labels
16 |     predicted_labels : list, ndarray
17 |         predicted labels
18 | 
19 |     Returns
20 |     -------
21 |     (float, float, float, float)
22 |         accuracy, precision, recall, f1
23 | 
24 |     Example
25 |     -------
26 |     >>> y_true = [0, 1, 1, 0]
27 |     >>> y_pred = [0, 0, 1, 1]
28 |     >>> calc_metrics(y_true, y_pred)
29 |     (0.5, 0.5, 0.5, 0.5)
30 |     """
31 |     acc = accuracy_score(true_labels, predicted_labels)
32 |     p, r, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels,
33 |             average='micro')
34 |     return (acc, p, r, f1)
35 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/classification/apps/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/cis/deep/utils/classification/apps/__init__.py


--------------------------------------------------------------------------------
/src/cis/deep/utils/classification/apps/calc_error_metrics.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | example usage:
 4 | -p -r -f
 5 | X:\sa\experiments\contextual_polarity\mlp\sent_1\amazon\sanity_test-most_well-binary\features-predict-tmp
 6 | X:\sa\experiments\contextual_polarity\mlp\sent_1\amazon\sanity_test-most_well-binary\features-predict-out-cleaned
 7 | """
 8 | from argparse import ArgumentParser
 9 | from logging import getLogger
10 | import sys
11 | 
12 | from sklearn.metrics.metrics import accuracy_score, \
13 |     precision_recall_fscore_support
14 | 
15 | from cis.deep.utils import logger_config, file_line_generator
16 | 
17 | 
18 | log = getLogger(__name__)
19 | logger_config(log)
20 | 
21 | parser = ArgumentParser(description="""Calculate the error metrics accuracy,
22 |         precision, recall, and f-measure for the given true and predicted
23 |         labels. Labels must be numeric type. This application is a wrapper
24 |         for sklearn.metrics.accuracy_score and
25 |         sklearn.metrics.precision_recall_fscore_support. Look up their
26 |         documentation to find the explanations of the parameters.""")
27 | parser.add_argument('true_labels', help='true labels, one per line')
28 | parser.add_argument('pred_labels', help='predicted labels, one per line')
29 | 
30 | parser.add_argument('-p', '--precision', action='store_true',
31 |         help='calculate precision')
32 | parser.add_argument('-r', '--recall', action='store_true',
33 |         help='calculate recall')
34 | parser.add_argument('-f', '--f_measure', action='store_true',
35 |         help='calculate f-measure')
36 | 
37 | parser.add_argument('-b', '--beta', default=1.0, type=float,
38 |         help='beta value of f-measure')
39 | parser.add_argument('-o', '--pos_label', default='1',
40 |         help='label of the positive class in a binary classification task')
41 | parser.add_argument('-a', '--avg', choices=['none', 'micro', 'macro', 'samples',
42 |         'weighted'], default='none',
43 |         help='label of the positive class in a binary classification task')
44 | 
45 | def main(argv=None):
46 | 
47 |     if argv is None:
48 |         argv = sys.argv[1:]
49 | 
50 |     args = parser.parse_args(argv)
51 |     log.info('start parameters: ' + str(args))
52 | 
53 |     log.info('loading data')
54 |     true = []
55 |     pred = []
56 | 
57 |     for line in file_line_generator(args.true_labels):
58 |         true.append(line)
59 | 
60 |     for line in file_line_generator(args.pred_labels):
61 |         pred.append(line)
62 | 
63 |     acc = accuracy_score(true, pred)
64 |     log.info('accuracy: %f' % acc)
65 | 
66 |     if args.precision or args.recall or args.f_measure:
67 |         p, r, f, _ = precision_recall_fscore_support(true, pred, args.beta,
68 |                 pos_label=args.pos_label,
69 |                 average=None if not args.avg else args.avg)
70 | 
71 |         if args.precision:
72 |             log.info('precision: %f' % p)
73 |         if args.recall:
74 |             log.info('recall: %f' % r)
75 |         if args.f_measure:
76 |             log.info('f-measure: %f' % f)
77 | 
78 |     log.info('finished')
79 | 
80 | if __name__ == "__main__":
81 |     sys.exit(main())
82 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/classification/apps/classify.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | """
  4 | 
  5 | from argparse import ArgumentParser
  6 | from logging import getLogger
  7 | import os
  8 | import sys
  9 | 
 10 | from sklearn.metrics.metrics import confusion_matrix
 11 | from sklearn.svm import LinearSVC
 12 | 
 13 | from cis.deep.utils import logger_config, file_line_generator, \
 14 |     save_object_to_file
 15 | from cis.deep.utils.classification import calc_metrics
 16 | import numpy as np
 17 | from sklearn.dummy import DummyClassifier
 18 | 
 19 | # import pydevd
 20 | # pydevd.settrace(host='129.187.148.250', stdoutToServer=True,
 21 | #         stderrToServer=True)
 22 | 
 23 | log = getLogger(__name__)
 24 | logger_config(log)
 25 | 
 26 | parser = ArgumentParser(
 27 |         description='Train and test a classifier.')
 28 | parser.add_argument('train_data',
 29 |         help="""File containing the features as dense matrix. bz2 and gz are
 30 |         supported.""")
 31 | parser.add_argument('train_labels',
 32 |         help="""File containing the data labels. One label per line.""")
 33 | parser.add_argument('test_data',
 34 |         help="""File containing the features as dense matrix. bz2 and gz are
 35 |         supported.""")
 36 | parser.add_argument('test_labels',
 37 |         help="""File containing the data labels. One label per line.""")
 38 | parser.add_argument('output_dir',
 39 |         help='directory to store the results in')
 40 | 
 41 | parser.add_argument('-n', '--normalize', action='store_true',
 42 |         help="""Normalize each feature to zero mean and 1 std dev. That makes
 43 |         sense if the the values of different features are very different.""")
 44 | parser.add_argument('-m', '--mode', action='store_true',
 45 |         help="""compute the results using mode, i.e., the majority class of the
 46 |         training data.""")
 47 | 
 48 | def get_classification_result(true_labels, pred_labels):
 49 |     """Return classification resuls for one fold.
 50 | 
 51 |     Return an array containing accuracy, precision, recall, and f1, based on the
 52 |     given true and predicted labels.
 53 | 
 54 |     Keyword arguments:
 55 |     fold_no -- this fold's number
 56 |     true_labels -- true labels
 57 |     pred_labels -- predicted labels
 58 |     """
 59 |     res = np.zeros((1, 4))
 60 |     res[:] = calc_metrics(true_labels, pred_labels)
 61 |     return res
 62 | 
 63 | def calc_results(train_features, train_labels, test_features, test_labels,
 64 |         normalize=False, mode=False):
 65 |     """Perform the k-fold cross validation.
 66 | 
 67 |     Perform the k-fold cross validation, collect the result and return the
 68 |     single test instance predictions, as well as the classification results for
 69 |     each single fold and for the combination of all folds.
 70 | 
 71 |     Keyword arguments:
 72 |     train_features -- all train_features
 73 |     train_labels -- all train_labels
 74 |     normalize -- normalize features to have zero mean and 1 std dev
 75 |     mode -- use mode (majority label) instead of liblinear
 76 |     """
 77 | 
 78 |     if normalize and not mode:
 79 |         # compute the mean and std dev only on the training data, but also
 80 |         # apply it to the test data.
 81 |         mean = np.mean(train_features, axis=0)
 82 |         std_dev = np.std(train_features, axis=0, dtype=float)
 83 |         train_features = (train_features - mean) / std_dev
 84 |         test_features = (test_features - mean) / std_dev
 85 | 
 86 |     if mode:
 87 |         model = model = DummyClassifier(strategy='most_frequent')
 88 |     else:
 89 |         model = LinearSVC(random_state=84447)
 90 | 
 91 |     model.fit(train_features, train_labels)
 92 |     pred_labels = model.predict(test_features)
 93 | 
 94 |     single_predictions = np.transpose(np.vstack((xrange(test_labels.shape[0]),
 95 |             test_labels, pred_labels)))
 96 | 
 97 |     classification_result = get_classification_result(test_labels, pred_labels)
 98 | 
 99 |     if mode:
100 |         weight_vectors = model.class_prior_
101 |     else:
102 |         # Store the feature weights after the training
103 |         weight_vectors = model.coef_
104 | 
105 |     return single_predictions, classification_result, weight_vectors, model
106 | 
107 | def main(argv=None):
108 | 
109 |     if argv is None:
110 |         argv = sys.argv[1:]
111 | 
112 |     args = parser.parse_args(argv)
113 |     log.info('start parameters: ' + str(args))
114 | 
115 |     log.info('loading feature and label data')
116 |     train_labels = np.asarray(map(int, list(file_line_generator(args.train_labels))))
117 |     train_features = np.loadtxt(args.train_data)
118 | 
119 |     if train_features.ndim == 1:
120 |         train_features = train_features.reshape((train_features.shape[0], 1))
121 | 
122 |     test_labels = np.asarray(map(int, list(file_line_generator(args.test_labels))))
123 |     test_features = np.loadtxt(args.test_data)
124 | 
125 |     if test_features.ndim == 1:
126 |         test_features = test_features.reshape((test_features.shape[0], 1))
127 | 
128 |     log.info('performing classification')
129 |     single_predictions, classification_result, weight_vectors, model = \
130 |             calc_results(train_features, train_labels, test_features,
131 |             test_labels, args.normalize, args.mode == True)
132 | 
133 |     log.info('storing results')
134 |     save_object_to_file(model, os.path.join(args.output_dir, 'svm'))
135 | 
136 |     np.savetxt(os.path.join(args.output_dir, 'weights.csv'),
137 |             weight_vectors, '%f', ';', '\n')
138 | 
139 |     header = 'instance_index;true_label;pred_label'
140 |     np.savetxt(os.path.join(args.output_dir, 'predictions.csv'),
141 |             single_predictions, '%d', ';', '\n', header=header)
142 | 
143 |     all_true_labels = single_predictions[:, 1]
144 |     all_pred_labels = single_predictions[:, 2]
145 |     confusion = confusion_matrix(all_true_labels, all_pred_labels)
146 | 
147 |     np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'),
148 |             confusion, '%d', ';', '\n')
149 | 
150 |     header = 'accuracy;precision;recall;f1'
151 |     np.savetxt(os.path.join(args.output_dir, 'metrics.csv'),
152 |             classification_result, '%f', ';', '\n', header=header)
153 | 
154 |     log.info(classification_result)
155 |     log.info('finished')
156 | 
157 | if __name__ == "__main__":
158 |     sys.exit(main())
159 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/classification/apps/classify_mode.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | """
  4 | 
  5 | from argparse import ArgumentParser
  6 | from logging import getLogger
  7 | import os
  8 | import sys
  9 | 
 10 | from sklearn.cross_validation import StratifiedKFold
 11 | from sklearn.metrics.metrics import confusion_matrix
 12 | 
 13 | from cis.deep.utils import logger_config, file_line_generator
 14 | from cis.deep.utils.classification import calc_metrics
 15 | import numpy as np
 16 | from sklearn.dummy import DummyClassifier
 17 | 
 18 | 
 19 | # from sklearn.dummy import DummyClassifier
 20 | log = getLogger(__name__)
 21 | logger_config(log)
 22 | 
 23 | parser = ArgumentParser(
 24 |         description="""Perform a 10-fold cross validation, always using the most
 25 |         frequent class as predicted value.""")
 26 | parser.add_argument('label_file',
 27 |         help="""File containing the data labels. One label per line.""")
 28 | parser.add_argument('output_dir',
 29 |         help='directory to store the results in')
 30 | 
 31 | NO_OF_FOLDS = 10
 32 | 
 33 | def get_classification_result(fold_no, true_labels, pred_labels):
 34 |     """Return classification resuls for one fold.
 35 | 
 36 |     Return an array containing accuracy, precision, recall, and f1, based on the
 37 |     given true and predicted labels.
 38 | 
 39 |     Keyword arguments:
 40 |     fold_no -- this fold's number
 41 |     true_labels -- true labels
 42 |     pred_labels -- predicted labels
 43 |     """
 44 |     res = np.zeros(5)
 45 |     res[0] = fold_no
 46 | 
 47 |     acc, prec, rec, f1 = calc_metrics(true_labels, pred_labels)
 48 |     res[1:5] = [acc, prec, rec, f1]
 49 |     return res
 50 | 
 51 | def do_cross_validation(labels):
 52 |     """Perform the k-fold cross validation.
 53 | 
 54 |     Perform the k-fold cross validation, collect the result and return the
 55 |     single test instance predictions, as well as the classification results for
 56 |     each single fold and for the combination of all folds.
 57 | 
 58 |     Keyword arguments:
 59 |     features -- all features
 60 |     labels -- all labels
 61 |     """
 62 |     skf = StratifiedKFold(labels, NO_OF_FOLDS)
 63 |     single_predictions = []  # Store each single classification decision
 64 | 
 65 |     # Store classification results for each fold and for the entire task (i.e.,
 66 |     # entire cross validation).
 67 |     classification_result = np.zeros((NO_OF_FOLDS + 1, 5))
 68 | 
 69 |     for cur_fold, (train_idx, test_idx) in enumerate(skf):
 70 |         model = DummyClassifier(strategy='most_frequent')
 71 |         model.fit(None, labels[train_idx])
 72 |         pred_labels = model.predict(np.zeros(labels[test_idx].shape[0]))
 73 | 
 74 |         fold_array = np.empty(test_idx.shape[0])
 75 |         fold_array.fill(cur_fold)
 76 |         single_predictions.append(np.transpose(np.vstack((fold_array, test_idx,
 77 |                 labels[test_idx], pred_labels))))
 78 |         classification_result[cur_fold, :] = get_classification_result(cur_fold,
 79 |                 labels[test_idx], pred_labels)
 80 | 
 81 |     single_predictions = np.vstack(single_predictions)
 82 |     return single_predictions, classification_result
 83 | 
 84 | def main(argv=None):
 85 | 
 86 |     if argv is None:
 87 |         argv = sys.argv[1:]
 88 | 
 89 |     args = parser.parse_args(argv)
 90 |     log.info('start parameters: ' + str(args))
 91 | 
 92 |     log.info('loading feature and label data')
 93 |     labels = np.asarray(map(int, list(file_line_generator(args.label_file))))
 94 | 
 95 |     log.info('performing cross validation')
 96 |     single_predictions, classification_result = do_cross_validation(labels)
 97 | 
 98 |     log.info('storing results')
 99 |     header = 'fold_no;instance_index;true_label;pred_label'
100 |     np.savetxt(os.path.join(args.output_dir, 'predictions.csv'),
101 |             single_predictions, '%d', ';', '\n', header=header)
102 | 
103 |     all_true_labels = single_predictions[:, 2]
104 |     all_pred_labels = single_predictions[:, 3]
105 |     confusion = confusion_matrix(all_true_labels, all_pred_labels)
106 | 
107 |     np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'),
108 |             confusion, '%d', ';', '\n')
109 | 
110 |     classification_result[NO_OF_FOLDS, :] = get_classification_result(-1,
111 |                 all_true_labels, all_pred_labels)
112 | 
113 |     header = 'fold_no;accuracy;precision;recall;f1'
114 |     np.savetxt(os.path.join(args.output_dir, 'metrics.csv'),
115 |             classification_result, '%f', ';', '\n', header=header)
116 | 
117 |     log.info(classification_result)
118 |     log.info('finished')
119 | 
120 | if __name__ == "__main__":
121 |     sys.exit(main())
122 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/classification/apps/classify_xval.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | example usage:
  4 | -n
  5 | X:\sa\experiments\contextual_polarity\vlbl\sentiment-wnd3_3-nce5\classification\1ep\distrib.out
  6 | X:\sa\experiments\contextual_polarity\vlbl\sentiment-wnd3_3-nce5\classification\ebert,20140515-label
  7 | .
  8 | """
  9 | 
 10 | from argparse import ArgumentParser
 11 | from logging import getLogger
 12 | import os
 13 | import sys
 14 | 
 15 | from sklearn.cross_validation import StratifiedKFold
 16 | from sklearn.metrics.metrics import confusion_matrix
 17 | from sklearn.svm import LinearSVC
 18 | 
 19 | from cis.deep.utils import logger_config, file_line_generator
 20 | from cis.deep.utils.classification import calc_metrics
 21 | import numpy as np
 22 | 
 23 | 
 24 | # from sklearn.dummy import DummyClassifier
 25 | log = getLogger(__name__)
 26 | logger_config(log)
 27 | 
 28 | parser = ArgumentParser(
 29 |         description='Perform a 10-fold cross validation on given feature data.')
 30 | parser.add_argument('feature_file',
 31 |         help="""File containing the features as dense matrix. bz2 and gz are
 32 |         supported.""")
 33 | parser.add_argument('label_file',
 34 |         help="""File containing the data labels. One label per line.""")
 35 | parser.add_argument('output_dir',
 36 |         help='directory to store the results in')
 37 | 
 38 | parser.add_argument('-n', '--normalize', action='store_true',
 39 |         help="""Normalize each feature to zero mean and 1 std dev. That makes
 40 |         sense if the the values of different features are very different.""")
 41 | 
 42 | NO_OF_FOLDS = 10
 43 | 
 44 | def get_classification_result(fold_no, true_labels, pred_labels):
 45 |     """Return classification resuls for one fold.
 46 | 
 47 |     Return an array containing accuracy, precision, recall, and f1, based on the
 48 |     given true and predicted labels.
 49 | 
 50 |     Keyword arguments:
 51 |     fold_no -- this fold's number
 52 |     true_labels -- true labels
 53 |     pred_labels -- predicted labels
 54 |     """
 55 |     res = np.zeros(5)
 56 |     res[0] = fold_no
 57 | 
 58 |     acc, prec, rec, f1 = calc_metrics(true_labels, pred_labels)
 59 |     res[1:5] = [acc, prec, rec, f1]
 60 |     return res
 61 | 
 62 | def calc_results(train_features, train_labels, normalize=False):
 63 |     """Perform the k-fold cross validation.
 64 | 
 65 |     Perform the k-fold cross validation, collect the result and return the
 66 |     single test instance predictions, as well as the classification results for
 67 |     each single fold and for the combination of all folds.
 68 | 
 69 |     Keyword arguments:
 70 |     train_features -- all train_features
 71 |     train_labels -- all train_labels
 72 |     """
 73 |     skf = StratifiedKFold(train_labels, NO_OF_FOLDS)
 74 |     single_predictions = []  # Store each single classification decision
 75 |     # Store the feature weights after the training
 76 |     weight_vectors = np.zeros((NO_OF_FOLDS, train_features.shape[1]))
 77 | 
 78 |     # Store classification results for each fold and for the entire task (i.e.,
 79 |     # entire cross validation).
 80 |     classification_result = np.zeros((NO_OF_FOLDS + 1, 5))
 81 | 
 82 |     for cur_fold, (train_idx, test_idx) in enumerate(skf):
 83 |         train_data = train_features[train_idx]
 84 |         test_data = train_features[test_idx]
 85 | 
 86 |         if normalize:
 87 |             # compute the mean and std dev only on the training data, but also
 88 |             # apply it to the test data.
 89 |             mean = np.mean(train_features[train_idx, :], axis=0)
 90 |             std_dev = np.std(train_features[train_idx, :], axis=0, dtype=float)
 91 |             train_data = (train_data - mean) / std_dev
 92 |             test_data = (test_data - mean) / std_dev
 93 | 
 94 |         model = LinearSVC(random_state=84447)
 95 |         model.fit(train_data, train_labels[train_idx])
 96 |         pred_labels = model.predict(test_data)
 97 | 
 98 |         fold_array = np.empty(test_idx.shape[0])
 99 |         fold_array.fill(cur_fold)
100 |         single_predictions.append(np.transpose(np.vstack((fold_array, test_idx,
101 |                 train_labels[test_idx], pred_labels))))
102 |         classification_result[cur_fold, :] = get_classification_result(cur_fold,
103 |                 train_labels[test_idx], pred_labels)
104 |         weight_vectors[cur_fold, :] = model.coef_
105 | 
106 |     single_predictions = np.vstack(single_predictions)
107 |     return single_predictions, classification_result, weight_vectors
108 | 
109 | def main(argv=None):
110 | 
111 |     if argv is None:
112 |         argv = sys.argv[1:]
113 | 
114 |     args = parser.parse_args(argv)
115 |     log.info('start parameters: ' + str(args))
116 | 
117 |     log.info('loading feature and label data')
118 |     labels = np.asarray(map(int, list(file_line_generator(args.label_file))))
119 |     features = np.loadtxt(args.feature_file)
120 | 
121 |     log.info('performing cross validation')
122 |     single_predictions, classification_result, weight_vectors = \
123 |             calc_results(features, labels, args.normalize)
124 | 
125 |     log.info('storing results')
126 |     np.savetxt(os.path.join(args.output_dir, 'svm-weights.csv'),
127 |             weight_vectors, '%f', ';', '\n')
128 | 
129 |     header = 'fold_no;instance_index;true_label;pred_label'
130 |     np.savetxt(os.path.join(args.output_dir, 'predictions.csv'),
131 |             single_predictions, '%d', ';', '\n', header=header)
132 | 
133 |     all_true_labels = single_predictions[:, 2]
134 |     all_pred_labels = single_predictions[:, 3]
135 |     confusion = confusion_matrix(all_true_labels, all_pred_labels)
136 | 
137 |     np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'),
138 |             confusion, '%d', ';', '\n')
139 | 
140 |     classification_result[NO_OF_FOLDS, :] = get_classification_result(-1,
141 |                 all_true_labels, all_pred_labels)
142 | 
143 |     header = 'fold_no;accuracy;precision;recall;f1'
144 |     np.savetxt(os.path.join(args.output_dir, 'metrics.csv'),
145 |             classification_result, '%f', ';', '\n', header=header)
146 | 
147 |     log.info(classification_result)
148 |     log.info('finished')
149 | 
150 | if __name__ == "__main__":
151 |     sys.exit(main())
152 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/classification/apps/prepare_significance_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | """
 4 | 
 5 | from argparse import ArgumentParser
 6 | from logging import getLogger
 7 | import sys
 8 | 
 9 | from cis.deep.utils import logger_config, file_line_generator, utf8_file_open
10 | 
11 | # import pydevd
12 | # pydevd.settrace(host='129.187.148.250', stdoutToServer=True,
13 | #         stderrToServer=True)
14 | 
15 | log = getLogger(__name__)
16 | logger_config(log)
17 | 
18 | parser = ArgumentParser(
19 |         description="""Prepare a predictions file created by classify.py for the
20 |         use of Sebastian Padó's approximate randomization significance test.""")
21 | parser.add_argument('prediction_file',
22 |         help="""File containing a classifiers prediction created by classify.py
23 |         .""")
24 | parser.add_argument('outfile',
25 |         help="""converted file""")
26 | 
27 | def main(argv=None):
28 | 
29 |     if argv is None:
30 |         argv = sys.argv[1:]
31 | 
32 |     args = parser.parse_args(argv)
33 |     log.info('start parameters: ' + str(args))
34 | 
35 |     log.info('converting file')
36 | 
37 |     with utf8_file_open(args.outfile, 'w') as outfile:
38 | 
39 |         for line in file_line_generator(args.prediction_file):
40 | 
41 |             if line.startswith(u'#'):
42 |                 continue
43 | 
44 |             (_, true_label, pred_label) = line.split(';')
45 |             true_label = int(true_label)
46 |             pred_label = int(pred_label)
47 | 
48 |             tp = 1 if true_label == 1 and pred_label == 1 else 0
49 |             model_pos = 1 if pred_label == 1 else 0
50 |             gold_pos = 1 if true_label == 1 else 0
51 | 
52 |             outfile.write(u'%d %d %d\n' % (tp, model_pos, gold_pos))
53 |     log.info('finished')
54 | 
55 | if __name__ == "__main__":
56 |     sys.exit(main())
57 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/clustering/__init__.py:
--------------------------------------------------------------------------------
 1 | from _collections import defaultdict
 2 | from collections import Counter
 3 | 
 4 | def purity(clusters, classes):
 5 |     """Compute purity for the given data.
 6 | 
 7 |     Parameters
 8 |     ----------
 9 |     clusters : list(int)
10 |         cluster ids of all examples
11 |     classes : list(int)
12 |         class ids of all examples
13 |     """
14 | 
15 |     d = defaultdict(list)
16 | 
17 |     # Get a list of class numbers of all examples in a cluster.
18 |     for k, v in zip(clusters, classes):
19 |         d[k].append(v)
20 | 
21 |     mayority = 0
22 | 
23 |     # Count the mayority class number and add it up over all clusters.
24 |     for k in d:
25 |         mayority += Counter(d[k]).most_common(1)[0][1]
26 | 
27 |     return float(mayority) / len(clusters)
28 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/clustering/apps/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/cis/deep/utils/clustering/apps/__init__.py


--------------------------------------------------------------------------------
/src/cis/deep/utils/clustering/apps/kmeans.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | example usage:
  4 | -k 94
  5 | X:\sa\experiments\contextual_polarity\mlp\sent_1\amazon\brown\5000-0.1l-200\features-predict-out-unique
  6 | X:\sa\experiments\contextual_polarity\mlp\sent_1\amazon\brown\5000-0.1l-200\features-predict-clusters
  7 | 
  8 | -s X:\sa\experiments\contextual_polarity\mlp\sent_1\amazon\brown\5000-0.1l-200\features-predict-out-unique
  9 | X:\sa\experiments\contextual_polarity\mlp\sent_1\amazon\brown\5000-0.1l-200\features-predict-out-unique
 10 | X:\sa\experiments\contextual_polarity\mlp\sent_1\amazon\brown\5000-0.1l-200\features-predict-clusters
 11 | """
 12 | from argparse import ArgumentParser
 13 | from logging import getLogger
 14 | import sys
 15 | 
 16 | from sklearn.cluster.k_means_ import KMeans
 17 | 
 18 | from cis.deep.utils import logger_config, utf8_file_open, save_object_to_file
 19 | import numpy as np
 20 | 
 21 | 
 22 | log = getLogger(__name__)
 23 | logger_config(log)
 24 | 
 25 | parser = ArgumentParser(description="""Cluster given data points using
 26 |         k-means.""")
 27 | 
 28 | parser.add_argument('data_points', help='data points to be clustered')
 29 | parser.add_argument('outfile', help='output file')
 30 | 
 31 | parser.add_argument('-m', '--model', help='save model into that file')
 32 | parser.add_argument('-c', '--centroids', help='save centroids into that file')
 33 | parser.add_argument('-i', '--max-iterations', dest='max_iter', type=int,
 34 |         default=300, help='Maximum number of iterations of the algorithm')
 35 | parser.add_argument('-mr', '--root', action='store_true',
 36 |         help="""modify the data by taking the root of every entry before 
 37 |         clustering""")
 38 | parser.add_argument('-t', '--threads', type=int, default=1,
 39 |         help="""number of jobs using for the clustering""")
 40 | 
 41 | cluster_group = parser.add_mutually_exclusive_group(required=True)
 42 | cluster_group.add_argument('-k', '--clusters', type=int,
 43 |         help='number of clusters; either -k or -s must be given')
 44 | cluster_group.add_argument('-s', '--start-points', dest='start_points',
 45 |         help="""file that contains the start points for all clusters; either -k
 46 |         or -s must be given""")
 47 | 
 48 | def get_initial_centers(cluster_count, filename):
 49 |     """Return number of clusters and initial cluster centers or the method to
 50 |     create them.
 51 | 
 52 |     Parameters
 53 |     ----------
 54 |     cluster_count : None/int
 55 |         number of clusters; if None, loads the cluster centroids from the given
 56 |         file
 57 |     filename : None/str
 58 |         name of file, which contains the cluster centroids; if None,
 59 |         cluster_count must be given
 60 | 
 61 |     Returns
 62 |     -------
 63 |     if cluster_count is given: (int, str)
 64 |         cluster count and the method that will be used to choose the centroids
 65 |         later
 66 |     if cluster_count is not given (int, ndarray)
 67 |         cluster count and the centroids
 68 |     """
 69 | 
 70 |     if cluster_count:
 71 |         return (cluster_count, 'k-means++')
 72 | 
 73 |     centers = np.loadtxt(filename)
 74 |     return (centers.shape[1], centers)
 75 | 
 76 | def main(argv=None):
 77 | 
 78 |     if argv is None:
 79 |         argv = sys.argv[1:]
 80 | 
 81 |     args = parser.parse_args(argv)
 82 |     log.info('start parameters: ' + str(args))
 83 | 
 84 |     log.info('loading data')
 85 |     data = np.loadtxt(args.data_points)
 86 | 
 87 |     if args.root is not None:
 88 |         data = np.sqrt(data)
 89 | 
 90 |     (k, initial_points) = get_initial_centers(args.clusters, args.start_points)
 91 | 
 92 |     log.info('calculate center points')
 93 |     kmeans = KMeans(k, initial_points, 1, args.max_iter, copy_x=False)
 94 |     predict = kmeans.fit_predict(data)
 95 | 
 96 |     log.info('storing results')
 97 | 
 98 |     if args.model:
 99 |         save_object_to_file(kmeans, args.model)
100 | 
101 |     with utf8_file_open(args.outfile, 'w') as outfile:
102 | 
103 |         for i in xrange(predict.shape[0]):
104 |             outfile.write(u'%d\n' % predict[i])
105 | 
106 |     if args.centroids:
107 |         np.savetxt(args.centroids, kmeans.cluster_centers_)
108 | 
109 |     log.info('finished')
110 | 
111 | if __name__ == "__main__":
112 |     sys.exit(main())
113 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/embeddings/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Requires the enum34 package.
  3 | """
  4 | 
  5 | from logging import getLogger
  6 | 
  7 | from enum import Enum, IntEnum
  8 | 
  9 | from cis.deep.utils import file_line_generator, logger_config, utf8_file_open,\
 10 |     sort_dict_by_label
 11 | import numpy as np
 12 | 
 13 | 
 14 | log = getLogger(__name__)
 15 | logger_config(log)
 16 | 
 17 | class SpecialToken(Enum):
 18 |     """Enum for special tokens and their string expression.
 19 | 
 20 |     Get the enum entry's value with SpecialToken.PAD.value.
 21 |     """
 22 |     UNKNOWN = u'<UNK>'
 23 |     SENT_START = u'<S>'
 24 |     SENT_END = u'</S>'
 25 |     PAD = u'<PAD>'
 26 | 
 27 | 
 28 | SPECIAL_TOKENS = [SpecialToken.UNKNOWN, SpecialToken.SENT_START,
 29 |         SpecialToken.SENT_END, SpecialToken.PAD]
 30 | 
 31 | 
 32 | class SpecialTokenID(IntEnum):
 33 |     """Enum for ids of special tokens.
 34 | 
 35 |     Get the enum entry's value with SpecialTokenId.PAD.value.
 36 |     """
 37 |     UNKNOWN = 0
 38 |     SENT_START = 1
 39 |     SENT_END = 2
 40 |     PAD = 3
 41 | 
 42 | 
 43 | def compute_avg_text_embedding(text, vocab, embs):
 44 |     """Convert the given text into a compressed vector using average embeddings.
 45 | 
 46 |     Average all word vectors to a final document vector.
 47 | 
 48 |     Parameters
 49 |     ----------
 50 |     text : str
 51 |         text to be compressed
 52 |     vocab: dict(str, int)
 53 |         vocabulary (see read_vocabulary_id_file)
 54 |     embs : ndarray(m*n)
 55 |         embeddings
 56 |     """
 57 |     vec = np.zeros(embs.shape[1])
 58 |     count = 0
 59 | 
 60 |     for tok in text.split():
 61 |         vec += embs[vocab.get(tok, SpecialTokenID.UNKNOWN.value), :]
 62 |         count += 1
 63 | 
 64 |     return vec / float(count)
 65 | 
 66 | def read_vocabulary_file(input_file, add_special_tokens=True):
 67 |     """Read the textual vocabulary into a list. Items that are empty after
 68 |     calling str.strip on them will be mapped to u'<EMPTY>'.
 69 | 
 70 |     Parameters
 71 |     ----------
 72 |     input_file : str
 73 |         location of the vocabulary
 74 |     add_special_tokens : bool
 75 |         indicates whether or not to add special tokens to the front of the
 76 |         vocabulary, like <UNK> for unknown tokens, etc.
 77 | 
 78 |     Returns
 79 |     -------
 80 |     list(str)
 81 |         vocabulary from token to unique id
 82 |     """
 83 |     vocab = list(file_line_generator(input_file))
 84 | 
 85 |     if add_special_tokens:
 86 |         _add_special_tokens(vocab)
 87 | 
 88 |     return [v.strip() if v.strip() else u'<EMPTY>' for v in vocab]
 89 | 
 90 | def read_vocabulary_id_file(input_file, add_special_tokens=True):
 91 |     """Read the textual vocabulary into a map that maps the token to it's index.
 92 | 
 93 |     Each map entry points from the vocabulary token to the index in the
 94 |     vocabulary.
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     input_file : str
 99 |         location of the vocabulary
100 |     add_special_tokens : bool
101 |         indicates whether or not to add special tokens to the front of the
102 |         vocabulary, like <UNK> for unknown tokens, etc.
103 | 
104 |     Returns
105 |     -------
106 |     dict(str, int)
107 |         vocabulary from token to unique id
108 |     """
109 |     vocab = read_vocabulary_file(input_file, add_special_tokens)
110 |     vocab_to_indices = {w : i for (i, w) in enumerate(vocab)}
111 | 
112 |     if len(vocab) != len(vocab_to_indices):
113 |         log.warning("""Vocabulary contains duplicate items. They have been
114 |                 removed automatically.""")
115 |     return vocab_to_indices
116 | 
117 | def write_vocabulary_file(output_file, vocab):
118 |     """Write the given vocabulary to the given file.
119 | 
120 |     The vocabulary items are stored in order of the vocab values, i.e., in the
121 |     same order as they have been read by read_vocabulary_id_file.
122 | 
123 |     Parameters
124 |     ----------
125 |     output_file : str
126 |         filename of the output
127 |     vocab : dict(str, int)
128 |         vocabulary that has been read by read_vocabulary_id_file
129 |     """
130 | 
131 |     with utf8_file_open(output_file, 'w') as vocab_file:
132 |         vocab_file.write(u'\n'.join(k[0]
133 |                 for k in sort_dict_by_label(vocab)))
134 |         vocab_file.write(u'\n')
135 | 
136 | def _add_special_tokens(vocab):
137 |     """Add special tokens to the beginning of the given vocabulary.
138 | 
139 |     Adds the special tokens only if they don't already exist. If the vocabulary
140 |     already contains some special tokens the order of them does not change.
141 | 
142 |     Parameters
143 |     ----------
144 |     vocab : list(str)
145 |         vocabulary items
146 | 
147 |     Returns
148 |     -------
149 |     list(str)
150 |         vocabulary with the special tokens inserted at the front
151 |     """
152 |     if SpecialToken.PAD.value not in vocab:
153 |         vocab.insert(0, SpecialToken.PAD.value)
154 |     if SpecialToken.SENT_END.value not in vocab:
155 |         vocab.insert(0, SpecialToken.SENT_END.value)
156 |     if SpecialToken.SENT_START.value not in vocab:
157 |         vocab.insert(0, SpecialToken.SENT_START.value)
158 |     if SpecialToken.UNKNOWN.value not in vocab:
159 |         vocab.insert(0, SpecialToken.UNKNOWN.value)
160 | 
161 |     return vocab
162 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/lm/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | 
 4 | 
 5 | def interpolate(model1, model2, weight):
 6 |     """Interpolate the probabilities of two models.
 7 | 
 8 |     Model 1 is weighted by the parameter, model 2 is weighted by (1 - weight).
 9 | 
10 |     Parameters
11 |     ----------
12 |     model1 : ndarray
13 |         probabilities of model 1
14 |     model2 : ndarray
15 |         probabilities of model 2
16 |     weight : float
17 |         weight of model 1, model 2 will receive weight (1 - weight)
18 | 
19 |     Returns
20 |     -------
21 |     float
22 |         interpolated probability
23 |     """
24 |     model1 = np.asarray(model1)
25 |     model2 = np.asarray(model2)
26 |     interpolated = weight * model1 + (1-weight) * model2
27 |     return perplexity(interpolated)
28 | 
29 | def perplexity(probabs):
30 |     """Calculate perplexity given the list of probabs.
31 | 
32 |     Parameters
33 |     ----------
34 |     probabs : iterable
35 |         list of probabs
36 | 
37 |     Returns
38 |     -------
39 |     float
40 |         perplexity
41 |     """
42 |     probabs = np.asarray(probabs)
43 |     return np.exp(-np.sum(np.log(probabs)) / np.max(probabs.shape))
44 | 
45 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/misc/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | 
 4 | 
 5 | def softmax(M):
 6 |     """Calculate the row-wise softmax given a matrix.
 7 | 
 8 |     Parameters
 9 |     ----------
10 |     M : 2d structure (m x n)
11 | 
12 |     Returns
13 |     -------
14 |     ndarray(m x n)
15 |         probabilities according to softmax computation, each row sum = 1
16 |     """
17 |     M = np.asarray(M)
18 | 
19 |     if M.ndim == 1:
20 |         M = np.atleast_2d(M)
21 | 
22 |     maxes = np.amax(M, axis=1)
23 |     maxes = maxes.reshape(maxes.shape[0], 1)
24 |     e = np.exp(M - maxes)
25 |     dist = e / np.sum(e, axis=1, keepdims=True)
26 |     return dist
27 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/misc/apps/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/cis/deep/utils/misc/apps/__init__.py


--------------------------------------------------------------------------------
/src/cis/deep/utils/misc/apps/automatic_cluster_labeling.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | """
 4 | 
 5 | from _collections import defaultdict
 6 | from argparse import ArgumentParser
 7 | from collections import Counter
 8 | from logging import getLogger
 9 | import sys
10 | 
11 | from cis.deep.utils import file_line_generator, logger_config, utf8_file_open, \
12 |     sort_dict_by_key
13 | 
14 | 
15 | log = getLogger(__name__)
16 | logger_config(log)
17 | 
18 | parser = ArgumentParser(
19 |         description="""Labels given clusters according to their majority class.
20 |         """)
21 | parser.add_argument('data_file',
22 |         help="""contains a line for each example which consists of the example's
23 |         original label and its cluster id separated by a space""")
24 | parser.add_argument('predicted_labels',
25 |         help="""output file containing the predicted labels for each item; one
26 |         label per line""")
27 | parser.add_argument('-cl', '--cluster_labels',
28 |         help="""output file containing the mapping of cluster ids to new labels
29 |         """)
30 | 
31 | def main(argv=None):
32 |     if argv is None:
33 |         argv = sys.argv[1:]
34 | 
35 |     args = parser.parse_args(argv)
36 |     log.info('start parameters: ' + str(args))
37 | 
38 |     log.info('loading data')
39 |     items = []
40 | 
41 |     for line in file_line_generator(args.data_file):
42 |         items.append(tuple(line.split()))
43 | 
44 |     log.info('compute majority labels')
45 |     cluster_to_label_count = defaultdict(Counter)
46 | 
47 |     # Count labels per cluster
48 |     for (label, cluster_id) in items:
49 |         cluster_to_label_count[cluster_id][label] += 1
50 | 
51 |     majority_labels = dict()
52 | 
53 |     # Get majority label per cluster
54 |     for cluster_id in cluster_to_label_count:
55 |         majority_labels[cluster_id] = cluster_to_label_count[cluster_id].most_common(1)[0][0]
56 | 
57 |     log.info('assign labels to examples')
58 | 
59 |     with utf8_file_open(args.predicted_labels, 'w') as pred_file:
60 | 
61 |         for example_line in file_line_generator(args.data_file):
62 |             pred_file.write(majority_labels[example_line.split()[1]] + u'\n')
63 | 
64 | 
65 |     if args.cluster_labels:
66 | 
67 |         with utf8_file_open(args.cluster_labels, 'w') as outfile:
68 | 
69 |             for (cluster_id, label) in sort_dict_by_key(majority_labels):
70 |                 outfile.write(u'%s %s\n' % (cluster_id, label))
71 | 
72 |     log.info('finished')
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     sys.exit(main())
77 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/misc/apps/combine_files.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | """
 4 | 
 5 | from argparse import ArgumentParser
 6 | from logging import getLogger
 7 | import sys
 8 | 
 9 | from cis.deep.utils import file_line_generator, logger_config, utf8_file_open,\
10 |     log_iterations
11 | 
12 | 
13 | log = getLogger(__name__)
14 | logger_config(log)
15 | 
16 | parser = ArgumentParser(
17 |         description="""Takes two files and combines each line in file 1 with
18 |         all lines in file 2.""")
19 | parser.add_argument('file1')
20 | parser.add_argument('file2',
21 |         help="""use the smaller file as file2, it will be kept in memory""")
22 | parser.add_argument('out_file',
23 |         help="""File to write the combination of both files into.
24 |         Bz2 is supported.""")
25 | parser.add_argument('-s', '--separator', default=u' ')
26 | 
27 | def main(argv=None):
28 |     """See argument parser description."""
29 | 
30 |     if argv is None:
31 |         argv = sys.argv[1:]
32 | 
33 |     args = parser.parse_args(argv)
34 |     log.info('start parameters: ' + str(args))
35 | 
36 |     log.info('loading data')
37 |     file2_content = list(file_line_generator(args.file2))
38 | 
39 |     log.info('combining files')
40 | 
41 |     with utf8_file_open(args.out_file, 'w') as outfile:
42 | 
43 |         for c, line1 in enumerate(file_line_generator(args.file1)):
44 |             log_iterations(log, c, 1000)
45 | 
46 |             for line2 in file2_content:
47 |                 outfile.write(line1 + args.separator + line2 + u'\n')
48 | 
49 |     log.info('finished')
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     sys.exit(main())
54 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/cis/deep/utils/preprocessing/__init__.py


--------------------------------------------------------------------------------
/src/cis/deep/utils/preprocessing/apps/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/cis/deep/utils/preprocessing/apps/__init__.py


--------------------------------------------------------------------------------
/src/cis/deep/utils/preprocessing/apps/convert_apnews_to_text.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from argparse import ArgumentParser
 3 | from logging import getLogger
 4 | import sys
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | from cis.deep.utils import utf8_file_open, logger_config, file_line_generator
10 | 
11 | 
12 | log = getLogger(__name__)
13 | logger_config(log)
14 | 
15 | parser = ArgumentParser(
16 |         description="""Converts the binary files of the AP News (Associated
17 |         News) corpus provided by Yoshua Bengio into readable text.""")
18 | parser.add_argument('infile', type=str, help='input file')
19 | parser.add_argument('outfile', type=str, help='output file')
20 | parser.add_argument('vocabulary', type=str, help='vocabular file')
21 | 
22 | def main(argv=None):
23 |     """See argument parser description."""
24 | 
25 |     if argv is None:
26 |         argv = sys.argv[1:]
27 | 
28 |     args = parser.parse_args(argv)
29 |     log.info('start parameters: ' + str(args))
30 | 
31 |     vocab = pd.Series(file_line_generator(args.vocabulary, comment='##'))
32 | 
33 |     with open(args.infile, 'rb') as infile:
34 |         integers = np.fromfile(infile, np.int32)
35 | 
36 |     with utf8_file_open(args.outfile, 'w') as outfile:
37 |         outfile.write(u'\n'.join(vocab[integers]))
38 |         outfile.write(u'\n')
39 | 
40 |     log.info('finished')
41 | 
42 | if __name__ == "__main__":
43 |     sys.exit(main())
44 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/preprocessing/apps/escape_regex.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | example usage:
 4 | """
 5 | 
 6 | from argparse import ArgumentParser
 7 | from logging import getLogger
 8 | import re
 9 | import sys
10 | 
11 | from cis.deep.utils import utf8_file_open, logger_config
12 | 
13 | 
14 | log = getLogger(__name__)
15 | logger_config(log)
16 | 
17 | parser = ArgumentParser(description="""Escape the given text file to remove all
18 |         regular expressions.""")
19 | parser.add_argument('infile',
20 |         help='file that might contain regular expressions')
21 | parser.add_argument('outfile', help='file having regular expressions escaped')
22 | 
23 | def main(argv=None):
24 | 
25 |     if argv is None:
26 |         argv = sys.argv[1:]
27 | 
28 |     args = parser.parse_args(argv)
29 |     log.info('start parameters: ' + str(args))
30 | 
31 |     log.info('transforming data')
32 | 
33 |     with utf8_file_open(args.infile) as infile:
34 |         with utf8_file_open(args.outfile, 'w') as outfile:
35 | 
36 |             for line in infile:
37 |                 outfile.write(re.escape(line))
38 |     log.info('finished')
39 | 
40 | if __name__ == "__main__":
41 |     sys.exit(main())
42 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/preprocessing/apps/filter_file_by_lines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #! /usr/bin/env python
 3 | """
 4 | """
 5 | 
 6 | from argparse import ArgumentParser
 7 | from logging import getLogger
 8 | import os
 9 | import sys
10 | 
11 | from cis.deep.utils import logger_config, utf8_file_open, file_line_generator
12 | 
13 | 
14 | log = getLogger(__name__)
15 | logger_config(log)
16 | 
17 | parser = ArgumentParser(
18 |         description="""Filters a given file by lines indices.""")
19 | 
20 | parser.add_argument('indices', help="""line numbers that will be included in 
21 |         the output; either comma separated string (e.g., 1,4,6) or file
22 |         containing one index per line;
23 |         Caution: make sure the indices are sorted; the indices are 0-based.""")
24 | parser.add_argument('infile', help='file to be filtered')
25 | parser.add_argument('outfile', help='filtered output file')
26 | parser.add_argument('-i', '--inverse', action='store_true',
27 |         help="""inverse the indices, i.e., exclude the lines with the given
28 |         line number""")
29 | 
30 | def get_indices(indices):
31 |     """Generates line indices to keep.
32 | 
33 |     Parameters
34 |     ----------
35 |     indices : str
36 |         either name of a file containing indices one per line or a comma
37 |         separated string
38 | 
39 |     Returns
40 |     -------
41 |     int
42 |         next index
43 |     """
44 | 
45 |     if os.path.exists(indices):
46 |         return set(map(int, file_line_generator(indices, True)))
47 | 
48 |     return set((int(i.strip()) for i in indices.split(u',')))
49 | 
50 | def main(argv=None):
51 |     log.info('started application')
52 | 
53 |     if argv is None:
54 |         argv = sys.argv[1:]
55 | 
56 |     args = parser.parse_args()
57 |     log.info('start parameters: ' + str(args))
58 |     log.info('reading index file')
59 |     idx = get_indices(args.indices)
60 |     max_idx = max(idx)
61 |     log.info('filtering file')
62 | 
63 |     with utf8_file_open(args.outfile, 'w') as outfile:
64 | 
65 |         for (cur_idx, line) in enumerate(
66 |                 file_line_generator(args.infile, False)):
67 | 
68 |             if not args.inverse:
69 | 
70 |                 if cur_idx in idx:
71 |                     outfile.write(line)
72 | 
73 |                 if cur_idx >= max_idx:
74 |                     break
75 |             else:
76 | 
77 |                 if cur_idx not in idx:
78 |                     outfile.write(line)
79 | 
80 | 
81 |     log.info('finished')
82 | 
83 | if __name__ == "__main__":
84 |     sys.exit(main())
85 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/preprocessing/apps/preprocess.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Example usage:
 4 | --amazon
 5 | NLTK_DATA_DIR = 'C:/Temp/NLTK data'
 6 | x
 7 | y
 8 | """
 9 | 
10 | from argparse import ArgumentParser
11 | from logging import getLogger
12 | import sys
13 | 
14 | from cis.deep.utils import logger_config, file_line_generator, utf8_file_open, \
15 |     log_iterations
16 | from cis.deep.utils.preprocessing.corpus import AmazonProductReviewCorpusReader
17 | import re
18 | from cis.deep.utils.text import tokenize
19 | import nltk
20 | 
21 | log = getLogger(__name__)
22 | logger_config(log)
23 | 
24 | parser = ArgumentParser(description="""
25 |         Preprocess a given file. Several preprocessing parameters are
26 |         available. TODO: add lowercasing""")
27 | parser.add_argument('--amazon', action='store_true',
28 |         help="""preprocess the Amazon product review corpus.""")
29 | 
30 | parser.add_argument('-rd', '--replace_digits',
31 |         help="""Replace all digits by the given string""")
32 | parser.add_argument('-sh', '--strip_html', action='store_true',
33 |         help='strip html tags')
34 | parser.add_argument('-t', '--tokenize', action='store_true',
35 |         help="""tokenize the text""")
36 | parser.add_argument('-ss', '--sentence_splitter', type=str,
37 |         default='tokenizers/punkt/english.pickle',
38 |         help='model file to be used for sentence splitting (default: ' + \
39 |         'tokenizers/punkt/english.pickle)')
40 | parser.add_argument('-s', '--split_sentence', action='store_true',
41 |         help='split sentences')
42 | parser.add_argument('infile', help='name of the input file')
43 | parser.add_argument('outfile', help='name of the output file')
44 | 
45 | REGEX_FLAGS = re.UNICODE
46 | 
47 | def main(argv=None):
48 | 
49 |     if argv is None:
50 |         argv = sys.argv[1:]
51 | 
52 |     args = parser.parse_args(argv)
53 |     log.info('start parameters: ' + str(args))
54 |     log.info('preprocessing data')
55 | 
56 |     if args.amazon is True:
57 |         line_iterator = \
58 |                 AmazonProductReviewCorpusReader(args.infile).review_generator()
59 |     else:
60 |         line_iterator = file_line_generator(args.infile)
61 | 
62 |     if args.sentence_splitter:
63 |         sent_splitter = nltk.data.load(args.sentence_splitter)
64 | 
65 |     with utf8_file_open(args.outfile, 'w') as outfile:
66 | 
67 |         for (i, line) in enumerate(line_iterator):
68 |             log_iterations(log, i, 100000)
69 | 
70 |             if args.replace_digits:
71 |                 line = re.sub(r'\d', args.replace_digits, line,
72 |                         0, REGEX_FLAGS)
73 | 
74 |             if args.strip_html:
75 |                 line = nltk.clean_html(line)
76 | 
77 |             if args.sentence_splitter:
78 |                 line = sent_splitter.tokenize(line)
79 |             else:
80 |                 line = [line]
81 | 
82 |             if args.tokenize:
83 |                 line = [tokenize(l) for l in line]
84 | 
85 |             if not args.tokenize:
86 |                 outfile.write(u'\n'.join(line))
87 |             else:
88 |                 outfile.write(u'\n'.join([u' '.join(l) for l in line]))
89 | 
90 |             outfile.write(u'\n')
91 | 
92 |     log.info('finished')
93 | 
94 | if __name__ == "__main__":
95 |     sys.exit(main())
96 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/preprocessing/apps/splitter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from argparse import ArgumentParser
 3 | from logging import getLogger
 4 | import sys
 5 | 
 6 | import nltk
 7 | 
 8 | from cis.deep.utils import utf8_file_open, logger_config
 9 | 
10 | 
11 | log = getLogger(__name__)
12 | logger_config(log)
13 | 
14 | parser = ArgumentParser(
15 |         description="""Splits the given input file into sentences by NLTK\'s
16 |         punkt sentence tokenizer and writes the result into the output file.
17 |         It assumes English language if there is no language given. It reads
18 |         one line at a time, i.e., if there are line breaks not marking sentence
19 |         boundaries, they won't be handled correctly.""")
20 | parser.add_argument('-m', '--model', type=str,
21 |         default='tokenizers/punkt/english.pickle',
22 |         help='model file to be used for sentence splitting (default: ' + \
23 |         'tokenizers/punkt/english.pickle)')
24 | parser.add_argument('infile', type=str, help='input file')
25 | parser.add_argument('outfile', type=str, help='output file')
26 | 
27 | def main(argv=None):
28 |     """See argument parser description."""
29 | 
30 |     if argv is None:
31 |         argv = sys.argv[1:]
32 | 
33 |     args = parser.parse_args(argv)
34 |     log.info('start parameters: ' + str(args))
35 | 
36 |     with utf8_file_open(args.infile, 'r') as infile:
37 |         with utf8_file_open(args.outfile, 'w') as outfile:
38 |             sent_splitter = nltk.data.load(args.model)
39 | 
40 |             for line in infile:
41 |                 outfile.write('\n'.join(sent_splitter.tokenize(line.strip())) +
42 |                         '\n')
43 | 
44 |     log.info('finished')
45 | 
46 | if __name__ == "__main__":
47 |     sys.exit(main())
48 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/preprocessing/apps/text_to_bow.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -v X:\sa\embeddings\vlbl\sentiment-wnd3_3-nce5\vlbl.vocab
 4 | ebert,20140515-n_grams
 5 | ebert,20140515-n_grams.out
 6 | """
 7 | 
 8 | from argparse import ArgumentParser
 9 | from logging import getLogger
10 | import sys
11 | 
12 | from sklearn.feature_extraction.text import CountVectorizer
13 | 
14 | from cis.deep.utils import file_line_generator, logger_config, utf8_file_open
15 | from cis.deep.utils.embeddings import read_vocabulary_id_file
16 | import numpy as np
17 | 
18 | 
19 | log = getLogger(__name__)
20 | logger_config(log)
21 | 
22 | parser = ArgumentParser(
23 |         description="""Converts a given text file into a bag-of-words feature
24 |         file. Currently, only tf is supported.""")
25 | parser.add_argument('infile',
26 |         help="""Data file, containing all tokens. Each line will get its 
27 |         bow feature vector.""")
28 | parser.add_argument('out_feature_file',
29 |         help="""File to write the features into. Bz2 is supported.""")
30 | 
31 | parser.add_argument('-v', '--vocabulary',
32 |         help="""Vocabulary file containing all valid words. If it's not given
33 |         the vocabulary is inferred and stored afterwards. For additional
34 |         information see
35 |         sklearn.feature_extraction.text.CountVectorizer.__init__'s vocabulary
36 |         parameter.""")
37 | parser.add_argument('-n', '--ngram', default='1,1',
38 |         help="""comma-separated list of (min n-gram, max n-gram). For example
39 |         "1,3" includes all unigrams, bigrams, and trigrams. For additional
40 |         information see see CountVectorizer.__init__'s ngram_range parameter.
41 |         """)
42 | 
43 | def main(argv=None):
44 |     """See argument parser description."""
45 | 
46 |     if argv is None:
47 |         argv = sys.argv[1:]
48 | 
49 |     args = parser.parse_args(argv)
50 |     log.info('start parameters: ' + str(args))
51 | 
52 |     log.info('loading data')
53 | 
54 |     if args.vocabulary is None:
55 |         vocab = args.vocabulary
56 |     else:
57 |         vocab = read_vocabulary_id_file(args.vocabulary)
58 | 
59 |     text = list(file_line_generator(args.infile))
60 | 
61 |     ngram_range = map(int, tuple(args.ngram.split(',')))
62 |     vectorizer = CountVectorizer(token_pattern='[^ ]+', min_df=0.0,
63 |             vocabulary=vocab, ngram_range=ngram_range, dtype=int)
64 | 
65 |     log.info('creating features')
66 |     bow = vectorizer.fit_transform(text)
67 | 
68 |     log.info('storing result')
69 |     np.savetxt(args.out_feature_file, bow.todense(), fmt='%d')
70 | 
71 |     with utf8_file_open(args.out_feature_file + '.vocab', 'w') as vocab_file:
72 |         vocab_file.write(u'\n'.join(vectorizer.get_feature_names()))
73 | 
74 |     log.info('finished')
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     sys.exit(main())
79 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/preprocessing/apps/text_to_features.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | X:\sa\experiments\contextual_polarity\vlbl\sentiment-wnd3_3-nce5/classification\ebert,20140515-n_grams
 4 | X:\sa\embeddings\vlbl\sentiment-wnd3_3-nce5\vLBL.vocab
 5 | ./embs.txt
 6 | ./features_out
 7 | """
 8 | 
 9 | from argparse import ArgumentParser
10 | from logging import getLogger
11 | import os
12 | import sys
13 | 
14 | from cis.deep.utils import file_line_generator, logger_config, utf8_file_open, \
15 |     ndarray_to_string
16 | from cis.deep.utils.embeddings import read_vocabulary_id_file, SpecialTokenID
17 | import numpy as np
18 | 
19 | 
20 | log = getLogger(__name__)
21 | logger_config(log)
22 | 
23 | parser = ArgumentParser(
24 |         description="""Converts a given text file into a features file. This
25 |         is done by replacing each token in the text file by it's given feature
26 |         vector. All features will be concatenated, i.e., there will be no space
27 |         between.""")
28 | parser.add_argument('infile',
29 |         help="""Data file, containing all tokens to be replaced by their
30 |         features. The file can be compressed with bz2 or gz.""")
31 | parser.add_argument('vocabulary',
32 |         help="""Vocabulary file containing all valid words. Tokens not contained
33 |         in the vocabulary will be mapped to <UNK>.""")
34 | parser.add_argument('feature_file',
35 |         help="""File containing all token features. Each feature must be in a
36 |         single row. The row index must correspond to the vocabulary index.
37 |         Currently, only dense matrices are supported.""")
38 | parser.add_argument('out_feature_file',
39 |         help="""File to write the features into. Bz2 is supported.""")
40 | 
41 | parser.add_argument('-a', '--avg', action='store_true',
42 |         help='Average the features for all words in one example (i.e., line).')
43 | 
44 | 
45 | def main(argv=None):
46 |     """See argument parser description."""
47 | 
48 |     if argv is None:
49 |         argv = sys.argv[1:]
50 | 
51 |     args = parser.parse_args(argv)
52 |     log.info('start parameters: ' + str(args))
53 | 
54 |     log.info('loading data')
55 |     vocab = read_vocabulary_id_file(args.vocabulary, False)
56 | 
57 |     _, ext = os.path.splitext(args.feature_file)
58 | 
59 |     if ext == 'npy':
60 |         features = np.load(args.feature_file)
61 |     else:
62 |         features = np.loadtxt(args.feature_file)
63 | 
64 |     log.info('creating features')
65 | 
66 |     with utf8_file_open(args.out_feature_file, 'w') as outfile:
67 | 
68 |         for line in file_line_generator(args.infile):
69 |             toks = line.split()
70 |             cur_features = np.zeros((len(toks), features.shape[1]))
71 | 
72 |             for (i, tok) in enumerate(toks):
73 |                 cur_features[i, :] = features[
74 |                         vocab.get(tok, SpecialTokenID.UNKNOWN.value)]
75 | 
76 |             if args.avg:
77 |                 res = ndarray_to_string(np.mean(cur_features, axis=0))
78 |             else:
79 |                 res = ndarray_to_string(np.reshape(cur_features,
80 |                         np.prod(cur_features.shape), order='C'))
81 | 
82 |             outfile.write(res + u'\n')
83 | 
84 |     log.info('finished')
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     sys.exit(main())
89 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/preprocessing/apps/tokenizer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from argparse import ArgumentParser
 3 | from logging import getLogger
 4 | import sys
 5 | 
 6 | from cis.deep.utils import logger_config, utf8_file_open
 7 | from cis.deep.utils.text import tokenize
 8 | 
 9 | 
10 | log = getLogger(__name__)
11 | logger_config(log)
12 | 
13 | parser = ArgumentParser(
14 |         description="""Tokenizes the given input file by NLTK\'s recommended
15 |         word tokenizer and writes the result into the output file.""")
16 | parser.add_argument('infile', help='input file')
17 | parser.add_argument('outfile', help='output file')
18 | 
19 | def main(argv=None):
20 |     """See argument parser description."""
21 | 
22 |     if argv is None:
23 |         argv = sys.argv[1:]
24 | 
25 |     args = parser.parse_args(argv)
26 |     log.info('start parameters: ' + str(args))
27 | 
28 |     with utf8_file_open(args.infile, 'r') as infile:
29 |         with utf8_file_open(args.outfile, 'w') as outfile:
30 | 
31 |             for line in infile:
32 |                 outfile.write(' '.join(tokenize(line)) + '\n')
33 | 
34 |     log.info('finished')
35 | 
36 | if __name__ == "__main__":
37 |     sys.exit(main())
38 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/preprocessing/apps/word_count.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from argparse import ArgumentParser
 3 | from collections import Counter
 4 | from logging import getLogger
 5 | import sys
 6 | 
 7 | from cis.deep.utils import utf8_file_open, logger_config, sort_dict_by_label
 8 | 
 9 | 
10 | log = getLogger(__name__)
11 | logger_config(log)
12 | 
13 | parser = ArgumentParser(
14 |         description="""Count all tokens in the given input file and writes them
15 |         with its count to the output file in descending order.""")
16 | parser.add_argument('-l', '--lowercase', action='store_true',
17 |         help='lowercase words before counting')
18 | parser.add_argument('infile', help='input file')
19 | parser.add_argument('outfile', help='output file')
20 | 
21 | def main(argv=None):
22 |     """See argument parser description."""
23 | 
24 |     if argv is None:
25 |         argv = sys.argv[1:]
26 | 
27 |     args = parser.parse_args(argv)
28 |     log.info('start parameters: ' + str(args))
29 | 
30 |     counter = Counter()
31 | 
32 |     with utf8_file_open(args.infile, 'r') as infile:
33 | 
34 |         for line in infile:
35 |             line = line.strip()
36 | 
37 |             if args.lowercase:
38 |                 line = line.lower()
39 | #             line = line.decode('utf-8').strip()
40 | 
41 | #             log.info(line)
42 | #             if line == '' or line.startswith('<doc id='):
43 | #                 continue
44 | 
45 |             counter.update(line.strip().split())
46 | 
47 |     with utf8_file_open(args.outfile, 'w') as outfile:
48 |         for (key, count) in sort_dict_by_label(counter, True):
49 |             outfile.write(u'%s\t%i\n' % (key, count))
50 | 
51 |     log.info('finished')
52 | 
53 | if __name__ == "__main__":
54 |     sys.exit(main())
55 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/preprocessing/corpus.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """This file contains classes dealing with various corpora."""
 3 | 
 4 | from logging import getLogger
 5 | import re
 6 | 
 7 | from cis.deep.utils import logger_config, file_line_generator
 8 | 
 9 | 
10 | log = getLogger(__name__)
11 | logger_config(log)
12 | 
13 | class AmazonProductReviewCorpusReader:
14 |     """Helper methods for the Amazon review corpus."""
15 | 
16 |     def __init__(self, infile):
17 |         """
18 |         Parameters
19 |         ----------
20 |         infile : str
21 |         """
22 |         self.infile = infile
23 | 
24 |     def review_generator(self, remove_meta_cols=True):
25 |         """Iterate over all reviews
26 | 
27 |         Parameters
28 |         ----------
29 |         remove_meta_cols : bool
30 |             indicates whether or not to remove the first 7 meta data columns
31 |         """
32 | 
33 |         for line in file_line_generator(self.infile):
34 |             line = line.decode(errors='ignore')
35 | 
36 |             if remove_meta_cols is True:
37 |                 line = self._extract_body(line)
38 | 
39 |             yield line
40 | 
41 |         raise StopIteration()
42 | 
43 |     @staticmethod
44 |     def _extract_body(line):
45 |         """Return the review body from the given text line."""
46 |         # Remove the meta data in front of the line. Since the body may contain
47 |         # more tabs, we need to join it here.
48 |         body = u' '.join(line.split(u'\t')[7:])
49 |         body = body.strip()
50 |         # There are too many whitespaces.
51 | #         body = re.sub(r'\s+', ' ', body, flags=re.UNICODE)
52 |         return body
53 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/statistics/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | 
 4 | from cis.deep.utils import utf8_file_open, file_line_generator
 5 | 
 6 | 
 7 | def calc_matrix_statistics(matrix_file):
 8 |     """Calculates some basic statistics for huge matrix files.
 9 | 
10 |     If a matrix is too big to be imported into a program, use this method to
11 |     calculate the mean, maximum, minimum, and standard deviation of every line
12 |     in the file. It returns a generator.
13 | 
14 |     Parameters
15 |     ----------
16 |     matrix_file : str
17 |         filename of the matrix file; the file must be a csv file with spaces as
18 |         separator
19 | 
20 |     Returns
21 |     -------
22 |     generator : (float, float, float, float)
23 |         mean, max, min, std_dev of current line in the matrix file
24 |     """
25 | 
26 |     for line in file_line_generator(matrix_file):
27 |         a = np.fromstring(line, sep=u' ')
28 |         yield (np.mean(a), np.max(a), np.min(a), np.std(a))
29 | 
30 |     raise StopIteration()
31 | 
32 | def cross_entropy(p, q):
33 |     """Calculate the cross-entropy for discrete probability distributions.
34 | 
35 |     Parameters
36 |     ----------
37 |     p, q : array-like, dtype=float, shape=n
38 |         Discrete probability distributions.
39 | 
40 |     Returns
41 |     -------
42 |     cross entropy
43 |     """
44 |     p = np.asarray(p, dtype=np.float)
45 |     q = np.asarray(q, dtype=np.float)
46 | 
47 |     return -np.sum(p * np.log10(q))
48 | 
49 | def kullback_leibler(p, q):
50 |     """Calculate the Kullback-Leibler divergence D(P || Q) for discrete
51 |     probability distributions.
52 | 
53 |     Source taken from https://gist.github.com/larsmans/3104581
54 |     The computation does not consider those elements i where either p_i or q_i
55 |     is zero.
56 | 
57 |     Parameters
58 |     ----------
59 |     p, q : array-like, dtype=float, shape=n
60 |         Discrete probability distributions.
61 | 
62 |     Returns
63 |     -------
64 |     KL divergence
65 |     """
66 |     p = np.asarray(p, dtype=np.float)
67 |     q = np.asarray(q, dtype=np.float)
68 | 
69 |     return np.sum(np.where(np.logical_and(p != 0, q != 0), p * np.log(p / q), 0))
70 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/statistics/apps/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/cis/deep/utils/statistics/apps/__init__.py


--------------------------------------------------------------------------------
/src/cis/deep/utils/statistics/apps/calc_matrix_statistics.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from argparse import ArgumentParser
 3 | from logging import getLogger
 4 | import sys
 5 | 
 6 | from cis.deep.utils import utf8_file_open, logger_config, log_iterations
 7 | from cis.deep.utils.statistics import calc_matrix_statistics
 8 | 
 9 | 
10 | log = getLogger(__name__)
11 | logger_config(log)
12 | 
13 | parser = ArgumentParser(
14 |         description="""Calculates the basic statistics for a file that contains
15 |         a matrix in csv format with spaces as separators. The statistics include
16 |         mean, max, min, and std dev for every row in the input file.""")
17 | parser.add_argument('infile', type=str, help='input file')
18 | parser.add_argument('outfile', type=str, help='output file')
19 | 
20 | def main(argv=None):
21 |     """See argument parser description."""
22 | 
23 |     if argv is None:
24 |         argv = sys.argv[1:]
25 | 
26 |     args = parser.parse_args(argv)
27 |     log.info('start parameters: ' + str(args))
28 | 
29 |     with utf8_file_open(args.outfile, 'w') as outfile:
30 |         outfile.write(u'mean max min std_dev\n')
31 | 
32 |         for (count, tupel) in enumerate(calc_matrix_statistics(args.infile)):
33 |             log_iterations(log, count, 10000)
34 | 
35 |             outfile.write(u'%f %f %f %f\n' % tupel)
36 | 
37 |     log.info('finished')
38 | 
39 | if __name__ == "__main__":
40 |     sys.exit(main())
41 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/text.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | This file contains text related utility functions.
 4 | """
 5 | from nltk.tokenize import word_tokenize
 6 | 
 7 | def tokenize(text):
 8 |     """Tokenize the given input file by NLTK\'s recommended word tokenizer.
 9 | 
10 |     Parameters
11 |     ----------
12 |     text : str
13 |         text to tokenize
14 | 
15 |     Returns
16 |     -------
17 |     list(str)
18 |         tokenized text
19 |     """
20 |     return word_tokenize(text.strip())
21 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/theano/.gitignore:
--------------------------------------------------------------------------------
1 | /__init__.pyc
2 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/theano/__init__.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | import numpy
 3 | 
 4 | # Caution: Setting this to true prevents a model to be stored correctly. Loading
 5 | # is not possible. This is because the Print function is not properly
 6 | # serialized.
 7 | PRINT_VARS = False
 8 | numpy.set_printoptions(edgeitems=500)
 9 | 
10 | def debug_print(var, name):
11 |     """Wrap the given Theano variable into a Print node for debugging.
12 | 
13 |     If the variable is wrapped into a Print node depends on the state of the
14 |     PRINT_VARS variable above. If it is false, this method just returns the
15 |     original Theano variable.
16 |     The given variable is printed to console whenever it is used in the graph.
17 | 
18 |     Parameters
19 |     ----------
20 |     var : Theano variable
21 |         variable to be wrapped
22 |     name : str
23 |         name of the variable in the console output
24 | 
25 |     Returns
26 |     -------
27 |     Theano variable
28 |         wrapped Theano variable
29 | 
30 |     Example
31 |     -------
32 |     import theano.tensor as T
33 |     d = T.dot(W, x) + b
34 |     d = debug_print(d, 'dot_product')
35 |     """
36 | 
37 |     if PRINT_VARS is False:
38 |         return var
39 | 
40 |     return theano.printing.Print(name)(var)
41 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/theano/gpu_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | This file contains a gpu test taken from 
 4 | http://www.deeplearning.net/software/theano/tutorial/using_gpu.html (2013-12-18)
 5 | """
 6 | 
 7 | import time
 8 | 
 9 | import numpy
10 | from theano import function, config, shared, sandbox
11 | 
12 | import theano.tensor as T
13 | 
14 | 
15 | vlen = 10 * 30 * 768  # 10 x #cores x # threads per core
16 | iters = 1000
17 | 
18 | rng = numpy.random.RandomState(22)
19 | x = shared(numpy.asarray(rng.rand(vlen), config.floatX))
20 | f = function([], T.exp(x))
21 | print f.maker.fgraph.toposort()
22 | t0 = time.time()
23 | for i in xrange(iters):
24 |     r = f()
25 | t1 = time.time()
26 | print 'Looping %d times took' % iters, t1 - t0, 'seconds'
27 | print 'Result is', r
28 | if numpy.any([isinstance(x.op, T.Elemwise) for x in f.maker.fgraph.toposort()]):
29 |     print 'Used the cpu'
30 | else:
31 |     print 'Used the gpu'


--------------------------------------------------------------------------------
/src/cis/deep/utils/theano/log_reg.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This file contains a basic logistic regression test. 
  4 | http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh#a-real-example-logistic-regression
  5 | (2013-12-18)
  6 | """
  7 | 
  8 | import os.path
  9 | 
 10 | import numpy
 11 | from theano import config
 12 | import theano
 13 | 
 14 | from cis.deep.utils import save_object_to_file, load_object_from_file
 15 | import theano.tensor as T
 16 | 
 17 | 
 18 | rng = numpy.random
 19 |  
 20 | floatX = config.floatX
 21 |  
 22 | class LogisticRegression():
 23 |  
 24 |     def __init__(self):
 25 |         self.N = 400
 26 |         self.feats = 784
 27 |         self.D = (numpy.asarray(rng.randn(self.N, self.feats), floatX),
 28 |                 numpy.asarray(rng.randint(size=self.N, low=0, high=2), floatX))
 29 |         self.training_steps = 10
 30 |  
 31 |         # Declare Theano symbolic variables
 32 |         self.x = T.matrix("x", floatX)
 33 |         self.y = T.vector("y", floatX)
 34 |         self.w = theano.shared(numpy.asarray(numpy.zeros(self.feats), floatX),
 35 |                 name="w")
 36 |         self.b = theano.shared(numpy.cast[floatX](0.), name="b")
 37 | #         print "Initial model:"
 38 | #         print self.w.get_value(), self.b.get_value()
 39 |  
 40 |     def create_graph(self):
 41 |  
 42 |         # Construct Theano expression graph
 43 | #         self.p_1 = theano.printing.Print('p_1')(1 / (1 + T.exp(-T.dot(self.x, self.w) - self.b)))  # Probability that target = 1
 44 | #         self.prediction = theano.printing.Print('prediction')(self.p_1 > 0.5)  # The prediction thresholded
 45 | #         y = theano.printing.Print('y')(self.y)
 46 | #         left = theano.printing.Print('left')(-y * T.log(self.p_1))
 47 | #         self.xent = theano.printing.Print('xent')(left - (1 - self.y) * T.log(1 - self.p_1))  # Cross-entropy loss function
 48 | #         self.cost = theano.printing.Print('cost')(self.xent.mean() + 0.01 * (self.w ** 2).sum())  # The cost to minimize
 49 | #         self.gw, self.gb = T.grad(self.cost, [self.w, self.b])  # Compute the gradient of the cost
 50 |         self.p_1 = 1 / (1 + T.exp(-T.dot(self.x, self.w) - self.b))  # Probability that target = 1
 51 |         self.prediction = self.p_1 > 0.5  # The prediction thresholded
 52 |         self.xent = -self.y * T.log(self.p_1) - (1 - self.y) * T.log(1 - self.p_1)  # Cross-entropy loss function
 53 |         self.cost = self.xent.mean() + 0.01 * (self.w ** 2).sum()  # The cost to minimize
 54 |         self.gw, self.gb = T.grad(self.cost, [self.w, self.b])  # Compute the gradient of the cost
 55 |  
 56 |  
 57 |         # Compile
 58 |         self.train = theano.function(
 59 |                   inputs=[self.x, self.y],
 60 |                   outputs=[self.prediction, self.xent],
 61 |                   updates=((self.w, self.w - 0.1 * self.gw), (self.b, self.b - 0.1 * self.gb)))
 62 |         self.predict = theano.function(inputs=[self.x], outputs=self.prediction)
 63 |  
 64 |     def do_train(self):
 65 |  
 66 |         # Train
 67 |         for i in range(self.training_steps):
 68 |             print self.b.get_value(), type(self.b.get_value())
 69 |             pred, err = self.train(self.D[0], self.D[1])
 70 |  
 71 | print floatX
 72 | 
 73 | 
 74 | if not os.path.exists('test'):
 75 |     print 'create new model'
 76 |     model = LogisticRegression()
 77 |     model.create_graph()
 78 |     model.do_train()
 79 |     save_object_to_file(model, 'test')
 80 | else:
 81 |     print 'load model'
 82 |     model = load_object_from_file('test')
 83 | 
 84 | print "Final model:"
 85 | print model.w.get_value(), model.b.get_value()
 86 | print "target values for D:", model.D[1]
 87 | print "prediction on D:", model.predict(model.D[0])
 88 |  
 89 | 
 90 | # import numpy
 91 | # import theano
 92 | # import theano.tensor as T
 93 | # from theano import config
 94 | # rng = numpy.random
 95 | # floatX = config.floatX
 96 | # 
 97 | # N = 400
 98 | # feats = 784
 99 | # D = (numpy.asarray(rng.randn(N, feats), floatX), numpy.asarray(rng.randint(size=N,low=0, high=2), floatX))
100 | # training_steps = 10000
101 | # 
102 | # # Declare Theano symbolic variables
103 | # x = T.matrix("x", floatX)
104 | # y = T.vector("y", floatX)
105 | # w = theano.shared(numpy.asarray(rng.randn(feats), floatX), name="w")
106 | # b = theano.shared(numpy.cast[floatX](0.), name="b")
107 | # print "Initial model:"
108 | # print w.get_value(), b.get_value()
109 | # 
110 | # # Construct Theano expression graph
111 | # p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b))   # Probability that target = 1
112 | # prediction = p_1 > 0.5                    # The prediction thresholded
113 | # xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function
114 | # cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize
115 | # gw,gb = T.grad(cost, [w, b])              # Compute the gradient of the cost
116 | # 
117 | # # Compile
118 | # train = theano.function(
119 | #           inputs=[x,y],
120 | #           outputs=[prediction, xent],
121 | #           updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb)))
122 | # predict = theano.function(inputs=[x], outputs=prediction)
123 | # 
124 | # # Train
125 | # for i in range(training_steps):
126 | #     pred, err = train(D[0], D[1])
127 | # 
128 | # print "Final model:"
129 | # print w.get_value(), b.get_value()
130 | # print "target values for D:", D[1]
131 | # print "prediction on D:", predict(D[0])


--------------------------------------------------------------------------------
/src/cis/deep/utils/visualization/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | This file contains visualization methods.
 4 | """
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | def render_points(data, width=1, height=1, margin=0.00):
 8 |     """Render text points to a pylab figure.
 9 | 
10 |     Parameters
11 |     ----------
12 |     points : [(str, (str, float, float))]
13 |         data points to render, having the form [(color, (title, x, y))]
14 |     width : int
15 |         width of the graph in inches
16 |     height : int
17 |         height of the graph in inches
18 |     margin : float
19 |         amount of extra whitespace added at the edges
20 |     """
21 |     plt.figure(figsize=(width, height), tight_layout=True)
22 |     ax = plt.gca()
23 | 
24 |     minx = 0
25 |     maxx = 0
26 |     miny = 0
27 |     maxy = 0
28 | 
29 |     for _, points in data:
30 |         # get min and max coordinates of the figure
31 |         for (title, x, y) in points:
32 |             if minx > x: minx = x
33 |             if maxx < x: maxx = x
34 |             if miny > y: miny = y
35 |             if maxy < y: maxy = y
36 | 
37 |     dx = maxx - minx
38 |     dy = maxy - miny
39 |     assert dx > 0
40 |     assert dy > 0
41 |     minx -= dx * margin
42 |     miny -= dy * margin
43 |     maxx += dx * margin
44 |     maxy += dy * margin
45 | 
46 |     ax.set_autoscale_on(False)
47 | 
48 |     minx_pos = 50000000
49 |     maxx_pos = -50000000
50 |     miny_pos = 50000000
51 |     maxy_pos = -50000000
52 | 
53 |     for color, points in data:
54 |         # render the single points
55 |         for pt in points:
56 |             (title, x, y) = pt
57 |             x = 1. * (x - minx) / (maxx - minx)
58 |             y = 1. * (y - miny) / (maxy - miny)
59 | 
60 |             minx_pos = min(minx_pos, x)
61 |             maxx_pos = max(maxx_pos, x)
62 |             miny_pos = min(miny_pos, y)
63 |             maxy_pos = max(maxy_pos, y)
64 |             pos = (x, y)
65 | 
66 |             plt.annotate(title, pos, color=color)
67 | 
68 |     ax.set_xlim([minx_pos, maxx_pos])
69 |     ax.set_ylim([miny_pos, maxy_pos])
70 | 


--------------------------------------------------------------------------------
/src/cis/deep/utils/visualization/apps/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/cis/deep/utils/visualization/apps/__init__.py


--------------------------------------------------------------------------------
/src/cis/deep/utils/visualization/apps/visualize_by_tsne.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | The required t-sne implementation used can be found at
  4 | https://github.com/turian/textSNE
  5 | 
  6 | example usage:
  7 | X:/sa/embeddings/ColWes08/combined-WilWieHof05
  8 | -f X:/sa/embeddings/ColWes08/combined-non_WilWieHof05-shuffle-1500
  9 | 
 10 | X:/sa/embeddings/vlbl/general-nce5-1ep/vLBL_1_14-03-05_07-09-05.embeddings_r-combined-WilWieHof05
 11 | -f X:/sa/embeddings/vlbl/general-nce5-1ep/vLBL_1_14-03-05_07-09-05.embeddings_r-combined-non_WilWieHof05
 12 | """
 13 | 
 14 | from argparse import ArgumentParser
 15 | from logging import getLogger
 16 | import sys
 17 | 
 18 | from calc_tsne import tsne
 19 | from cis.deep.utils import file_line_generator, logger_config
 20 | from cis.deep.utils.visualization import render_points
 21 | import numpy as np
 22 | import pylab as plt
 23 | 
 24 | 
 25 | log = getLogger(__name__)
 26 | logger_config(log)
 27 | 
 28 | parser = ArgumentParser(
 29 |         description="""This script creates a 2d visualization of different kinds
 30 |         of input, e.g., Collobert & Weston word embeddings or RAE query
 31 |         representations. The code is a modification of a original t-SNE code.
 32 |         """)
 33 | parser.add_argument('file', type=str, help='first file to load')
 34 | parser.add_argument('-f', '--file2', type=str, help='second file to load')
 35 | parser.add_argument('-o', '--out', type=str,
 36 |         help='write the rendered image to the given output file')
 37 | 
 38 | def scaleData(x):
 39 |     """Scales the given data between 0 and 1.
 40 |     This is necessary, because t-sne will fail for too big numbers.
 41 |     """
 42 |     x -= np.min(x)
 43 |     x /= np.max(x)
 44 |     return x
 45 | 
 46 | def getData(emb_file):
 47 |     """Load the data file.
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     emb_file : str
 52 |         name of the data file in which the first tab-separated column contains
 53 |         the title and the second column the values of an item
 54 | 
 55 |     Returns
 56 |     -------
 57 |     list(str)
 58 |         item titles
 59 |     list(ndarray)
 60 |         item values
 61 |     """
 62 |     titles = []
 63 |     data = []
 64 | 
 65 |     for l in file_line_generator(emb_file):
 66 |         token, emb = l.split(u'\t')
 67 |         titles.append(token)
 68 |         data.append(np.fromstring(emb, sep=u' '))
 69 | 
 70 |     return titles, np.asarray(data)
 71 | 
 72 | def main(argv=None):
 73 | 
 74 |     if argv is None:
 75 |         argv = sys.argv[1:]
 76 | 
 77 |     args = parser.parse_args(argv)
 78 |     log.info('start parameters: ' + str(args))
 79 | 
 80 |     log.info('loading data')
 81 |     titles, x = getData(args.file)
 82 | 
 83 |     file_size1 = len(titles)
 84 | 
 85 |     if args.file2 is not None:
 86 |         titles2, x2 = getData(args.file2)
 87 |         titles.extend(titles2)
 88 |         x = np.vstack((x, x2))
 89 | 
 90 | #     x = scaleData(x)
 91 | 
 92 |     log.info('performing t-SNE')
 93 |     out = tsne(x, no_dims=2, perplexity=30, initial_dims=100, use_pca=False)
 94 | 
 95 |     points = [('green', [(title, point[0], point[1])
 96 |             for title, point in zip(titles[:file_size1], out[:file_size1, :])])]
 97 | 
 98 |     if args.file2 is not None:
 99 |         points.append(('gray', [(title, point[0], point[1])
100 |             for title, point in zip(titles[file_size1:], out[file_size1:, :])]))
101 | 
102 | #     pca = PCA(n_components=2)
103 | #     out = pca.fit_transform(x)
104 | 
105 | #     mds = MDS()
106 | #     out = mds.fit_transform(x)
107 | 
108 |     log.info('rendering result')
109 |     render_points(points, 20, 20)
110 | 
111 |     if args.out:
112 |         plt.savefig(args.out, dpi=600)
113 |     else:
114 |         plt.show()
115 | 
116 |     log.info('finished')
117 | 
118 | if __name__ == "__main__":
119 |     sys.exit(main())
120 | 


--------------------------------------------------------------------------------
/src/log.best.scitail.txt:
--------------------------------------------------------------------------------
 1 | ... training
 2 | Epoch  1 iter 100 average cost: 0.50105274111 uses  0.274652850628 min
 3 | 	current dev_acc: 0.774814814815  ;  	max_dev_acc: 0.774814814815
 4 | 		current acc: 0.763255813953  ;  		max_acc: 0.763255813953
 5 | Epoch  1 iter 200 average cost: 0.427455786616 uses  0.3303023537 min
 6 | 	current dev_acc: 0.788888888889  ;  	max_dev_acc: 0.788888888889
 7 | 		current acc: 0.782325581395  ;  		max_acc: 0.782325581395
 8 | Epoch  1 iter 300 average cost: 0.38460314619 uses  0.324084401131 min
 9 | 	current dev_acc: 0.795555555556  ;  	max_dev_acc: 0.795555555556
10 | 		current acc: 0.799534883721  ;  		max_acc: 0.799534883721
11 | Epoch  1 iter 400 average cost: 0.36287988646 uses  0.327256147067 min
12 | 	current dev_acc: 0.800740740741  ;  	max_dev_acc: 0.800740740741
13 | 		current acc: 0.801860465116  ;  		max_acc: 0.801860465116
14 | Epoch  1 uses  1.50898130337 min
15 | Epoch  2 iter 500 average cost: 0.335963069856 uses  0.33199198246 min
16 | Epoch  2 iter 600 average cost: 0.301583123356 uses  0.296971384684 min
17 | 	current dev_acc: 0.817777777778  ;  	max_dev_acc: 0.817777777778
18 | 		current acc: 0.811162790698  ;  		max_acc: 0.811162790698
19 | Epoch  2 iter 700 average cost: 0.27583842698 uses  0.333897197247 min
20 | 	current dev_acc: 0.819259259259  ;  	max_dev_acc: 0.819259259259
21 | 		current acc: 0.807441860465  ;  		max_acc: 0.811162790698
22 | Epoch  2 iter 800 average cost: 0.256100475551 uses  0.333747748534 min
23 | 	current dev_acc: 0.823703703704  ;  	max_dev_acc: 0.823703703704
24 | 		current acc: 0.820465116279  ;  		max_acc: 0.820465116279
25 | Epoch  2 iter 900 average cost: 0.240837877587 uses  0.336418318748 min
26 | Epoch  2 uses  1.52058656613 min
27 | Epoch  3 iter 1000 average cost: 0.224601259664 uses  0.300214282672 min
28 | Epoch  3 iter 1100 average cost: 0.209052862727 uses  0.297549700737 min
29 | Epoch  3 iter 1200 average cost: 0.195848352684 uses  0.296437966824 min
30 | Epoch  3 iter 1300 average cost: 0.184916155527 uses  0.29627721707 min
31 | 


--------------------------------------------------------------------------------
/src/model_para_0.820930232558:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/model_para_0.820930232558


--------------------------------------------------------------------------------
/src/preprocess_SciTail.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from random import randint
 3 | 
 4 | def Scitail_2_ExtreamPosNeg():
 5 |     root_file="/save/wenpeng/datasets/SciTailV1/tsv_format/scitail_1.0_train.tsv"
 6 |     writefilename = "/save/wenpeng/datasets/SciTailV1/tsv_format/scitail_1.0_train_2_ExtreamPosNeg.txt"
 7 |     # files=['scitail_1.0_train.tsv', 'scitail_1.0_dev.tsv', 'scitail_1.0_test.tsv']
 8 |     'we creat 10 neg, 10 pos for each sentence'
 9 |     readfile = open(root_file, 'r')
10 |     writefile = open(writefilename, 'w')
11 |     for line in readfile:
12 |         parts = line.strip().split('\t')
13 |         sent1 = parts[0]
14 |         sent2 = parts[1]
15 |         sent1_wordlist = sent1.split()
16 |         sent2_wordlist = sent2.split()
17 |         sent1_len = len(sent1_wordlist)
18 |         sent2_len = len(sent2_wordlist)
19 |         'sent1 pos'
20 |         sent1_pos_list = []
21 |         sent1_pos_list.append(sent1)
22 | 
23 |         # for i in range(9):
24 |         #     left = randint(0, sent1_len/2)
25 |         #     right = randint(left + 1, sent1_len)
26 |         #     sent1_pos_ins = sent1_wordlist[left:right]
27 |         #     sent1_pos_list.append(' '.join(sent1_pos_ins))
28 |         # assert len(sent1_pos_list) == 10
29 |         'sent1 neg'
30 |         sent1_neg_list = []
31 | 
32 |         sent1_neg_list.append(' '.join(sent1_wordlist[::-1])) #reverse
33 |         for i in range(8):
34 |             insert_point = randint(0, sent1_len - 1)
35 |             sent1_neg_list.append(' '.join(sent1_wordlist[:insert_point]+['not']+sent1_wordlist[insert_point:]))
36 |         random.Random(100).shuffle(sent1_wordlist)
37 |         sent1_neg_list.append(' '.join(sent1_wordlist)) #shuffle
38 |         assert len(sent1_neg_list) == 10
39 |         'write sent1 into file'
40 |         for sent in sent1_pos_list:
41 |             writefile.write(sent1+'\t'+sent+'\tentails\n')
42 |         for sent in sent1_neg_list:
43 |             writefile.write(sent1+'\t'+sent+'\tneutral\n')
44 | 
45 | 
46 |         'sent2 pos'
47 |         sent2_pos_list = []
48 |         sent2_pos_list.append(sent2)
49 | 
50 |         # for i in range(9):
51 |         #     left = randint(0, sent2_len/2)
52 |         #     right = randint(left + 1, sent2_len)
53 |         #     sent2_pos_ins = sent2_wordlist[left:right]
54 |         #     sent2_pos_list.append(' '.join(sent2_pos_ins))
55 |         # assert len(sent2_pos_list) == 10
56 |         'sent2 neg'
57 |         sent2_neg_list = []
58 | 
59 |         sent2_neg_list.append(' '.join(sent2_wordlist[::-1])) #reverse
60 |         for i in range(8):
61 |             insert_point = randint(0, sent2_len - 1)
62 |             sent2_neg_list.append(' '.join(sent2_wordlist[:insert_point]+['not']+sent2_wordlist[insert_point:]))
63 |         random.Random(100).shuffle(sent2_wordlist)
64 |         sent2_neg_list.append(' '.join(sent2_wordlist)) #shuffle
65 |         assert len(sent2_neg_list) == 10
66 |         'write sent2 into file'
67 |         for sent in sent2_pos_list:
68 |             writefile.write(sent2+'\t'+sent+'\tentails\n')
69 |         for sent in sent2_neg_list:
70 |             writefile.write(sent2+'\t'+sent+'\tneutral\n')
71 |     readfile.close()
72 |     writefile.close()
73 |     print 'write over'
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     Scitail_2_ExtreamPosNeg()
78 | 


--------------------------------------------------------------------------------
/src/train_SciTail_DeIsTe_model.py:
--------------------------------------------------------------------------------
  1 | import cPickle
  2 | import gzip
  3 | import os
  4 | import sys
  5 | sys.setrecursionlimit(6000)
  6 | import time
  7 | 
  8 | import numpy as np
  9 | import theano
 10 | import theano.tensor as T
 11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 12 | import random
 13 | 
 14 | from logistic_sgd import LogisticRegression
 15 | from mlp import HiddenLayer
 16 | from theano.tensor.signal import downsample
 17 | from random import shuffle
 18 | from scipy.stats import mode
 19 | 
 20 | from load_data import load_SciTailV1_dataset,load_word2vec, load_word2vec_to_init, extend_word2vec_lowercase
 21 | from common_functions import Conv_for_Pair,dropout_layer, store_model_to_file, elementwise_is_two,Conv_with_Mask_with_Gate, Conv_with_Mask, create_conv_para, L2norm_paraList, ABCNN, create_ensemble_para, cosine_matrix1_matrix2_rowwise, Diversify_Reg, Gradient_Cost_Para, GRU_Batch_Tensor_Input_with_Mask, create_LSTM_para
 22 | 
 23 | 
 24 | def evaluate_lenet5(learning_rate=0.01, n_epochs=10, L2_weight=0.000001, extra_size=4, emb_size=300, posi_emb_size=50,batch_size=50, filter_size=[3,3], maxSentLen=50, hidden_size=300):
 25 | 
 26 |     model_options = locals().copy()
 27 |     print "model options", model_options
 28 | 
 29 |     seed=1234
 30 |     np.random.seed(seed)
 31 |     rng = np.random.RandomState(seed)    #random seed, control the model generates the same results
 32 | 
 33 | 
 34 |     all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id  =load_SciTailV1_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
 35 |     # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)
 36 | 
 37 |     train_sents_l=np.asarray(all_sentences_l[0], dtype='int32')
 38 |     dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
 39 |     test_sents_l=np.asarray(all_sentences_l[2], dtype='int32')
 40 | 
 41 |     train_masks_l=np.asarray(all_masks_l[0], dtype=theano.config.floatX)
 42 |     dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
 43 |     test_masks_l=np.asarray(all_masks_l[2], dtype=theano.config.floatX)
 44 | 
 45 |     train_sents_r=np.asarray(all_sentences_r[0], dtype='int32')
 46 |     dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
 47 |     test_sents_r=np.asarray(all_sentences_r[2]    , dtype='int32')
 48 | 
 49 |     train_masks_r=np.asarray(all_masks_r[0], dtype=theano.config.floatX)
 50 |     dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
 51 |     test_masks_r=np.asarray(all_masks_r[2], dtype=theano.config.floatX)
 52 | 
 53 | 
 54 |     train_labels_store=np.asarray(all_labels[0], dtype='int32')
 55 |     dev_labels_store=np.asarray(all_labels[1], dtype='int32')
 56 |     test_labels_store=np.asarray(all_labels[2], dtype='int32')
 57 | 
 58 |     train_size=len(train_labels_store)
 59 |     dev_size=len(dev_labels_store)
 60 |     test_size=len(test_labels_store)
 61 |     print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size
 62 | 
 63 |     vocab_size=len(word2id)+1
 64 | 
 65 | 
 66 |     rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
 67 |     #here, we leave code for loading word2vec to initialize words
 68 |     rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
 69 |     id2word = {y:x for x,y in word2id.iteritems()}
 70 |     word2vec=load_word2vec()
 71 |     rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
 72 |     init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable
 73 | 
 74 |     posi_rand_values=rng.normal(0.0, 0.01, (maxSentLen, posi_emb_size))   #generate a matrix by Gaussian distribution
 75 |     posi_embeddings=theano.shared(value=np.array(posi_rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable
 76 | 
 77 | 
 78 |     #now, start to build the input form of the model
 79 |     sents_ids_l=T.imatrix()
 80 |     sents_mask_l=T.fmatrix()
 81 |     sents_ids_r=T.imatrix()
 82 |     sents_mask_r=T.fmatrix()
 83 |     labels=T.ivector()
 84 |     ######################
 85 |     # BUILD ACTUAL MODEL #
 86 |     ######################
 87 |     print '... building the model'
 88 | 
 89 |     def embed_input(emb_matrix, sent_ids):
 90 |         return emb_matrix[sent_ids.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)
 91 | 
 92 |     embed_input_l=embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
 93 |     embed_input_r=embed_input(init_embeddings, sents_ids_r)#embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)
 94 | 
 95 | 
 96 | 
 97 |     '''create_AttentiveConv_params '''
 98 |     conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size[0]))
 99 |     conv_W_posi, conv_b_posi=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size+posi_emb_size, filter_size[0]))
100 |     conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 1))
101 | 
102 |     NN_para=[conv_W, conv_b,conv_W_posi, conv_b_posi,conv_W_context]
103 | 
104 |     '''
105 |     attentive convolution function
106 |     '''
107 | 
108 |     attentive_conv_layer = Conv_for_Pair(rng,
109 |             origin_input_tensor3=embed_input_l,
110 |             origin_input_tensor3_r = embed_input_r,
111 |             input_tensor3=embed_input_l,
112 |             input_tensor3_r = embed_input_r,
113 |              mask_matrix = sents_mask_l,
114 |              mask_matrix_r = sents_mask_r,
115 |              image_shape=(batch_size, 1, emb_size, maxSentLen),
116 |              image_shape_r = (batch_size, 1, emb_size, maxSentLen),
117 |              filter_shape=(hidden_size, 1, emb_size, filter_size[0]),
118 |              filter_shape_context=(hidden_size, 1,emb_size, 1),
119 |              W=conv_W, b=conv_b,
120 |              W_posi=conv_W_posi, b_posi=conv_b_posi,
121 |              W_context=conv_W_context, b_context=conv_b_context,
122 |              posi_emb_matrix = posi_embeddings,
123 |              posi_emb_size = posi_emb_size)
124 |     attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
125 |     attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r
126 | 
127 |     sent_embeddings_l = attentive_conv_layer.maxpool_vec_l
128 |     sent_embeddings_r = attentive_conv_layer.maxpool_vec_r
129 | 
130 |     "form input to LR classifier"
131 |     LR_input = T.concatenate([sent_embeddings_l,sent_embeddings_r,sent_embeddings_l*sent_embeddings_r,attentive_sent_embeddings_l,attentive_sent_embeddings_r,attentive_sent_embeddings_l*attentive_sent_embeddings_r],axis=1)
132 |     LR_input_size=6*hidden_size
133 | 
134 |     U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2
135 |     LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
136 |     LR_para=[U_a, LR_b]
137 | 
138 | 
139 |     layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
140 |     loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.
141 | 
142 | 
143 | 
144 | 
145 | 
146 |     params = [init_embeddings,posi_embeddings]+NN_para+LR_para
147 |     # L2_reg = (init_embeddings**2).sum()+(conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum()
148 | 
149 |     cost=loss#+L2_weight*L2_reg
150 | 
151 |     updates =   Gradient_Cost_Para(cost,params, learning_rate)
152 | 
153 | 
154 |     train_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
155 |     dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
156 | 
157 |     test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
158 | 
159 |     ###############
160 |     # TRAIN MODEL #
161 |     ###############
162 |     print '... training'
163 |     # early-stopping parameters
164 |     patience = 50000000000  # look as this many examples regardless
165 |     start_time = time.time()
166 |     mid_time = start_time
167 |     past_time= mid_time
168 |     epoch = 0
169 |     done_looping = False
170 | 
171 |     n_train_batches=train_size/batch_size
172 |     train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size]
173 |     n_dev_batches=dev_size/batch_size
174 |     dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
175 |     n_test_batches=test_size/batch_size
176 |     test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]
177 | 
178 | 
179 |     max_acc_dev=0.0
180 |     max_acc_test=0.0
181 |     max_f1=0.0
182 | 
183 |     cost_i=0.0
184 |     train_indices = range(train_size)
185 | 
186 |     while epoch < n_epochs:
187 |         epoch = epoch + 1
188 | 
189 |         random.Random(100).shuffle(train_indices) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
190 | 
191 |         iter_accu=0
192 | 
193 |         for batch_id in train_batch_start: #for each batch
194 |             iter = (epoch - 1) * n_train_batches + iter_accu +1
195 |             iter_accu+=1
196 |             train_id_batch = train_indices[batch_id:batch_id+batch_size]
197 |             cost_i+= train_model(
198 |                                 train_sents_l[train_id_batch],
199 |                                 train_masks_l[train_id_batch],
200 |                                 train_sents_r[train_id_batch],
201 |                                 train_masks_r[train_id_batch],
202 |                                 train_labels_store[train_id_batch])
203 | 
204 |             #after each 1000 batches, we test the performance of the model on all test data
205 |             if iter%100==0:
206 |                 print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
207 |                 past_time = time.time()
208 |                 dev_error_sum=0.0
209 |                 for dev_batch_id in dev_batch_start: # for each test batch
210 |                     dev_error_i=dev_model(
211 |                             dev_sents_l[dev_batch_id:dev_batch_id+batch_size],
212 |                             dev_masks_l[dev_batch_id:dev_batch_id+batch_size],
213 |                             dev_sents_r[dev_batch_id:dev_batch_id+batch_size],
214 |                             dev_masks_r[dev_batch_id:dev_batch_id+batch_size],
215 |                             dev_labels_store[dev_batch_id:dev_batch_id+batch_size])
216 | 
217 |                     dev_error_sum+=dev_error_i
218 |                 dev_acc=1.0-dev_error_sum/(len(dev_batch_start))
219 | 
220 | 
221 |                 if dev_acc > max_acc_dev:
222 |                     max_acc_dev=dev_acc
223 |                     print '\tcurrent dev_acc:', dev_acc,' ; ','\tmax_dev_acc:', max_acc_dev
224 | 
225 | 
226 |                     error_sum=0.0
227 |                     for idd, test_batch_id in enumerate(test_batch_start): # for each test batch
228 |                         error_i=test_model(
229 |                                 test_sents_l[test_batch_id:test_batch_id+batch_size],
230 |                                 test_masks_l[test_batch_id:test_batch_id+batch_size],
231 |                                 test_sents_r[test_batch_id:test_batch_id+batch_size],
232 |                                 test_masks_r[test_batch_id:test_batch_id+batch_size],
233 |                                 test_labels_store[test_batch_id:test_batch_id+batch_size])
234 | 
235 |                         error_sum+=error_i
236 |                     test_acc=1.0-error_sum/(len(test_batch_start))
237 |                     if test_acc > max_acc_test:
238 |                         max_acc_test=test_acc
239 |                         store_model_to_file('/home/wenpeng/workspace/SciTail/src/model_para_'+str(max_acc_test), params)
240 |                     print '\t\tcurrent acc:', test_acc,' ; ','\t\tmax_acc:', max_acc_test
241 |                 else:
242 |                     print '\tcurrent dev_acc:', dev_acc,' ; ','\tmax_dev_acc:', max_acc_dev
243 | 
244 | 
245 |         print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
246 |         mid_time = time.time()
247 | 
248 |         #print 'Batch_size: ', update_freq
249 |     end_time = time.time()
250 | 
251 |     print >> sys.stderr, ('The code for file ' +
252 |                           os.path.split(__file__)[1] +
253 |                           ' ran for %.2fm' % ((end_time - start_time) / 60.))
254 | 
255 |     return max_acc_test
256 | 
257 | 
258 | 
259 | if __name__ == '__main__':
260 |     evaluate_lenet5()
261 |     # lr_list=[0.005,0.01,0.02,0.03,0.001]
262 |     # batch_list=[10,20,30,40,50,60,70,80,100]
263 |     # maxlen_list=[20,25,30,35,40,45,50,55]
264 |     #
265 |     # best_acc=0.0
266 |     # best_lr=0.01
267 |     # for lr in lr_list:
268 |     #     acc_test= evaluate_lenet5(learning_rate=lr)
269 |     #     if acc_test>best_acc:
270 |     #         best_lr=lr
271 |     #         best_acc=acc_test
272 |     #     print '\t\t\t\tcurrent best_acc:', best_acc
273 |     # best_batch=50
274 |     # for batch in batch_list:
275 |     #     acc_test= evaluate_lenet5(learning_rate=best_lr,  batch_size=batch)
276 |     #     if acc_test>best_acc:
277 |     #         best_batch=batch
278 |     #         best_acc=acc_test
279 |     #     print '\t\t\t\tcurrent best_acc:', best_acc
280 |     #
281 |     # best_maxlen=40
282 |     # for maxlen in maxlen_list:
283 |     #     acc_test= evaluate_lenet5(learning_rate=best_lr,  batch_size=best_batch, maxSentLen=maxlen)
284 |     #     if acc_test>best_acc:
285 |     #         best_maxlen=maxlen
286 |     #         best_acc=acc_test
287 |     #     print '\t\t\t\tcurrent best_acc:', best_acc
288 |     # print 'Hyper tune finished, best test acc: ', best_acc, ' by  lr: ', best_lr, ' batch: ', best_batch, ' maxlen: ', best_maxlen
289 | 


--------------------------------------------------------------------------------
/src/word2embeddings/.gitignore:
--------------------------------------------------------------------------------
1 | /__init__.pyc
2 | 


--------------------------------------------------------------------------------
/src/word2embeddings/AUTHORS.rst:
--------------------------------------------------------------------------------
1 | Sebastian Ebert <ebert@cis.lmu.de>
2 | 


--------------------------------------------------------------------------------
/src/word2embeddings/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | 


--------------------------------------------------------------------------------
/src/word2embeddings/README.rst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/word2embeddings/README.rst


--------------------------------------------------------------------------------
/src/word2embeddings/__init__.py:
--------------------------------------------------------------------------------
1 | from pkgutil import extend_path
2 | __path__ = extend_path(__path__, __name__)


--------------------------------------------------------------------------------
/src/word2embeddings/apps/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | 
 4 | 
 5 | def use_theano_development_version():
 6 |     """Prepare usage of development version of Theano.
 7 | 
 8 |     Alters the PYTHONPATH variable by removing the paths to installed Theano
 9 |     versions and adding my own development installation.
10 |     CAUTION: this function must be called before importing any of my or Theano's
11 |     libraries.
12 |     """
13 |     print '\nold path:'
14 |     print '\n'.join(sys.path)
15 | 
16 |     # List of possible Theano installation paths for different servers at CIS
17 |     # and on my local machine.
18 |     possible_paths = ['/usr/lib/python2.7/site-packages/Theano-0.6.0-py2.7.egg',
19 |             'C:\\Anaconda\\lib\\site-packages\\theano-current',
20 |             '/usr/local/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg',
21 |             '/usr/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg',
22 |             '/usr/lib/python2.7/site-packages/Theano-0.6.0-py2.7.egg', #delta
23 |             ]
24 | 
25 |     for p in possible_paths:
26 | 
27 |         try:
28 |             sys.path.remove(p)
29 |             print 'removed ', p
30 |         except ValueError:
31 |             pass
32 | 
33 |     sys.path.insert(0, '/mounts/Users/cisintern/ebert/data/promotion/src/theano/')
34 |     sys.path.insert(0, 'Z:\\data\\promotion\\src\\theano\\')
35 |     #sys.path.insert(0, '/mounts/Users/student/irina/Programs/Theano/Theano/')
36 | 
37 |     print 'new path:'
38 |     print '\n'.join(sys.path)
39 | 
40 |     from theano import version
41 | 
42 |     print '\nnew Theano version:', version.full_version
43 | 
44 | #     sys.path.remove('/usr/lib/python2.7/site-packages')
45 | 


--------------------------------------------------------------------------------
/src/word2embeddings/apps/analyze_lbl_distribution.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | """
 4 | from argparse import ArgumentParser
 5 | from logging import getLogger
 6 | import sys
 7 | 
 8 | from cis.deep.utils import logger_config, embeddings, file_line_generator, \
 9 |     utf8_file_open
10 | import numpy as np
11 | 
12 | 
13 | log = getLogger(__name__)
14 | logger_config(log)
15 | 
16 | parser = ArgumentParser(description='Analyze the most likely tokens given a ' +
17 |         'context and their probabilities.')
18 | parser.add_argument('vocabulary', help='vocabulary')
19 | parser.add_argument('distributions',
20 |         help='file containing the LBL predictions')
21 | parser.add_argument('contexts',
22 |         help='file containing contexts')
23 | parser.add_argument('out_file',
24 |         help='result file')
25 | 
26 | def main(argv=None):
27 | 
28 |     if argv is None:
29 |         argv = sys.argv[1:]
30 | 
31 |     args = parser.parse_args(argv)
32 |     log.info('start parameters: ' + str(args))
33 | 
34 |     log.info('loading data')
35 |     vocab = embeddings.read_vocabulary_file(args.vocabulary, False)
36 |     contexts = list(file_line_generator(args.contexts))
37 |     dists = np.loadtxt(args.distributions)
38 | 
39 |     log.info('computing results')
40 |     # Add X in the n-grams' centers
41 |     # Assume we have the same context size left and right.
42 |     x_pos = len(contexts[0].split()) // 2
43 |     contexts = [sp[:x_pos] + ['X'] + sp[x_pos:]
44 |             for sp in [c.split() for c in contexts]]
45 | 
46 |     # Sorts all words for each context descending.
47 |     sort_words_per_context_value = np.sort(dists, 1)[: , ::-1]
48 |     sort_words_per_context_idx = np.argsort(dists, 1)[: , ::-1]
49 | 
50 |     # Sorts all contexts according to their probability assigned to "similar".
51 |     sort_context_for_similar_idx = np.argsort(dists[:, 465])[::-1]
52 |     sort_context_for_similar_value = np.sort(dists[:, 465])[::-1]
53 | 
54 |     log.info('writing data data')
55 | 
56 |     with utf8_file_open(args.out_file, 'w') as likelihood_file:
57 | 
58 |         # Write results to a file
59 |         for (i, idx) in enumerate(sort_context_for_similar_idx):
60 |             likelihood_file.write(u' '.join(contexts[idx]) + u'\t' +
61 |                     unicode(sort_context_for_similar_value[i]) + u'\n')
62 | 
63 |             # 10 most likely words for the current context
64 |             for j in xrange(10):
65 |                 likelihood_file.write(vocab[sort_words_per_context_idx[idx, j]] +
66 |                         u'\t' + unicode(sort_words_per_context_value[idx, j]) +
67 |                         u'\n')
68 | 
69 |             likelihood_file.write(u'\n')
70 | 
71 |     log.info('finished')
72 | 
73 | if __name__ == '__main__':
74 |     sys.exit(main())
75 | 


--------------------------------------------------------------------------------
/src/word2embeddings/apps/classify_imdb_docs.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Example usage:
  4 | X:\sa\corpora\imdb\txt_sentoken
  5 | X:\sa\embeddings\vlbl\wikipedia_small-general-nce5-960\vLBL.vocab
  6 | X:\sa\embeddings\vlbl\wikipedia_small-general-nce5-960\vLBL_960_14-03-30_23-07-11.embeddings_q
  7 | .
  8 | """
  9 | 
 10 | from argparse import ArgumentParser
 11 | import json
 12 | from logging import getLogger
 13 | import os
 14 | import sys
 15 | 
 16 | from scipy.io import mmread
 17 | from sklearn.cross_validation import StratifiedKFold
 18 | from sklearn.metrics.metrics import accuracy_score, confusion_matrix, \
 19 |     precision_recall_fscore_support
 20 | from sklearn.svm import LinearSVC
 21 | 
 22 | from cis.deep.utils import logger_config, file_line_generator, utf8_file_open
 23 | import numpy as np
 24 | from cis.deep.utils.embeddings import read_vocabulary_id_file, \
 25 |     compute_avg_text_embedding
 26 | import itertools
 27 | from cis.deep.utils.classification import calc_metrics
 28 | 
 29 | 
 30 | NO_OF_FOLDS = 10
 31 | 
 32 | log = getLogger(__name__)
 33 | logger_config(log)
 34 | 
 35 | parser = ArgumentParser(
 36 |         description='Perform a 10-fold cross validation on the polarity ' +
 37 |         'dataset v2.0 of [PanLee04].')
 38 | parser.add_argument('corpus_dir',
 39 |         help='location of the pos and neg directories of the dataset')
 40 | parser.add_argument('vocabulary',
 41 |         help='Vocabulary file that contains list of tokens.')
 42 | parser.add_argument('embeddings',
 43 |         help='File that contains the trained word embeddings')
 44 | parser.add_argument('output_dir',
 45 |         help='directory to store the results in')
 46 | 
 47 | def convert_doc(doc, vocab, embs):
 48 |     """Convert the given document into a document vector.
 49 | 
 50 |     Average all word vectors to a final document vector.
 51 | 
 52 |     Parameters
 53 |     ----------
 54 |     doc : str
 55 |         filename of the document
 56 |     vocab : dict(str, int)
 57 |         id vocabulary
 58 |     embs : ndarray
 59 |         embeddings
 60 |     """
 61 | 
 62 |     with utf8_file_open(doc) as f:
 63 |         s = f.read()
 64 |         return compute_avg_text_embedding(s, vocab, embs)
 65 | 
 66 | def do_cross_validation(features, labels):
 67 |     """Perform the k-fold cross validation.
 68 | 
 69 |     Perform the k-fold cross validation, collect the result and return the
 70 |     single test instance predictions, as well as the classification results for
 71 |     each single fold and for the combination of all folds.
 72 | 
 73 |     Keyword arguments:
 74 |     features -- all features
 75 |     labels -- all labels
 76 |     classifier -- code of the classifier to create (see command line arguments)
 77 |     """
 78 |     single_predictions = []  # Store each single classification decision
 79 |     # Store the feature weights after the training
 80 |     weight_vectors = np.zeros((NO_OF_FOLDS, len(features.values()[0])))
 81 |     # Store classification results for each fold and for the entire task (i.e.,
 82 |     # entire cross validation).
 83 |     classification_result = np.zeros((NO_OF_FOLDS + 1, 5))
 84 | 
 85 |     for cur_fold, (train_names, test_names) in enumerate(imdb_cross_folds(features.keys())):
 86 |         train_data = [features[n] for n in train_names]
 87 |         train_labels = [labels[n] for n in train_names]
 88 |         model = train_model(train_data, train_labels)
 89 | 
 90 |         test_data = [features[n] for n in test_names]
 91 |         pred_labels = model.predict(test_data)
 92 |         true_labels = []
 93 | 
 94 |         for i in xrange(len(test_data)):
 95 |             single_predictions.append([cur_fold, test_names[i],
 96 |                     labels[test_names[i]], pred_labels[i]])
 97 |             true_labels.append(labels[test_names[i]])
 98 | 
 99 |         classification_result[cur_fold, :] = get_classification_result(cur_fold,
100 |                 true_labels, pred_labels)
101 | 
102 |         weight_vectors[cur_fold, :] = model.coef_
103 | 
104 |     return single_predictions, classification_result, weight_vectors
105 | 
106 | def get_classification_result(fold_no, true_labels, pred_labels):
107 |     """Return classification resuls for one fold.
108 | 
109 |     Return an array containing accuracy, precision, recall, and f1, based on the
110 |     given true and predicted labels.
111 | 
112 |     Parameters
113 |     ----------
114 |     fold_no : int
115 |         this fold's number
116 |     true_labels list(int)
117 |         true labels
118 |     pred_labels list(int)
119 |         predicted labels
120 | 
121 |     Returns
122 |     -------
123 |     ndarray
124 |         [fold number, accuracy, precision, recall, f1]
125 |     """
126 |     res = calc_metrics(true_labels, pred_labels)
127 |     return np.asarray([fold_no] + [r for r in res])
128 | 
129 | def imdb_cross_folds(filenames):
130 |     """Get the docs for training and testing to be used in a 10-fold x
131 |     validation.
132 | 
133 |     Parameters
134 |     ----------
135 |     filenames : list(str)
136 |         filenames of imdb docs; they contain the fold number 
137 | 
138 |     Returns
139 |     -------
140 |     list(str)
141 |         names of training documents
142 |     list(str)
143 |         names of test documents
144 |     """
145 | 
146 |     for i in xrange(10):
147 |         test = filter(lambda f: f.startswith(u'cv' + unicode(i)), filenames)
148 |         training = filter(lambda f: not f.startswith(u'cv' + unicode(i)), filenames)
149 |         yield (training, test)
150 | 
151 |     raise StopIteration()
152 | 
153 | def load_data(corpus_dir, vocab, embs):
154 |     """Load feature data and labels.
155 | 
156 |     Loads the documents from the imdb corpus and converts them into one feature
157 |     vector per document by averaging the word representations of the text.
158 | 
159 |     Parameters
160 |     ----------
161 |     corpus_dir : str
162 |         location of the dataset
163 |     vocab : dict(str, int)
164 |         id vocabulary
165 |     embs : ndarray(m*n)
166 |         word embeddings
167 | 
168 |     Returns
169 |     -------
170 |     features : dict(str, ndarray)
171 |         map from a document name its document representations, which is the
172 |         averaged word vectors
173 |     labels : dict(str, int)
174 |         map from a document name its label
175 |     """
176 |     pos_docs = os.listdir(os.path.join(corpus_dir, u'pos'))
177 |     num_pos_docs = len(pos_docs)
178 |     pos_docs = [os.path.join(corpus_dir, u'pos/', d) for d in pos_docs]
179 |     neg_docs = os.listdir(os.path.join(corpus_dir, u'neg'))
180 |     neg_docs = [os.path.join(corpus_dir, u'neg/', d) for d in neg_docs]
181 |     docs = pos_docs + neg_docs
182 |     features = dict()
183 |     labels = dict()
184 | 
185 |     for (count, d) in enumerate(docs):
186 |         basename = os.path.basename(d)
187 |         features[basename] = convert_doc(d, vocab, embs)
188 |         labels[basename] = 1 if count < num_pos_docs else 0
189 | 
190 |     return features, labels
191 | 
192 | def train_model(features, labels):
193 |     """Create, train, and return a model using the given features and labels.
194 | 
195 |     Parameters
196 |     ----------
197 |     features : list(ndarray)
198 |         features of training instances
199 |     labels : list(int)
200 |         labels of training instances
201 |     """
202 |     model = LinearSVC()
203 |     model.fit(features, labels)
204 |     return model
205 | 
206 | def main(argv=None):
207 | 
208 |     if argv is None:
209 |         argv = sys.argv[1:]
210 | 
211 |     args = parser.parse_args(argv)
212 |     log.info('start parameters: ' + str(args))
213 | 
214 |     log.info('loading embeddings')
215 |     vocab = read_vocabulary_id_file(args.vocabulary)
216 |     embs = np.loadtxt(args.embeddings)
217 | 
218 |     log.info('loading documents')
219 |     features, labels = load_data(args.corpus_dir, vocab, embs)
220 | 
221 |     log.info('performing cross validation')
222 |     single_predictions, classification_result, weight_vectors = \
223 |             do_cross_validation(features, labels)
224 | 
225 |     log.info('storing results')
226 |     np.savetxt(os.path.join(args.output_dir, 'svm-weights.csv'),
227 |             weight_vectors, '%f', ';', '\n')
228 | 
229 |     with utf8_file_open(os.path.join(args.output_dir, 'predictions.csv'), 'w') \
230 |             as pred_file:
231 |         pred_file.write(u'fold_no;doc;true_label;pred_label\n')
232 | 
233 |         for sp in single_predictions:
234 |             pred_file.write(u';'.join(map(unicode, sp)) + u'\n')
235 | 
236 |     all_true_labels = [sp[2] for sp in single_predictions]
237 |     all_pred_labels = [sp[3] for sp in single_predictions]
238 |     confusion = confusion_matrix(all_true_labels, all_pred_labels)
239 | 
240 |     np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'),
241 |             confusion, '%d', ';', '\n')
242 | 
243 |     classification_result[NO_OF_FOLDS, :] = get_classification_result(-1,
244 |                 all_true_labels, all_pred_labels)
245 | 
246 |     header = u'fold_no;accuracy;precision;recall;f1'
247 |     np.savetxt(os.path.join(args.output_dir, 'metrics.csv'),
248 |             classification_result, '%f', u';', u'\n', header=header)
249 | 
250 |     log.info(classification_result)
251 |     log.info('finished')
252 | 
253 | if __name__ == '__main__':
254 |     sys.exit(main())
255 | 


--------------------------------------------------------------------------------
/src/word2embeddings/apps/create_embeddings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #! /usr/bin/env python
  3 | """
  4 | """
  5 | from argparse import ArgumentParser
  6 | from logging import getLogger
  7 | import logging
  8 | import sys
  9 | 
 10 | # from word2embeddings.apps import use_theano_development_version
 11 | # use_theano_development_version()
 12 | 
 13 | from cis.deep.utils import logger_config
 14 | from word2embeddings.nn.trainer import HingeSentimentMiniBatchTrainer, \
 15 |     HingeSentiment2MiniBatchTrainer, HingeMiniBatchTrainer, \
 16 |     SimpleVLblNceTrainer, SimpleVLblNceSentimentTrainer, \
 17 |     VLblNceTrainer, VLblNceSentimentTrainer, VLblNceDistributionalTrainer, \
 18 |     NlblNceTrainer, NvLblNceTrainer, SLmNceTrainer, LblNceTrainer
 19 | from word2embeddings.tools.util import debug
 20 | 
 21 | log = getLogger(__name__)
 22 | logger_config(log)
 23 | 
 24 | parser = ArgumentParser()
 25 | parser.add_argument('train_file',
 26 |         help='Document for training that contains tokenized text')
 27 | 
 28 | parser.add_argument('--hidden-layers', dest='hidden_layers',
 29 |         help='Width of each hidden layer, comma separated. E.g., ' +
 30 |         '"28,64,32". This option only has an effect for mlp models and ' +
 31 |         'for slm, where only one hidden layer is allowed.')
 32 | 
 33 | parser.add_argument('vocabulary',
 34 |         help='Vocabulary file that contains list of tokens.\nCaution: Add ' +
 35 |         'the special tokens <UNK>, <S>, </S>, <PAD> in this exact order at ' +
 36 |         'the first positions in the vocabulary.')
 37 | 
 38 | 
 39 | parser.add_argument('--sentiment-vocabulary', dest='sent_vocab',
 40 |         help='Vocabulary file that contains sentiment words')
 41 | 
 42 | parser.add_argument('--predict-vocabulary', dest='pred_vocab',
 43 |         help='Vocabulary that contains the items that should be considered ' +
 44 |         'during perplexity computation.\n' +
 45 |         'Caution: Make sure this includes <UNK>.\n' +
 46 |         'Caution2: If this vocabulary does not contain a word that is seen ' +
 47 |         'in prediction this word is not considered during perplexity  ' +
 48 |         'calculation.')
 49 | 
 50 | 
 51 | parser.add_argument('--unigram', dest='unigram',
 52 |         help='file containing the unigram count (the probabilities are ' +
 53 |         'calculated automatically given the counts\n ' +
 54 |         'Caution: Add the ' +
 55 |         'special tokens <UNK>, <S>, </S>, <PAD> in this exact order at the ' +
 56 |         'first positions in the vocabulary.')
 57 | parser.add_argument('--noise-samples', dest='noise_samples', type=int,
 58 |         help='number of noise samples per data sample')
 59 | parser.add_argument('--nce-seed', dest='nce_seed', type=int, default=2345,
 60 |         help='seed for the noise sample generation in NCE')
 61 | 
 62 | 
 63 | parser.add_argument('--validation-file', dest='validation_file', nargs='+',
 64 |         help='Files for validation that contains tokenized text. Multiple ' +
 65 |         'files are supported, with the first file being the main validation ' +
 66 |         'file, i.e., if --dump-best is active, then the performance on the ' +
 67 |         'first file is considered.\n ' +
 68 |         'Note: For all LBL based models the validation cost will be ' +
 69 |         'different even if you provide the same validation file twice, ' +
 70 |         'because the NCE cost computation involves a randomized process.')
 71 | 
 72 | parser.add_argument('--perplexity', action='store_true',
 73 |         help='instead of calculating the error on the validation set, ' +
 74 |         'additionally calculate the perplexity. Caution: does only work ' +
 75 |         'for vLBL models. Note: using ppl in validation is slower.')
 76 | 
 77 | 
 78 | parser.add_argument('--disable-padding', dest='disable_padding',
 79 |         action='store_true', default=False,
 80 |         help='Disable padding sentences while generating examples')
 81 | 
 82 | parser.add_argument('--learn-eos', dest='learn_eos',
 83 |         action='store_true', default=False,
 84 |         help='Learn word embedding for the end-of-sentence token </S>.')
 85 | 
 86 | 
 87 | parser.add_argument('--load-model', dest='load_model',
 88 |         help='Proceed training with the given model file.')
 89 | 
 90 | parser.add_argument('--model-type', dest='model_type',
 91 |         choices=['ColWes08', 'sent_1', 'sent_2', 'vlbl', 'nvlbl',
 92 |                 'vlbl_sent', 'simple_vlbl', 'simple_vlbl_sent', 'vlbl_dist',
 93 |                 'lbl', 'nlbl', 'slm'],
 94 |                 default='ColWes08',
 95 |         help='Type of the model to use for training. All sentiment models ' +
 96 |         'require a sentiment vocabulary.')
 97 | 
 98 | parser.add_argument('--activation-func', dest='activation_func', default='rect',
 99 |         choices=['sigmoid', 'tanh', 'rect', 'softsign'],
100 |         help='Activation function to use in non-linear models.')
101 | 
102 | 
103 | parser.add_argument('--left-context', dest='left_context', type=int,
104 |         default=2,
105 |         help='Left context window to be used measured from the current token')
106 | 
107 | parser.add_argument('--right-context', dest='right_context', type=int,
108 |         default=2,
109 |         help='Right context window measured from the current token')
110 | 
111 | parser.add_argument('--word-embedding-size', dest='word_embedding_size',
112 |         type=int, default=64)
113 | 
114 | 
115 | # Argument for MiniBatchTrainer
116 | parser.add_argument('--epochs-limit', dest='epochs_limit', type=int, default=-1,
117 |         help='maximal number of epochs to train (-1 for no limit)')
118 | 
119 | parser.add_argument('--examples-limit', dest='examples_limit', type=int,
120 |         default=-1,
121 |         help='maximal number of examples to train (-1 for no limit)')
122 | 
123 | parser.add_argument('--early-stopping', dest='early_stopping', type=int,
124 |         default=-1,
125 |         help='Stop the training when N consecutive validations resulted in ' + \
126 |         'worse results than the validation before. -1 to deactivate this ' + \
127 |         'feature.')
128 | 
129 | 
130 | parser.add_argument('--batch-size', dest='batch_size', type=int, default=16)
131 | 
132 | 
133 | parser.add_argument('--learning-rate', dest='learning_rate',
134 |         default=0.1,
135 |         help='Learning rate. If this parameter is a float value than the ' +
136 |         'learning rate is valid for all model parameters. Otherwise, it can ' +
137 |         'contain parameter specific learning rates in using the pattern ' +
138 |         '"param_name1:param_learning_rate1,param_name2:param_learning_rate2\.' +
139 |         'You can also specify a learning rate for only some of your ' +
140 |         'parameters and assign the default learning rate for all other ' +
141 |         'parameters by specifying "default:default_learning_rate".')
142 | 
143 | parser.add_argument('--lr-adaptation', dest='lr_adaptation_method',
144 |         choices=['constant', 'linear', 'adagrad', 'MniTeh12'],
145 |         default='constant',
146 |         help='Sets the method that is used to reduce the learning rate. ' +
147 |         'Supports "linear" (linear reduction) and "adagrad" (AdaGrad ' +
148 |         'algorithm), and "constant" (no reduction), "MniTeh12" (halves the  ' +
149 |         'learning rate whenever the validation perplexity (if "--perplexity" ' +
150 |         'is given) or error (otherwise) goes up; for details see [MniTeh12])')
151 | 
152 | parser.add_argument('--learning-method', dest='learning_method',
153 |         choices=['fan_in', 'global'], default='global',
154 |         help='Determine the method that learning rate is calculated. Two ' +
155 |         'options are available: {fan_in, global}')
156 | 
157 | 
158 | parser.add_argument('--l1-weight', dest='l1_weight', type=float, default=0.0,
159 |         help='Weight of L1 regularization term. 0 to deactivate. ' +
160 |         'Only implemented for LBL models and SLM.')
161 | parser.add_argument('--l2-weight', dest='l2_weight', type=float, default=0.0,
162 |         help='Weight of L2 regularization term. 0 to deactivate. ' +
163 |         'Only implemented for LBL models and SLM.')
164 | 
165 | parser.add_argument('--dump-period', dest='dump_period', type=int, default=-1,
166 |         help='A model will be dumped every x seconds/examples (-1 = no ' +
167 |         'dumping. Only the final model will be dumped.)')
168 | 
169 | parser.add_argument('--load-params', dest='load_params', nargs=2,
170 |         help='Load initial values from files. This parameter requires two ' +
171 |         'arguments: (i) <BASE_FILENAME> and (ii) a comma separated list of ' +
172 |         'parameter names as specified by the individual model. Each parameter' +
173 |         'must be stored in csv file format in an own file. The single ' +
174 |         'parameter files are then expected to be named ' +
175 |         '<BASE_FILENAME>.<PARAMETER_NAMES>.\n ' +
176 |         'Example usage: ~/my_model "C,R" will load ~/my_model.C and ' +
177 |         '~/my_model.R.\n ' +
178 |         'Gzip and bz2 files are supported.')
179 | 
180 | parser.add_argument('--store-params', dest='store_params',
181 |         help='Comma-separated list of parameter names that will be stored ' +
182 |         'each time the model is stored. The parameter names as specified by ' +
183 |         'the individual model. Each parameter is stored in a separate file, ' +
184 |         'e.g., paramter C is stored in <MODEL_NAME>.params.C.')
185 | 
186 | parser.add_argument('--out-dir', dest='out_dir', default='.',
187 |         help='directory where to store the output files')
188 | 
189 | parser.add_argument('--dump-vocabulary', dest='dump_vocabulary',
190 |         action='store_true',
191 |         help='Dump the vocabulary after importing it to remove duplicates.')
192 | 
193 | parser.add_argument('--dump-embeddings', dest='dump_embeddings',
194 |         action='store_true',
195 |         help='Dump the embeddings for every dumped model. Caution: might ' +
196 |         'be a big file.\n ' +
197 |         'Caution: This parameter is deprecated. It\'s not supported by the ' +
198 |         'new vLBL models. Use --store-params instead.')
199 | 
200 | parser.add_argument('--validation-period', dest='validation_period',
201 |         type=float, default=-1,
202 |         help='A model will be evaluated every y seconds/examples. (-1 ' +
203 |         'for never). If a development file is given, the scores on the ' +
204 |         'training data and the validation data is computed, otherwise only ' +
205 |         'the former is computed.')
206 | 
207 | parser.add_argument('--period-type', dest='period_type', default='examples',
208 |         choices=['time', 'examples'],
209 |         help='Set the period to be in seconds or number of examples ' +
210 |         'by setting the option to time or examples.')
211 | 
212 | parser.add_argument('--dump-best', dest='dump_best', action='store_true',
213 |         help='Save the best model every validation period. What "best" ' + \
214 |         'means depends on the type of model. If "--perplexity" is given, ' + \
215 |         'it\'s the model with the lowest perplexity. If not, it\'s the ' + \
216 |         'model with the lowest training error.')
217 | 
218 | parser.add_argument('--dump-each-epoch', dest='dump_each_epoch',
219 |         action='store_true', help='Dump the model after each epoch')
220 | 
221 | parser.add_argument('--dump-initial-model', dest='dump_initial_model',
222 |         action='store_true',
223 |         help='Dump the initial model before any training is done.')
224 | 
225 | 
226 | parser.add_argument('--error-function', dest='error_func',
227 |         default='least_squares', choices=['cross_entropy', 'least_squares'],
228 |         help='defines the used error function (default: least_squares); ' +
229 |         'This parameter is only valid for MLPs.')
230 | 
231 | parser.add_argument('--count-examples', dest='count_examples',
232 |         action='store_true',
233 |         help='Only count the examples in the training file, don\'t train a ' +
234 |         'model.')
235 | 
236 | 
237 | parser.add_argument('--debug-host', dest='debug_host',
238 |         help='Allow remote debugging at the given host IP. Make sure you ' +
239 |         'follow the instructions at ' +
240 |         'http://pydev.org/manual_adv_remote_debugger.html. Especially, the ' +
241 |         'pydevd source must be in the PYTHONPATH and ' +
242 |         'PATHS_FROM_ECLIPSE_TO_PYTHON in pydevd_file_utils.py must be adapted.')
243 | 
244 | def main(argv=None):
245 |     log.info('started application')
246 | 
247 |     log.warning('This script is obsolete. It will not be updated anymore and ' +
248 |         'will be deleted in the future. Use train_model.py instead.')
249 | 
250 |     if argv is None:
251 |         argv = sys.argv[1:]
252 | 
253 |     args = parser.parse_args(argv)
254 | 
255 |     check_args(args)
256 | 
257 |     log.info('start parameters: ' + str(args))
258 | 
259 |     if args.debug_host:
260 |         import pydevd
261 |         pydevd.settrace(host=args.debug_host, stdoutToServer=True,
262 |                 stderrToServer=True)
263 | 
264 |     if log.level == logging.DEBUG:
265 |         sys.excepthook = debug
266 | 
267 |     log.info('creating trainer')
268 | 
269 |     if args.model_type == 'ColWes08':
270 |         log.info('Using ColWes08 trainer')
271 |         trainer = HingeMiniBatchTrainer()
272 |     elif args.model_type == 'sent_1':
273 |         log.info('Using sent_1 trainer')
274 |         trainer = HingeSentimentMiniBatchTrainer()
275 |     elif args.model_type == 'sent_2':
276 |         log.info('Using sent_2 trainer')
277 |         trainer = HingeSentiment2MiniBatchTrainer()
278 |     elif args.model_type == 'simple_vlbl':
279 |         log.info('Using simple LBL trainer that uses noise-contrastive estimation')
280 |         trainer = SimpleVLblNceTrainer()
281 |     elif args.model_type == 'simple_vlbl_sent':
282 |         log.info('Using simple LBL trainer that uses noise-contrastive estimation to create sentiment embeddings')
283 |         trainer = SimpleVLblNceSentimentTrainer()
284 |     elif args.model_type == 'vlbl':
285 |         log.info('Using LBL trainer that uses noise-contrastive estimation')
286 |         trainer = VLblNceTrainer()
287 |     elif args.model_type == 'vlbl_sent':
288 |         log.info('Using LBL trainer that uses noise-contrastive estimation to create sentiment embeddings')
289 |         trainer = VLblNceSentimentTrainer()
290 |     elif args.model_type == 'nvlbl':
291 |         log.info('Using non-linear vLBL NCE trainer')
292 |         trainer = NvLblNceTrainer()
293 |     elif args.model_type == 'lbl':
294 |         log.info('Using linear LBL trainer that uses noise-contrastive estimation')
295 |         trainer = LblNceTrainer()
296 |     elif args.model_type == 'nlbl':
297 |         log.info('Using non-linear LBL trainer that uses noise-contrastive estimation')
298 |         trainer = NlblNceTrainer()
299 |     elif args.model_type == 'vlbl_dist':
300 |         log.info('Using LBL trainer that uses distributional representation of input')
301 |         trainer = VLblNceDistributionalTrainer()
302 |     elif args.model_type == 'slm':
303 |         log.info('Using shallow neural network lm with NCE')
304 |         trainer = SLmNceTrainer()
305 |     else:
306 |         raise ValueError('Unknown model type. Abort')
307 | 
308 |     if args.count_examples is True:
309 |         log.info('counting examples')
310 |         trainer.configure(args)
311 |         count = trainer.count_examples(args.train_file)
312 |         log.info('examples: %d' % count)
313 |     else:
314 |         trainer.prepare_usage(args)
315 |         log.info('training is about to begin')
316 |         trainer.run()
317 | 
318 |     log.info('finished')
319 | 
320 | def check_args(args):
321 | 
322 | 
323 | 
324 | #     if args.epochs_limit == -1 and args.examples_limit == -1:
325 | #         raise ValueError('Either epochs-limit or examples-limit must be given.')
326 |     pass
327 | 
328 | if __name__ == '__main__':
329 |     sys.exit(main())
330 | 


--------------------------------------------------------------------------------
/src/word2embeddings/apps/extract_model_data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | This file contains an application that extracts the vocabulary and embeddings
 4 | from a given model file.
 5 | """
 6 | from argparse import ArgumentParser
 7 | from logging import getLogger
 8 | import sys
 9 | 
10 | from cis.deep.utils import logger_config, load_object_from_file
11 | 
12 | 
13 | log = getLogger(__name__)
14 | logger_config(log)
15 | 
16 | parser = ArgumentParser(description='extract parameters from a given model ' +
17 |         'file')
18 | parser.add_argument('model_file', help='model file')
19 | parser.add_argument('store_params', nargs=2,
20 |         help='The first parameter is the filename, the second is a ' +
21 |         'comma-separated list of parameter names. For more information see ' +
22 |         'the documentation of the --load-params parameter in ' +
23 |         'create_embeddings.py.')
24 | 
25 | parser.add_argument('-f', '--format', default='txt', choices=['txt', 'npy'],
26 |         help='Format of the output files. txt = space separated csv format; ' +
27 |         'npy = binary numpy format')
28 | 
29 | def main(argv=None):
30 | 
31 |     if argv is None:
32 |         argv = sys.argv[1:]
33 | 
34 |     args = parser.parse_args(argv)
35 |     log.info('start parameters: ' + str(args))
36 | 
37 |     log.info('loading data')
38 |     model = load_object_from_file(args.model_file)
39 | 
40 |     log.info('writing data')
41 | #     trainer.dump_vocabulary(args.vocabulary_file)
42 |     model.store_params(args.store_params[0], args.store_params[1], True,
43 |             args.format)
44 |     log.info('finished')
45 | 
46 | if __name__ == '__main__':
47 |     sys.exit(main())
48 | 


--------------------------------------------------------------------------------
/src/word2embeddings/apps/extract_words_with_we.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | This file contains an application that extracts the input and output embeddings
 4 | from a given vlbl or vlbl_dist model file.
 5 | """
 6 | from argparse import ArgumentParser
 7 | from logging import getLogger
 8 | import sys
 9 | import numpy as np
10 | 
11 | from cis.deep.utils import logger_config, load_object_from_file, utf8_file_open, sort_dict_by_label
12 | from cis.deep.utils.embeddings import read_vocabulary_id_file
13 | 
14 | log = getLogger(__name__)
15 | logger_config(log)
16 | 
17 | parser = ArgumentParser(description='Extract input and output word embeddings ' +
18 |         'from a given vLBL or distributional vLBL model file. ' +
19 |         'Output format is "word space embedding". ' +
20 |         'In case of vLBL model, input embeddings are represented with R matrix; ' +
21 |         'output embeddings are represented with Q matrix. ' +
22 |         'In case of vLBL distributional model, input embeddings are ' +
23 |         'represented with D*R matrix, output embeddings are represented with ' +
24 |         'Q matrix.')
25 | parser.add_argument('model_file', help='vlbl or vlbl_dist model file')
26 | parser.add_argument('--model-type', dest='model_type',
27 |         choices=['vlbl', 'vlbl_dist'],
28 |                 default='vlbl',
29 |         help='Type of the model to use for embeddings extraction.')
30 | parser.add_argument("vocabulary",
31 |         help="Vocabulary file that contains list of tokens.")
32 | parser.add_argument("result_file",
33 |         help="Document to which the predictions will be written.")
34 | 
35 | def main(argv=None):
36 | 
37 |     if argv is None:
38 |         argv = sys.argv[1:]
39 | 
40 |     args = parser.parse_args(argv)
41 |     log.info('start parameters: ' + str(args))
42 | 
43 |     log.info('loading data')
44 |     model = load_object_from_file(args.model_file)
45 | 
46 |     # read vocabulary from file
47 |     vocab = sort_dict_by_label(read_vocabulary_id_file(args.vocabulary))
48 | 
49 |     # get matrices from model
50 |     r_matrix = model.R.get_value()
51 |     q_matrix = model.Q.get_value()
52 | 
53 |     # get input embeddings
54 |     if args.model_type == 'vlbl':
55 |         in_we = r_matrix
56 |     elif args.model_type == 'vlbl_dist':
57 |         # this will not work with the old versions of models - because of sparsity
58 |         d_matrix = model.D.get_value().todense()
59 |         in_we = np.dot(d_matrix, r_matrix)
60 |         # need to convert from numpy.matrix to numpy.ndarray
61 |         in_we = in_we.view(type=np.ndarray)
62 | 
63 |     with utf8_file_open(args.result_file + ".in", 'w') as outfile:
64 |         for (word, ind) in vocab:
65 |             outfile.write(unicode(word) + u' ' + u' '.join(map(str, in_we[ind])) + u'\n')
66 | 
67 |     with utf8_file_open(args.result_file + ".out", 'w') as outfile:
68 |         for (word, ind) in vocab:
69 |             outfile.write(unicode(word) + u' ' + u' '.join(map(str, q_matrix[ind])) + u'\n')
70 | 
71 |     log.info('finished')
72 | 
73 | if __name__ == "__main__":
74 |     sys.exit(main())
75 | 


--------------------------------------------------------------------------------
/src/word2embeddings/apps/prepare_brown_file.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | example usage:
 4 | -r
 5 | 18
 6 | X:\sa\dictionary\Brown_clusters\brown-mod
 7 | X:\sa\dictionary\Brown_clusters\brown-mod-fixed_length_right
 8 | """
 9 | 
10 | from argparse import ArgumentParser
11 | from logging import getLogger
12 | import sys
13 | 
14 | from cis.deep.utils import utf8_file_open, file_line_generator, logger_config
15 | from word2embeddings.tools.util import prepare_brown_signature
16 | 
17 | 
18 | log = getLogger(__name__)
19 | logger_config(log)
20 | 
21 | parser = ArgumentParser(description='Prepare a given file that contains ' +
22 |         'Brown clustering signatures for words. Convert the variable length ' +
23 |         'signatures into fixed length signatures.')
24 | parser.add_argument('max_size', help='size of the fixed signatures', type=int)
25 | parser.add_argument('infile', help='input file with variable size signatures')
26 | parser.add_argument('outfile', help='output file with fixed size signatures')
27 | parser.add_argument('-r', '--right', default=False,
28 |         action='store_true',
29 |         help='pad the signatures to the right instead of to the left')
30 | 
31 | def main(argv=None):
32 | 
33 |     if argv is None:
34 |         argv = sys.argv[1:]
35 | 
36 |     args = parser.parse_args(argv)
37 |     log.info('start parameters: ' + str(args))
38 | 
39 |     log.info('transforming data')
40 | 
41 |     with utf8_file_open(args.outfile, 'w') as outfile:
42 |         for line in file_line_generator(args.infile):
43 |             token, signature = line.split(u'\t')
44 |             outfile.write(u'%s\t%s\n' % (token, prepare_brown_signature(
45 |                     signature, args.max_size, args.right)))
46 | 
47 |     log.info('finished')
48 | 
49 | if __name__ == '__main__':
50 |     sys.exit(main())
51 | 


--------------------------------------------------------------------------------
/src/word2embeddings/apps/test_mlp.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #! /usr/bin/env python
 3 | """
 4 | Example usage:
 5 | X:\sa\experiments\contextual_polarity\mlp\easy_test\1-1-features-predict
 6 | X:\sa\experiments\contextual_polarity\mlp\easy_test\1-1-features-predict-out
 7 | MultiLayerPerceptron_1_13-12-05_18-20-37.model
 8 | """
 9 | 
10 | from argparse import ArgumentParser
11 | from logging import getLogger
12 | import logging
13 | import sys
14 | 
15 | 
16 | # CAUTION: remove the Theano path before importing any of my or Theano's
17 | # libraries
18 | 
19 | # print '\n'
20 | # local
21 | # if 'C:\\Anaconda\\lib\\site-packages\\theano_test-current' in sys.path:
22 | #     sys.path.remove('C:\\Anaconda\\lib\\site-packages\\theano_test-current')
23 | #     print 'removed old theano_test path'
24 | # # Calculus
25 | # if '/usr/local/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg' in sys.path:
26 | #     sys.path.remove('/usr/local/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg')
27 | #     print 'removed old theano_test path'
28 | # Omega
29 | # if '/usr/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg' in sys.path:
30 | #     sys.path.remove('/usr/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg')
31 | #     print 'removed old theano_test path'
32 | # sys.path.insert(0, '/mounts/Users/cisintern/ebert/data/promotion/src/deep/src/word2embeddings/main/resources/theano_develop')
33 | # print '\n'.join(sys.path)
34 | # print '\n'
35 | # sys.path.remove('/usr/lib/python2.7/site-packages')
36 | # exit()
37 | 
38 | from cis.deep.utils import utf8_file_open, logger_config
39 | from word2embeddings.nn.predictor import MlpPredictor
40 | from word2embeddings.tools.util import debug
41 | 
42 | log = getLogger(__name__)
43 | logger_config(log)
44 | 
45 | from theano import version
46 | print version.full_version
47 | 
48 | parser = ArgumentParser()
49 | parser.add_argument('--disable-padding', dest='disable_padding',
50 |         action='store_true', default=False,
51 |         help='Disable padding sentences while generating examples')
52 | parser.add_argument('--binary',
53 |         action='store_true',
54 |         help='Predict binary values, i.e., round output values to {0, 1}')
55 | 
56 | parser.add_argument('predict_file',
57 |         help='Document with examples to predict the label of.')
58 | parser.add_argument('result_file',
59 |         help='Document to which the predictions will be written.')
60 | parser.add_argument('load_model',
61 |         help='Proceed training with the given model file.')
62 | 
63 | # Argument for MiniBatchTrainer
64 | # parser.add_argument('--batch-size', dest='batch_size', type=int, default=16)
65 | 
66 | 
67 | def main(argv=None):
68 |     log.info('started application')
69 | 
70 |     if argv is None:
71 |         argv = sys.argv[1:]
72 | 
73 |     args = parser.parse_args()
74 |     log.info('start parameters: ' + str(args))
75 | 
76 |     if log.level == logging.DEBUG:
77 |         sys.excepthook = debug
78 | 
79 |     log.info('creating predictor')
80 |     predictor = MlpPredictor()
81 |     predictor.prepare_usage(args)
82 |     log.info('starting prediction')
83 |     predictions = predictor.run()
84 | 
85 |     log.info('storing results')
86 |     with utf8_file_open(args.result_file, 'w') as outfile:
87 | 
88 |         for p in predictions:
89 | 
90 |             if args.binary:
91 |                 outfile.write(unicode((p > 0.5).astype(int)) + u'\n')
92 |             else:
93 |                 outfile.write(unicode(p) + u'\n')
94 | 
95 |     log.info('finished')
96 | 
97 | if __name__ == '__main__':
98 |     sys.exit(main())
99 | 


--------------------------------------------------------------------------------
/src/word2embeddings/apps/train_mlp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #! /usr/bin/env python
  3 | """
  4 | Example usage:
  5 | --dev-file X:\sa\experiments\contextual_polarity\mlp\easy_test\1-1-features-test
  6 | --epochs-limit 10
  7 | --batch-size 2
  8 | --examples-limit 4
  9 | --dump-period -1
 10 | --validation-period 1000
 11 | --error-function "cross_entropy"
 12 | X:\sa\experiments\contextual_polarity\mlp\easy_test\1-1-features
 13 | 1
 14 | 1
 15 | "1"
 16 | """
 17 | 
 18 | from argparse import ArgumentParser
 19 | from logging import getLogger
 20 | import logging
 21 | import sys
 22 | 
 23 | from cis.deep.utils import logger_config
 24 | from word2embeddings.nn.trainer import MlpTrainer
 25 | from word2embeddings.tools.util import debug
 26 | 
 27 | 
 28 | # import cProfile
 29 | # CAUTION: remove the Theano path before importing any of my or Theanos
 30 | # libraries 
 31 | # print '\n'
 32 | if 'C:\\Anaconda\\lib\\site-packages\\theano-current' in sys.path:
 33 |     sys.path.remove('C:\\Anaconda\\lib\\site-packages\\theano-current')
 34 |     print 'removed old theano path'
 35 | # Calculus
 36 | if '/usr/local/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg' in sys.path:
 37 |     sys.path.remove('/usr/local/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg')
 38 |     print 'removed old theano path'
 39 | # Omega
 40 | # if '/usr/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg' in sys.path:
 41 | #     sys.path.remove('/usr/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg')
 42 | #     print 'removed old theano path'
 43 | # print '\n'.join(sys.path)
 44 | # print '\n'
 45 | # exit()
 46 | 
 47 | 
 48 | log = getLogger(__name__)
 49 | logger_config(log)
 50 | 
 51 | parser = ArgumentParser()
 52 | parser.add_argument('train_file',
 53 |         help='Document for training that contains tokenized text')
 54 | parser.add_argument('input_size', type=int, help='size of the input')
 55 | parser.add_argument('output_size', type=int, help='size of the output')
 56 | parser.add_argument('hidden_layers', default='32',
 57 |         help='Width of each hidden layer, comma separated. E.g., "128,64,32"')
 58 | 
 59 | parser.add_argument('--dev-file', dest='dev_file',
 60 |         help='Document for dev that contains tokenized text. If no file ' +
 61 |         'is given validation will only be performed on the training data.')
 62 | 
 63 | parser.add_argument('--disable-padding', dest='disable_padding',
 64 |         action='store_true', default=False,
 65 |         help='Disable padding sentences while generating examples')
 66 | 
 67 | parser.add_argument('--load-model', dest='load_model',
 68 |         help='Proceed training with the given model file.')
 69 | 
 70 | # Argument for MiniBatchTrainer
 71 | parser.add_argument('--epochs-limit', dest='epochs_limit',
 72 |         type=int, default=1)
 73 | parser.add_argument('--batch-size', dest='batch_size', type=int, default=16)
 74 | parser.add_argument('--learning-rate', dest='learning_rate',
 75 |         type=float, default=0.1)
 76 | parser.add_argument('--decay-learning', dest='decay_learning',
 77 |         choices=['linear'], default='', help='Supports "linear" decay for now.')
 78 | parser.add_argument('--learning-method', dest='learning_method',
 79 |         choices=['fan_in', 'global'], default='global',
 80 |         help='Determine the method that learning rate is calculated. Two ' +
 81 |         'options are available: {fan_in, global}')
 82 | parser.add_argument('--dump-period', dest='dump_period', type=int,
 83 |         default=1800,
 84 |         help='A model will be dumped every x seconds (-1 for never, i.e., ' +
 85 |         'only the final and the best model after training will be dumped.)')
 86 | parser.add_argument('--validation-period', dest='validation_period',
 87 |         type=float, default=5e5,
 88 |         help='A model will be evaluated every y seconds/examples. (-1 ' +
 89 |         'for never). If a development file is given, the scores on the ' +
 90 |         'training data and the validation data is computed, otherwise only ' +
 91 |         'the former is computed.')
 92 | parser.add_argument('--period-type', dest='period_type', default='examples',
 93 |         choices=['time', 'examples'],
 94 |         help='Set the period to be in seconds or number of examples ' +
 95 |         'by setting the option to time or examples.')
 96 | parser.add_argument('--save-best', dest='save_best', action='store_true',
 97 |         help='Save the best model every validation period.')
 98 | parser.add_argument('--dump-each-epoch', dest='dump_each_epoch',
 99 |         action='store_true', help='Dump the model after each epoch')
100 | parser.add_argument('--examples-limit', dest='examples_limit', type=float,
101 |         help='Size of example to be used', default=1e9)
102 | parser.add_argument('--error-function', dest='error_func',
103 |         default='least_squares', choices=['cross_entropy', 'least_squares'],
104 |         help='defines the used error function (default: least_squares)')
105 | 
106 | def main(argv=None):
107 |     log.info('started application')
108 | 
109 |     if argv is None:
110 |         argv = sys.argv[1:]
111 | 
112 |     args = parser.parse_args()
113 |     log.info('start parameters: ' + str(args))
114 | 
115 |     if log.level == logging.DEBUG:
116 |         sys.excepthook = debug
117 | 
118 |     log.info('creating trainer')
119 |     trainer = MlpTrainer()
120 |     trainer.prepare_usage(args)
121 |     log.info('starting training')
122 |     trainer.run()
123 |     log.info('finished')
124 | 
125 | if __name__ == '__main__':
126 | #     cProfile.run('main()')
127 |     
128 |     sys.exit(main())
129 | 


--------------------------------------------------------------------------------
/src/word2embeddings/apps/train_model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #! /usr/bin/env python
  3 | """
  4 | """
  5 | from argparse import ArgumentParser
  6 | from logging import getLogger
  7 | import logging
  8 | import sys
  9 | 
 10 | #from word2embeddings.apps import use_theano_development_version
 11 | #use_theano_development_version()
 12 | 
 13 | from cis.deep.utils import logger_config
 14 | from word2embeddings.nn.trainer import HingeSentimentMiniBatchTrainer, \
 15 |     HingeSentiment2MiniBatchTrainer, HingeMiniBatchTrainer, \
 16 |     SimpleVLblNceTrainer, SimpleVLblNceSentimentTrainer, \
 17 |     VLblNceTrainer, VLblNceSentimentTrainer, VLblNceDistributionalTrainer,\
 18 |     NlblNceTrainer, NvLblNceTrainer, SLmNceTrainer, LblNceTrainer
 19 | from word2embeddings.tools.util import debug
 20 | 
 21 | log = getLogger(__name__)
 22 | logger_config(log)
 23 | 
 24 | parser = ArgumentParser()
 25 | parser.add_argument('train_file',
 26 |         help='Document for training that contains tokenized text')
 27 | 
 28 | parser.add_argument('--hidden-layers', dest='hidden_layers',
 29 |         help='Width of each hidden layer, comma separated. E.g., ' +
 30 |         '"28,64,32". This option only has an effect for mlp models and ' +
 31 |         'for slm, where only one hidden layer is allowed.')
 32 | 
 33 | parser.add_argument('vocabulary',
 34 |         help='Vocabulary file that contains list of tokens.\nCaution: Add ' +
 35 |         'the special tokens <UNK>, <S>, </S>, <PAD> in this exact order at ' +
 36 |         'the first positions in the vocabulary.')
 37 | 
 38 | 
 39 | parser.add_argument('--sentiment-vocabulary', dest='sent_vocab',
 40 |         help='Vocabulary file that contains sentiment words')
 41 | 
 42 | parser.add_argument('--predict-vocabulary', dest='pred_vocab',
 43 |         help='Vocabulary that contains the items that should be considered ' +
 44 |         'during perplexity computation.\n' +
 45 |         'Caution: Make sure this includes <UNK>.\n' +
 46 |         'Caution2: If this vocabulary does not contain a word that is seen ' +
 47 |         'in prediction this word is not considered during perplexity  ' +
 48 |         'calculation.')
 49 | 
 50 | 
 51 | parser.add_argument('--unigram', dest='unigram',
 52 |         help='file containing the unigram count (the probabilities are ' +
 53 |         'calculated automatically given the counts\n ' +
 54 |         'Caution: Add the ' +
 55 |         'special tokens <UNK>, <S>, </S>, <PAD> in this exact order at the ' +
 56 |         'first positions in the vocabulary.')
 57 | parser.add_argument('--noise-samples', dest='noise_samples', type=int,
 58 |         help='number of noise samples per data sample')
 59 | parser.add_argument('--nce-seed', dest='nce_seed', type=int, default=2345,
 60 |         help='seed for the noise sample generation in NCE')
 61 | 
 62 | 
 63 | parser.add_argument('--validation-file', dest='validation_file', nargs='+',
 64 |         help='Files for validation that contains tokenized text. Multiple ' +
 65 |         'files are supported, with the first file being the main validation ' +
 66 |         'file, i.e., if --dump-best is active, then the performance on the ' +
 67 |         'first file is considered.\n ' +
 68 |         'Note: For all LBL based models the validation cost will be ' +
 69 |         'different even if you provide the same validation file twice, ' +
 70 |         'because the NCE cost computation involves a randomized process.')
 71 | 
 72 | parser.add_argument('--perplexity', action='store_true',
 73 |         help='instead of calculating the error on the validation set, ' +
 74 |         'additionally calculate the perplexity. Caution: does only work ' +
 75 |         'for vLBL models. Note: using ppl in validation is slower.')
 76 | 
 77 | 
 78 | parser.add_argument('--disable-padding', dest='disable_padding',
 79 |         action='store_true', default=False,
 80 |         help='Disable padding sentences while generating examples')
 81 | 
 82 | parser.add_argument('--learn-eos', dest='learn_eos',
 83 |         action='store_true', default=False,
 84 |         help='Learn word embedding for the end-of-sentence token </S>.')
 85 | 
 86 | 
 87 | parser.add_argument('--load-model', dest='load_model',
 88 |         help='Proceed training with the given model file.')
 89 | 
 90 | parser.add_argument('--model-type', dest='model_type',
 91 |         choices=['ColWes08', 'sent_1', 'sent_2', 'vlbl', 'nvlbl',
 92 |                 'vlbl_sent', 'simple_vlbl', 'simple_vlbl_sent', 'vlbl_dist',
 93 |                 'lbl', 'nlbl', 'slm'],
 94 |                 default='ColWes08',
 95 |         help='Type of the model to use for training. All sentiment models ' +
 96 |         'require a sentiment vocabulary.')
 97 | 
 98 | parser.add_argument('--activation-func', dest='activation_func', default='rect',
 99 |         choices=['sigmoid', 'tanh', 'rect', 'softsign'],
100 |         help='Activation function to use in non-linear models.')
101 | 
102 | 
103 | parser.add_argument('--left-context', dest='left_context', type=int,
104 |         default=2,
105 |         help='Left context window to be used measured from the current token')
106 | 
107 | parser.add_argument('--right-context', dest='right_context', type=int,
108 |         default=2,
109 |         help='Right context window measured from the current token')
110 | 
111 | parser.add_argument('--word-embedding-size', dest='word_embedding_size',
112 |         type=int, default=64)
113 | 
114 | 
115 | # Argument for MiniBatchTrainer
116 | parser.add_argument('--epochs-limit', dest='epochs_limit', type=int, default=-1,
117 |         help='maximal number of epochs to train (-1 for no limit)')
118 | 
119 | parser.add_argument('--examples-limit', dest='examples_limit', type=int,
120 |         default=-1,
121 |         help='maximal number of examples to train (-1 for no limit)')
122 | 
123 | parser.add_argument('--early-stopping', dest='early_stopping', type=int,
124 |         default=-1,
125 |         help='Stop the training when N consecutive validations resulted in ' + \
126 |         'worse results than the validation before. -1 to deactivate this ' + \
127 |         'feature.')
128 | 
129 | 
130 | parser.add_argument('--batch-size', dest='batch_size', type=int, default=16)
131 | 
132 | 
133 | parser.add_argument('--learning-rate', dest='learning_rate',
134 |         default=0.1,
135 |         help='Learning rate. If this parameter is a float value than the ' +
136 |         'learning rate is valid for all model parameters. Otherwise, it can ' +
137 |         'contain parameter specific learning rates in using the pattern ' +
138 |         '"param_name1:param_learning_rate1,param_name2:param_learning_rate2\.' +
139 |         'You can also specify a learning rate for only some of your ' +
140 |         'parameters and assign the default learning rate for all other ' +
141 |         'parameters by specifying "default:default_learning_rate".')
142 | 
143 | parser.add_argument('--lr-adaptation', dest='lr_adaptation_method',
144 |         choices=['constant', 'linear', 'adagrad', 'MniTeh12'],
145 |         default='constant',
146 |         help='Sets the method that is used to reduce the learning rate. ' +
147 |         'Supports "linear" (linear reduction) and "adagrad" (AdaGrad ' +
148 |         'algorithm), and "constant" (no reduction), "MniTeh12" (halves the  ' +
149 |         'learning rate whenever the validation perplexity (if "--perplexity" ' +
150 |         'is given) or error (otherwise) goes up; for details see [MniTeh12])')
151 | 
152 | parser.add_argument('--learning-method', dest='learning_method',
153 |         choices=['fan_in', 'global'], default='global',
154 |         help='Determine the method that learning rate is calculated. Two ' +
155 |         'options are available: {fan_in, global}')
156 | 
157 | 
158 | parser.add_argument('--l1-weight', dest='l1_weight', type=float, default=0.0,
159 |         help='Weight of L1 regularization term. 0 to deactivate. ' +
160 |         'Only implemented for LBL models and SLM.')
161 | parser.add_argument('--l2-weight', dest='l2_weight', type=float, default=0.0,
162 |         help='Weight of L2 regularization term. 0 to deactivate. ' +
163 |         'Only implemented for LBL models and SLM.')
164 | 
165 | parser.add_argument('--dump-period', dest='dump_period', type=int, default=-1,
166 |         help='A model will be dumped every x seconds/examples (-1 = no ' +
167 |         'dumping. Only the final model will be dumped.)')
168 | 
169 | parser.add_argument('--load-params', dest='load_params', nargs=2,
170 |         help='Load initial values from files. This parameter requires two ' +
171 |         'arguments: (i) <BASE_FILENAME> and (ii) a comma separated list of ' +
172 |         'parameter names as specified by the individual model. Each parameter' +
173 |         'must be stored in csv file format in an own file. The single ' +
174 |         'parameter files are then expected to be named ' +
175 |         '<BASE_FILENAME>.<PARAMETER_NAMES>.\n ' +
176 |         'Example usage: ~/my_model "C,R" will load ~/my_model.C and ' +
177 |         '~/my_model.R.\n ' +
178 |         'Gzip and bz2 files are supported.')
179 | 
180 | parser.add_argument('--store-params', dest='store_params',
181 |         help='Comma-separated list of parameter names that will be stored ' +
182 |         'each time the model is stored. The parameter names as specified by ' +
183 |         'the individual model. Each parameter is stored in a separate file, ' +
184 |         'e.g., paramter C is stored in <MODEL_NAME>.params.C.')
185 | 
186 | parser.add_argument('--out-dir', dest='out_dir', default='.',
187 |         help='directory where to store the output files')
188 | 
189 | parser.add_argument('--dump-vocabulary', dest='dump_vocabulary',
190 |         action='store_true',
191 |         help='Dump the vocabulary after importing it to remove duplicates.')
192 | 
193 | parser.add_argument('--dump-embeddings', dest='dump_embeddings',
194 |         action='store_true',
195 |         help='Dump the embeddings for every dumped model. Caution: might ' +
196 |         'be a big file.\n ' +
197 |         'Caution: This parameter is deprecated. It\'s not supported by the ' +
198 |         'new vLBL models. Use --store-params instead.')
199 | 
200 | parser.add_argument('--validation-period', dest='validation_period',
201 |         type=float, default=-1,
202 |         help='A model will be evaluated every y seconds/examples. (-1 ' +
203 |         'for never). If a development file is given, the scores on the ' +
204 |         'training data and the validation data is computed, otherwise only ' +
205 |         'the former is computed.')
206 | 
207 | parser.add_argument('--period-type', dest='period_type', default='examples',
208 |         choices=['time', 'examples'],
209 |         help='Set the period to be in seconds or number of examples ' +
210 |         'by setting the option to time or examples.')
211 | 
212 | parser.add_argument('--dump-best', dest='dump_best', action='store_true',
213 |         help='Save the best model every validation period. What "best" ' + \
214 |         'means depends on the type of model. If "--perplexity" is given, ' + \
215 |         'it\'s the model with the lowest perplexity. If not, it\'s the ' + \
216 |         'model with the lowest training error.')
217 | 
218 | parser.add_argument('--dump-each-epoch', dest='dump_each_epoch',
219 |         action='store_true', help='Dump the model after each epoch')
220 | 
221 | parser.add_argument('--dump-initial-model', dest='dump_initial_model',
222 |         action='store_true',
223 |         help='Dump the initial model before any training is done.')
224 | 
225 | 
226 | parser.add_argument('--error-function', dest='error_func',
227 |         default='least_squares', choices=['cross_entropy', 'least_squares'],
228 |         help='defines the used error function (default: least_squares); ' +
229 |         'This parameter is only valid for MLPs.')
230 | 
231 | parser.add_argument('--count-examples', dest='count_examples',
232 |         action='store_true',
233 |         help='Only count the examples in the training file, don\'t train a ' +
234 |         'model.')
235 | 
236 | 
237 | parser.add_argument('--debug-host', dest='debug_host',
238 |         help='Allow remote debugging at the given host IP. Make sure you ' +
239 |         'follow the instructions at ' +
240 |         'http://pydev.org/manual_adv_remote_debugger.html. Especially, the ' +
241 |         'pydevd source must be in the PYTHONPATH and ' +
242 |         'PATHS_FROM_ECLIPSE_TO_PYTHON in pydevd_file_utils.py must be adapted.')
243 | 
244 | def main(argv=None):
245 |     log.info('started application')
246 | 
247 |     if argv is None:
248 |         argv = sys.argv[1:]
249 | 
250 |     args = parser.parse_args(argv)
251 | 
252 |     check_args(args)
253 | 
254 |     log.info('start parameters: ' + str(args))
255 | 
256 |     if args.debug_host:
257 |         import pydevd
258 |         pydevd.settrace(host=args.debug_host, stdoutToServer=True,
259 |                 stderrToServer=True)
260 | 
261 |     if log.level == logging.DEBUG:
262 |         sys.excepthook = debug
263 | 
264 |     log.info('creating trainer')
265 | 
266 |     if args.model_type == 'ColWes08':
267 |         log.info('Using ColWes08 trainer')
268 |         trainer = HingeMiniBatchTrainer()
269 |     elif args.model_type == 'sent_1':
270 |         log.info('Using sent_1 trainer')
271 |         trainer = HingeSentimentMiniBatchTrainer()
272 |     elif args.model_type == 'sent_2':
273 |         log.info('Using sent_2 trainer')
274 |         trainer = HingeSentiment2MiniBatchTrainer()
275 |     elif args.model_type == 'simple_vlbl':
276 |         log.info('Using simple LBL trainer that uses noise-contrastive estimation')
277 |         trainer = SimpleVLblNceTrainer()
278 |     elif args.model_type == 'simple_vlbl_sent':
279 |         log.info('Using simple LBL trainer that uses noise-contrastive estimation to create sentiment embeddings')
280 |         trainer = SimpleVLblNceSentimentTrainer()
281 |     elif args.model_type == 'vlbl':
282 |         log.info('Using LBL trainer that uses noise-contrastive estimation')
283 |         trainer = VLblNceTrainer()
284 |     elif args.model_type == 'vlbl_sent':
285 |         log.info('Using LBL trainer that uses noise-contrastive estimation to create sentiment embeddings')
286 |         trainer = VLblNceSentimentTrainer()
287 |     elif args.model_type == 'nvlbl':
288 |         log.info('Using non-linear vLBL NCE trainer')
289 |         trainer = NvLblNceTrainer()
290 |     elif args.model_type == 'lbl':
291 |         log.info('Using linear LBL trainer that uses noise-contrastive estimation')
292 |         trainer = LblNceTrainer()
293 |     elif args.model_type == 'nlbl':
294 |         log.info('Using non-linear LBL trainer that uses noise-contrastive estimation')
295 |         trainer = NlblNceTrainer()
296 |     elif args.model_type == 'vlbl_dist':
297 |         log.info('Using LBL trainer that uses distributional representation of input')
298 |         trainer = VLblNceDistributionalTrainer()
299 |     elif args.model_type == 'slm':
300 |         log.info('Using shallow neural network lm with NCE')
301 |         trainer = SLmNceTrainer()
302 |     else:
303 |         raise ValueError('Unknown model type. Abort')
304 | 
305 |     if args.count_examples is True:
306 |         log.info('counting examples')
307 |         trainer.configure(args)
308 |         count = trainer.count_examples(args.train_file)
309 |         log.info('examples: %d' % count)
310 |     else:
311 |         trainer.prepare_usage(args)
312 |         log.info('training is about to begin')
313 |         trainer.run()
314 | 
315 |     log.info('finished')
316 | 
317 | def check_args(args):
318 | 
319 | 
320 | 
321 | #     if args.epochs_limit == -1 and args.examples_limit == -1:
322 | #         raise ValueError('Either epochs-limit or examples-limit must be given.')
323 |     pass
324 | 
325 | if __name__ == '__main__':
326 |     sys.exit(main())
327 | 


--------------------------------------------------------------------------------
/src/word2embeddings/apps/use_lm.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #! /usr/bin/env python
  3 | """
  4 | """
  5 | 
  6 | from argparse import ArgumentParser
  7 | from logging import getLogger
  8 | import logging
  9 | import sys
 10 | 
 11 | from cis.deep.utils import logger_config
 12 | from word2embeddings.nn.predictor import vLblNCEPredictor
 13 | from word2embeddings.tools.util import debug
 14 | 
 15 | 
 16 | log = getLogger(__name__)
 17 | logger_config(log)
 18 | 
 19 | parser = ArgumentParser()
 20 | parser.add_argument('predict_file',
 21 |         help='Document with examples to predict the label of.')
 22 | 
 23 | parser.add_argument('result_file',
 24 |         help='Document to which the predictions will be written.')
 25 | 
 26 | parser.add_argument('vocabulary',
 27 |         help='Vocabulary file that contains list of tokens.')
 28 | 
 29 | parser.add_argument('load_model',
 30 |         help='Proceed training with the given model file.')
 31 | 
 32 | 
 33 | parser.add_argument('--predict-vocabulary', dest='pred_vocab',
 34 |         help='Vocabulary that contains the items that should be considered ' +
 35 |         'during perplexity computation.\n' +
 36 |         'Caution: Make sure this includes <UNK>.\n' +
 37 |         'Caution2: If this vocabulary does not contain a word that is seen ' +
 38 |         'in prediction this word is not considered during perplexity  ' +
 39 |         'calculation.')
 40 | 
 41 | parser.add_argument('--batch-size', dest='batch_size', type=int, default=100)
 42 | 
 43 | 
 44 | parser.add_argument('-a', '--store_argmax', action='store_true',
 45 |         help='Store the most likely vocabulary item.')
 46 | 
 47 | parser.add_argument('-r', '--store_rank', action='store_true',
 48 |         help='Store the rank of each vocabulary entry according to the ' +
 49 |         'softmax.')
 50 | 
 51 | parser.add_argument('-sm', '--store_softmax', action='store_true',
 52 |         help='Store the whole softmax distributions. Caution: The vocabulary ' +
 53 |         'size can be very high. Therefore, the softmax output, which is a ' +
 54 |         'distribution over all vocabulary items, might become very large, too.')
 55 | 
 56 | parser.add_argument('-nr', '--normalize_with_root', action='store_true',
 57 |         help='Compute the root of the sm distribution and normalize the ' +
 58 |         'vectors to unit length. This only has an effect when -sm is given.')
 59 | 
 60 | parser.add_argument('-ppl', '--perplexity', action='store_true',
 61 |         help='Instead of calculating only the other model outputs, e.g., ' +
 62 |         'softmax, etc., also compute the perplexity on the given text. ' +
 63 |         'If this parameter is given, the predict_file parameter must point ' +
 64 |         'to a text file that is iterate over just as in the training, i.e., ' +
 65 |         'using a window approach. That means, it does not handle single ' +
 66 |         'contexts per line anymore. Caution: does only work for vLBL models. ' +
 67 |         'Note: using ppl in validation is slower.')
 68 | 
 69 | parser.add_argument('-save-word', '--save_word', action='store_true',
 70 |         help='Works only with -ppl parameter. Store next to probability word ' +
 71 |         'from prediction vocabulary. Used during post-processing for the '+
 72 |         'right interpolation.')
 73 | 
 74 | parser.add_argument('-pr', '--predictions', action='store_true',
 75 |         help='Store predicted embeddings for each context.')
 76 | 
 77 | parser.add_argument('-i', '--information', action='store_true',
 78 |         help='Store additional information for every prediction, e.g., (k ' +
 79 |         'nearest neighboring words).')
 80 | 
 81 | parser.add_argument('--debug-host', dest='debug_host',
 82 |         help='Allow remote debugging at the given host IP. Make sure you ' +
 83 |         'follow the instructions at ' +
 84 |         'http://pydev.org/manual_adv_remote_debugger.html. Especially, the ' +
 85 |         'pydevd source must be in the PYTHONPATH and ' +
 86 |         'PATHS_FROM_ECLIPSE_TO_PYTHON in pydevd_file_utils.py must be adapted.')
 87 | 
 88 | 
 89 | def main(argv=None):
 90 |     log.info('started application')
 91 | 
 92 |     log.warning('This script is obsolete. It will not be updated anymore and ' +
 93 |         'will be deleted in the future. Use use_model.py instead.')
 94 | 
 95 |     if argv is None:
 96 |         argv = sys.argv[1:]
 97 | 
 98 |     args = parser.parse_args()
 99 |     log.info('start parameters: ' + str(args))
100 | 
101 |     if args.debug_host:
102 |         import pydevd
103 |         pydevd.settrace(host=args.debug_host, stdoutToServer=True,
104 |                 stderrToServer=True)
105 | 
106 |     if log.level == logging.DEBUG:
107 |         sys.excepthook = debug
108 | 
109 |     log.info('creating predictor')
110 |     predictor = vLblNCEPredictor()
111 |     predictor.prepare_usage(args)
112 |     log.info('starting prediction')
113 |     predictor.run()
114 |     log.info('finished')
115 | 
116 | if __name__ == '__main__':
117 |     sys.exit(main())
118 | 


--------------------------------------------------------------------------------
/src/word2embeddings/apps/use_model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #! /usr/bin/env python
  3 | """
  4 | """
  5 | 
  6 | from argparse import ArgumentParser
  7 | from logging import getLogger
  8 | import logging
  9 | import sys
 10 | 
 11 | from cis.deep.utils import logger_config
 12 | from word2embeddings.nn.predictor import vLblNCEPredictor
 13 | from word2embeddings.tools.util import debug
 14 | 
 15 | 
 16 | log = getLogger(__name__)
 17 | logger_config(log)
 18 | 
 19 | parser = ArgumentParser()
 20 | parser.add_argument('predict_file',
 21 |         help='Document with examples to predict the label of.')
 22 | 
 23 | parser.add_argument('result_file',
 24 |         help='Document to which the predictions will be written.')
 25 | 
 26 | parser.add_argument('vocabulary',
 27 |         help='Vocabulary file that contains list of tokens.')
 28 | 
 29 | parser.add_argument('load_model',
 30 |         help='Proceed training with the given model file.')
 31 | 
 32 | 
 33 | parser.add_argument('--predict-vocabulary', dest='pred_vocab',
 34 |         help='Vocabulary that contains the items that should be considered ' +
 35 |         'during perplexity computation.\n' +
 36 |         'Caution: Make sure this includes <UNK>.\n' +
 37 |         'Caution2: If this vocabulary does not contain a word that is seen ' +
 38 |         'in prediction this word is not considered during perplexity  ' +
 39 |         'calculation.')
 40 | 
 41 | parser.add_argument('--batch-size', dest='batch_size', type=int, default=100)
 42 | 
 43 | 
 44 | parser.add_argument('-a', '--store_argmax', action='store_true',
 45 |         help='Store the most likely vocabulary item.')
 46 | 
 47 | parser.add_argument('-r', '--store_rank', action='store_true',
 48 |         help='Store the rank of each vocabulary entry according to the ' +
 49 |         'softmax.')
 50 | 
 51 | parser.add_argument('-sm', '--store_softmax', action='store_true',
 52 |         help='Store the whole softmax distributions. Caution: The vocabulary ' +
 53 |         'size can be very high. Therefore, the softmax output, which is a ' +
 54 |         'distribution over all vocabulary items, might become very large, too.')
 55 | 
 56 | parser.add_argument('-nr', '--normalize_with_root', action='store_true',
 57 |         help='Compute the root of the sm distribution and normalize the ' +
 58 |         'vectors to unit length. This only has an effect when -sm is given.')
 59 | 
 60 | parser.add_argument('-ppl', '--perplexity', action='store_true',
 61 |         help='Instead of calculating only the other model outputs, e.g., ' +
 62 |         'softmax, etc., also compute the perplexity on the given text. ' +
 63 |         'If this parameter is given, the predict_file parameter must point ' +
 64 |         'to a text file that is iterate over just as in the training, i.e., ' +
 65 |         'using a window approach. That means, it does not handle single ' +
 66 |         'contexts per line anymore. Caution: does only work for vLBL models. ' +
 67 |         'Note: using ppl in validation is slower.')
 68 | 
 69 | parser.add_argument('-save-word', '--save_word', action='store_true',
 70 |         help='Works only with -ppl parameter. Store next to probability word ' +
 71 |         'from prediction vocabulary. Used during post-processing for the '+
 72 |         'right interpolation.')
 73 | 
 74 | parser.add_argument('-pr', '--predictions', action='store_true',
 75 |         help='Store predicted embeddings for each context.')
 76 | 
 77 | parser.add_argument('-i', '--information', action='store_true',
 78 |         help='Store additional information for every prediction, e.g., (k ' +
 79 |         'nearest neighboring words).')
 80 | 
 81 | parser.add_argument('--debug-host', dest='debug_host',
 82 |         help='Allow remote debugging at the given host IP. Make sure you ' +
 83 |         'follow the instructions at ' +
 84 |         'http://pydev.org/manual_adv_remote_debugger.html. Especially, the ' +
 85 |         'pydevd source must be in the PYTHONPATH and ' +
 86 |         'PATHS_FROM_ECLIPSE_TO_PYTHON in pydevd_file_utils.py must be adapted.')
 87 | 
 88 | 
 89 | def main(argv=None):
 90 |     log.info('started application')
 91 | 
 92 |     if argv is None:
 93 |         argv = sys.argv[1:]
 94 | 
 95 |     args = parser.parse_args()
 96 |     log.info('start parameters: ' + str(args))
 97 | 
 98 |     if args.debug_host:
 99 |         import pydevd
100 |         pydevd.settrace(host=args.debug_host, stdoutToServer=True,
101 |                 stderrToServer=True)
102 | 
103 |     if log.level == logging.DEBUG:
104 |         sys.excepthook = debug
105 | 
106 |     log.info('creating predictor')
107 |     predictor = vLblNCEPredictor()
108 |     predictor.prepare_usage(args)
109 |     log.info('starting prediction')
110 |     predictor.run()
111 |     log.info('finished')
112 | 
113 | if __name__ == '__main__':
114 |     sys.exit(main())
115 | 


--------------------------------------------------------------------------------
/src/word2embeddings/lm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/word2embeddings/lm/__init__.py


--------------------------------------------------------------------------------
/src/word2embeddings/lm/networks.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """ Language modeling networks used by the trainer."""
 5 | 
 6 | import theano
 7 | 
 8 | from word2embeddings.nn.layers import EmbeddingLayer, HingeLayer
 9 | from word2embeddings.nn.networks import Network, StackedBiasedHidden
10 | 
11 | 
12 | class WordPhraseNetwork(Network):
13 |     """Model to distinguish between corrupted phrases and observed ones."""
14 | 
15 |     def __init__(self, name='WordPhrase', emb_matrix_shape=None, no_of_tokens=1,
16 |                              hidden_layers=[1]):
17 |         super(WordPhraseNetwork, self).__init__(name=name)
18 |         _, word_size = emb_matrix_shape
19 |         layers = [no_of_tokens * word_size]
20 |         layers.extend(hidden_layers)
21 |         layers.append(1)
22 |         self.word_embedding = EmbeddingLayer(name='w_embedding',
23 |                 shape=emb_matrix_shape)
24 |         self.hidden_stack = StackedBiasedHidden(name='w_stack', layers=layers)
25 |         self.loss = HingeLayer(name='loss')
26 | 
27 |         self.layers = [self.word_embedding, self.hidden_stack, self.loss]
28 | 
29 |     def link(self, inputs):
30 |         self.inputs = inputs
31 |         observed_phrases = inputs[0]
32 |         corrupted_phrases = inputs[1]
33 |         observed_words = self.word_embedding.link([observed_phrases])[0]
34 |         observed_scores = self.hidden_stack.link([observed_words])[0]
35 |         corrupted_scores = theano.clone(observed_scores,
36 |                 {observed_phrases: corrupted_phrases})
37 |         self.outputs = self.loss.link([observed_scores, corrupted_scores])
38 |         return self.outputs
39 | 
40 |     def get_word_embeddings(self):
41 |         return self.word_embedding.weights.get_value()
42 | 


--------------------------------------------------------------------------------
/src/word2embeddings/nn/.gitignore:
--------------------------------------------------------------------------------
1 | /__init__.pyc
2 | /layers.pyc
3 | /util.pyc
4 | 


--------------------------------------------------------------------------------
/src/word2embeddings/nn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/word2embeddings/nn/__init__.py


--------------------------------------------------------------------------------
/src/word2embeddings/nn/predictor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This file contains classes dealing with labeled data.
  4 | """
  5 | from logging import getLogger
  6 | 
  7 | from scipy.stats._rank import rankdata
  8 | from theano import tensor as T
  9 | 
 10 | from cis.deep.utils import load_object_from_file, text_to_vocab_indices, \
 11 |     file_line_generator, sort_dict_by_label, utf8_file_open, log_iterations, \
 12 |     ndarray_to_string
 13 | from cis.deep.utils import logger_config
 14 | from cis.deep.utils.embeddings import read_vocabulary_id_file
 15 | from cis.deep.utils.theano import debug_print
 16 | import numpy as np
 17 | from word2embeddings.nn.layers import floatX, intX
 18 | from word2embeddings.nn.trainer import MiniBatchRunner
 19 | from word2embeddings.tools.examples_generator import PlainExampleGenerator, \
 20 |     PaddedWindowExamplesGenerator
 21 | 
 22 | 
 23 | log = getLogger(__name__)
 24 | logger_config(log)
 25 | 
 26 | class MiniBatchPredictor(MiniBatchRunner):
 27 |     """Base class for predictors that work in mini batches."""
 28 |     example_iterator_type = PlainExampleGenerator
 29 | 
 30 |     def __init__(self):
 31 |         super(MiniBatchPredictor, self).__init__()
 32 | 
 33 |     def configure(self, args):
 34 |         super(MiniBatchPredictor, self).configure(args)
 35 |         self.predict_file = args.predict_file
 36 | 
 37 |     def get_model(self):
 38 |         self.model = load_object_from_file(self.load_model)
 39 |         self.predictor_method = self.model.predictor
 40 | 
 41 |     def run(self):
 42 |         """Predict the output of the model on all test examples.
 43 | 
 44 |         Returns
 45 |         -------
 46 |         list
 47 |             list of predicitons
 48 |         """
 49 |         predictions = []
 50 | 
 51 |         for example in self.epoch_iter(self.predict_file):
 52 |             predictions.append(self.model.predictor([example]))
 53 | 
 54 |         return predictions
 55 | 
 56 |     def predict_single(self):
 57 |         """Predict the output of the model on all test examples yielding one
 58 |         example at a time.
 59 | 
 60 |         Returns
 61 |         -------
 62 |         list
 63 |             list of results for the current example
 64 |         """
 65 |         for example in self.epoch_iter(self.predict_file):
 66 |             yield self.predictor_method([example])
 67 | 
 68 | 
 69 | class LblPredictor(MiniBatchPredictor):
 70 | 
 71 |     def __init__(self):
 72 |         super(LblPredictor, self).__init__()
 73 |         self.input_data = T.matrix('input_data', dtype=floatX)
 74 | #         self.label = T.matrix('label', dtype=intX)
 75 |         self.inputs = [self.input_data]
 76 | 
 77 |     def configure(self, args):
 78 |         super(LblPredictor, self).configure(args)
 79 |         self.vocab = read_vocabulary_id_file(args.vocabulary)
 80 |         self.vocab_size = len(self.vocab.keys())
 81 |         self.effective_vocab_size = len(self.vocab.keys())
 82 | 
 83 |     def process_example(self, example):
 84 |         """Convert the given example in handable data structures.
 85 | 
 86 |         Splits vectors in their single values and converts the labels into ints
 87 |         and the data into floats.
 88 | 
 89 |         Returns
 90 |         -------
 91 |         list(str)
 92 |             input text
 93 |         """
 94 | #         return example.split(' ')
 95 |         return text_to_vocab_indices(self.vocab, example)[0]
 96 | 
 97 | 
 98 | class vLblNCEPredictor(MiniBatchPredictor):
 99 |     def __init__(self):
100 |         super(vLblNCEPredictor, self).__init__()
101 |         self.h_indices = debug_print(T.imatrix('h'), 'h')
102 |         self.inputs = [self.h_indices]
103 | 
104 |     def configure(self, args):
105 |         super(vLblNCEPredictor, self).configure(args)
106 |         self.vocab = read_vocabulary_id_file(args.vocabulary)
107 |         self.vocab_size = len(self.vocab.keys())
108 |         self.effective_vocab_size = len(self.vocab.keys())
109 |         self.perplexity = args.perplexity
110 |         self.save_word = args.save_word
111 |         self.result_file = args.result_file
112 |         self.store_rank = args.store_rank
113 |         self.store_argmax = args.store_argmax
114 |         self.store_softmax = args.store_softmax
115 |         self.normalize_with_root = args.normalize_with_root
116 |         self.information = args.information
117 |         self.predictions = args.predictions
118 | 
119 |         # This code is taken from SimpleVLblNceTrainer
120 |         if args.pred_vocab:
121 |             # Element i contains the index of the i'th prediction vocabulary
122 |             # token in the original vocabulary.
123 |             self.vocab_mapping_list = list()
124 | 
125 |             # Mapping from the model vocabulary to the prediction vocabulary
126 |             # indices
127 |             self.vocab_mapping = dict()
128 | 
129 |             for i, token in enumerate(file_line_generator(args.pred_vocab)):
130 | 
131 |                 if not token in self.vocab:
132 |                     raise ValueError('Token "%s" in prediction vocabulary ' +
133 |                             'does not exist in model vocabulary.' % token)
134 | 
135 |                 self.vocab_mapping_list.append(self.vocab[token])
136 |                 self.vocab_mapping[self.vocab[token]] = i
137 |         else:
138 |             self.vocab_mapping_list = range(len(self.vocab))
139 |             self.vocab_mapping = dict(
140 |                     zip(self.vocab_mapping_list, self.vocab_mapping_list))
141 | 
142 |         if self.perplexity:
143 |             self.example_iterator_type = PaddedWindowExamplesGenerator
144 |             self.example_processor = self._process_example_full_text
145 |             self.learn_eos = True  # We need to set that because otherwise PaddedWindowExampleGenerator will ignore end-of-sentence tags (</S>)
146 |             self.disable_padding = False
147 |             self.w_indices = debug_print(T.imatrix('w'), 'w')
148 |             self.inputs.append(self.w_indices)
149 |         else:
150 |             self.example_processor = self._process_example_context_per_line
151 | 
152 |     def get_model(self):
153 |         super(vLblNCEPredictor, self).get_model()
154 | 
155 |         if self.perplexity:
156 |             self.left_context = self.model.left_context
157 |             self.right_context = self.model.right_context
158 | 
159 |     def predict_single(self):
160 |         """Predict the output of the model on all test examples yielding one
161 |         example at a time.
162 | 
163 |         Returns
164 |         -------
165 |         list
166 |             list of results for the current example
167 |         """
168 |         for example in self.epoch_iter(self.predict_file):
169 |             example = [example]
170 | 
171 |             if self.perplexity:
172 |                 # Pass only the context, not the target word
173 |                 yield example, self.predictor_method(zip(*example)[0])
174 |             else:
175 |                 yield example, self.predictor_method(example)
176 | 
177 |     def process_example(self, example):
178 |         """Convert the given example in handable data structures.
179 | 
180 |         Splits vectors in their single values and converts the labels into ints
181 |         and the data into floats.
182 | 
183 |         Returns
184 |         -------
185 |         list(str)
186 |             input text
187 |         """
188 |         log.debug(example)
189 |         res = self.example_processor(example)
190 |         log.debug(res)
191 |         return res[0]
192 | 
193 |     def _process_example_context_per_line(self, example):
194 |         """Process the given example that contains only the context and not the
195 |         target word.
196 |         """
197 |         return text_to_vocab_indices(self.vocab, example)
198 | 
199 |     def _process_example_full_text(self, example):
200 |         """Process the given example that contains context and target word.
201 | 
202 |         The implementation is taken from SimpleVLblNceTrainer.process_example.
203 |         """
204 |         idx, example = text_to_vocab_indices(self.vocab, example)
205 |         return (idx[:self.model.left_context] if self.model.right_context == 0 else
206 |                 idx[:self.model.left_context] + idx[self.model.left_context + 1:],
207 |                 idx[self.model.left_context]), example
208 | 
209 |     def run(self):
210 |         vocab = dict(self.vocab)
211 | 
212 |         # Get a mapping from index to word
213 |         vocab_entries = sort_dict_by_label(vocab)
214 |         vocab_entries = zip(*vocab_entries)[0]
215 |         log_probabs = 0.
216 |         num_ppl_examples = 0
217 |         num_examples = 0
218 | 
219 |         with utf8_file_open(self.result_file, 'w') as outfile:
220 | 
221 |             for batch, _ in self.next_batch(self.predict_file):
222 |             # Handle each prediction
223 | #             for (cur_count, (example, predictions)) in enumerate(self.predict_single()):
224 | 
225 |                 log_iterations(log, num_examples, 10000)
226 |                 num_examples += len(batch)
227 | 
228 |                 if self.perplexity:
229 |                     batch = zip(*batch)
230 |                     # Pass only the context, not the target word
231 |                     predictions = self.predictor_method(batch[0])
232 |                 else:
233 |                     self.predictor_method(batch)
234 | 
235 |                 if self.store_softmax or self.store_rank or self.store_argmax \
236 |                         or self.information or self.perplexity:
237 |                     sm, probabs, cur_log_probabs, cur_num_ppl_examples = \
238 |                             self._calc_probabilities_from_similarity(batch[1], predictions[1])
239 |                     num_ppl_examples += cur_num_ppl_examples
240 | 
241 |                 if self.store_rank or self.information:
242 |                     # rankdata sorts ascending, i.e., distances, but we have
243 |                     # similarities, hence, 1-sm
244 |                     ranks = rankdata(1 - sm, method='min').astype(int)
245 | 
246 |                     if self.store_rank:
247 |                         outfile.write(ndarray_to_string(ranks))
248 | 
249 |                     if self.information:
250 |                         unique_ranks = set(ranks)
251 |                         hard_idx = vocab[u'hard']
252 |                         sorted_unique_ranks = ' '.join(map(str, sorted(unique_ranks)))
253 |                         sorted_unique_ranks = ''
254 |                         top_ten_entries = ' '.join([vocab_entries[i] for i in np.argsort(1 - sm)[:10]])
255 |                         print '#%d\t%s\t%s' % (ranks[hard_idx],
256 |                                 sorted_unique_ranks,
257 |                                 top_ten_entries)
258 | 
259 |                 if self.store_argmax:
260 |                     maximum = np.argmax(sm)
261 |     #                 outfile.write(vocab_entries[maximum] + u' (%d)\t' % maximum)
262 |                     outfile.write(vocab_entries[maximum])
263 | 
264 |                 if self.store_softmax:
265 | 
266 |                     if self.normalize_with_root:
267 |                         sm = np.sqrt(sm)
268 |                         sm = sm / np.linalg.norm(sm, 2, axis=-1)
269 | 
270 |                     outfile.write(ndarray_to_string(sm))
271 | 
272 |                 if self.perplexity:
273 | 
274 |                     if self.save_word:
275 |                         indices_in_predict_vocab = [self.vocab_mapping[batch[1][i]] for i in range(len(batch[1]))]
276 |                         indices_in_original_vocab = [self.vocab_mapping_list[i] for i in indices_in_predict_vocab]
277 |                         words = [self.vocab.keys()[self.vocab.values().index(i)] for i in indices_in_original_vocab]
278 | 
279 |                         outfile.write( u'\n'.join("%s %s" % (x, y) for x, y in zip(map(unicode, probabs), words)) )
280 |                     else:
281 |                         outfile.write(u'\n'.join(map(unicode, probabs)))
282 | 
283 |                     log_probabs += cur_log_probabs if cur_log_probabs is not np.nan else 0.
284 | 
285 |                 if self.predictions:
286 |                     outfile.write(ndarray_to_string(predictions[0][0]))
287 | 
288 |                 outfile.write(u'\n')
289 | 
290 |             # print all results
291 |     #         for predictions in predictions:
292 |     #             outfile.write(ndarray_to_string(predictions[0][0]) + u'\t')
293 |     #
294 |     #             if args.store_softmax:
295 |     #                 outfile.write(ndarray_to_string(predictions[1][0]) + u'\t')
296 |     #
297 |     #             outfile.write(vocab_entries[predictions[2][0]] + u' (%d)' % predictions[2][0])
298 |     #             outfile.write(u'\n')
299 |     # #             outfile.write(unicode(predictions) + u'\n')
300 |         if self.perplexity:
301 |             ppl = np.exp(-1. / (num_ppl_examples) * log_probabs)
302 |             log.info('Perplexity on %d examples is %f', num_ppl_examples, ppl)
303 | 
304 | 
305 | class MlpPredictor(MiniBatchPredictor):
306 | 
307 |     def __init__(self):
308 |         super(MlpPredictor, self).__init__()
309 |         self.input_data = T.matrix('input_data', dtype=floatX)
310 |         self.label = T.matrix('label', dtype=intX)
311 |         self.inputs = [self.label, self.input_data]
312 | 
313 |     def process_example(self, example):
314 |         """Convert the given example in handable data structures.
315 | 
316 |         Splits vectors in their single values and converts the labels into ints
317 |         and the data into floats.
318 | 
319 |         Returns
320 |         -------
321 |         list(float)
322 |             values
323 |         """
324 |         return map(float, example.split(' '))
325 | 


--------------------------------------------------------------------------------
/src/word2embeddings/nn/tools.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """tools.py: Tools for dealing with Networks."""
 5 | 
 6 | from cis.deep.utils import file_line_generator
 7 | import numpy as np
 8 | 
 9 | 
10 | LOG_FORMAT = '%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s'
11 | 
12 | 
13 | def read_unigram_distribution(filename):
14 |     """Read the unigram distribution for all vocabulary items from the file.
15 | 
16 |     1 probability per line.
17 |     Caution: Don't forget to add the 4 special tokens, e.g., <UNK>. Besides
18 |     <UNK> we don't want to draw them as noise, therefore they should have
19 |     a count of 0.
20 |     """
21 |     unigram_dist = read_unigram_frequencies(filename)
22 | 
23 |     # Note: use the same datatype as Theano's floatX here, to avoid problems.
24 |     return np.asarray(unigram_dist, 'float32') / np.sum(unigram_dist)
25 | 
26 | def read_unigram_frequencies(filename):
27 |     """Read the unigram frequencies for all vocabulary items from the file.
28 | 
29 |     1 frequency per line.
30 |     Caution: Don't forget to add the 4 special tokens, e.g., <UNK>. Besides
31 |     <UNK> we don't want to draw them as noise, therefore they should have
32 |     a count of 0.
33 |     """
34 |     unigram_dist = []
35 | 
36 |     for line in file_line_generator(filename):
37 |         unigram_dist.append(int(line))
38 | 
39 |     return unigram_dist
40 | 


--------------------------------------------------------------------------------
/src/word2embeddings/nn/util.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """util.py: Useful functions"""
 5 | 
 6 | import numpy
 7 | from theano import tensor as T
 8 | 
 9 | 
10 | def zero_value(shape, dtype):
11 |     return numpy.zeros(shape, dtype=dtype)
12 | 
13 | def random_value_GloBen10(shape, dtype, random_generator=None, no_of_units=None):
14 |     """
15 |     Return a randomly initialized matrix using a uniform distribution.
16 | 
17 |     Returns a randomly initialized matrix using the method proposed in
18 |     [GloBen10].
19 | 
20 |     Parameters
21 |     ----------
22 |     shape : (int, int)
23 |         size of the matrix that needs to be initialized
24 |     dtype : dtype
25 |         datatype of the random values
26 |     random_generator : numpy.random.RandomState
27 |         random number generator; if None a new instance will automatically be
28 |         created
29 |     no_of_units : (int, int)
30 |         number of input and output dimensions; if None it will be the same as
31 |         shape
32 |     """
33 |     # `W` is initialized with `W_values` which is uniformely sampled
34 |     # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
35 |     # for tanh activation function
36 |     # the output of uniform if converted using asarray to dtype
37 |     # theano.config.floatX so that the code is runable on GPU
38 |     # Note : optimal initialization of weights is dependent on the
39 |     #                activation function used (among other things).
40 |     #                For example, results presented in [Xavier10] suggest that you
41 |     #                should use 4 times larger initial weights for sigmoid
42 |     #                compared to tanh
43 |     #                We have no info for other function, so we use the same as
44 |     #                tanh.
45 |     if not random_generator:
46 |         random_generator = numpy.random.RandomState(1234)
47 | 
48 |     if no_of_units is None:
49 |         total_dimensions = numpy.sum(shape)
50 |     else:
51 |         total_dimensions = numpy.sum(no_of_units)
52 | 
53 |     low = -numpy.sqrt(6. / total_dimensions)
54 |     high = numpy.sqrt(6. / total_dimensions)
55 |     random_values = random_generator.uniform(low=low, high=high, size=shape)
56 |     W_values = numpy.asarray(random_values, dtype=dtype)
57 |     return W_values
58 | 
59 | def random_value_normal(shape, dtype, random_generator=None):
60 |     """Return a randomly initialized matrix using a normal distribution.
61 | 
62 |     Returns random numbers from a zero-mean Gaussian with 0.01 std dev. This
63 |     std dev value has been proposed by [Hin10].
64 | 
65 |     Parameters
66 |     ----------
67 |     shape : (int, int)
68 |         size of the matrix that needs to be initialized
69 |     dtype : dtype
70 |         datatype of the random values
71 |     random_generator : numpy.random.RandomState
72 |         random number generator; if None a new instance will automatically be
73 |         created
74 |     """
75 | 
76 |     if not random_generator:
77 |         random_generator = numpy.random.RandomState(1234)
78 | 
79 |     random_values = random_generator.normal(scale=0.01, size=shape)
80 |     W_values = numpy.asarray(random_values, dtype=dtype)
81 |     return W_values
82 | 
83 | def threshold(x):
84 |     """An approximation of sigmoid.
85 | 
86 |     More approximate and faster than ultra_fast_sigmoid.
87 | 
88 |     Approx in 3 parts: 0, scaled linear, 1
89 | 
90 |     Removing the slope and shift does not make it faster.
91 | 
92 |     """
93 | #     x = theano.printing.Print('x')(x)
94 | #     gt = theano.printing.Print('gt')(T.gt(x, 0.5))
95 | #     return gt
96 |     return T.gt(x, 0.5)
97 | 
98 | 


--------------------------------------------------------------------------------
/src/word2embeddings/tools/.gitignore:
--------------------------------------------------------------------------------
1 | /__init__.pyc
2 | /theano_extensions.pyc
3 | 


--------------------------------------------------------------------------------
/src/word2embeddings/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/word2embeddings/tools/__init__.py


--------------------------------------------------------------------------------
/src/word2embeddings/tools/examples_generator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """examples_generator.py: Examples generator for language models training."""
  5 | 
  6 | from cis.deep.utils import file_line_generator
  7 | from cis.deep.utils.embeddings import SpecialToken
  8 | 
  9 | 
 10 | class Error(Exception):
 11 |     """ Base class to be used for other module's exceptions."""
 12 | 
 13 | 
 14 | class SpanNotUsedError(Error):
 15 |     """ Raised if the a token of a particular span is not picked."""
 16 | 
 17 | 
 18 | class ExampleGenerator(object):
 19 | 
 20 |     def configure(self, options):
 21 |         """Configure the example generator with the given options.
 22 | 
 23 |         Parameters
 24 |         ----------
 25 |         options : dict
 26 |             options dictionary
 27 |         """
 28 |         pass
 29 | 
 30 |     def example_iter(self, filename):
 31 |         """Iterate over the examples in the given file.
 32 | 
 33 |         Must be implemented by each sub class.
 34 | 
 35 |         Parameters
 36 |         ----------
 37 |         filename : str
 38 |             name of the file containing the examples
 39 | 
 40 |         Yields
 41 |         ------
 42 |         examples
 43 |         """
 44 |         raise NotImplementedError
 45 | 
 46 | 
 47 | class PlainExampleGenerator(ExampleGenerator):
 48 |     """Reads a file containing only examples.
 49 | 
 50 |     Each line is considered one example.
 51 |     """
 52 |     def example_iter(self, filename):
 53 | 
 54 |         for example in file_line_generator(filename):
 55 |             yield example
 56 | 
 57 |         raise StopIteration
 58 | 
 59 | 
 60 | class LabeledExampleGenerator(PlainExampleGenerator):
 61 |     """Reads a file containing label and data for every example.
 62 | 
 63 |     Each line in the file is an example. The first column corresponds to the
 64 |     label, the second column is the input of the classifier. Columns are tab-
 65 |     separated. Vector-based inputs or outputs are space separated. However, this
 66 |     generator does not convert the values in any way. Instead, it just returns
 67 |     the values as strings.
 68 |     """
 69 | 
 70 |     def example_iter(self, filename):
 71 | 
 72 |         for line in super(LabeledExampleGenerator, self).example_iter(filename):
 73 |             yield line.split('\t')
 74 | 
 75 |         raise StopIteration
 76 | 
 77 | 
 78 | class PaddedWindowExamplesGenerator(PlainExampleGenerator):
 79 |     """ Generates sequence of fixed-width window of tokens."""
 80 | 
 81 |     def configure(self, options):
 82 |         self.left_context = options.left_context
 83 |         self.right_context = options.right_context
 84 |         self.disable_padding = options.disable_padding
 85 |         self.learn_eos = options.learn_eos
 86 | 
 87 |     def example_iter(self, filename):
 88 | 
 89 |         for line in super(PaddedWindowExamplesGenerator, self).example_iter(filename):
 90 |             line = line.split()
 91 | 
 92 |             if not self.disable_padding:
 93 |                 line = self.pad_sent(line)
 94 |             elif self.learn_eos:  # add eos token if we need to learn it, but do not want to do padding
 95 |                 line.append(SpecialToken.SENT_END.value)
 96 | 
 97 |             for example in self.sent_examples(line):
 98 |                 yield example
 99 | 
100 |     def is_valid_example(self, _):
101 |         """Checks if the given example is a valid example to process.
102 | 
103 |         Every subclass can specify what is a valid example.
104 |         """
105 |         return True
106 | 
107 |     def pad_sent(self, tokens):
108 |         sent = [SpecialToken.SENT_START.value]
109 |         sent.extend(tokens)
110 |         sent.append(SpecialToken.SENT_END.value)
111 |         return sent
112 | 
113 |     def sent_examples(self, sent):
114 |         """Turns a sentence into a number of examples.
115 |              An example is like {'sources': [list of feature vectors]}
116 |         """
117 |         length = len(sent)
118 | 
119 |         # if the padding is disabled start pos from leftcontext+1
120 |         start_offset = self.left_context if self.disable_padding else 1
121 |         end_offset = self.right_context if self.disable_padding else 1
122 | 
123 |         # if we want to learn end-of-sentence during padding,
124 |         # then move end_offset to let pos cover eos token
125 |         if not self.disable_padding and self.learn_eos:
126 |             end_offset -= 1
127 | 
128 |         for pos in range(start_offset, length - end_offset):
129 |             left_context = sent[max(0, pos - self.left_context): pos]
130 |             right_context = sent[pos + 1: pos + 1 + self.right_context]
131 | 
132 |             left_diff = self.left_context - len(left_context)
133 | 
134 |             if left_diff > 0:
135 |                 left_context = left_diff * [SpecialToken.PAD.value] + \
136 |                         left_context
137 | 
138 |             right_diff = self.right_context - len(right_context)
139 | 
140 |             if right_diff > 0:
141 |                 right_context = right_context + right_diff * \
142 |                         [SpecialToken.PAD.value]
143 | 
144 |             example = left_context + [sent[pos]] + right_context
145 | 
146 |             if not self.is_valid_example(example):
147 |                 continue
148 | 
149 |             yield example
150 | 
151 | 
152 | class SentimentExamplesGenerator(PaddedWindowExamplesGenerator):
153 |     """Extract special sentiment training examples.
154 | 
155 |     Extracts positive instances from the text, having the requirement that
156 |     the center word of the example is contained in a sentiment vocabulary.
157 | 
158 |     Attributes
159 |     ----------
160 |     vocab : dict
161 |         sentiment vocabulary
162 |     """
163 | 
164 |     def configure(self, options):
165 |         """
166 |         Parameters
167 |         ----------
168 |         options.vocab : dict
169 |             sentiment vocabulary
170 | 
171 |         """
172 |         super(SentimentExamplesGenerator, self).configure(options)
173 |         self.sent_vocab = options.sent_vocab
174 | 
175 |     def is_valid_example(self, example):
176 |         return example[self.left_context] in self.sent_vocab
177 | 
178 | 
179 | class SentimentAnywhereExamplesGenerator(SentimentExamplesGenerator):
180 |     """Extract special sentiment training examples.
181 | 
182 |     Extracts examples from the text, having the requirement that
183 |     at least one token of the example is contained in a sentiment vocabulary.
184 |     """
185 | 
186 |     def is_valid_example(self, example):
187 |         return any((e in self.sent_vocab for e in example))
188 | 


--------------------------------------------------------------------------------
/src/word2embeddings/tools/theano_extensions.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy
  3 | from theano import Op, Apply
  4 | from theano.sandbox.rng_mrg import MRG_RandomStreams
  5 | from theano.tensor.basic import as_tensor_variable
  6 | 
  7 | import theano.tensor as T
  8 | 
  9 | 
 10 | class MRG_RandomStreams2(MRG_RandomStreams):
 11 |     """Module component with similar interface to numpy.random
 12 |     (numpy.random.RandomState)
 13 |     """
 14 | 
 15 |     def __init__(self, seed=12345, use_cuda=None):
 16 |         """
 17 |         :type seed: int or list of 6 int.
 18 | 
 19 |         :param seed: a default seed to initialize the random state.
 20 |             If a single int is given, it will be replicated 6 times.
 21 |             The first 3 values of the seed must all be less than M1 = 2147483647,
 22 |             and not all 0; and the last 3 values must all be less than
 23 |             M2 = 2147462579, and not all 0.
 24 | 
 25 |         """
 26 |         super(MRG_RandomStreams2, self).__init__(seed, use_cuda)
 27 | 
 28 |     def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int32',
 29 |                     nstreams=None):
 30 |         """
 31 |         Sample `n` (currently `n` needs to be 1) times from a multinomial
 32 |         distribution defined by probabilities pvals.
 33 | 
 34 |         Example : pvals = [[.98, .01, .01], [.01, .98, .01]] will
 35 |         probably result in [[1,0,0],[0,1,0]].
 36 | 
 37 |         .. note::
 38 |             -`size` and `ndim` are only there keep the same signature as other
 39 |             uniform, binomial, normal, etc.
 40 |             todo : adapt multinomial to take that into account
 41 | 
 42 |             -Does not do any value checking on pvals, i.e. there is no
 43 |              check that the elements are non-negative, less than 1, or
 44 |              sum to 1. passing pvals = [[-2., 2.]] will result in
 45 |              sampling [[0, 0]]
 46 |         """
 47 |         if pvals is None:
 48 |             raise TypeError('You have to specify pvals')
 49 |         pvals = as_tensor_variable(pvals)
 50 |         if size is not None:
 51 |             if any([isinstance(i, int) and i <= 0 for i in size]):
 52 |                 raise ValueError(
 53 |                     'The specified size contains a dimension with value <= 0',
 54 |                     size)
 55 | 
 56 |         if n == 1 and pvals.ndim == 1:
 57 |             if ndim is not None:
 58 |                 raise ValueError('Provided an ndim argument to ' +
 59 |                         'MRG_RandomStreams2.multinomial, which does not use ' +
 60 |                         'the ndim argument.')
 61 |             unis = self.uniform(size=size, ndim=2, nstreams=nstreams)
 62 |             op = MultinomialFromUniform2(dtype)
 63 |             return op(pvals, unis)
 64 |         else:
 65 |             raise NotImplementedError('MRG_RandomStreams2.multinomial only ' +
 66 |                 ' implemented with n == 1 and pvals.ndim = 2')
 67 | 
 68 | 
 69 | class MultinomialFromUniform2(Op):
 70 |     '''Converts samples from a uniform into sample from a multinomial.
 71 | 
 72 |     This random number generator is faster than the standard one of Theano,
 73 |     because it stops earlier and doesn't return matrices of zeros and ones,
 74 |     indicating which index was drawn. Instead it returns the index of the drawn
 75 |     element.
 76 |     '''
 77 |     def __init__(self, odtype):
 78 |         self.odtype = odtype
 79 | 
 80 |     def __eq__(self, other):
 81 |         return type(self) == type(other) and self.odtype == other.odtype
 82 | 
 83 |     def __hash__(self):
 84 |         return hash((type(self), self.odtype))
 85 | 
 86 |     def __str__(self):
 87 |         return '%s{%s}' % (self.__class__.__name__, self.odtype)
 88 | 
 89 |     def __setstate__(self, dct):
 90 |         self.__dict__.update(dct)
 91 |         try:
 92 |             self.odtype
 93 |         except AttributeError:
 94 |             self.odtype = 'auto'
 95 | 
 96 |     def make_node(self, pvals, unis):
 97 |         pvals = T.as_tensor_variable(pvals)
 98 |         unis = T.as_tensor_variable(unis)
 99 |         if pvals.ndim != 1:
100 |             raise NotImplementedError('pvals ndim should be 1', pvals.ndim)
101 |         if unis.ndim != 2:
102 |             raise NotImplementedError('unis ndim should be 2', unis.ndim)
103 |         if self.odtype == 'auto':
104 |             odtype = pvals.dtype
105 |         else:
106 |             odtype = self.odtype
107 |         out = T.tensor(dtype=odtype, broadcastable=unis.type.broadcastable)
108 |         return Apply(self, [pvals, unis], [out])
109 | 
110 |     def grad(self, ins, outgrads):
111 |         pvals, unis = ins
112 |         (gz,) = outgrads
113 |         return [T.zeros_like(x) for x in ins]
114 | 
115 | #     def c_code_cache_version(self):
116 | #         return (5,)
117 |  
118 |     def c_code(self, node, name, ins, outs, sub):
119 |         (pvals, unis) = ins
120 |         (z,) = outs
121 | 
122 |         fail = sub['fail']
123 |         return """
124 |         if (PyArray_NDIM(%(pvals)s) != 1)
125 |         {
126 |             PyErr_Format(PyExc_TypeError, "pvals wrong rank");
127 |             %(fail)s;
128 |         }
129 |         if (PyArray_NDIM(%(unis)s) != 2)
130 |         {
131 |             PyErr_Format(PyExc_TypeError, "unis wrong rank");
132 |             %(fail)s;
133 |         }
134 | 
135 |         if ((NULL == %(z)s)
136 |             || ((PyArray_DIMS(%(z)s))[0] != (PyArray_DIMS(%(unis)s))[0])
137 |             || ((PyArray_DIMS(%(z)s))[1] != (PyArray_DIMS(%(unis)s))[1])
138 |         )
139 |         {
140 |             Py_XDECREF(%(z)s);
141 |             %(z)s = (PyArrayObject*) PyArray_ZEROS(2,
142 |                 PyArray_DIMS(%(unis)s),
143 |                 type_num_%(z)s,
144 |                 0);
145 |             if (!%(z)s)
146 |             {
147 |                 PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
148 |                 %(fail)s;
149 |             }
150 |         }
151 | 
152 |         { // NESTED SCOPE
153 | 
154 |         const int nb_outcomes = PyArray_DIMS(%(pvals)s)[0];
155 |         const int nb_rows = PyArray_DIMS(%(unis)s)[0];
156 |         const int nb_cols = PyArray_DIMS(%(unis)s)[1];
157 | 
158 |         //
159 |         // For each multinomial, loop over each possible outcome
160 |         //
161 |         for (int row = 0; row < nb_rows; ++row)
162 |         {
163 |             for (int col = 0; col < nb_cols; ++col) {
164 | //                std::cout << row << 'x' << col << std::endl;
165 | 
166 |                 dtype_%(pvals)s cummul = 0.;
167 |                 const dtype_%(unis)s* unis_n = (dtype_%(unis)s*)PyArray_GETPTR2(%(unis)s, row, col);
168 |                 dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, row, col);
169 |                 *z_nm = -1;
170 | 
171 | //                std::cout << "unis " << (int)(*unis_n * 100) << std::endl;
172 | //                std::cout << "z_nm " << (int)(*z_nm * 100) << std::endl;
173 | 
174 |                 for (int m = 0; m < nb_outcomes; ++m)
175 |                 {
176 |                     const dtype_%(pvals)s* pvals_m = (dtype_%(pvals)s*)PyArray_GETPTR1(%(pvals)s, m);
177 |                     cummul += *pvals_m;
178 | //                    std::cout << "cummul " << (int)(cummul * 100) << std::endl;
179 | 
180 |                     if (cummul > *unis_n)
181 |                     {
182 |                         *z_nm = m;
183 | //                        *z_nm = 17;
184 |                         break;
185 |                     }
186 | 
187 |                 }
188 | 
189 |                 // If we reached the end, use the last value.
190 |                 // If we have a real distribution [0,1], than this should never
191 |                 // happen, right? I got a segmentation fault when removing it.
192 |                 // 2014-04-08
193 |                 // This might happen due to rounding errors. 2014-05-01
194 |                 if (*z_nm == -1) {
195 |                     *z_nm = nb_outcomes - 1;
196 |                 }
197 |             }
198 |         }
199 |         } // END NESTED SCOPE
200 |         """ % locals()
201 | 
202 |     def perform(self, node, ins, outs):
203 |         (pvals, unis) = ins
204 |         (z,) = outs
205 | 
206 |         if z[0] is None or z[0].shape != numpy.sum(unis.shape):
207 |             z[0] = numpy.zeros(unis.shape, dtype=node.outputs[0].dtype)
208 | 
209 |         z[0][:, :] = -1
210 | 
211 |         nb_outcomes = pvals.shape[0]
212 | 
213 |         for row in xrange(unis.shape[0]):
214 |             for col in xrange(unis.shape[1]):
215 |                 cummul = 0
216 |                 unis_n = unis[row, col]
217 | 
218 |                 for m in range(nb_outcomes):
219 |                     cummul += pvals[m]
220 | 
221 |                     if cummul > unis_n:
222 |                         z[0][row, col] = m
223 | #                         z[0][row, col] = 13
224 |                         break
225 | 
226 |                 # If we reached the end, use the last value.
227 |                 # If we have a real distribution [0,1], than this should never
228 |                 # happen, right? I got a segmentation fault when removing it.
229 |                 # 2014-04-08
230 |                 # This might happen due to rounding errors. 2014-05-01
231 |                 if z[0][row, col] == -1:
232 |                     z[0][row, col] = nb_outcomes - 1;
233 | 


--------------------------------------------------------------------------------
/src/word2embeddings/tools/util.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """util.py: Collection of useful utilities."""
  5 | 
  6 | from itertools import islice, izip_longest
  7 | import re
  8 | import sys
  9 | 
 10 | from cis.deep.utils import file_line_generator
 11 | 
 12 | 
 13 | LOG_FORMAT = '%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s'
 14 | 
 15 | def Enum(**enums):
 16 |     """An enumeration factory class."""
 17 |     obj = type('Enum', (), enums)
 18 |     obj.named_value = dict([(a, v) for a, v in vars(obj).items() if not a.startswith('__')])
 19 |     obj.value_named = dict([(v, a) for a, v in obj.named_value.items()])
 20 |     return obj
 21 | 
 22 | def debug(type_, value, tb):
 23 |     if hasattr(sys, 'ps1') or not sys.stderr.isatty():
 24 |         # we are in interactive mode or we don't have a tty-like
 25 |         # device, so we call the default hook
 26 |         sys.__excepthook__(type_, value, tb)
 27 |     else:
 28 |         import traceback, pdb
 29 |         # we are NOT in interactive mode, print the exception...
 30 |         traceback.print_exception(type_, value, tb)
 31 |         print('\n')
 32 |         # ...then start the debugger in post-mortem mode.
 33 |         pdb.pm()
 34 | 
 35 | def extract_results_from_logfile(logfile, result='train_error', fmt='new',
 36 |         no_of_val_files=1):
 37 |     """Extract results from a given logfile and returns them as ndarray.
 38 | 
 39 |     Parameters
 40 |     ----------
 41 |     logfile : str
 42 |         path of the logfile
 43 |     result : str
 44 |         type of the result to be extracted; one of 'train_error',
 45 |         'val_error', 'val_ppl'
 46 |     format : str
 47 |         'new' or 'old', new format allows several validation files; old format
 48 |         only allowed 1 validation file.
 49 |     no_of_val_files : int
 50 |         number of validation files used in the logfile; is only matters if
 51 |         result = 'val_error' or 'val_perplexity'
 52 | 
 53 |     Returns
 54 |     -------
 55 |     ndarray
 56 |         contains all results in an array
 57 |     """
 58 | 
 59 |     if fmt == 'old':
 60 |         val_method_name = 'validate'
 61 |     else:
 62 |         val_method_name = '_validate_single_file'
 63 | 
 64 | 
 65 |     if result == 'train_error':
 66 |         pattern = re.compile(r'run\tAverage loss on .*? training set is (.*)',
 67 |                 re.UNICODE)
 68 |     elif result == 'val_error':
 69 |         pattern = re.compile(
 70 |                 r'%s\tAverage loss on .*? validation set is (.*)' % val_method_name,
 71 |                 re.UNICODE)
 72 |     elif result == 'val_ppl':
 73 |         pattern = re.compile(
 74 |                 r'%s\tPerplexity on .*? validation set is (.*)' % val_method_name,
 75 |                 re.UNICODE)
 76 |     else:
 77 |         raise ValueError('Unknown result type to be extracted from logfile: %s'
 78 |                 % result)
 79 | 
 80 |     values = list()
 81 | 
 82 |     for line in file_line_generator(logfile):
 83 |         match = re.search(pattern, line)
 84 | 
 85 |         if not match:
 86 |             continue
 87 | 
 88 |         values.append(float(match.group(1)))
 89 | 
 90 |     # Converts the 1d list of results into one list per validation file.
 91 |     if (result == 'val_error' or result == 'val_ppl') and no_of_val_files != 1:
 92 |         values = list(grouper_recipes(values, no_of_val_files))
 93 |         values = zip(*values)
 94 | 
 95 |     return values
 96 | 
 97 | def grouper(iterable, n):
 98 |     """Group n items from the iterable into a group.
 99 | 
100 |     Parameters
101 |     ----------
102 |     iterable : any
103 |         iterator to get the items from
104 |     n : int
105 |         number of items to form one group
106 | 
107 |     Returns
108 |     -------
109 |     tuple(items)
110 |         tuple of n items taken from the iterator
111 |     """
112 |     chunk = tuple(islice(iterable, n))
113 | 
114 |     if not chunk:
115 |         return
116 |     yield chunk
117 | 
118 | def grouper_recipes(iterable, n, fillvalue=None):
119 |     """Collect data into fixed-length chunks or blocks.
120 |     Grouper taken from https://docs.python.org/2/library/itertools.html.
121 |     """
122 |     # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
123 |     args = [iter(iterable)] * n
124 |     return izip_longest(fillvalue=fillvalue, *args)
125 | 
126 | def prepare_brown_signature(signature, max_size, add_right=False):
127 |     """Convert variable length signatures into fixed length ones.
128 | 
129 |     Prepends zeros to the front of the signature.
130 | 
131 |     Parameters
132 |     ----------
133 |     signature : str
134 |         brown signature a string (space separated)
135 |     max_size : int
136 |         size of the fixed signature
137 |     add_right : bool
138 |         indicates whether to add the padding zeros to the right of the signature
139 |         instead of the left
140 | 
141 |     Returns
142 |     -------
143 |     str
144 |         fixed length brown signature
145 | 
146 |     Example
147 |     -------
148 |     >>> prepare_brown_signature(u'1 1', 4)
149 |     u'0 0 1 1'
150 | 
151 |     >>> prepare_brown_signature(u'1 1 1 1', 4)
152 |     u'1 1 1 1'
153 | 
154 |     >>> prepare_brown_signature(u'1 1', 4, True)
155 |     u'1 1 0 0'
156 | 
157 |     >>> prepare_brown_signature(u'1 1 1 1', 4, True)
158 |     u'1 1 1 1'
159 |     """
160 |     sig_len = len(signature.split())
161 |     needed_padding = max_size - sig_len
162 | 
163 |     if needed_padding == 0:
164 |         return signature
165 | 
166 |     padding = u' '.join([u'0' for _ in xrange(needed_padding)])
167 |     return padding + u' ' + signature \
168 |         if not add_right else signature + u' ' + padding
169 | 


--------------------------------------------------------------------------------