├── README.md └── src ├── cis ├── .gitignore ├── __init__.py └── deep │ ├── .gitignore │ ├── __init__.py │ └── utils │ ├── .gitignore │ ├── __init__.py │ ├── classification │ ├── __init__.py │ └── apps │ │ ├── __init__.py │ │ ├── calc_error_metrics.py │ │ ├── classify.py │ │ ├── classify_mode.py │ │ ├── classify_xval.py │ │ └── prepare_significance_test.py │ ├── clustering │ ├── __init__.py │ └── apps │ │ ├── __init__.py │ │ └── kmeans.py │ ├── embeddings │ └── __init__.py │ ├── lm │ └── __init__.py │ ├── misc │ ├── __init__.py │ └── apps │ │ ├── __init__.py │ │ ├── automatic_cluster_labeling.py │ │ └── combine_files.py │ ├── preprocessing │ ├── __init__.py │ ├── apps │ │ ├── __init__.py │ │ ├── convert_apnews_to_text.py │ │ ├── escape_regex.py │ │ ├── filter_file_by_lines.py │ │ ├── preprocess.py │ │ ├── splitter.py │ │ ├── text_to_bow.py │ │ ├── text_to_features.py │ │ ├── tokenizer.py │ │ └── word_count.py │ └── corpus.py │ ├── statistics │ ├── __init__.py │ └── apps │ │ ├── __init__.py │ │ └── calc_matrix_statistics.py │ ├── text.py │ ├── theano │ ├── .gitignore │ ├── __init__.py │ ├── gpu_test.py │ └── log_reg.py │ └── visualization │ ├── __init__.py │ └── apps │ ├── __init__.py │ └── visualize_by_tsne.py ├── common_functions.py ├── load_data.py ├── log.best.scitail.txt ├── logistic_sgd.py ├── logistic_sgd_biased.py ├── mlp.py ├── model_para_0.820930232558 ├── preprocess_SciTail.py ├── train_SciTail_DeIsTe_model.py └── word2embeddings ├── .gitignore ├── AUTHORS.rst ├── MANIFEST.in ├── README.rst ├── __init__.py ├── apps ├── __init__.py ├── analyze_lbl_distribution.py ├── classify_imdb_docs.py ├── create_embeddings.py ├── extract_model_data.py ├── extract_words_with_we.py ├── prepare_brown_file.py ├── test_mlp.py ├── train_mlp.py ├── train_model.py ├── use_lm.py └── use_model.py ├── lm ├── __init__.py └── networks.py ├── nn ├── .gitignore ├── __init__.py ├── layers.py ├── networks.py ├── predictor.py ├── tools.py ├── trainer.py └── util.py └── tools ├── .gitignore ├── __init__.py ├── examples_generator.py ├── theano_extensions.py └── util.py /README.md: -------------------------------------------------------------------------------- 1 | # SciTail 2 | This released code is for our ACL2018 paper "End-Task Oriented Textual Entailment via Deep Explorations of Inter-Sentence Interactions". It gets STOA performance in a textual entailment benchmark -- 82.1% accuracy on SciTail dataset. We release the code and the pretrained model 3 | 4 | To reproduce the result, just run the "train_SciTail_DeIsTe_model.py" file. It needs word2vec embeddings. We provide the pretrained model so that reloading it can reproduce the paper numbers. 5 | -------------------------------------------------------------------------------- /src/cis/.gitignore: -------------------------------------------------------------------------------- 1 | /__init__.pyc 2 | -------------------------------------------------------------------------------- /src/cis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/cis/__init__.py -------------------------------------------------------------------------------- /src/cis/deep/.gitignore: -------------------------------------------------------------------------------- 1 | /__init__.pyc 2 | -------------------------------------------------------------------------------- /src/cis/deep/__init__.py: -------------------------------------------------------------------------------- 1 | from pkgutil import extend_path 2 | __path__ = extend_path(__path__, __name__) -------------------------------------------------------------------------------- /src/cis/deep/utils/.gitignore: -------------------------------------------------------------------------------- 1 | /__init__.pyc 2 | -------------------------------------------------------------------------------- /src/cis/deep/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This file contains common utility classes and methods. 4 | """ 5 | from bz2 import BZ2File 6 | import cPickle 7 | import codecs 8 | import collections 9 | from datetime import date 10 | import hashlib 11 | import io 12 | from itertools import izip_longest 13 | import logging 14 | from operator import itemgetter 15 | import os 16 | import sys 17 | 18 | import numpy as np 19 | import itertools 20 | 21 | 22 | def are_generators_equal(gen1, gen2): 23 | """Indicate whether or not the given generators are equal. 24 | 25 | Generators cannot be compared as easily as lists. Here's the description of 26 | what happens: 27 | "This can actually short-circuit without necessarily having to look at all 28 | values. As pointed out by larsmans in the comments, we can't use izip() here 29 | since it might give wrong results if the generators produce a different 30 | number of elements – izip() will stop on the shortest iterator. We use a 31 | newly created object instance as fill value for izip_longest(), since object 32 | instances are also compared by object identity, so sentinel is guaranteed to 33 | compare unequal to everything else." 34 | [http://stackoverflow.com/questions/9983547/comparing-two-generators-in-python] 35 | """ 36 | return all(a == b for a, b in 37 | izip_longest(gen1, gen2, fillvalue=object())) 38 | 39 | def digest(string): 40 | """Calculate a hash for the given string. 41 | 42 | Parameters 43 | ---------- 44 | string : str 45 | string to calculate the hash for 46 | 47 | Examples 48 | -------- 49 | >>> digest('hello world') 50 | '2f05477fc24bb4faefd86517156dafdecec45b8ad3cf2522a563582b' 51 | """ 52 | return hashlib.sha224(string).hexdigest() 53 | 54 | def file_line_generator(filename, strip=True, comment=None): 55 | """Iterates over the lines in a file. 56 | 57 | Each line is one string. Uses utf8_file_open. 58 | 59 | Parameters 60 | ---------- 61 | filename : str 62 | name of the file to load 63 | strip : bool 64 | indicates whether or not to strip each line after reading (removes line 65 | endings, but also tabs or spaces at the beginning of the line) 66 | comment : str 67 | if a line in the file starts with this string, then it's considered to 68 | be a comment and discarded. None if nothing should be discarded. 69 | 70 | Returns 71 | ------- 72 | list 73 | each line of the given file is one item in the list 74 | """ 75 | 76 | with utf8_file_open(filename) as f: 77 | 78 | for line in f: 79 | 80 | if strip: 81 | line = line.strip() 82 | 83 | if comment and line.startswith(comment): 84 | continue 85 | 86 | yield line 87 | 88 | raise StopIteration 89 | 90 | def flatten_iterable(it): 91 | """Flattens an iteratable object. 92 | 93 | Parameters 94 | ---------- 95 | it : iterable 96 | nested iterable 97 | 98 | Returns 99 | ------- 100 | generator 101 | generator that iterates over all items in the iterable 102 | """ 103 | 104 | for item in it: 105 | if isinstance(item, collections.Iterable) and \ 106 | not isinstance(item, basestring): 107 | 108 | for sub in flatten_iterable(item): 109 | yield sub 110 | else: 111 | yield item 112 | 113 | def generator_has_next(gen): 114 | """Check if the given generator contains more elements. 115 | 116 | This is a hack. If the generator contains more elements, the returned 117 | generator must be used, because the original generator "lost" an element. 118 | The returned generator however contains this element. This is possible by 119 | using itertools.chain. 120 | 121 | Returns 122 | ------- 123 | Any 124 | False: generator does not contain any more elements 125 | generator: generator does contain more elements, use this generator 126 | instead of the original one, otherwise you loose one element. 127 | """ 128 | 129 | try: 130 | elem = gen.next() 131 | return itertools.chain([elem], gen) 132 | except StopIteration: 133 | return False 134 | 135 | 136 | def load_object_from_file(filename): 137 | """Loads an object from the given filename. 138 | 139 | The given file must have been written using save_object. 140 | 141 | Parameters 142 | ---------- 143 | filename : string 144 | name of the persisted object 145 | """ 146 | # Caution: using utf8_file_open doesn't work with cPickle 147 | return cPickle.load(open(filename, 'rb')) 148 | 149 | def log_iterations(log, count, log_every): 150 | """Log how many iterations have been handled every log_every iterations. 151 | 152 | Parameters 153 | ---------- 154 | log : logger 155 | logger to be logged into 156 | count : int 157 | current count of iterations 158 | log_every : int 159 | the count is logged every log_every iterations 160 | """ 161 | 162 | if count % log_every == 0: 163 | log.info('iterations: ' + str(count)) 164 | 165 | 166 | def logger_config(logger, level=logging.INFO, log_dir=None): 167 | """Configure the given logger. 168 | 169 | Parameters 170 | ---------- 171 | logger : logger 172 | logger to configure 173 | log_dir : str 174 | path where to store the log file, if None no log file is created 175 | """ 176 | logger.setLevel(level) 177 | formatter = _logger_config_create_formatter() 178 | logger.addHandler(_logger_config_create_console_handler(formatter, level)) 179 | 180 | if log_dir is not None: 181 | logger.addHandler(_logger_config_create_file_handler(formatter, level, 182 | log_dir)) 183 | 184 | def _logger_config_create_formatter(): 185 | """Return a formatter object.""" 186 | formatter = logging.Formatter( 187 | '%(asctime)s\t%(levelname)s\t%(module)s\t%(funcName)s\t%(message)s', 188 | '%Y-%m-%d %H:%M:%S') 189 | return formatter 190 | 191 | def _logger_config_create_console_handler(formatter, level): 192 | """Return a console handler.""" 193 | ch = logging.StreamHandler(sys.stdout) 194 | ch.setLevel(level) 195 | ch.setFormatter(formatter) 196 | return ch 197 | 198 | def _logger_config_create_file_handler(formatter, level, log_dir): 199 | """Return a log file handler.""" 200 | fh = logging.FileHandler(os.path.join(log_dir, 'log-' + 201 | date.today().strftime('%Y-%m-%d')), encoding='utf-8') 202 | fh.setLevel(level) 203 | fh.setFormatter(formatter) 204 | return fh 205 | 206 | def ndarray_to_string(array): 207 | """Converts the given ndarray into a unicode string. 208 | 209 | Parameters 210 | ---------- 211 | array : ndarray 212 | 213 | Returns 214 | ------- 215 | unicode 216 | """ 217 | array = np.asarray(array) 218 | 219 | if array.ndim == 1: 220 | return u' '.join([unicode(item) for item in array]) 221 | elif array.ndim == 2: 222 | return u'\n'.join([ndarray_to_string(line) for line in array]) 223 | 224 | raise ValueError(u'only 1d arrays supported') 225 | 226 | 227 | def save_object_to_file(obj, filename): 228 | """Saves the given object to file using cPickle. 229 | 230 | The object might provide extra routings for storing (e.g., __getstate__). 231 | 232 | Parameters 233 | ---------- 234 | obj : any 235 | object to store 236 | filename : string 237 | file to store the object to 238 | """ 239 | # Caution: using utf8_file_open doesn't work with cPickle 240 | cPickle.dump(obj, open(filename, "wb"), protocol=-1) 241 | 242 | def sort_dict_by_key(d, reverse=False): 243 | """Sort the given dictionary by its keys. 244 | 245 | Parameters 246 | ---------- 247 | d : dict 248 | dictionary to sort 249 | reverse : bool 250 | indicates if the sorting should be reversed 251 | 252 | Returns 253 | ------- 254 | list of tupels 255 | contains tupels of key and value ordered according to key 256 | 257 | Examples 258 | -------- 259 | >>> x = {'c':2, 'a':4, 'b':3, 'd':1, 'e':0} 260 | >>> sort_dict_by_key(x) 261 | [('a', 4), ('b', 3), ('c', 2), ('d', 1), ('e', 0)] 262 | 263 | >>> x = {'c':2, 'e':4, 'd':3, 'b':1, 'a':0} 264 | >>> sort_dict_by_key(x, True) 265 | [('e', 4), ('d', 3), ('c', 2), ('b', 1), ('a', 0)] 266 | """ 267 | return sorted(d.iteritems(), key=itemgetter(0), reverse=reverse) 268 | 269 | def sort_dict_by_label(d, reverse=False): 270 | """Sort the given dictionary by its values. 271 | 272 | Parameters 273 | ---------- 274 | d : dict 275 | dictionary to sort 276 | reverse : bool 277 | indicates if the sorting should be reversed 278 | 279 | Returns 280 | ------- 281 | list of tupels 282 | contains tupels of key and value ordered according to value 283 | 284 | Examples 285 | -------- 286 | >>> x = {'c':2, 'a':4, 'b':3, 'd':1, 'e':0} 287 | >>> sort_dict_by_label(x) 288 | [('e', 0), ('d', 1), ('c', 2), ('b', 3), ('a', 4)] 289 | 290 | >>> x = {'c':2, 'e':4, 'd':3, 'b':1, 'a':0} 291 | >>> sort_dict_by_label(x, True) 292 | [('e', 4), ('d', 3), ('c', 2), ('b', 1), ('a', 0)] 293 | """ 294 | return sorted(d.iteritems(), key=itemgetter(1), reverse=reverse) 295 | 296 | def text_to_vocab_indices(vocab, tokens, unk=u''): 297 | """ 298 | Convert all tokens in the text into their indices in the given vocabulary. 299 | 300 | Tokens that do not exist in the vocabulary will receive the token 301 | index. 302 | 303 | Parameters 304 | ---------- 305 | vocabulary : dict(str, int) 306 | mapping from token text to index 307 | must contain an UNKNOWN token 308 | tokens : str or list(str) 309 | text to replace all tokens in 310 | unk : str 311 | unknown word token 312 | 313 | Returns 314 | ------- 315 | list(int) 316 | list that contains the vocabulary indices for all tokens instead of 317 | the tokens themselves 318 | list(str) 319 | list of the original input text having unknown tokens replaced by the 320 | unknown word token 321 | 322 | Examples 323 | >>> vocab = {u'i': 0, u'am': 1, u'home': 2, u'':-1} 324 | >>> text_to_vocab_indices(vocab, u'i am home now .') 325 | ([0, 1, 2, -1, -1], [u'i', u'am', u'home', u'', u'']) 326 | >>> text_to_vocab_indices(vocab, [u'i', u'am', u'home', u'now', u'.']) 327 | ([0, 1, 2, -1, -1], [u'i', u'am', u'home', u'', u'']) 328 | """ 329 | 330 | if isinstance(tokens, (str, unicode)): 331 | tokens = tokens.split() 332 | 333 | conv_tokens = [t if t in vocab else unk for t in tokens] 334 | sent_indices = [vocab[t] for t in conv_tokens] 335 | 336 | return sent_indices, conv_tokens 337 | 338 | def utf8_file_open(filename, mode='r'): 339 | """Return a file object for the given filename in the given mode. 340 | 341 | Open an utf-8 file in the given mode (see io.open for further details) and 342 | uses only \n as line endings. Can open bz2 files. 343 | 344 | Parameters 345 | ---------- 346 | filename : string 347 | name of the file to open 348 | mode : string 349 | open mode (see io.open for further details), default value: 'r' 350 | """ 351 | 352 | # It seems that utf8 files are read properly by BZ2File. 353 | if filename.endswith(u'.bz2'): 354 | return codecs.getreader("utf-8")(BZ2File(filename, mode, compresslevel=9)) 355 | 356 | return io.open(filename, mode, encoding='utf8', newline='\n') 357 | -------------------------------------------------------------------------------- /src/cis/deep/utils/classification/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This file contains common utility classes and methods for classification. 4 | """ 5 | from sklearn.metrics.metrics import accuracy_score, \ 6 | precision_recall_fscore_support 7 | 8 | 9 | def calc_metrics(true_labels, predicted_labels): 10 | """Provide accuracy, precision, recall, and f1 as error measure. 11 | 12 | Parameters 13 | ---------- 14 | true_labels : list, ndarray 15 | true labels 16 | predicted_labels : list, ndarray 17 | predicted labels 18 | 19 | Returns 20 | ------- 21 | (float, float, float, float) 22 | accuracy, precision, recall, f1 23 | 24 | Example 25 | ------- 26 | >>> y_true = [0, 1, 1, 0] 27 | >>> y_pred = [0, 0, 1, 1] 28 | >>> calc_metrics(y_true, y_pred) 29 | (0.5, 0.5, 0.5, 0.5) 30 | """ 31 | acc = accuracy_score(true_labels, predicted_labels) 32 | p, r, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, 33 | average='micro') 34 | return (acc, p, r, f1) 35 | -------------------------------------------------------------------------------- /src/cis/deep/utils/classification/apps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/cis/deep/utils/classification/apps/__init__.py -------------------------------------------------------------------------------- /src/cis/deep/utils/classification/apps/calc_error_metrics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | example usage: 4 | -p -r -f 5 | X:\sa\experiments\contextual_polarity\mlp\sent_1\amazon\sanity_test-most_well-binary\features-predict-tmp 6 | X:\sa\experiments\contextual_polarity\mlp\sent_1\amazon\sanity_test-most_well-binary\features-predict-out-cleaned 7 | """ 8 | from argparse import ArgumentParser 9 | from logging import getLogger 10 | import sys 11 | 12 | from sklearn.metrics.metrics import accuracy_score, \ 13 | precision_recall_fscore_support 14 | 15 | from cis.deep.utils import logger_config, file_line_generator 16 | 17 | 18 | log = getLogger(__name__) 19 | logger_config(log) 20 | 21 | parser = ArgumentParser(description="""Calculate the error metrics accuracy, 22 | precision, recall, and f-measure for the given true and predicted 23 | labels. Labels must be numeric type. This application is a wrapper 24 | for sklearn.metrics.accuracy_score and 25 | sklearn.metrics.precision_recall_fscore_support. Look up their 26 | documentation to find the explanations of the parameters.""") 27 | parser.add_argument('true_labels', help='true labels, one per line') 28 | parser.add_argument('pred_labels', help='predicted labels, one per line') 29 | 30 | parser.add_argument('-p', '--precision', action='store_true', 31 | help='calculate precision') 32 | parser.add_argument('-r', '--recall', action='store_true', 33 | help='calculate recall') 34 | parser.add_argument('-f', '--f_measure', action='store_true', 35 | help='calculate f-measure') 36 | 37 | parser.add_argument('-b', '--beta', default=1.0, type=float, 38 | help='beta value of f-measure') 39 | parser.add_argument('-o', '--pos_label', default='1', 40 | help='label of the positive class in a binary classification task') 41 | parser.add_argument('-a', '--avg', choices=['none', 'micro', 'macro', 'samples', 42 | 'weighted'], default='none', 43 | help='label of the positive class in a binary classification task') 44 | 45 | def main(argv=None): 46 | 47 | if argv is None: 48 | argv = sys.argv[1:] 49 | 50 | args = parser.parse_args(argv) 51 | log.info('start parameters: ' + str(args)) 52 | 53 | log.info('loading data') 54 | true = [] 55 | pred = [] 56 | 57 | for line in file_line_generator(args.true_labels): 58 | true.append(line) 59 | 60 | for line in file_line_generator(args.pred_labels): 61 | pred.append(line) 62 | 63 | acc = accuracy_score(true, pred) 64 | log.info('accuracy: %f' % acc) 65 | 66 | if args.precision or args.recall or args.f_measure: 67 | p, r, f, _ = precision_recall_fscore_support(true, pred, args.beta, 68 | pos_label=args.pos_label, 69 | average=None if not args.avg else args.avg) 70 | 71 | if args.precision: 72 | log.info('precision: %f' % p) 73 | if args.recall: 74 | log.info('recall: %f' % r) 75 | if args.f_measure: 76 | log.info('f-measure: %f' % f) 77 | 78 | log.info('finished') 79 | 80 | if __name__ == "__main__": 81 | sys.exit(main()) 82 | -------------------------------------------------------------------------------- /src/cis/deep/utils/classification/apps/classify.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | """ 4 | 5 | from argparse import ArgumentParser 6 | from logging import getLogger 7 | import os 8 | import sys 9 | 10 | from sklearn.metrics.metrics import confusion_matrix 11 | from sklearn.svm import LinearSVC 12 | 13 | from cis.deep.utils import logger_config, file_line_generator, \ 14 | save_object_to_file 15 | from cis.deep.utils.classification import calc_metrics 16 | import numpy as np 17 | from sklearn.dummy import DummyClassifier 18 | 19 | # import pydevd 20 | # pydevd.settrace(host='129.187.148.250', stdoutToServer=True, 21 | # stderrToServer=True) 22 | 23 | log = getLogger(__name__) 24 | logger_config(log) 25 | 26 | parser = ArgumentParser( 27 | description='Train and test a classifier.') 28 | parser.add_argument('train_data', 29 | help="""File containing the features as dense matrix. bz2 and gz are 30 | supported.""") 31 | parser.add_argument('train_labels', 32 | help="""File containing the data labels. One label per line.""") 33 | parser.add_argument('test_data', 34 | help="""File containing the features as dense matrix. bz2 and gz are 35 | supported.""") 36 | parser.add_argument('test_labels', 37 | help="""File containing the data labels. One label per line.""") 38 | parser.add_argument('output_dir', 39 | help='directory to store the results in') 40 | 41 | parser.add_argument('-n', '--normalize', action='store_true', 42 | help="""Normalize each feature to zero mean and 1 std dev. That makes 43 | sense if the the values of different features are very different.""") 44 | parser.add_argument('-m', '--mode', action='store_true', 45 | help="""compute the results using mode, i.e., the majority class of the 46 | training data.""") 47 | 48 | def get_classification_result(true_labels, pred_labels): 49 | """Return classification resuls for one fold. 50 | 51 | Return an array containing accuracy, precision, recall, and f1, based on the 52 | given true and predicted labels. 53 | 54 | Keyword arguments: 55 | fold_no -- this fold's number 56 | true_labels -- true labels 57 | pred_labels -- predicted labels 58 | """ 59 | res = np.zeros((1, 4)) 60 | res[:] = calc_metrics(true_labels, pred_labels) 61 | return res 62 | 63 | def calc_results(train_features, train_labels, test_features, test_labels, 64 | normalize=False, mode=False): 65 | """Perform the k-fold cross validation. 66 | 67 | Perform the k-fold cross validation, collect the result and return the 68 | single test instance predictions, as well as the classification results for 69 | each single fold and for the combination of all folds. 70 | 71 | Keyword arguments: 72 | train_features -- all train_features 73 | train_labels -- all train_labels 74 | normalize -- normalize features to have zero mean and 1 std dev 75 | mode -- use mode (majority label) instead of liblinear 76 | """ 77 | 78 | if normalize and not mode: 79 | # compute the mean and std dev only on the training data, but also 80 | # apply it to the test data. 81 | mean = np.mean(train_features, axis=0) 82 | std_dev = np.std(train_features, axis=0, dtype=float) 83 | train_features = (train_features - mean) / std_dev 84 | test_features = (test_features - mean) / std_dev 85 | 86 | if mode: 87 | model = model = DummyClassifier(strategy='most_frequent') 88 | else: 89 | model = LinearSVC(random_state=84447) 90 | 91 | model.fit(train_features, train_labels) 92 | pred_labels = model.predict(test_features) 93 | 94 | single_predictions = np.transpose(np.vstack((xrange(test_labels.shape[0]), 95 | test_labels, pred_labels))) 96 | 97 | classification_result = get_classification_result(test_labels, pred_labels) 98 | 99 | if mode: 100 | weight_vectors = model.class_prior_ 101 | else: 102 | # Store the feature weights after the training 103 | weight_vectors = model.coef_ 104 | 105 | return single_predictions, classification_result, weight_vectors, model 106 | 107 | def main(argv=None): 108 | 109 | if argv is None: 110 | argv = sys.argv[1:] 111 | 112 | args = parser.parse_args(argv) 113 | log.info('start parameters: ' + str(args)) 114 | 115 | log.info('loading feature and label data') 116 | train_labels = np.asarray(map(int, list(file_line_generator(args.train_labels)))) 117 | train_features = np.loadtxt(args.train_data) 118 | 119 | if train_features.ndim == 1: 120 | train_features = train_features.reshape((train_features.shape[0], 1)) 121 | 122 | test_labels = np.asarray(map(int, list(file_line_generator(args.test_labels)))) 123 | test_features = np.loadtxt(args.test_data) 124 | 125 | if test_features.ndim == 1: 126 | test_features = test_features.reshape((test_features.shape[0], 1)) 127 | 128 | log.info('performing classification') 129 | single_predictions, classification_result, weight_vectors, model = \ 130 | calc_results(train_features, train_labels, test_features, 131 | test_labels, args.normalize, args.mode == True) 132 | 133 | log.info('storing results') 134 | save_object_to_file(model, os.path.join(args.output_dir, 'svm')) 135 | 136 | np.savetxt(os.path.join(args.output_dir, 'weights.csv'), 137 | weight_vectors, '%f', ';', '\n') 138 | 139 | header = 'instance_index;true_label;pred_label' 140 | np.savetxt(os.path.join(args.output_dir, 'predictions.csv'), 141 | single_predictions, '%d', ';', '\n', header=header) 142 | 143 | all_true_labels = single_predictions[:, 1] 144 | all_pred_labels = single_predictions[:, 2] 145 | confusion = confusion_matrix(all_true_labels, all_pred_labels) 146 | 147 | np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'), 148 | confusion, '%d', ';', '\n') 149 | 150 | header = 'accuracy;precision;recall;f1' 151 | np.savetxt(os.path.join(args.output_dir, 'metrics.csv'), 152 | classification_result, '%f', ';', '\n', header=header) 153 | 154 | log.info(classification_result) 155 | log.info('finished') 156 | 157 | if __name__ == "__main__": 158 | sys.exit(main()) 159 | -------------------------------------------------------------------------------- /src/cis/deep/utils/classification/apps/classify_mode.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | """ 4 | 5 | from argparse import ArgumentParser 6 | from logging import getLogger 7 | import os 8 | import sys 9 | 10 | from sklearn.cross_validation import StratifiedKFold 11 | from sklearn.metrics.metrics import confusion_matrix 12 | 13 | from cis.deep.utils import logger_config, file_line_generator 14 | from cis.deep.utils.classification import calc_metrics 15 | import numpy as np 16 | from sklearn.dummy import DummyClassifier 17 | 18 | 19 | # from sklearn.dummy import DummyClassifier 20 | log = getLogger(__name__) 21 | logger_config(log) 22 | 23 | parser = ArgumentParser( 24 | description="""Perform a 10-fold cross validation, always using the most 25 | frequent class as predicted value.""") 26 | parser.add_argument('label_file', 27 | help="""File containing the data labels. One label per line.""") 28 | parser.add_argument('output_dir', 29 | help='directory to store the results in') 30 | 31 | NO_OF_FOLDS = 10 32 | 33 | def get_classification_result(fold_no, true_labels, pred_labels): 34 | """Return classification resuls for one fold. 35 | 36 | Return an array containing accuracy, precision, recall, and f1, based on the 37 | given true and predicted labels. 38 | 39 | Keyword arguments: 40 | fold_no -- this fold's number 41 | true_labels -- true labels 42 | pred_labels -- predicted labels 43 | """ 44 | res = np.zeros(5) 45 | res[0] = fold_no 46 | 47 | acc, prec, rec, f1 = calc_metrics(true_labels, pred_labels) 48 | res[1:5] = [acc, prec, rec, f1] 49 | return res 50 | 51 | def do_cross_validation(labels): 52 | """Perform the k-fold cross validation. 53 | 54 | Perform the k-fold cross validation, collect the result and return the 55 | single test instance predictions, as well as the classification results for 56 | each single fold and for the combination of all folds. 57 | 58 | Keyword arguments: 59 | features -- all features 60 | labels -- all labels 61 | """ 62 | skf = StratifiedKFold(labels, NO_OF_FOLDS) 63 | single_predictions = [] # Store each single classification decision 64 | 65 | # Store classification results for each fold and for the entire task (i.e., 66 | # entire cross validation). 67 | classification_result = np.zeros((NO_OF_FOLDS + 1, 5)) 68 | 69 | for cur_fold, (train_idx, test_idx) in enumerate(skf): 70 | model = DummyClassifier(strategy='most_frequent') 71 | model.fit(None, labels[train_idx]) 72 | pred_labels = model.predict(np.zeros(labels[test_idx].shape[0])) 73 | 74 | fold_array = np.empty(test_idx.shape[0]) 75 | fold_array.fill(cur_fold) 76 | single_predictions.append(np.transpose(np.vstack((fold_array, test_idx, 77 | labels[test_idx], pred_labels)))) 78 | classification_result[cur_fold, :] = get_classification_result(cur_fold, 79 | labels[test_idx], pred_labels) 80 | 81 | single_predictions = np.vstack(single_predictions) 82 | return single_predictions, classification_result 83 | 84 | def main(argv=None): 85 | 86 | if argv is None: 87 | argv = sys.argv[1:] 88 | 89 | args = parser.parse_args(argv) 90 | log.info('start parameters: ' + str(args)) 91 | 92 | log.info('loading feature and label data') 93 | labels = np.asarray(map(int, list(file_line_generator(args.label_file)))) 94 | 95 | log.info('performing cross validation') 96 | single_predictions, classification_result = do_cross_validation(labels) 97 | 98 | log.info('storing results') 99 | header = 'fold_no;instance_index;true_label;pred_label' 100 | np.savetxt(os.path.join(args.output_dir, 'predictions.csv'), 101 | single_predictions, '%d', ';', '\n', header=header) 102 | 103 | all_true_labels = single_predictions[:, 2] 104 | all_pred_labels = single_predictions[:, 3] 105 | confusion = confusion_matrix(all_true_labels, all_pred_labels) 106 | 107 | np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'), 108 | confusion, '%d', ';', '\n') 109 | 110 | classification_result[NO_OF_FOLDS, :] = get_classification_result(-1, 111 | all_true_labels, all_pred_labels) 112 | 113 | header = 'fold_no;accuracy;precision;recall;f1' 114 | np.savetxt(os.path.join(args.output_dir, 'metrics.csv'), 115 | classification_result, '%f', ';', '\n', header=header) 116 | 117 | log.info(classification_result) 118 | log.info('finished') 119 | 120 | if __name__ == "__main__": 121 | sys.exit(main()) 122 | -------------------------------------------------------------------------------- /src/cis/deep/utils/classification/apps/classify_xval.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | example usage: 4 | -n 5 | X:\sa\experiments\contextual_polarity\vlbl\sentiment-wnd3_3-nce5\classification\1ep\distrib.out 6 | X:\sa\experiments\contextual_polarity\vlbl\sentiment-wnd3_3-nce5\classification\ebert,20140515-label 7 | . 8 | """ 9 | 10 | from argparse import ArgumentParser 11 | from logging import getLogger 12 | import os 13 | import sys 14 | 15 | from sklearn.cross_validation import StratifiedKFold 16 | from sklearn.metrics.metrics import confusion_matrix 17 | from sklearn.svm import LinearSVC 18 | 19 | from cis.deep.utils import logger_config, file_line_generator 20 | from cis.deep.utils.classification import calc_metrics 21 | import numpy as np 22 | 23 | 24 | # from sklearn.dummy import DummyClassifier 25 | log = getLogger(__name__) 26 | logger_config(log) 27 | 28 | parser = ArgumentParser( 29 | description='Perform a 10-fold cross validation on given feature data.') 30 | parser.add_argument('feature_file', 31 | help="""File containing the features as dense matrix. bz2 and gz are 32 | supported.""") 33 | parser.add_argument('label_file', 34 | help="""File containing the data labels. One label per line.""") 35 | parser.add_argument('output_dir', 36 | help='directory to store the results in') 37 | 38 | parser.add_argument('-n', '--normalize', action='store_true', 39 | help="""Normalize each feature to zero mean and 1 std dev. That makes 40 | sense if the the values of different features are very different.""") 41 | 42 | NO_OF_FOLDS = 10 43 | 44 | def get_classification_result(fold_no, true_labels, pred_labels): 45 | """Return classification resuls for one fold. 46 | 47 | Return an array containing accuracy, precision, recall, and f1, based on the 48 | given true and predicted labels. 49 | 50 | Keyword arguments: 51 | fold_no -- this fold's number 52 | true_labels -- true labels 53 | pred_labels -- predicted labels 54 | """ 55 | res = np.zeros(5) 56 | res[0] = fold_no 57 | 58 | acc, prec, rec, f1 = calc_metrics(true_labels, pred_labels) 59 | res[1:5] = [acc, prec, rec, f1] 60 | return res 61 | 62 | def calc_results(train_features, train_labels, normalize=False): 63 | """Perform the k-fold cross validation. 64 | 65 | Perform the k-fold cross validation, collect the result and return the 66 | single test instance predictions, as well as the classification results for 67 | each single fold and for the combination of all folds. 68 | 69 | Keyword arguments: 70 | train_features -- all train_features 71 | train_labels -- all train_labels 72 | """ 73 | skf = StratifiedKFold(train_labels, NO_OF_FOLDS) 74 | single_predictions = [] # Store each single classification decision 75 | # Store the feature weights after the training 76 | weight_vectors = np.zeros((NO_OF_FOLDS, train_features.shape[1])) 77 | 78 | # Store classification results for each fold and for the entire task (i.e., 79 | # entire cross validation). 80 | classification_result = np.zeros((NO_OF_FOLDS + 1, 5)) 81 | 82 | for cur_fold, (train_idx, test_idx) in enumerate(skf): 83 | train_data = train_features[train_idx] 84 | test_data = train_features[test_idx] 85 | 86 | if normalize: 87 | # compute the mean and std dev only on the training data, but also 88 | # apply it to the test data. 89 | mean = np.mean(train_features[train_idx, :], axis=0) 90 | std_dev = np.std(train_features[train_idx, :], axis=0, dtype=float) 91 | train_data = (train_data - mean) / std_dev 92 | test_data = (test_data - mean) / std_dev 93 | 94 | model = LinearSVC(random_state=84447) 95 | model.fit(train_data, train_labels[train_idx]) 96 | pred_labels = model.predict(test_data) 97 | 98 | fold_array = np.empty(test_idx.shape[0]) 99 | fold_array.fill(cur_fold) 100 | single_predictions.append(np.transpose(np.vstack((fold_array, test_idx, 101 | train_labels[test_idx], pred_labels)))) 102 | classification_result[cur_fold, :] = get_classification_result(cur_fold, 103 | train_labels[test_idx], pred_labels) 104 | weight_vectors[cur_fold, :] = model.coef_ 105 | 106 | single_predictions = np.vstack(single_predictions) 107 | return single_predictions, classification_result, weight_vectors 108 | 109 | def main(argv=None): 110 | 111 | if argv is None: 112 | argv = sys.argv[1:] 113 | 114 | args = parser.parse_args(argv) 115 | log.info('start parameters: ' + str(args)) 116 | 117 | log.info('loading feature and label data') 118 | labels = np.asarray(map(int, list(file_line_generator(args.label_file)))) 119 | features = np.loadtxt(args.feature_file) 120 | 121 | log.info('performing cross validation') 122 | single_predictions, classification_result, weight_vectors = \ 123 | calc_results(features, labels, args.normalize) 124 | 125 | log.info('storing results') 126 | np.savetxt(os.path.join(args.output_dir, 'svm-weights.csv'), 127 | weight_vectors, '%f', ';', '\n') 128 | 129 | header = 'fold_no;instance_index;true_label;pred_label' 130 | np.savetxt(os.path.join(args.output_dir, 'predictions.csv'), 131 | single_predictions, '%d', ';', '\n', header=header) 132 | 133 | all_true_labels = single_predictions[:, 2] 134 | all_pred_labels = single_predictions[:, 3] 135 | confusion = confusion_matrix(all_true_labels, all_pred_labels) 136 | 137 | np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'), 138 | confusion, '%d', ';', '\n') 139 | 140 | classification_result[NO_OF_FOLDS, :] = get_classification_result(-1, 141 | all_true_labels, all_pred_labels) 142 | 143 | header = 'fold_no;accuracy;precision;recall;f1' 144 | np.savetxt(os.path.join(args.output_dir, 'metrics.csv'), 145 | classification_result, '%f', ';', '\n', header=header) 146 | 147 | log.info(classification_result) 148 | log.info('finished') 149 | 150 | if __name__ == "__main__": 151 | sys.exit(main()) 152 | -------------------------------------------------------------------------------- /src/cis/deep/utils/classification/apps/prepare_significance_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | """ 4 | 5 | from argparse import ArgumentParser 6 | from logging import getLogger 7 | import sys 8 | 9 | from cis.deep.utils import logger_config, file_line_generator, utf8_file_open 10 | 11 | # import pydevd 12 | # pydevd.settrace(host='129.187.148.250', stdoutToServer=True, 13 | # stderrToServer=True) 14 | 15 | log = getLogger(__name__) 16 | logger_config(log) 17 | 18 | parser = ArgumentParser( 19 | description="""Prepare a predictions file created by classify.py for the 20 | use of Sebastian Padó's approximate randomization significance test.""") 21 | parser.add_argument('prediction_file', 22 | help="""File containing a classifiers prediction created by classify.py 23 | .""") 24 | parser.add_argument('outfile', 25 | help="""converted file""") 26 | 27 | def main(argv=None): 28 | 29 | if argv is None: 30 | argv = sys.argv[1:] 31 | 32 | args = parser.parse_args(argv) 33 | log.info('start parameters: ' + str(args)) 34 | 35 | log.info('converting file') 36 | 37 | with utf8_file_open(args.outfile, 'w') as outfile: 38 | 39 | for line in file_line_generator(args.prediction_file): 40 | 41 | if line.startswith(u'#'): 42 | continue 43 | 44 | (_, true_label, pred_label) = line.split(';') 45 | true_label = int(true_label) 46 | pred_label = int(pred_label) 47 | 48 | tp = 1 if true_label == 1 and pred_label == 1 else 0 49 | model_pos = 1 if pred_label == 1 else 0 50 | gold_pos = 1 if true_label == 1 else 0 51 | 52 | outfile.write(u'%d %d %d\n' % (tp, model_pos, gold_pos)) 53 | log.info('finished') 54 | 55 | if __name__ == "__main__": 56 | sys.exit(main()) 57 | -------------------------------------------------------------------------------- /src/cis/deep/utils/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | from _collections import defaultdict 2 | from collections import Counter 3 | 4 | def purity(clusters, classes): 5 | """Compute purity for the given data. 6 | 7 | Parameters 8 | ---------- 9 | clusters : list(int) 10 | cluster ids of all examples 11 | classes : list(int) 12 | class ids of all examples 13 | """ 14 | 15 | d = defaultdict(list) 16 | 17 | # Get a list of class numbers of all examples in a cluster. 18 | for k, v in zip(clusters, classes): 19 | d[k].append(v) 20 | 21 | mayority = 0 22 | 23 | # Count the mayority class number and add it up over all clusters. 24 | for k in d: 25 | mayority += Counter(d[k]).most_common(1)[0][1] 26 | 27 | return float(mayority) / len(clusters) 28 | -------------------------------------------------------------------------------- /src/cis/deep/utils/clustering/apps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/cis/deep/utils/clustering/apps/__init__.py -------------------------------------------------------------------------------- /src/cis/deep/utils/clustering/apps/kmeans.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | example usage: 4 | -k 94 5 | X:\sa\experiments\contextual_polarity\mlp\sent_1\amazon\brown\5000-0.1l-200\features-predict-out-unique 6 | X:\sa\experiments\contextual_polarity\mlp\sent_1\amazon\brown\5000-0.1l-200\features-predict-clusters 7 | 8 | -s X:\sa\experiments\contextual_polarity\mlp\sent_1\amazon\brown\5000-0.1l-200\features-predict-out-unique 9 | X:\sa\experiments\contextual_polarity\mlp\sent_1\amazon\brown\5000-0.1l-200\features-predict-out-unique 10 | X:\sa\experiments\contextual_polarity\mlp\sent_1\amazon\brown\5000-0.1l-200\features-predict-clusters 11 | """ 12 | from argparse import ArgumentParser 13 | from logging import getLogger 14 | import sys 15 | 16 | from sklearn.cluster.k_means_ import KMeans 17 | 18 | from cis.deep.utils import logger_config, utf8_file_open, save_object_to_file 19 | import numpy as np 20 | 21 | 22 | log = getLogger(__name__) 23 | logger_config(log) 24 | 25 | parser = ArgumentParser(description="""Cluster given data points using 26 | k-means.""") 27 | 28 | parser.add_argument('data_points', help='data points to be clustered') 29 | parser.add_argument('outfile', help='output file') 30 | 31 | parser.add_argument('-m', '--model', help='save model into that file') 32 | parser.add_argument('-c', '--centroids', help='save centroids into that file') 33 | parser.add_argument('-i', '--max-iterations', dest='max_iter', type=int, 34 | default=300, help='Maximum number of iterations of the algorithm') 35 | parser.add_argument('-mr', '--root', action='store_true', 36 | help="""modify the data by taking the root of every entry before 37 | clustering""") 38 | parser.add_argument('-t', '--threads', type=int, default=1, 39 | help="""number of jobs using for the clustering""") 40 | 41 | cluster_group = parser.add_mutually_exclusive_group(required=True) 42 | cluster_group.add_argument('-k', '--clusters', type=int, 43 | help='number of clusters; either -k or -s must be given') 44 | cluster_group.add_argument('-s', '--start-points', dest='start_points', 45 | help="""file that contains the start points for all clusters; either -k 46 | or -s must be given""") 47 | 48 | def get_initial_centers(cluster_count, filename): 49 | """Return number of clusters and initial cluster centers or the method to 50 | create them. 51 | 52 | Parameters 53 | ---------- 54 | cluster_count : None/int 55 | number of clusters; if None, loads the cluster centroids from the given 56 | file 57 | filename : None/str 58 | name of file, which contains the cluster centroids; if None, 59 | cluster_count must be given 60 | 61 | Returns 62 | ------- 63 | if cluster_count is given: (int, str) 64 | cluster count and the method that will be used to choose the centroids 65 | later 66 | if cluster_count is not given (int, ndarray) 67 | cluster count and the centroids 68 | """ 69 | 70 | if cluster_count: 71 | return (cluster_count, 'k-means++') 72 | 73 | centers = np.loadtxt(filename) 74 | return (centers.shape[1], centers) 75 | 76 | def main(argv=None): 77 | 78 | if argv is None: 79 | argv = sys.argv[1:] 80 | 81 | args = parser.parse_args(argv) 82 | log.info('start parameters: ' + str(args)) 83 | 84 | log.info('loading data') 85 | data = np.loadtxt(args.data_points) 86 | 87 | if args.root is not None: 88 | data = np.sqrt(data) 89 | 90 | (k, initial_points) = get_initial_centers(args.clusters, args.start_points) 91 | 92 | log.info('calculate center points') 93 | kmeans = KMeans(k, initial_points, 1, args.max_iter, copy_x=False) 94 | predict = kmeans.fit_predict(data) 95 | 96 | log.info('storing results') 97 | 98 | if args.model: 99 | save_object_to_file(kmeans, args.model) 100 | 101 | with utf8_file_open(args.outfile, 'w') as outfile: 102 | 103 | for i in xrange(predict.shape[0]): 104 | outfile.write(u'%d\n' % predict[i]) 105 | 106 | if args.centroids: 107 | np.savetxt(args.centroids, kmeans.cluster_centers_) 108 | 109 | log.info('finished') 110 | 111 | if __name__ == "__main__": 112 | sys.exit(main()) 113 | -------------------------------------------------------------------------------- /src/cis/deep/utils/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Requires the enum34 package. 3 | """ 4 | 5 | from logging import getLogger 6 | 7 | from enum import Enum, IntEnum 8 | 9 | from cis.deep.utils import file_line_generator, logger_config, utf8_file_open,\ 10 | sort_dict_by_label 11 | import numpy as np 12 | 13 | 14 | log = getLogger(__name__) 15 | logger_config(log) 16 | 17 | class SpecialToken(Enum): 18 | """Enum for special tokens and their string expression. 19 | 20 | Get the enum entry's value with SpecialToken.PAD.value. 21 | """ 22 | UNKNOWN = u'' 23 | SENT_START = u'' 24 | SENT_END = u'' 25 | PAD = u'' 26 | 27 | 28 | SPECIAL_TOKENS = [SpecialToken.UNKNOWN, SpecialToken.SENT_START, 29 | SpecialToken.SENT_END, SpecialToken.PAD] 30 | 31 | 32 | class SpecialTokenID(IntEnum): 33 | """Enum for ids of special tokens. 34 | 35 | Get the enum entry's value with SpecialTokenId.PAD.value. 36 | """ 37 | UNKNOWN = 0 38 | SENT_START = 1 39 | SENT_END = 2 40 | PAD = 3 41 | 42 | 43 | def compute_avg_text_embedding(text, vocab, embs): 44 | """Convert the given text into a compressed vector using average embeddings. 45 | 46 | Average all word vectors to a final document vector. 47 | 48 | Parameters 49 | ---------- 50 | text : str 51 | text to be compressed 52 | vocab: dict(str, int) 53 | vocabulary (see read_vocabulary_id_file) 54 | embs : ndarray(m*n) 55 | embeddings 56 | """ 57 | vec = np.zeros(embs.shape[1]) 58 | count = 0 59 | 60 | for tok in text.split(): 61 | vec += embs[vocab.get(tok, SpecialTokenID.UNKNOWN.value), :] 62 | count += 1 63 | 64 | return vec / float(count) 65 | 66 | def read_vocabulary_file(input_file, add_special_tokens=True): 67 | """Read the textual vocabulary into a list. Items that are empty after 68 | calling str.strip on them will be mapped to u''. 69 | 70 | Parameters 71 | ---------- 72 | input_file : str 73 | location of the vocabulary 74 | add_special_tokens : bool 75 | indicates whether or not to add special tokens to the front of the 76 | vocabulary, like for unknown tokens, etc. 77 | 78 | Returns 79 | ------- 80 | list(str) 81 | vocabulary from token to unique id 82 | """ 83 | vocab = list(file_line_generator(input_file)) 84 | 85 | if add_special_tokens: 86 | _add_special_tokens(vocab) 87 | 88 | return [v.strip() if v.strip() else u'' for v in vocab] 89 | 90 | def read_vocabulary_id_file(input_file, add_special_tokens=True): 91 | """Read the textual vocabulary into a map that maps the token to it's index. 92 | 93 | Each map entry points from the vocabulary token to the index in the 94 | vocabulary. 95 | 96 | Parameters 97 | ---------- 98 | input_file : str 99 | location of the vocabulary 100 | add_special_tokens : bool 101 | indicates whether or not to add special tokens to the front of the 102 | vocabulary, like for unknown tokens, etc. 103 | 104 | Returns 105 | ------- 106 | dict(str, int) 107 | vocabulary from token to unique id 108 | """ 109 | vocab = read_vocabulary_file(input_file, add_special_tokens) 110 | vocab_to_indices = {w : i for (i, w) in enumerate(vocab)} 111 | 112 | if len(vocab) != len(vocab_to_indices): 113 | log.warning("""Vocabulary contains duplicate items. They have been 114 | removed automatically.""") 115 | return vocab_to_indices 116 | 117 | def write_vocabulary_file(output_file, vocab): 118 | """Write the given vocabulary to the given file. 119 | 120 | The vocabulary items are stored in order of the vocab values, i.e., in the 121 | same order as they have been read by read_vocabulary_id_file. 122 | 123 | Parameters 124 | ---------- 125 | output_file : str 126 | filename of the output 127 | vocab : dict(str, int) 128 | vocabulary that has been read by read_vocabulary_id_file 129 | """ 130 | 131 | with utf8_file_open(output_file, 'w') as vocab_file: 132 | vocab_file.write(u'\n'.join(k[0] 133 | for k in sort_dict_by_label(vocab))) 134 | vocab_file.write(u'\n') 135 | 136 | def _add_special_tokens(vocab): 137 | """Add special tokens to the beginning of the given vocabulary. 138 | 139 | Adds the special tokens only if they don't already exist. If the vocabulary 140 | already contains some special tokens the order of them does not change. 141 | 142 | Parameters 143 | ---------- 144 | vocab : list(str) 145 | vocabulary items 146 | 147 | Returns 148 | ------- 149 | list(str) 150 | vocabulary with the special tokens inserted at the front 151 | """ 152 | if SpecialToken.PAD.value not in vocab: 153 | vocab.insert(0, SpecialToken.PAD.value) 154 | if SpecialToken.SENT_END.value not in vocab: 155 | vocab.insert(0, SpecialToken.SENT_END.value) 156 | if SpecialToken.SENT_START.value not in vocab: 157 | vocab.insert(0, SpecialToken.SENT_START.value) 158 | if SpecialToken.UNKNOWN.value not in vocab: 159 | vocab.insert(0, SpecialToken.UNKNOWN.value) 160 | 161 | return vocab 162 | -------------------------------------------------------------------------------- /src/cis/deep/utils/lm/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | 5 | def interpolate(model1, model2, weight): 6 | """Interpolate the probabilities of two models. 7 | 8 | Model 1 is weighted by the parameter, model 2 is weighted by (1 - weight). 9 | 10 | Parameters 11 | ---------- 12 | model1 : ndarray 13 | probabilities of model 1 14 | model2 : ndarray 15 | probabilities of model 2 16 | weight : float 17 | weight of model 1, model 2 will receive weight (1 - weight) 18 | 19 | Returns 20 | ------- 21 | float 22 | interpolated probability 23 | """ 24 | model1 = np.asarray(model1) 25 | model2 = np.asarray(model2) 26 | interpolated = weight * model1 + (1-weight) * model2 27 | return perplexity(interpolated) 28 | 29 | def perplexity(probabs): 30 | """Calculate perplexity given the list of probabs. 31 | 32 | Parameters 33 | ---------- 34 | probabs : iterable 35 | list of probabs 36 | 37 | Returns 38 | ------- 39 | float 40 | perplexity 41 | """ 42 | probabs = np.asarray(probabs) 43 | return np.exp(-np.sum(np.log(probabs)) / np.max(probabs.shape)) 44 | 45 | -------------------------------------------------------------------------------- /src/cis/deep/utils/misc/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | 5 | def softmax(M): 6 | """Calculate the row-wise softmax given a matrix. 7 | 8 | Parameters 9 | ---------- 10 | M : 2d structure (m x n) 11 | 12 | Returns 13 | ------- 14 | ndarray(m x n) 15 | probabilities according to softmax computation, each row sum = 1 16 | """ 17 | M = np.asarray(M) 18 | 19 | if M.ndim == 1: 20 | M = np.atleast_2d(M) 21 | 22 | maxes = np.amax(M, axis=1) 23 | maxes = maxes.reshape(maxes.shape[0], 1) 24 | e = np.exp(M - maxes) 25 | dist = e / np.sum(e, axis=1, keepdims=True) 26 | return dist 27 | -------------------------------------------------------------------------------- /src/cis/deep/utils/misc/apps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/cis/deep/utils/misc/apps/__init__.py -------------------------------------------------------------------------------- /src/cis/deep/utils/misc/apps/automatic_cluster_labeling.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | """ 4 | 5 | from _collections import defaultdict 6 | from argparse import ArgumentParser 7 | from collections import Counter 8 | from logging import getLogger 9 | import sys 10 | 11 | from cis.deep.utils import file_line_generator, logger_config, utf8_file_open, \ 12 | sort_dict_by_key 13 | 14 | 15 | log = getLogger(__name__) 16 | logger_config(log) 17 | 18 | parser = ArgumentParser( 19 | description="""Labels given clusters according to their majority class. 20 | """) 21 | parser.add_argument('data_file', 22 | help="""contains a line for each example which consists of the example's 23 | original label and its cluster id separated by a space""") 24 | parser.add_argument('predicted_labels', 25 | help="""output file containing the predicted labels for each item; one 26 | label per line""") 27 | parser.add_argument('-cl', '--cluster_labels', 28 | help="""output file containing the mapping of cluster ids to new labels 29 | """) 30 | 31 | def main(argv=None): 32 | if argv is None: 33 | argv = sys.argv[1:] 34 | 35 | args = parser.parse_args(argv) 36 | log.info('start parameters: ' + str(args)) 37 | 38 | log.info('loading data') 39 | items = [] 40 | 41 | for line in file_line_generator(args.data_file): 42 | items.append(tuple(line.split())) 43 | 44 | log.info('compute majority labels') 45 | cluster_to_label_count = defaultdict(Counter) 46 | 47 | # Count labels per cluster 48 | for (label, cluster_id) in items: 49 | cluster_to_label_count[cluster_id][label] += 1 50 | 51 | majority_labels = dict() 52 | 53 | # Get majority label per cluster 54 | for cluster_id in cluster_to_label_count: 55 | majority_labels[cluster_id] = cluster_to_label_count[cluster_id].most_common(1)[0][0] 56 | 57 | log.info('assign labels to examples') 58 | 59 | with utf8_file_open(args.predicted_labels, 'w') as pred_file: 60 | 61 | for example_line in file_line_generator(args.data_file): 62 | pred_file.write(majority_labels[example_line.split()[1]] + u'\n') 63 | 64 | 65 | if args.cluster_labels: 66 | 67 | with utf8_file_open(args.cluster_labels, 'w') as outfile: 68 | 69 | for (cluster_id, label) in sort_dict_by_key(majority_labels): 70 | outfile.write(u'%s %s\n' % (cluster_id, label)) 71 | 72 | log.info('finished') 73 | 74 | 75 | if __name__ == "__main__": 76 | sys.exit(main()) 77 | -------------------------------------------------------------------------------- /src/cis/deep/utils/misc/apps/combine_files.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | """ 4 | 5 | from argparse import ArgumentParser 6 | from logging import getLogger 7 | import sys 8 | 9 | from cis.deep.utils import file_line_generator, logger_config, utf8_file_open,\ 10 | log_iterations 11 | 12 | 13 | log = getLogger(__name__) 14 | logger_config(log) 15 | 16 | parser = ArgumentParser( 17 | description="""Takes two files and combines each line in file 1 with 18 | all lines in file 2.""") 19 | parser.add_argument('file1') 20 | parser.add_argument('file2', 21 | help="""use the smaller file as file2, it will be kept in memory""") 22 | parser.add_argument('out_file', 23 | help="""File to write the combination of both files into. 24 | Bz2 is supported.""") 25 | parser.add_argument('-s', '--separator', default=u' ') 26 | 27 | def main(argv=None): 28 | """See argument parser description.""" 29 | 30 | if argv is None: 31 | argv = sys.argv[1:] 32 | 33 | args = parser.parse_args(argv) 34 | log.info('start parameters: ' + str(args)) 35 | 36 | log.info('loading data') 37 | file2_content = list(file_line_generator(args.file2)) 38 | 39 | log.info('combining files') 40 | 41 | with utf8_file_open(args.out_file, 'w') as outfile: 42 | 43 | for c, line1 in enumerate(file_line_generator(args.file1)): 44 | log_iterations(log, c, 1000) 45 | 46 | for line2 in file2_content: 47 | outfile.write(line1 + args.separator + line2 + u'\n') 48 | 49 | log.info('finished') 50 | 51 | 52 | if __name__ == "__main__": 53 | sys.exit(main()) 54 | -------------------------------------------------------------------------------- /src/cis/deep/utils/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/cis/deep/utils/preprocessing/__init__.py -------------------------------------------------------------------------------- /src/cis/deep/utils/preprocessing/apps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/cis/deep/utils/preprocessing/apps/__init__.py -------------------------------------------------------------------------------- /src/cis/deep/utils/preprocessing/apps/convert_apnews_to_text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from argparse import ArgumentParser 3 | from logging import getLogger 4 | import sys 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from cis.deep.utils import utf8_file_open, logger_config, file_line_generator 10 | 11 | 12 | log = getLogger(__name__) 13 | logger_config(log) 14 | 15 | parser = ArgumentParser( 16 | description="""Converts the binary files of the AP News (Associated 17 | News) corpus provided by Yoshua Bengio into readable text.""") 18 | parser.add_argument('infile', type=str, help='input file') 19 | parser.add_argument('outfile', type=str, help='output file') 20 | parser.add_argument('vocabulary', type=str, help='vocabular file') 21 | 22 | def main(argv=None): 23 | """See argument parser description.""" 24 | 25 | if argv is None: 26 | argv = sys.argv[1:] 27 | 28 | args = parser.parse_args(argv) 29 | log.info('start parameters: ' + str(args)) 30 | 31 | vocab = pd.Series(file_line_generator(args.vocabulary, comment='##')) 32 | 33 | with open(args.infile, 'rb') as infile: 34 | integers = np.fromfile(infile, np.int32) 35 | 36 | with utf8_file_open(args.outfile, 'w') as outfile: 37 | outfile.write(u'\n'.join(vocab[integers])) 38 | outfile.write(u'\n') 39 | 40 | log.info('finished') 41 | 42 | if __name__ == "__main__": 43 | sys.exit(main()) 44 | -------------------------------------------------------------------------------- /src/cis/deep/utils/preprocessing/apps/escape_regex.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | example usage: 4 | """ 5 | 6 | from argparse import ArgumentParser 7 | from logging import getLogger 8 | import re 9 | import sys 10 | 11 | from cis.deep.utils import utf8_file_open, logger_config 12 | 13 | 14 | log = getLogger(__name__) 15 | logger_config(log) 16 | 17 | parser = ArgumentParser(description="""Escape the given text file to remove all 18 | regular expressions.""") 19 | parser.add_argument('infile', 20 | help='file that might contain regular expressions') 21 | parser.add_argument('outfile', help='file having regular expressions escaped') 22 | 23 | def main(argv=None): 24 | 25 | if argv is None: 26 | argv = sys.argv[1:] 27 | 28 | args = parser.parse_args(argv) 29 | log.info('start parameters: ' + str(args)) 30 | 31 | log.info('transforming data') 32 | 33 | with utf8_file_open(args.infile) as infile: 34 | with utf8_file_open(args.outfile, 'w') as outfile: 35 | 36 | for line in infile: 37 | outfile.write(re.escape(line)) 38 | log.info('finished') 39 | 40 | if __name__ == "__main__": 41 | sys.exit(main()) 42 | -------------------------------------------------------------------------------- /src/cis/deep/utils/preprocessing/apps/filter_file_by_lines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #! /usr/bin/env python 3 | """ 4 | """ 5 | 6 | from argparse import ArgumentParser 7 | from logging import getLogger 8 | import os 9 | import sys 10 | 11 | from cis.deep.utils import logger_config, utf8_file_open, file_line_generator 12 | 13 | 14 | log = getLogger(__name__) 15 | logger_config(log) 16 | 17 | parser = ArgumentParser( 18 | description="""Filters a given file by lines indices.""") 19 | 20 | parser.add_argument('indices', help="""line numbers that will be included in 21 | the output; either comma separated string (e.g., 1,4,6) or file 22 | containing one index per line; 23 | Caution: make sure the indices are sorted; the indices are 0-based.""") 24 | parser.add_argument('infile', help='file to be filtered') 25 | parser.add_argument('outfile', help='filtered output file') 26 | parser.add_argument('-i', '--inverse', action='store_true', 27 | help="""inverse the indices, i.e., exclude the lines with the given 28 | line number""") 29 | 30 | def get_indices(indices): 31 | """Generates line indices to keep. 32 | 33 | Parameters 34 | ---------- 35 | indices : str 36 | either name of a file containing indices one per line or a comma 37 | separated string 38 | 39 | Returns 40 | ------- 41 | int 42 | next index 43 | """ 44 | 45 | if os.path.exists(indices): 46 | return set(map(int, file_line_generator(indices, True))) 47 | 48 | return set((int(i.strip()) for i in indices.split(u','))) 49 | 50 | def main(argv=None): 51 | log.info('started application') 52 | 53 | if argv is None: 54 | argv = sys.argv[1:] 55 | 56 | args = parser.parse_args() 57 | log.info('start parameters: ' + str(args)) 58 | log.info('reading index file') 59 | idx = get_indices(args.indices) 60 | max_idx = max(idx) 61 | log.info('filtering file') 62 | 63 | with utf8_file_open(args.outfile, 'w') as outfile: 64 | 65 | for (cur_idx, line) in enumerate( 66 | file_line_generator(args.infile, False)): 67 | 68 | if not args.inverse: 69 | 70 | if cur_idx in idx: 71 | outfile.write(line) 72 | 73 | if cur_idx >= max_idx: 74 | break 75 | else: 76 | 77 | if cur_idx not in idx: 78 | outfile.write(line) 79 | 80 | 81 | log.info('finished') 82 | 83 | if __name__ == "__main__": 84 | sys.exit(main()) 85 | -------------------------------------------------------------------------------- /src/cis/deep/utils/preprocessing/apps/preprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Example usage: 4 | --amazon 5 | NLTK_DATA_DIR = 'C:/Temp/NLTK data' 6 | x 7 | y 8 | """ 9 | 10 | from argparse import ArgumentParser 11 | from logging import getLogger 12 | import sys 13 | 14 | from cis.deep.utils import logger_config, file_line_generator, utf8_file_open, \ 15 | log_iterations 16 | from cis.deep.utils.preprocessing.corpus import AmazonProductReviewCorpusReader 17 | import re 18 | from cis.deep.utils.text import tokenize 19 | import nltk 20 | 21 | log = getLogger(__name__) 22 | logger_config(log) 23 | 24 | parser = ArgumentParser(description=""" 25 | Preprocess a given file. Several preprocessing parameters are 26 | available. TODO: add lowercasing""") 27 | parser.add_argument('--amazon', action='store_true', 28 | help="""preprocess the Amazon product review corpus.""") 29 | 30 | parser.add_argument('-rd', '--replace_digits', 31 | help="""Replace all digits by the given string""") 32 | parser.add_argument('-sh', '--strip_html', action='store_true', 33 | help='strip html tags') 34 | parser.add_argument('-t', '--tokenize', action='store_true', 35 | help="""tokenize the text""") 36 | parser.add_argument('-ss', '--sentence_splitter', type=str, 37 | default='tokenizers/punkt/english.pickle', 38 | help='model file to be used for sentence splitting (default: ' + \ 39 | 'tokenizers/punkt/english.pickle)') 40 | parser.add_argument('-s', '--split_sentence', action='store_true', 41 | help='split sentences') 42 | parser.add_argument('infile', help='name of the input file') 43 | parser.add_argument('outfile', help='name of the output file') 44 | 45 | REGEX_FLAGS = re.UNICODE 46 | 47 | def main(argv=None): 48 | 49 | if argv is None: 50 | argv = sys.argv[1:] 51 | 52 | args = parser.parse_args(argv) 53 | log.info('start parameters: ' + str(args)) 54 | log.info('preprocessing data') 55 | 56 | if args.amazon is True: 57 | line_iterator = \ 58 | AmazonProductReviewCorpusReader(args.infile).review_generator() 59 | else: 60 | line_iterator = file_line_generator(args.infile) 61 | 62 | if args.sentence_splitter: 63 | sent_splitter = nltk.data.load(args.sentence_splitter) 64 | 65 | with utf8_file_open(args.outfile, 'w') as outfile: 66 | 67 | for (i, line) in enumerate(line_iterator): 68 | log_iterations(log, i, 100000) 69 | 70 | if args.replace_digits: 71 | line = re.sub(r'\d', args.replace_digits, line, 72 | 0, REGEX_FLAGS) 73 | 74 | if args.strip_html: 75 | line = nltk.clean_html(line) 76 | 77 | if args.sentence_splitter: 78 | line = sent_splitter.tokenize(line) 79 | else: 80 | line = [line] 81 | 82 | if args.tokenize: 83 | line = [tokenize(l) for l in line] 84 | 85 | if not args.tokenize: 86 | outfile.write(u'\n'.join(line)) 87 | else: 88 | outfile.write(u'\n'.join([u' '.join(l) for l in line])) 89 | 90 | outfile.write(u'\n') 91 | 92 | log.info('finished') 93 | 94 | if __name__ == "__main__": 95 | sys.exit(main()) 96 | -------------------------------------------------------------------------------- /src/cis/deep/utils/preprocessing/apps/splitter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from argparse import ArgumentParser 3 | from logging import getLogger 4 | import sys 5 | 6 | import nltk 7 | 8 | from cis.deep.utils import utf8_file_open, logger_config 9 | 10 | 11 | log = getLogger(__name__) 12 | logger_config(log) 13 | 14 | parser = ArgumentParser( 15 | description="""Splits the given input file into sentences by NLTK\'s 16 | punkt sentence tokenizer and writes the result into the output file. 17 | It assumes English language if there is no language given. It reads 18 | one line at a time, i.e., if there are line breaks not marking sentence 19 | boundaries, they won't be handled correctly.""") 20 | parser.add_argument('-m', '--model', type=str, 21 | default='tokenizers/punkt/english.pickle', 22 | help='model file to be used for sentence splitting (default: ' + \ 23 | 'tokenizers/punkt/english.pickle)') 24 | parser.add_argument('infile', type=str, help='input file') 25 | parser.add_argument('outfile', type=str, help='output file') 26 | 27 | def main(argv=None): 28 | """See argument parser description.""" 29 | 30 | if argv is None: 31 | argv = sys.argv[1:] 32 | 33 | args = parser.parse_args(argv) 34 | log.info('start parameters: ' + str(args)) 35 | 36 | with utf8_file_open(args.infile, 'r') as infile: 37 | with utf8_file_open(args.outfile, 'w') as outfile: 38 | sent_splitter = nltk.data.load(args.model) 39 | 40 | for line in infile: 41 | outfile.write('\n'.join(sent_splitter.tokenize(line.strip())) + 42 | '\n') 43 | 44 | log.info('finished') 45 | 46 | if __name__ == "__main__": 47 | sys.exit(main()) 48 | -------------------------------------------------------------------------------- /src/cis/deep/utils/preprocessing/apps/text_to_bow.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | -v X:\sa\embeddings\vlbl\sentiment-wnd3_3-nce5\vlbl.vocab 4 | ebert,20140515-n_grams 5 | ebert,20140515-n_grams.out 6 | """ 7 | 8 | from argparse import ArgumentParser 9 | from logging import getLogger 10 | import sys 11 | 12 | from sklearn.feature_extraction.text import CountVectorizer 13 | 14 | from cis.deep.utils import file_line_generator, logger_config, utf8_file_open 15 | from cis.deep.utils.embeddings import read_vocabulary_id_file 16 | import numpy as np 17 | 18 | 19 | log = getLogger(__name__) 20 | logger_config(log) 21 | 22 | parser = ArgumentParser( 23 | description="""Converts a given text file into a bag-of-words feature 24 | file. Currently, only tf is supported.""") 25 | parser.add_argument('infile', 26 | help="""Data file, containing all tokens. Each line will get its 27 | bow feature vector.""") 28 | parser.add_argument('out_feature_file', 29 | help="""File to write the features into. Bz2 is supported.""") 30 | 31 | parser.add_argument('-v', '--vocabulary', 32 | help="""Vocabulary file containing all valid words. If it's not given 33 | the vocabulary is inferred and stored afterwards. For additional 34 | information see 35 | sklearn.feature_extraction.text.CountVectorizer.__init__'s vocabulary 36 | parameter.""") 37 | parser.add_argument('-n', '--ngram', default='1,1', 38 | help="""comma-separated list of (min n-gram, max n-gram). For example 39 | "1,3" includes all unigrams, bigrams, and trigrams. For additional 40 | information see see CountVectorizer.__init__'s ngram_range parameter. 41 | """) 42 | 43 | def main(argv=None): 44 | """See argument parser description.""" 45 | 46 | if argv is None: 47 | argv = sys.argv[1:] 48 | 49 | args = parser.parse_args(argv) 50 | log.info('start parameters: ' + str(args)) 51 | 52 | log.info('loading data') 53 | 54 | if args.vocabulary is None: 55 | vocab = args.vocabulary 56 | else: 57 | vocab = read_vocabulary_id_file(args.vocabulary) 58 | 59 | text = list(file_line_generator(args.infile)) 60 | 61 | ngram_range = map(int, tuple(args.ngram.split(','))) 62 | vectorizer = CountVectorizer(token_pattern='[^ ]+', min_df=0.0, 63 | vocabulary=vocab, ngram_range=ngram_range, dtype=int) 64 | 65 | log.info('creating features') 66 | bow = vectorizer.fit_transform(text) 67 | 68 | log.info('storing result') 69 | np.savetxt(args.out_feature_file, bow.todense(), fmt='%d') 70 | 71 | with utf8_file_open(args.out_feature_file + '.vocab', 'w') as vocab_file: 72 | vocab_file.write(u'\n'.join(vectorizer.get_feature_names())) 73 | 74 | log.info('finished') 75 | 76 | 77 | if __name__ == "__main__": 78 | sys.exit(main()) 79 | -------------------------------------------------------------------------------- /src/cis/deep/utils/preprocessing/apps/text_to_features.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | X:\sa\experiments\contextual_polarity\vlbl\sentiment-wnd3_3-nce5/classification\ebert,20140515-n_grams 4 | X:\sa\embeddings\vlbl\sentiment-wnd3_3-nce5\vLBL.vocab 5 | ./embs.txt 6 | ./features_out 7 | """ 8 | 9 | from argparse import ArgumentParser 10 | from logging import getLogger 11 | import os 12 | import sys 13 | 14 | from cis.deep.utils import file_line_generator, logger_config, utf8_file_open, \ 15 | ndarray_to_string 16 | from cis.deep.utils.embeddings import read_vocabulary_id_file, SpecialTokenID 17 | import numpy as np 18 | 19 | 20 | log = getLogger(__name__) 21 | logger_config(log) 22 | 23 | parser = ArgumentParser( 24 | description="""Converts a given text file into a features file. This 25 | is done by replacing each token in the text file by it's given feature 26 | vector. All features will be concatenated, i.e., there will be no space 27 | between.""") 28 | parser.add_argument('infile', 29 | help="""Data file, containing all tokens to be replaced by their 30 | features. The file can be compressed with bz2 or gz.""") 31 | parser.add_argument('vocabulary', 32 | help="""Vocabulary file containing all valid words. Tokens not contained 33 | in the vocabulary will be mapped to .""") 34 | parser.add_argument('feature_file', 35 | help="""File containing all token features. Each feature must be in a 36 | single row. The row index must correspond to the vocabulary index. 37 | Currently, only dense matrices are supported.""") 38 | parser.add_argument('out_feature_file', 39 | help="""File to write the features into. Bz2 is supported.""") 40 | 41 | parser.add_argument('-a', '--avg', action='store_true', 42 | help='Average the features for all words in one example (i.e., line).') 43 | 44 | 45 | def main(argv=None): 46 | """See argument parser description.""" 47 | 48 | if argv is None: 49 | argv = sys.argv[1:] 50 | 51 | args = parser.parse_args(argv) 52 | log.info('start parameters: ' + str(args)) 53 | 54 | log.info('loading data') 55 | vocab = read_vocabulary_id_file(args.vocabulary, False) 56 | 57 | _, ext = os.path.splitext(args.feature_file) 58 | 59 | if ext == 'npy': 60 | features = np.load(args.feature_file) 61 | else: 62 | features = np.loadtxt(args.feature_file) 63 | 64 | log.info('creating features') 65 | 66 | with utf8_file_open(args.out_feature_file, 'w') as outfile: 67 | 68 | for line in file_line_generator(args.infile): 69 | toks = line.split() 70 | cur_features = np.zeros((len(toks), features.shape[1])) 71 | 72 | for (i, tok) in enumerate(toks): 73 | cur_features[i, :] = features[ 74 | vocab.get(tok, SpecialTokenID.UNKNOWN.value)] 75 | 76 | if args.avg: 77 | res = ndarray_to_string(np.mean(cur_features, axis=0)) 78 | else: 79 | res = ndarray_to_string(np.reshape(cur_features, 80 | np.prod(cur_features.shape), order='C')) 81 | 82 | outfile.write(res + u'\n') 83 | 84 | log.info('finished') 85 | 86 | 87 | if __name__ == "__main__": 88 | sys.exit(main()) 89 | -------------------------------------------------------------------------------- /src/cis/deep/utils/preprocessing/apps/tokenizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from argparse import ArgumentParser 3 | from logging import getLogger 4 | import sys 5 | 6 | from cis.deep.utils import logger_config, utf8_file_open 7 | from cis.deep.utils.text import tokenize 8 | 9 | 10 | log = getLogger(__name__) 11 | logger_config(log) 12 | 13 | parser = ArgumentParser( 14 | description="""Tokenizes the given input file by NLTK\'s recommended 15 | word tokenizer and writes the result into the output file.""") 16 | parser.add_argument('infile', help='input file') 17 | parser.add_argument('outfile', help='output file') 18 | 19 | def main(argv=None): 20 | """See argument parser description.""" 21 | 22 | if argv is None: 23 | argv = sys.argv[1:] 24 | 25 | args = parser.parse_args(argv) 26 | log.info('start parameters: ' + str(args)) 27 | 28 | with utf8_file_open(args.infile, 'r') as infile: 29 | with utf8_file_open(args.outfile, 'w') as outfile: 30 | 31 | for line in infile: 32 | outfile.write(' '.join(tokenize(line)) + '\n') 33 | 34 | log.info('finished') 35 | 36 | if __name__ == "__main__": 37 | sys.exit(main()) 38 | -------------------------------------------------------------------------------- /src/cis/deep/utils/preprocessing/apps/word_count.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from argparse import ArgumentParser 3 | from collections import Counter 4 | from logging import getLogger 5 | import sys 6 | 7 | from cis.deep.utils import utf8_file_open, logger_config, sort_dict_by_label 8 | 9 | 10 | log = getLogger(__name__) 11 | logger_config(log) 12 | 13 | parser = ArgumentParser( 14 | description="""Count all tokens in the given input file and writes them 15 | with its count to the output file in descending order.""") 16 | parser.add_argument('-l', '--lowercase', action='store_true', 17 | help='lowercase words before counting') 18 | parser.add_argument('infile', help='input file') 19 | parser.add_argument('outfile', help='output file') 20 | 21 | def main(argv=None): 22 | """See argument parser description.""" 23 | 24 | if argv is None: 25 | argv = sys.argv[1:] 26 | 27 | args = parser.parse_args(argv) 28 | log.info('start parameters: ' + str(args)) 29 | 30 | counter = Counter() 31 | 32 | with utf8_file_open(args.infile, 'r') as infile: 33 | 34 | for line in infile: 35 | line = line.strip() 36 | 37 | if args.lowercase: 38 | line = line.lower() 39 | # line = line.decode('utf-8').strip() 40 | 41 | # log.info(line) 42 | # if line == '' or line.startswith(' 0.5) # The prediction thresholded 45 | # y = theano.printing.Print('y')(self.y) 46 | # left = theano.printing.Print('left')(-y * T.log(self.p_1)) 47 | # self.xent = theano.printing.Print('xent')(left - (1 - self.y) * T.log(1 - self.p_1)) # Cross-entropy loss function 48 | # self.cost = theano.printing.Print('cost')(self.xent.mean() + 0.01 * (self.w ** 2).sum()) # The cost to minimize 49 | # self.gw, self.gb = T.grad(self.cost, [self.w, self.b]) # Compute the gradient of the cost 50 | self.p_1 = 1 / (1 + T.exp(-T.dot(self.x, self.w) - self.b)) # Probability that target = 1 51 | self.prediction = self.p_1 > 0.5 # The prediction thresholded 52 | self.xent = -self.y * T.log(self.p_1) - (1 - self.y) * T.log(1 - self.p_1) # Cross-entropy loss function 53 | self.cost = self.xent.mean() + 0.01 * (self.w ** 2).sum() # The cost to minimize 54 | self.gw, self.gb = T.grad(self.cost, [self.w, self.b]) # Compute the gradient of the cost 55 | 56 | 57 | # Compile 58 | self.train = theano.function( 59 | inputs=[self.x, self.y], 60 | outputs=[self.prediction, self.xent], 61 | updates=((self.w, self.w - 0.1 * self.gw), (self.b, self.b - 0.1 * self.gb))) 62 | self.predict = theano.function(inputs=[self.x], outputs=self.prediction) 63 | 64 | def do_train(self): 65 | 66 | # Train 67 | for i in range(self.training_steps): 68 | print self.b.get_value(), type(self.b.get_value()) 69 | pred, err = self.train(self.D[0], self.D[1]) 70 | 71 | print floatX 72 | 73 | 74 | if not os.path.exists('test'): 75 | print 'create new model' 76 | model = LogisticRegression() 77 | model.create_graph() 78 | model.do_train() 79 | save_object_to_file(model, 'test') 80 | else: 81 | print 'load model' 82 | model = load_object_from_file('test') 83 | 84 | print "Final model:" 85 | print model.w.get_value(), model.b.get_value() 86 | print "target values for D:", model.D[1] 87 | print "prediction on D:", model.predict(model.D[0]) 88 | 89 | 90 | # import numpy 91 | # import theano 92 | # import theano.tensor as T 93 | # from theano import config 94 | # rng = numpy.random 95 | # floatX = config.floatX 96 | # 97 | # N = 400 98 | # feats = 784 99 | # D = (numpy.asarray(rng.randn(N, feats), floatX), numpy.asarray(rng.randint(size=N,low=0, high=2), floatX)) 100 | # training_steps = 10000 101 | # 102 | # # Declare Theano symbolic variables 103 | # x = T.matrix("x", floatX) 104 | # y = T.vector("y", floatX) 105 | # w = theano.shared(numpy.asarray(rng.randn(feats), floatX), name="w") 106 | # b = theano.shared(numpy.cast[floatX](0.), name="b") 107 | # print "Initial model:" 108 | # print w.get_value(), b.get_value() 109 | # 110 | # # Construct Theano expression graph 111 | # p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b)) # Probability that target = 1 112 | # prediction = p_1 > 0.5 # The prediction thresholded 113 | # xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function 114 | # cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize 115 | # gw,gb = T.grad(cost, [w, b]) # Compute the gradient of the cost 116 | # 117 | # # Compile 118 | # train = theano.function( 119 | # inputs=[x,y], 120 | # outputs=[prediction, xent], 121 | # updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb))) 122 | # predict = theano.function(inputs=[x], outputs=prediction) 123 | # 124 | # # Train 125 | # for i in range(training_steps): 126 | # pred, err = train(D[0], D[1]) 127 | # 128 | # print "Final model:" 129 | # print w.get_value(), b.get_value() 130 | # print "target values for D:", D[1] 131 | # print "prediction on D:", predict(D[0]) -------------------------------------------------------------------------------- /src/cis/deep/utils/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This file contains visualization methods. 4 | """ 5 | import matplotlib.pyplot as plt 6 | 7 | def render_points(data, width=1, height=1, margin=0.00): 8 | """Render text points to a pylab figure. 9 | 10 | Parameters 11 | ---------- 12 | points : [(str, (str, float, float))] 13 | data points to render, having the form [(color, (title, x, y))] 14 | width : int 15 | width of the graph in inches 16 | height : int 17 | height of the graph in inches 18 | margin : float 19 | amount of extra whitespace added at the edges 20 | """ 21 | plt.figure(figsize=(width, height), tight_layout=True) 22 | ax = plt.gca() 23 | 24 | minx = 0 25 | maxx = 0 26 | miny = 0 27 | maxy = 0 28 | 29 | for _, points in data: 30 | # get min and max coordinates of the figure 31 | for (title, x, y) in points: 32 | if minx > x: minx = x 33 | if maxx < x: maxx = x 34 | if miny > y: miny = y 35 | if maxy < y: maxy = y 36 | 37 | dx = maxx - minx 38 | dy = maxy - miny 39 | assert dx > 0 40 | assert dy > 0 41 | minx -= dx * margin 42 | miny -= dy * margin 43 | maxx += dx * margin 44 | maxy += dy * margin 45 | 46 | ax.set_autoscale_on(False) 47 | 48 | minx_pos = 50000000 49 | maxx_pos = -50000000 50 | miny_pos = 50000000 51 | maxy_pos = -50000000 52 | 53 | for color, points in data: 54 | # render the single points 55 | for pt in points: 56 | (title, x, y) = pt 57 | x = 1. * (x - minx) / (maxx - minx) 58 | y = 1. * (y - miny) / (maxy - miny) 59 | 60 | minx_pos = min(minx_pos, x) 61 | maxx_pos = max(maxx_pos, x) 62 | miny_pos = min(miny_pos, y) 63 | maxy_pos = max(maxy_pos, y) 64 | pos = (x, y) 65 | 66 | plt.annotate(title, pos, color=color) 67 | 68 | ax.set_xlim([minx_pos, maxx_pos]) 69 | ax.set_ylim([miny_pos, maxy_pos]) 70 | -------------------------------------------------------------------------------- /src/cis/deep/utils/visualization/apps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/cis/deep/utils/visualization/apps/__init__.py -------------------------------------------------------------------------------- /src/cis/deep/utils/visualization/apps/visualize_by_tsne.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | The required t-sne implementation used can be found at 4 | https://github.com/turian/textSNE 5 | 6 | example usage: 7 | X:/sa/embeddings/ColWes08/combined-WilWieHof05 8 | -f X:/sa/embeddings/ColWes08/combined-non_WilWieHof05-shuffle-1500 9 | 10 | X:/sa/embeddings/vlbl/general-nce5-1ep/vLBL_1_14-03-05_07-09-05.embeddings_r-combined-WilWieHof05 11 | -f X:/sa/embeddings/vlbl/general-nce5-1ep/vLBL_1_14-03-05_07-09-05.embeddings_r-combined-non_WilWieHof05 12 | """ 13 | 14 | from argparse import ArgumentParser 15 | from logging import getLogger 16 | import sys 17 | 18 | from calc_tsne import tsne 19 | from cis.deep.utils import file_line_generator, logger_config 20 | from cis.deep.utils.visualization import render_points 21 | import numpy as np 22 | import pylab as plt 23 | 24 | 25 | log = getLogger(__name__) 26 | logger_config(log) 27 | 28 | parser = ArgumentParser( 29 | description="""This script creates a 2d visualization of different kinds 30 | of input, e.g., Collobert & Weston word embeddings or RAE query 31 | representations. The code is a modification of a original t-SNE code. 32 | """) 33 | parser.add_argument('file', type=str, help='first file to load') 34 | parser.add_argument('-f', '--file2', type=str, help='second file to load') 35 | parser.add_argument('-o', '--out', type=str, 36 | help='write the rendered image to the given output file') 37 | 38 | def scaleData(x): 39 | """Scales the given data between 0 and 1. 40 | This is necessary, because t-sne will fail for too big numbers. 41 | """ 42 | x -= np.min(x) 43 | x /= np.max(x) 44 | return x 45 | 46 | def getData(emb_file): 47 | """Load the data file. 48 | 49 | Parameters 50 | ---------- 51 | emb_file : str 52 | name of the data file in which the first tab-separated column contains 53 | the title and the second column the values of an item 54 | 55 | Returns 56 | ------- 57 | list(str) 58 | item titles 59 | list(ndarray) 60 | item values 61 | """ 62 | titles = [] 63 | data = [] 64 | 65 | for l in file_line_generator(emb_file): 66 | token, emb = l.split(u'\t') 67 | titles.append(token) 68 | data.append(np.fromstring(emb, sep=u' ')) 69 | 70 | return titles, np.asarray(data) 71 | 72 | def main(argv=None): 73 | 74 | if argv is None: 75 | argv = sys.argv[1:] 76 | 77 | args = parser.parse_args(argv) 78 | log.info('start parameters: ' + str(args)) 79 | 80 | log.info('loading data') 81 | titles, x = getData(args.file) 82 | 83 | file_size1 = len(titles) 84 | 85 | if args.file2 is not None: 86 | titles2, x2 = getData(args.file2) 87 | titles.extend(titles2) 88 | x = np.vstack((x, x2)) 89 | 90 | # x = scaleData(x) 91 | 92 | log.info('performing t-SNE') 93 | out = tsne(x, no_dims=2, perplexity=30, initial_dims=100, use_pca=False) 94 | 95 | points = [('green', [(title, point[0], point[1]) 96 | for title, point in zip(titles[:file_size1], out[:file_size1, :])])] 97 | 98 | if args.file2 is not None: 99 | points.append(('gray', [(title, point[0], point[1]) 100 | for title, point in zip(titles[file_size1:], out[file_size1:, :])])) 101 | 102 | # pca = PCA(n_components=2) 103 | # out = pca.fit_transform(x) 104 | 105 | # mds = MDS() 106 | # out = mds.fit_transform(x) 107 | 108 | log.info('rendering result') 109 | render_points(points, 20, 20) 110 | 111 | if args.out: 112 | plt.savefig(args.out, dpi=600) 113 | else: 114 | plt.show() 115 | 116 | log.info('finished') 117 | 118 | if __name__ == "__main__": 119 | sys.exit(main()) 120 | -------------------------------------------------------------------------------- /src/log.best.scitail.txt: -------------------------------------------------------------------------------- 1 | ... training 2 | Epoch 1 iter 100 average cost: 0.50105274111 uses 0.274652850628 min 3 | current dev_acc: 0.774814814815 ; max_dev_acc: 0.774814814815 4 | current acc: 0.763255813953 ; max_acc: 0.763255813953 5 | Epoch 1 iter 200 average cost: 0.427455786616 uses 0.3303023537 min 6 | current dev_acc: 0.788888888889 ; max_dev_acc: 0.788888888889 7 | current acc: 0.782325581395 ; max_acc: 0.782325581395 8 | Epoch 1 iter 300 average cost: 0.38460314619 uses 0.324084401131 min 9 | current dev_acc: 0.795555555556 ; max_dev_acc: 0.795555555556 10 | current acc: 0.799534883721 ; max_acc: 0.799534883721 11 | Epoch 1 iter 400 average cost: 0.36287988646 uses 0.327256147067 min 12 | current dev_acc: 0.800740740741 ; max_dev_acc: 0.800740740741 13 | current acc: 0.801860465116 ; max_acc: 0.801860465116 14 | Epoch 1 uses 1.50898130337 min 15 | Epoch 2 iter 500 average cost: 0.335963069856 uses 0.33199198246 min 16 | Epoch 2 iter 600 average cost: 0.301583123356 uses 0.296971384684 min 17 | current dev_acc: 0.817777777778 ; max_dev_acc: 0.817777777778 18 | current acc: 0.811162790698 ; max_acc: 0.811162790698 19 | Epoch 2 iter 700 average cost: 0.27583842698 uses 0.333897197247 min 20 | current dev_acc: 0.819259259259 ; max_dev_acc: 0.819259259259 21 | current acc: 0.807441860465 ; max_acc: 0.811162790698 22 | Epoch 2 iter 800 average cost: 0.256100475551 uses 0.333747748534 min 23 | current dev_acc: 0.823703703704 ; max_dev_acc: 0.823703703704 24 | current acc: 0.820465116279 ; max_acc: 0.820465116279 25 | Epoch 2 iter 900 average cost: 0.240837877587 uses 0.336418318748 min 26 | Epoch 2 uses 1.52058656613 min 27 | Epoch 3 iter 1000 average cost: 0.224601259664 uses 0.300214282672 min 28 | Epoch 3 iter 1100 average cost: 0.209052862727 uses 0.297549700737 min 29 | Epoch 3 iter 1200 average cost: 0.195848352684 uses 0.296437966824 min 30 | Epoch 3 iter 1300 average cost: 0.184916155527 uses 0.29627721707 min 31 | -------------------------------------------------------------------------------- /src/model_para_0.820930232558: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/model_para_0.820930232558 -------------------------------------------------------------------------------- /src/preprocess_SciTail.py: -------------------------------------------------------------------------------- 1 | import random 2 | from random import randint 3 | 4 | def Scitail_2_ExtreamPosNeg(): 5 | root_file="/save/wenpeng/datasets/SciTailV1/tsv_format/scitail_1.0_train.tsv" 6 | writefilename = "/save/wenpeng/datasets/SciTailV1/tsv_format/scitail_1.0_train_2_ExtreamPosNeg.txt" 7 | # files=['scitail_1.0_train.tsv', 'scitail_1.0_dev.tsv', 'scitail_1.0_test.tsv'] 8 | 'we creat 10 neg, 10 pos for each sentence' 9 | readfile = open(root_file, 'r') 10 | writefile = open(writefilename, 'w') 11 | for line in readfile: 12 | parts = line.strip().split('\t') 13 | sent1 = parts[0] 14 | sent2 = parts[1] 15 | sent1_wordlist = sent1.split() 16 | sent2_wordlist = sent2.split() 17 | sent1_len = len(sent1_wordlist) 18 | sent2_len = len(sent2_wordlist) 19 | 'sent1 pos' 20 | sent1_pos_list = [] 21 | sent1_pos_list.append(sent1) 22 | 23 | # for i in range(9): 24 | # left = randint(0, sent1_len/2) 25 | # right = randint(left + 1, sent1_len) 26 | # sent1_pos_ins = sent1_wordlist[left:right] 27 | # sent1_pos_list.append(' '.join(sent1_pos_ins)) 28 | # assert len(sent1_pos_list) == 10 29 | 'sent1 neg' 30 | sent1_neg_list = [] 31 | 32 | sent1_neg_list.append(' '.join(sent1_wordlist[::-1])) #reverse 33 | for i in range(8): 34 | insert_point = randint(0, sent1_len - 1) 35 | sent1_neg_list.append(' '.join(sent1_wordlist[:insert_point]+['not']+sent1_wordlist[insert_point:])) 36 | random.Random(100).shuffle(sent1_wordlist) 37 | sent1_neg_list.append(' '.join(sent1_wordlist)) #shuffle 38 | assert len(sent1_neg_list) == 10 39 | 'write sent1 into file' 40 | for sent in sent1_pos_list: 41 | writefile.write(sent1+'\t'+sent+'\tentails\n') 42 | for sent in sent1_neg_list: 43 | writefile.write(sent1+'\t'+sent+'\tneutral\n') 44 | 45 | 46 | 'sent2 pos' 47 | sent2_pos_list = [] 48 | sent2_pos_list.append(sent2) 49 | 50 | # for i in range(9): 51 | # left = randint(0, sent2_len/2) 52 | # right = randint(left + 1, sent2_len) 53 | # sent2_pos_ins = sent2_wordlist[left:right] 54 | # sent2_pos_list.append(' '.join(sent2_pos_ins)) 55 | # assert len(sent2_pos_list) == 10 56 | 'sent2 neg' 57 | sent2_neg_list = [] 58 | 59 | sent2_neg_list.append(' '.join(sent2_wordlist[::-1])) #reverse 60 | for i in range(8): 61 | insert_point = randint(0, sent2_len - 1) 62 | sent2_neg_list.append(' '.join(sent2_wordlist[:insert_point]+['not']+sent2_wordlist[insert_point:])) 63 | random.Random(100).shuffle(sent2_wordlist) 64 | sent2_neg_list.append(' '.join(sent2_wordlist)) #shuffle 65 | assert len(sent2_neg_list) == 10 66 | 'write sent2 into file' 67 | for sent in sent2_pos_list: 68 | writefile.write(sent2+'\t'+sent+'\tentails\n') 69 | for sent in sent2_neg_list: 70 | writefile.write(sent2+'\t'+sent+'\tneutral\n') 71 | readfile.close() 72 | writefile.close() 73 | print 'write over' 74 | 75 | 76 | if __name__ == '__main__': 77 | Scitail_2_ExtreamPosNeg() 78 | -------------------------------------------------------------------------------- /src/train_SciTail_DeIsTe_model.py: -------------------------------------------------------------------------------- 1 | import cPickle 2 | import gzip 3 | import os 4 | import sys 5 | sys.setrecursionlimit(6000) 6 | import time 7 | 8 | import numpy as np 9 | import theano 10 | import theano.tensor as T 11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 12 | import random 13 | 14 | from logistic_sgd import LogisticRegression 15 | from mlp import HiddenLayer 16 | from theano.tensor.signal import downsample 17 | from random import shuffle 18 | from scipy.stats import mode 19 | 20 | from load_data import load_SciTailV1_dataset,load_word2vec, load_word2vec_to_init, extend_word2vec_lowercase 21 | from common_functions import Conv_for_Pair,dropout_layer, store_model_to_file, elementwise_is_two,Conv_with_Mask_with_Gate, Conv_with_Mask, create_conv_para, L2norm_paraList, ABCNN, create_ensemble_para, cosine_matrix1_matrix2_rowwise, Diversify_Reg, Gradient_Cost_Para, GRU_Batch_Tensor_Input_with_Mask, create_LSTM_para 22 | 23 | 24 | def evaluate_lenet5(learning_rate=0.01, n_epochs=10, L2_weight=0.000001, extra_size=4, emb_size=300, posi_emb_size=50,batch_size=50, filter_size=[3,3], maxSentLen=50, hidden_size=300): 25 | 26 | model_options = locals().copy() 27 | print "model options", model_options 28 | 29 | seed=1234 30 | np.random.seed(seed) 31 | rng = np.random.RandomState(seed) #random seed, control the model generates the same results 32 | 33 | 34 | all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id =load_SciTailV1_dataset(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence 35 | # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id =load_ACE05_dataset(maxSentLen, word2id) 36 | 37 | train_sents_l=np.asarray(all_sentences_l[0], dtype='int32') 38 | dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32') 39 | test_sents_l=np.asarray(all_sentences_l[2], dtype='int32') 40 | 41 | train_masks_l=np.asarray(all_masks_l[0], dtype=theano.config.floatX) 42 | dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX) 43 | test_masks_l=np.asarray(all_masks_l[2], dtype=theano.config.floatX) 44 | 45 | train_sents_r=np.asarray(all_sentences_r[0], dtype='int32') 46 | dev_sents_r=np.asarray(all_sentences_r[1] , dtype='int32') 47 | test_sents_r=np.asarray(all_sentences_r[2] , dtype='int32') 48 | 49 | train_masks_r=np.asarray(all_masks_r[0], dtype=theano.config.floatX) 50 | dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX) 51 | test_masks_r=np.asarray(all_masks_r[2], dtype=theano.config.floatX) 52 | 53 | 54 | train_labels_store=np.asarray(all_labels[0], dtype='int32') 55 | dev_labels_store=np.asarray(all_labels[1], dtype='int32') 56 | test_labels_store=np.asarray(all_labels[2], dtype='int32') 57 | 58 | train_size=len(train_labels_store) 59 | dev_size=len(dev_labels_store) 60 | test_size=len(test_labels_store) 61 | print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size 62 | 63 | vocab_size=len(word2id)+1 64 | 65 | 66 | rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution 67 | #here, we leave code for loading word2vec to initialize words 68 | rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) 69 | id2word = {y:x for x,y in word2id.iteritems()} 70 | word2vec=load_word2vec() 71 | rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) 72 | init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable 73 | 74 | posi_rand_values=rng.normal(0.0, 0.01, (maxSentLen, posi_emb_size)) #generate a matrix by Gaussian distribution 75 | posi_embeddings=theano.shared(value=np.array(posi_rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable 76 | 77 | 78 | #now, start to build the input form of the model 79 | sents_ids_l=T.imatrix() 80 | sents_mask_l=T.fmatrix() 81 | sents_ids_r=T.imatrix() 82 | sents_mask_r=T.fmatrix() 83 | labels=T.ivector() 84 | ###################### 85 | # BUILD ACTUAL MODEL # 86 | ###################### 87 | print '... building the model' 88 | 89 | def embed_input(emb_matrix, sent_ids): 90 | return emb_matrix[sent_ids.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) 91 | 92 | embed_input_l=embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM 93 | embed_input_r=embed_input(init_embeddings, sents_ids_r)#embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) 94 | 95 | 96 | 97 | '''create_AttentiveConv_params ''' 98 | conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size[0])) 99 | conv_W_posi, conv_b_posi=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size+posi_emb_size, filter_size[0])) 100 | conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 1)) 101 | 102 | NN_para=[conv_W, conv_b,conv_W_posi, conv_b_posi,conv_W_context] 103 | 104 | ''' 105 | attentive convolution function 106 | ''' 107 | 108 | attentive_conv_layer = Conv_for_Pair(rng, 109 | origin_input_tensor3=embed_input_l, 110 | origin_input_tensor3_r = embed_input_r, 111 | input_tensor3=embed_input_l, 112 | input_tensor3_r = embed_input_r, 113 | mask_matrix = sents_mask_l, 114 | mask_matrix_r = sents_mask_r, 115 | image_shape=(batch_size, 1, emb_size, maxSentLen), 116 | image_shape_r = (batch_size, 1, emb_size, maxSentLen), 117 | filter_shape=(hidden_size, 1, emb_size, filter_size[0]), 118 | filter_shape_context=(hidden_size, 1,emb_size, 1), 119 | W=conv_W, b=conv_b, 120 | W_posi=conv_W_posi, b_posi=conv_b_posi, 121 | W_context=conv_W_context, b_context=conv_b_context, 122 | posi_emb_matrix = posi_embeddings, 123 | posi_emb_size = posi_emb_size) 124 | attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l 125 | attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r 126 | 127 | sent_embeddings_l = attentive_conv_layer.maxpool_vec_l 128 | sent_embeddings_r = attentive_conv_layer.maxpool_vec_r 129 | 130 | "form input to LR classifier" 131 | LR_input = T.concatenate([sent_embeddings_l,sent_embeddings_r,sent_embeddings_l*sent_embeddings_r,attentive_sent_embeddings_l,attentive_sent_embeddings_r,attentive_sent_embeddings_l*attentive_sent_embeddings_r],axis=1) 132 | LR_input_size=6*hidden_size 133 | 134 | U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2 135 | LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class 136 | LR_para=[U_a, LR_b] 137 | 138 | 139 | layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector 140 | loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. 141 | 142 | 143 | 144 | 145 | 146 | params = [init_embeddings,posi_embeddings]+NN_para+LR_para 147 | # L2_reg = (init_embeddings**2).sum()+(conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum() 148 | 149 | cost=loss#+L2_weight*L2_reg 150 | 151 | updates = Gradient_Cost_Para(cost,params, learning_rate) 152 | 153 | 154 | train_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') 155 | dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') 156 | 157 | test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') 158 | 159 | ############### 160 | # TRAIN MODEL # 161 | ############### 162 | print '... training' 163 | # early-stopping parameters 164 | patience = 50000000000 # look as this many examples regardless 165 | start_time = time.time() 166 | mid_time = start_time 167 | past_time= mid_time 168 | epoch = 0 169 | done_looping = False 170 | 171 | n_train_batches=train_size/batch_size 172 | train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] 173 | n_dev_batches=dev_size/batch_size 174 | dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] 175 | n_test_batches=test_size/batch_size 176 | test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size] 177 | 178 | 179 | max_acc_dev=0.0 180 | max_acc_test=0.0 181 | max_f1=0.0 182 | 183 | cost_i=0.0 184 | train_indices = range(train_size) 185 | 186 | while epoch < n_epochs: 187 | epoch = epoch + 1 188 | 189 | random.Random(100).shuffle(train_indices) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed 190 | 191 | iter_accu=0 192 | 193 | for batch_id in train_batch_start: #for each batch 194 | iter = (epoch - 1) * n_train_batches + iter_accu +1 195 | iter_accu+=1 196 | train_id_batch = train_indices[batch_id:batch_id+batch_size] 197 | cost_i+= train_model( 198 | train_sents_l[train_id_batch], 199 | train_masks_l[train_id_batch], 200 | train_sents_r[train_id_batch], 201 | train_masks_r[train_id_batch], 202 | train_labels_store[train_id_batch]) 203 | 204 | #after each 1000 batches, we test the performance of the model on all test data 205 | if iter%100==0: 206 | print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' 207 | past_time = time.time() 208 | dev_error_sum=0.0 209 | for dev_batch_id in dev_batch_start: # for each test batch 210 | dev_error_i=dev_model( 211 | dev_sents_l[dev_batch_id:dev_batch_id+batch_size], 212 | dev_masks_l[dev_batch_id:dev_batch_id+batch_size], 213 | dev_sents_r[dev_batch_id:dev_batch_id+batch_size], 214 | dev_masks_r[dev_batch_id:dev_batch_id+batch_size], 215 | dev_labels_store[dev_batch_id:dev_batch_id+batch_size]) 216 | 217 | dev_error_sum+=dev_error_i 218 | dev_acc=1.0-dev_error_sum/(len(dev_batch_start)) 219 | 220 | 221 | if dev_acc > max_acc_dev: 222 | max_acc_dev=dev_acc 223 | print '\tcurrent dev_acc:', dev_acc,' ; ','\tmax_dev_acc:', max_acc_dev 224 | 225 | 226 | error_sum=0.0 227 | for idd, test_batch_id in enumerate(test_batch_start): # for each test batch 228 | error_i=test_model( 229 | test_sents_l[test_batch_id:test_batch_id+batch_size], 230 | test_masks_l[test_batch_id:test_batch_id+batch_size], 231 | test_sents_r[test_batch_id:test_batch_id+batch_size], 232 | test_masks_r[test_batch_id:test_batch_id+batch_size], 233 | test_labels_store[test_batch_id:test_batch_id+batch_size]) 234 | 235 | error_sum+=error_i 236 | test_acc=1.0-error_sum/(len(test_batch_start)) 237 | if test_acc > max_acc_test: 238 | max_acc_test=test_acc 239 | store_model_to_file('/home/wenpeng/workspace/SciTail/src/model_para_'+str(max_acc_test), params) 240 | print '\t\tcurrent acc:', test_acc,' ; ','\t\tmax_acc:', max_acc_test 241 | else: 242 | print '\tcurrent dev_acc:', dev_acc,' ; ','\tmax_dev_acc:', max_acc_dev 243 | 244 | 245 | print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' 246 | mid_time = time.time() 247 | 248 | #print 'Batch_size: ', update_freq 249 | end_time = time.time() 250 | 251 | print >> sys.stderr, ('The code for file ' + 252 | os.path.split(__file__)[1] + 253 | ' ran for %.2fm' % ((end_time - start_time) / 60.)) 254 | 255 | return max_acc_test 256 | 257 | 258 | 259 | if __name__ == '__main__': 260 | evaluate_lenet5() 261 | # lr_list=[0.005,0.01,0.02,0.03,0.001] 262 | # batch_list=[10,20,30,40,50,60,70,80,100] 263 | # maxlen_list=[20,25,30,35,40,45,50,55] 264 | # 265 | # best_acc=0.0 266 | # best_lr=0.01 267 | # for lr in lr_list: 268 | # acc_test= evaluate_lenet5(learning_rate=lr) 269 | # if acc_test>best_acc: 270 | # best_lr=lr 271 | # best_acc=acc_test 272 | # print '\t\t\t\tcurrent best_acc:', best_acc 273 | # best_batch=50 274 | # for batch in batch_list: 275 | # acc_test= evaluate_lenet5(learning_rate=best_lr, batch_size=batch) 276 | # if acc_test>best_acc: 277 | # best_batch=batch 278 | # best_acc=acc_test 279 | # print '\t\t\t\tcurrent best_acc:', best_acc 280 | # 281 | # best_maxlen=40 282 | # for maxlen in maxlen_list: 283 | # acc_test= evaluate_lenet5(learning_rate=best_lr, batch_size=best_batch, maxSentLen=maxlen) 284 | # if acc_test>best_acc: 285 | # best_maxlen=maxlen 286 | # best_acc=acc_test 287 | # print '\t\t\t\tcurrent best_acc:', best_acc 288 | # print 'Hyper tune finished, best test acc: ', best_acc, ' by lr: ', best_lr, ' batch: ', best_batch, ' maxlen: ', best_maxlen 289 | -------------------------------------------------------------------------------- /src/word2embeddings/.gitignore: -------------------------------------------------------------------------------- 1 | /__init__.pyc 2 | -------------------------------------------------------------------------------- /src/word2embeddings/AUTHORS.rst: -------------------------------------------------------------------------------- 1 | Sebastian Ebert 2 | -------------------------------------------------------------------------------- /src/word2embeddings/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | -------------------------------------------------------------------------------- /src/word2embeddings/README.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/word2embeddings/README.rst -------------------------------------------------------------------------------- /src/word2embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | from pkgutil import extend_path 2 | __path__ = extend_path(__path__, __name__) -------------------------------------------------------------------------------- /src/word2embeddings/apps/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | 4 | 5 | def use_theano_development_version(): 6 | """Prepare usage of development version of Theano. 7 | 8 | Alters the PYTHONPATH variable by removing the paths to installed Theano 9 | versions and adding my own development installation. 10 | CAUTION: this function must be called before importing any of my or Theano's 11 | libraries. 12 | """ 13 | print '\nold path:' 14 | print '\n'.join(sys.path) 15 | 16 | # List of possible Theano installation paths for different servers at CIS 17 | # and on my local machine. 18 | possible_paths = ['/usr/lib/python2.7/site-packages/Theano-0.6.0-py2.7.egg', 19 | 'C:\\Anaconda\\lib\\site-packages\\theano-current', 20 | '/usr/local/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg', 21 | '/usr/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg', 22 | '/usr/lib/python2.7/site-packages/Theano-0.6.0-py2.7.egg', #delta 23 | ] 24 | 25 | for p in possible_paths: 26 | 27 | try: 28 | sys.path.remove(p) 29 | print 'removed ', p 30 | except ValueError: 31 | pass 32 | 33 | sys.path.insert(0, '/mounts/Users/cisintern/ebert/data/promotion/src/theano/') 34 | sys.path.insert(0, 'Z:\\data\\promotion\\src\\theano\\') 35 | #sys.path.insert(0, '/mounts/Users/student/irina/Programs/Theano/Theano/') 36 | 37 | print 'new path:' 38 | print '\n'.join(sys.path) 39 | 40 | from theano import version 41 | 42 | print '\nnew Theano version:', version.full_version 43 | 44 | # sys.path.remove('/usr/lib/python2.7/site-packages') 45 | -------------------------------------------------------------------------------- /src/word2embeddings/apps/analyze_lbl_distribution.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | """ 4 | from argparse import ArgumentParser 5 | from logging import getLogger 6 | import sys 7 | 8 | from cis.deep.utils import logger_config, embeddings, file_line_generator, \ 9 | utf8_file_open 10 | import numpy as np 11 | 12 | 13 | log = getLogger(__name__) 14 | logger_config(log) 15 | 16 | parser = ArgumentParser(description='Analyze the most likely tokens given a ' + 17 | 'context and their probabilities.') 18 | parser.add_argument('vocabulary', help='vocabulary') 19 | parser.add_argument('distributions', 20 | help='file containing the LBL predictions') 21 | parser.add_argument('contexts', 22 | help='file containing contexts') 23 | parser.add_argument('out_file', 24 | help='result file') 25 | 26 | def main(argv=None): 27 | 28 | if argv is None: 29 | argv = sys.argv[1:] 30 | 31 | args = parser.parse_args(argv) 32 | log.info('start parameters: ' + str(args)) 33 | 34 | log.info('loading data') 35 | vocab = embeddings.read_vocabulary_file(args.vocabulary, False) 36 | contexts = list(file_line_generator(args.contexts)) 37 | dists = np.loadtxt(args.distributions) 38 | 39 | log.info('computing results') 40 | # Add X in the n-grams' centers 41 | # Assume we have the same context size left and right. 42 | x_pos = len(contexts[0].split()) // 2 43 | contexts = [sp[:x_pos] + ['X'] + sp[x_pos:] 44 | for sp in [c.split() for c in contexts]] 45 | 46 | # Sorts all words for each context descending. 47 | sort_words_per_context_value = np.sort(dists, 1)[: , ::-1] 48 | sort_words_per_context_idx = np.argsort(dists, 1)[: , ::-1] 49 | 50 | # Sorts all contexts according to their probability assigned to "similar". 51 | sort_context_for_similar_idx = np.argsort(dists[:, 465])[::-1] 52 | sort_context_for_similar_value = np.sort(dists[:, 465])[::-1] 53 | 54 | log.info('writing data data') 55 | 56 | with utf8_file_open(args.out_file, 'w') as likelihood_file: 57 | 58 | # Write results to a file 59 | for (i, idx) in enumerate(sort_context_for_similar_idx): 60 | likelihood_file.write(u' '.join(contexts[idx]) + u'\t' + 61 | unicode(sort_context_for_similar_value[i]) + u'\n') 62 | 63 | # 10 most likely words for the current context 64 | for j in xrange(10): 65 | likelihood_file.write(vocab[sort_words_per_context_idx[idx, j]] + 66 | u'\t' + unicode(sort_words_per_context_value[idx, j]) + 67 | u'\n') 68 | 69 | likelihood_file.write(u'\n') 70 | 71 | log.info('finished') 72 | 73 | if __name__ == '__main__': 74 | sys.exit(main()) 75 | -------------------------------------------------------------------------------- /src/word2embeddings/apps/classify_imdb_docs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Example usage: 4 | X:\sa\corpora\imdb\txt_sentoken 5 | X:\sa\embeddings\vlbl\wikipedia_small-general-nce5-960\vLBL.vocab 6 | X:\sa\embeddings\vlbl\wikipedia_small-general-nce5-960\vLBL_960_14-03-30_23-07-11.embeddings_q 7 | . 8 | """ 9 | 10 | from argparse import ArgumentParser 11 | import json 12 | from logging import getLogger 13 | import os 14 | import sys 15 | 16 | from scipy.io import mmread 17 | from sklearn.cross_validation import StratifiedKFold 18 | from sklearn.metrics.metrics import accuracy_score, confusion_matrix, \ 19 | precision_recall_fscore_support 20 | from sklearn.svm import LinearSVC 21 | 22 | from cis.deep.utils import logger_config, file_line_generator, utf8_file_open 23 | import numpy as np 24 | from cis.deep.utils.embeddings import read_vocabulary_id_file, \ 25 | compute_avg_text_embedding 26 | import itertools 27 | from cis.deep.utils.classification import calc_metrics 28 | 29 | 30 | NO_OF_FOLDS = 10 31 | 32 | log = getLogger(__name__) 33 | logger_config(log) 34 | 35 | parser = ArgumentParser( 36 | description='Perform a 10-fold cross validation on the polarity ' + 37 | 'dataset v2.0 of [PanLee04].') 38 | parser.add_argument('corpus_dir', 39 | help='location of the pos and neg directories of the dataset') 40 | parser.add_argument('vocabulary', 41 | help='Vocabulary file that contains list of tokens.') 42 | parser.add_argument('embeddings', 43 | help='File that contains the trained word embeddings') 44 | parser.add_argument('output_dir', 45 | help='directory to store the results in') 46 | 47 | def convert_doc(doc, vocab, embs): 48 | """Convert the given document into a document vector. 49 | 50 | Average all word vectors to a final document vector. 51 | 52 | Parameters 53 | ---------- 54 | doc : str 55 | filename of the document 56 | vocab : dict(str, int) 57 | id vocabulary 58 | embs : ndarray 59 | embeddings 60 | """ 61 | 62 | with utf8_file_open(doc) as f: 63 | s = f.read() 64 | return compute_avg_text_embedding(s, vocab, embs) 65 | 66 | def do_cross_validation(features, labels): 67 | """Perform the k-fold cross validation. 68 | 69 | Perform the k-fold cross validation, collect the result and return the 70 | single test instance predictions, as well as the classification results for 71 | each single fold and for the combination of all folds. 72 | 73 | Keyword arguments: 74 | features -- all features 75 | labels -- all labels 76 | classifier -- code of the classifier to create (see command line arguments) 77 | """ 78 | single_predictions = [] # Store each single classification decision 79 | # Store the feature weights after the training 80 | weight_vectors = np.zeros((NO_OF_FOLDS, len(features.values()[0]))) 81 | # Store classification results for each fold and for the entire task (i.e., 82 | # entire cross validation). 83 | classification_result = np.zeros((NO_OF_FOLDS + 1, 5)) 84 | 85 | for cur_fold, (train_names, test_names) in enumerate(imdb_cross_folds(features.keys())): 86 | train_data = [features[n] for n in train_names] 87 | train_labels = [labels[n] for n in train_names] 88 | model = train_model(train_data, train_labels) 89 | 90 | test_data = [features[n] for n in test_names] 91 | pred_labels = model.predict(test_data) 92 | true_labels = [] 93 | 94 | for i in xrange(len(test_data)): 95 | single_predictions.append([cur_fold, test_names[i], 96 | labels[test_names[i]], pred_labels[i]]) 97 | true_labels.append(labels[test_names[i]]) 98 | 99 | classification_result[cur_fold, :] = get_classification_result(cur_fold, 100 | true_labels, pred_labels) 101 | 102 | weight_vectors[cur_fold, :] = model.coef_ 103 | 104 | return single_predictions, classification_result, weight_vectors 105 | 106 | def get_classification_result(fold_no, true_labels, pred_labels): 107 | """Return classification resuls for one fold. 108 | 109 | Return an array containing accuracy, precision, recall, and f1, based on the 110 | given true and predicted labels. 111 | 112 | Parameters 113 | ---------- 114 | fold_no : int 115 | this fold's number 116 | true_labels list(int) 117 | true labels 118 | pred_labels list(int) 119 | predicted labels 120 | 121 | Returns 122 | ------- 123 | ndarray 124 | [fold number, accuracy, precision, recall, f1] 125 | """ 126 | res = calc_metrics(true_labels, pred_labels) 127 | return np.asarray([fold_no] + [r for r in res]) 128 | 129 | def imdb_cross_folds(filenames): 130 | """Get the docs for training and testing to be used in a 10-fold x 131 | validation. 132 | 133 | Parameters 134 | ---------- 135 | filenames : list(str) 136 | filenames of imdb docs; they contain the fold number 137 | 138 | Returns 139 | ------- 140 | list(str) 141 | names of training documents 142 | list(str) 143 | names of test documents 144 | """ 145 | 146 | for i in xrange(10): 147 | test = filter(lambda f: f.startswith(u'cv' + unicode(i)), filenames) 148 | training = filter(lambda f: not f.startswith(u'cv' + unicode(i)), filenames) 149 | yield (training, test) 150 | 151 | raise StopIteration() 152 | 153 | def load_data(corpus_dir, vocab, embs): 154 | """Load feature data and labels. 155 | 156 | Loads the documents from the imdb corpus and converts them into one feature 157 | vector per document by averaging the word representations of the text. 158 | 159 | Parameters 160 | ---------- 161 | corpus_dir : str 162 | location of the dataset 163 | vocab : dict(str, int) 164 | id vocabulary 165 | embs : ndarray(m*n) 166 | word embeddings 167 | 168 | Returns 169 | ------- 170 | features : dict(str, ndarray) 171 | map from a document name its document representations, which is the 172 | averaged word vectors 173 | labels : dict(str, int) 174 | map from a document name its label 175 | """ 176 | pos_docs = os.listdir(os.path.join(corpus_dir, u'pos')) 177 | num_pos_docs = len(pos_docs) 178 | pos_docs = [os.path.join(corpus_dir, u'pos/', d) for d in pos_docs] 179 | neg_docs = os.listdir(os.path.join(corpus_dir, u'neg')) 180 | neg_docs = [os.path.join(corpus_dir, u'neg/', d) for d in neg_docs] 181 | docs = pos_docs + neg_docs 182 | features = dict() 183 | labels = dict() 184 | 185 | for (count, d) in enumerate(docs): 186 | basename = os.path.basename(d) 187 | features[basename] = convert_doc(d, vocab, embs) 188 | labels[basename] = 1 if count < num_pos_docs else 0 189 | 190 | return features, labels 191 | 192 | def train_model(features, labels): 193 | """Create, train, and return a model using the given features and labels. 194 | 195 | Parameters 196 | ---------- 197 | features : list(ndarray) 198 | features of training instances 199 | labels : list(int) 200 | labels of training instances 201 | """ 202 | model = LinearSVC() 203 | model.fit(features, labels) 204 | return model 205 | 206 | def main(argv=None): 207 | 208 | if argv is None: 209 | argv = sys.argv[1:] 210 | 211 | args = parser.parse_args(argv) 212 | log.info('start parameters: ' + str(args)) 213 | 214 | log.info('loading embeddings') 215 | vocab = read_vocabulary_id_file(args.vocabulary) 216 | embs = np.loadtxt(args.embeddings) 217 | 218 | log.info('loading documents') 219 | features, labels = load_data(args.corpus_dir, vocab, embs) 220 | 221 | log.info('performing cross validation') 222 | single_predictions, classification_result, weight_vectors = \ 223 | do_cross_validation(features, labels) 224 | 225 | log.info('storing results') 226 | np.savetxt(os.path.join(args.output_dir, 'svm-weights.csv'), 227 | weight_vectors, '%f', ';', '\n') 228 | 229 | with utf8_file_open(os.path.join(args.output_dir, 'predictions.csv'), 'w') \ 230 | as pred_file: 231 | pred_file.write(u'fold_no;doc;true_label;pred_label\n') 232 | 233 | for sp in single_predictions: 234 | pred_file.write(u';'.join(map(unicode, sp)) + u'\n') 235 | 236 | all_true_labels = [sp[2] for sp in single_predictions] 237 | all_pred_labels = [sp[3] for sp in single_predictions] 238 | confusion = confusion_matrix(all_true_labels, all_pred_labels) 239 | 240 | np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'), 241 | confusion, '%d', ';', '\n') 242 | 243 | classification_result[NO_OF_FOLDS, :] = get_classification_result(-1, 244 | all_true_labels, all_pred_labels) 245 | 246 | header = u'fold_no;accuracy;precision;recall;f1' 247 | np.savetxt(os.path.join(args.output_dir, 'metrics.csv'), 248 | classification_result, '%f', u';', u'\n', header=header) 249 | 250 | log.info(classification_result) 251 | log.info('finished') 252 | 253 | if __name__ == '__main__': 254 | sys.exit(main()) 255 | -------------------------------------------------------------------------------- /src/word2embeddings/apps/create_embeddings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #! /usr/bin/env python 3 | """ 4 | """ 5 | from argparse import ArgumentParser 6 | from logging import getLogger 7 | import logging 8 | import sys 9 | 10 | # from word2embeddings.apps import use_theano_development_version 11 | # use_theano_development_version() 12 | 13 | from cis.deep.utils import logger_config 14 | from word2embeddings.nn.trainer import HingeSentimentMiniBatchTrainer, \ 15 | HingeSentiment2MiniBatchTrainer, HingeMiniBatchTrainer, \ 16 | SimpleVLblNceTrainer, SimpleVLblNceSentimentTrainer, \ 17 | VLblNceTrainer, VLblNceSentimentTrainer, VLblNceDistributionalTrainer, \ 18 | NlblNceTrainer, NvLblNceTrainer, SLmNceTrainer, LblNceTrainer 19 | from word2embeddings.tools.util import debug 20 | 21 | log = getLogger(__name__) 22 | logger_config(log) 23 | 24 | parser = ArgumentParser() 25 | parser.add_argument('train_file', 26 | help='Document for training that contains tokenized text') 27 | 28 | parser.add_argument('--hidden-layers', dest='hidden_layers', 29 | help='Width of each hidden layer, comma separated. E.g., ' + 30 | '"28,64,32". This option only has an effect for mlp models and ' + 31 | 'for slm, where only one hidden layer is allowed.') 32 | 33 | parser.add_argument('vocabulary', 34 | help='Vocabulary file that contains list of tokens.\nCaution: Add ' + 35 | 'the special tokens , , , in this exact order at ' + 36 | 'the first positions in the vocabulary.') 37 | 38 | 39 | parser.add_argument('--sentiment-vocabulary', dest='sent_vocab', 40 | help='Vocabulary file that contains sentiment words') 41 | 42 | parser.add_argument('--predict-vocabulary', dest='pred_vocab', 43 | help='Vocabulary that contains the items that should be considered ' + 44 | 'during perplexity computation.\n' + 45 | 'Caution: Make sure this includes .\n' + 46 | 'Caution2: If this vocabulary does not contain a word that is seen ' + 47 | 'in prediction this word is not considered during perplexity ' + 48 | 'calculation.') 49 | 50 | 51 | parser.add_argument('--unigram', dest='unigram', 52 | help='file containing the unigram count (the probabilities are ' + 53 | 'calculated automatically given the counts\n ' + 54 | 'Caution: Add the ' + 55 | 'special tokens , , , in this exact order at the ' + 56 | 'first positions in the vocabulary.') 57 | parser.add_argument('--noise-samples', dest='noise_samples', type=int, 58 | help='number of noise samples per data sample') 59 | parser.add_argument('--nce-seed', dest='nce_seed', type=int, default=2345, 60 | help='seed for the noise sample generation in NCE') 61 | 62 | 63 | parser.add_argument('--validation-file', dest='validation_file', nargs='+', 64 | help='Files for validation that contains tokenized text. Multiple ' + 65 | 'files are supported, with the first file being the main validation ' + 66 | 'file, i.e., if --dump-best is active, then the performance on the ' + 67 | 'first file is considered.\n ' + 68 | 'Note: For all LBL based models the validation cost will be ' + 69 | 'different even if you provide the same validation file twice, ' + 70 | 'because the NCE cost computation involves a randomized process.') 71 | 72 | parser.add_argument('--perplexity', action='store_true', 73 | help='instead of calculating the error on the validation set, ' + 74 | 'additionally calculate the perplexity. Caution: does only work ' + 75 | 'for vLBL models. Note: using ppl in validation is slower.') 76 | 77 | 78 | parser.add_argument('--disable-padding', dest='disable_padding', 79 | action='store_true', default=False, 80 | help='Disable padding sentences while generating examples') 81 | 82 | parser.add_argument('--learn-eos', dest='learn_eos', 83 | action='store_true', default=False, 84 | help='Learn word embedding for the end-of-sentence token .') 85 | 86 | 87 | parser.add_argument('--load-model', dest='load_model', 88 | help='Proceed training with the given model file.') 89 | 90 | parser.add_argument('--model-type', dest='model_type', 91 | choices=['ColWes08', 'sent_1', 'sent_2', 'vlbl', 'nvlbl', 92 | 'vlbl_sent', 'simple_vlbl', 'simple_vlbl_sent', 'vlbl_dist', 93 | 'lbl', 'nlbl', 'slm'], 94 | default='ColWes08', 95 | help='Type of the model to use for training. All sentiment models ' + 96 | 'require a sentiment vocabulary.') 97 | 98 | parser.add_argument('--activation-func', dest='activation_func', default='rect', 99 | choices=['sigmoid', 'tanh', 'rect', 'softsign'], 100 | help='Activation function to use in non-linear models.') 101 | 102 | 103 | parser.add_argument('--left-context', dest='left_context', type=int, 104 | default=2, 105 | help='Left context window to be used measured from the current token') 106 | 107 | parser.add_argument('--right-context', dest='right_context', type=int, 108 | default=2, 109 | help='Right context window measured from the current token') 110 | 111 | parser.add_argument('--word-embedding-size', dest='word_embedding_size', 112 | type=int, default=64) 113 | 114 | 115 | # Argument for MiniBatchTrainer 116 | parser.add_argument('--epochs-limit', dest='epochs_limit', type=int, default=-1, 117 | help='maximal number of epochs to train (-1 for no limit)') 118 | 119 | parser.add_argument('--examples-limit', dest='examples_limit', type=int, 120 | default=-1, 121 | help='maximal number of examples to train (-1 for no limit)') 122 | 123 | parser.add_argument('--early-stopping', dest='early_stopping', type=int, 124 | default=-1, 125 | help='Stop the training when N consecutive validations resulted in ' + \ 126 | 'worse results than the validation before. -1 to deactivate this ' + \ 127 | 'feature.') 128 | 129 | 130 | parser.add_argument('--batch-size', dest='batch_size', type=int, default=16) 131 | 132 | 133 | parser.add_argument('--learning-rate', dest='learning_rate', 134 | default=0.1, 135 | help='Learning rate. If this parameter is a float value than the ' + 136 | 'learning rate is valid for all model parameters. Otherwise, it can ' + 137 | 'contain parameter specific learning rates in using the pattern ' + 138 | '"param_name1:param_learning_rate1,param_name2:param_learning_rate2\.' + 139 | 'You can also specify a learning rate for only some of your ' + 140 | 'parameters and assign the default learning rate for all other ' + 141 | 'parameters by specifying "default:default_learning_rate".') 142 | 143 | parser.add_argument('--lr-adaptation', dest='lr_adaptation_method', 144 | choices=['constant', 'linear', 'adagrad', 'MniTeh12'], 145 | default='constant', 146 | help='Sets the method that is used to reduce the learning rate. ' + 147 | 'Supports "linear" (linear reduction) and "adagrad" (AdaGrad ' + 148 | 'algorithm), and "constant" (no reduction), "MniTeh12" (halves the ' + 149 | 'learning rate whenever the validation perplexity (if "--perplexity" ' + 150 | 'is given) or error (otherwise) goes up; for details see [MniTeh12])') 151 | 152 | parser.add_argument('--learning-method', dest='learning_method', 153 | choices=['fan_in', 'global'], default='global', 154 | help='Determine the method that learning rate is calculated. Two ' + 155 | 'options are available: {fan_in, global}') 156 | 157 | 158 | parser.add_argument('--l1-weight', dest='l1_weight', type=float, default=0.0, 159 | help='Weight of L1 regularization term. 0 to deactivate. ' + 160 | 'Only implemented for LBL models and SLM.') 161 | parser.add_argument('--l2-weight', dest='l2_weight', type=float, default=0.0, 162 | help='Weight of L2 regularization term. 0 to deactivate. ' + 163 | 'Only implemented for LBL models and SLM.') 164 | 165 | parser.add_argument('--dump-period', dest='dump_period', type=int, default=-1, 166 | help='A model will be dumped every x seconds/examples (-1 = no ' + 167 | 'dumping. Only the final model will be dumped.)') 168 | 169 | parser.add_argument('--load-params', dest='load_params', nargs=2, 170 | help='Load initial values from files. This parameter requires two ' + 171 | 'arguments: (i) and (ii) a comma separated list of ' + 172 | 'parameter names as specified by the individual model. Each parameter' + 173 | 'must be stored in csv file format in an own file. The single ' + 174 | 'parameter files are then expected to be named ' + 175 | '..\n ' + 176 | 'Example usage: ~/my_model "C,R" will load ~/my_model.C and ' + 177 | '~/my_model.R.\n ' + 178 | 'Gzip and bz2 files are supported.') 179 | 180 | parser.add_argument('--store-params', dest='store_params', 181 | help='Comma-separated list of parameter names that will be stored ' + 182 | 'each time the model is stored. The parameter names as specified by ' + 183 | 'the individual model. Each parameter is stored in a separate file, ' + 184 | 'e.g., paramter C is stored in .params.C.') 185 | 186 | parser.add_argument('--out-dir', dest='out_dir', default='.', 187 | help='directory where to store the output files') 188 | 189 | parser.add_argument('--dump-vocabulary', dest='dump_vocabulary', 190 | action='store_true', 191 | help='Dump the vocabulary after importing it to remove duplicates.') 192 | 193 | parser.add_argument('--dump-embeddings', dest='dump_embeddings', 194 | action='store_true', 195 | help='Dump the embeddings for every dumped model. Caution: might ' + 196 | 'be a big file.\n ' + 197 | 'Caution: This parameter is deprecated. It\'s not supported by the ' + 198 | 'new vLBL models. Use --store-params instead.') 199 | 200 | parser.add_argument('--validation-period', dest='validation_period', 201 | type=float, default=-1, 202 | help='A model will be evaluated every y seconds/examples. (-1 ' + 203 | 'for never). If a development file is given, the scores on the ' + 204 | 'training data and the validation data is computed, otherwise only ' + 205 | 'the former is computed.') 206 | 207 | parser.add_argument('--period-type', dest='period_type', default='examples', 208 | choices=['time', 'examples'], 209 | help='Set the period to be in seconds or number of examples ' + 210 | 'by setting the option to time or examples.') 211 | 212 | parser.add_argument('--dump-best', dest='dump_best', action='store_true', 213 | help='Save the best model every validation period. What "best" ' + \ 214 | 'means depends on the type of model. If "--perplexity" is given, ' + \ 215 | 'it\'s the model with the lowest perplexity. If not, it\'s the ' + \ 216 | 'model with the lowest training error.') 217 | 218 | parser.add_argument('--dump-each-epoch', dest='dump_each_epoch', 219 | action='store_true', help='Dump the model after each epoch') 220 | 221 | parser.add_argument('--dump-initial-model', dest='dump_initial_model', 222 | action='store_true', 223 | help='Dump the initial model before any training is done.') 224 | 225 | 226 | parser.add_argument('--error-function', dest='error_func', 227 | default='least_squares', choices=['cross_entropy', 'least_squares'], 228 | help='defines the used error function (default: least_squares); ' + 229 | 'This parameter is only valid for MLPs.') 230 | 231 | parser.add_argument('--count-examples', dest='count_examples', 232 | action='store_true', 233 | help='Only count the examples in the training file, don\'t train a ' + 234 | 'model.') 235 | 236 | 237 | parser.add_argument('--debug-host', dest='debug_host', 238 | help='Allow remote debugging at the given host IP. Make sure you ' + 239 | 'follow the instructions at ' + 240 | 'http://pydev.org/manual_adv_remote_debugger.html. Especially, the ' + 241 | 'pydevd source must be in the PYTHONPATH and ' + 242 | 'PATHS_FROM_ECLIPSE_TO_PYTHON in pydevd_file_utils.py must be adapted.') 243 | 244 | def main(argv=None): 245 | log.info('started application') 246 | 247 | log.warning('This script is obsolete. It will not be updated anymore and ' + 248 | 'will be deleted in the future. Use train_model.py instead.') 249 | 250 | if argv is None: 251 | argv = sys.argv[1:] 252 | 253 | args = parser.parse_args(argv) 254 | 255 | check_args(args) 256 | 257 | log.info('start parameters: ' + str(args)) 258 | 259 | if args.debug_host: 260 | import pydevd 261 | pydevd.settrace(host=args.debug_host, stdoutToServer=True, 262 | stderrToServer=True) 263 | 264 | if log.level == logging.DEBUG: 265 | sys.excepthook = debug 266 | 267 | log.info('creating trainer') 268 | 269 | if args.model_type == 'ColWes08': 270 | log.info('Using ColWes08 trainer') 271 | trainer = HingeMiniBatchTrainer() 272 | elif args.model_type == 'sent_1': 273 | log.info('Using sent_1 trainer') 274 | trainer = HingeSentimentMiniBatchTrainer() 275 | elif args.model_type == 'sent_2': 276 | log.info('Using sent_2 trainer') 277 | trainer = HingeSentiment2MiniBatchTrainer() 278 | elif args.model_type == 'simple_vlbl': 279 | log.info('Using simple LBL trainer that uses noise-contrastive estimation') 280 | trainer = SimpleVLblNceTrainer() 281 | elif args.model_type == 'simple_vlbl_sent': 282 | log.info('Using simple LBL trainer that uses noise-contrastive estimation to create sentiment embeddings') 283 | trainer = SimpleVLblNceSentimentTrainer() 284 | elif args.model_type == 'vlbl': 285 | log.info('Using LBL trainer that uses noise-contrastive estimation') 286 | trainer = VLblNceTrainer() 287 | elif args.model_type == 'vlbl_sent': 288 | log.info('Using LBL trainer that uses noise-contrastive estimation to create sentiment embeddings') 289 | trainer = VLblNceSentimentTrainer() 290 | elif args.model_type == 'nvlbl': 291 | log.info('Using non-linear vLBL NCE trainer') 292 | trainer = NvLblNceTrainer() 293 | elif args.model_type == 'lbl': 294 | log.info('Using linear LBL trainer that uses noise-contrastive estimation') 295 | trainer = LblNceTrainer() 296 | elif args.model_type == 'nlbl': 297 | log.info('Using non-linear LBL trainer that uses noise-contrastive estimation') 298 | trainer = NlblNceTrainer() 299 | elif args.model_type == 'vlbl_dist': 300 | log.info('Using LBL trainer that uses distributional representation of input') 301 | trainer = VLblNceDistributionalTrainer() 302 | elif args.model_type == 'slm': 303 | log.info('Using shallow neural network lm with NCE') 304 | trainer = SLmNceTrainer() 305 | else: 306 | raise ValueError('Unknown model type. Abort') 307 | 308 | if args.count_examples is True: 309 | log.info('counting examples') 310 | trainer.configure(args) 311 | count = trainer.count_examples(args.train_file) 312 | log.info('examples: %d' % count) 313 | else: 314 | trainer.prepare_usage(args) 315 | log.info('training is about to begin') 316 | trainer.run() 317 | 318 | log.info('finished') 319 | 320 | def check_args(args): 321 | 322 | 323 | 324 | # if args.epochs_limit == -1 and args.examples_limit == -1: 325 | # raise ValueError('Either epochs-limit or examples-limit must be given.') 326 | pass 327 | 328 | if __name__ == '__main__': 329 | sys.exit(main()) 330 | -------------------------------------------------------------------------------- /src/word2embeddings/apps/extract_model_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This file contains an application that extracts the vocabulary and embeddings 4 | from a given model file. 5 | """ 6 | from argparse import ArgumentParser 7 | from logging import getLogger 8 | import sys 9 | 10 | from cis.deep.utils import logger_config, load_object_from_file 11 | 12 | 13 | log = getLogger(__name__) 14 | logger_config(log) 15 | 16 | parser = ArgumentParser(description='extract parameters from a given model ' + 17 | 'file') 18 | parser.add_argument('model_file', help='model file') 19 | parser.add_argument('store_params', nargs=2, 20 | help='The first parameter is the filename, the second is a ' + 21 | 'comma-separated list of parameter names. For more information see ' + 22 | 'the documentation of the --load-params parameter in ' + 23 | 'create_embeddings.py.') 24 | 25 | parser.add_argument('-f', '--format', default='txt', choices=['txt', 'npy'], 26 | help='Format of the output files. txt = space separated csv format; ' + 27 | 'npy = binary numpy format') 28 | 29 | def main(argv=None): 30 | 31 | if argv is None: 32 | argv = sys.argv[1:] 33 | 34 | args = parser.parse_args(argv) 35 | log.info('start parameters: ' + str(args)) 36 | 37 | log.info('loading data') 38 | model = load_object_from_file(args.model_file) 39 | 40 | log.info('writing data') 41 | # trainer.dump_vocabulary(args.vocabulary_file) 42 | model.store_params(args.store_params[0], args.store_params[1], True, 43 | args.format) 44 | log.info('finished') 45 | 46 | if __name__ == '__main__': 47 | sys.exit(main()) 48 | -------------------------------------------------------------------------------- /src/word2embeddings/apps/extract_words_with_we.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This file contains an application that extracts the input and output embeddings 4 | from a given vlbl or vlbl_dist model file. 5 | """ 6 | from argparse import ArgumentParser 7 | from logging import getLogger 8 | import sys 9 | import numpy as np 10 | 11 | from cis.deep.utils import logger_config, load_object_from_file, utf8_file_open, sort_dict_by_label 12 | from cis.deep.utils.embeddings import read_vocabulary_id_file 13 | 14 | log = getLogger(__name__) 15 | logger_config(log) 16 | 17 | parser = ArgumentParser(description='Extract input and output word embeddings ' + 18 | 'from a given vLBL or distributional vLBL model file. ' + 19 | 'Output format is "word space embedding". ' + 20 | 'In case of vLBL model, input embeddings are represented with R matrix; ' + 21 | 'output embeddings are represented with Q matrix. ' + 22 | 'In case of vLBL distributional model, input embeddings are ' + 23 | 'represented with D*R matrix, output embeddings are represented with ' + 24 | 'Q matrix.') 25 | parser.add_argument('model_file', help='vlbl or vlbl_dist model file') 26 | parser.add_argument('--model-type', dest='model_type', 27 | choices=['vlbl', 'vlbl_dist'], 28 | default='vlbl', 29 | help='Type of the model to use for embeddings extraction.') 30 | parser.add_argument("vocabulary", 31 | help="Vocabulary file that contains list of tokens.") 32 | parser.add_argument("result_file", 33 | help="Document to which the predictions will be written.") 34 | 35 | def main(argv=None): 36 | 37 | if argv is None: 38 | argv = sys.argv[1:] 39 | 40 | args = parser.parse_args(argv) 41 | log.info('start parameters: ' + str(args)) 42 | 43 | log.info('loading data') 44 | model = load_object_from_file(args.model_file) 45 | 46 | # read vocabulary from file 47 | vocab = sort_dict_by_label(read_vocabulary_id_file(args.vocabulary)) 48 | 49 | # get matrices from model 50 | r_matrix = model.R.get_value() 51 | q_matrix = model.Q.get_value() 52 | 53 | # get input embeddings 54 | if args.model_type == 'vlbl': 55 | in_we = r_matrix 56 | elif args.model_type == 'vlbl_dist': 57 | # this will not work with the old versions of models - because of sparsity 58 | d_matrix = model.D.get_value().todense() 59 | in_we = np.dot(d_matrix, r_matrix) 60 | # need to convert from numpy.matrix to numpy.ndarray 61 | in_we = in_we.view(type=np.ndarray) 62 | 63 | with utf8_file_open(args.result_file + ".in", 'w') as outfile: 64 | for (word, ind) in vocab: 65 | outfile.write(unicode(word) + u' ' + u' '.join(map(str, in_we[ind])) + u'\n') 66 | 67 | with utf8_file_open(args.result_file + ".out", 'w') as outfile: 68 | for (word, ind) in vocab: 69 | outfile.write(unicode(word) + u' ' + u' '.join(map(str, q_matrix[ind])) + u'\n') 70 | 71 | log.info('finished') 72 | 73 | if __name__ == "__main__": 74 | sys.exit(main()) 75 | -------------------------------------------------------------------------------- /src/word2embeddings/apps/prepare_brown_file.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | example usage: 4 | -r 5 | 18 6 | X:\sa\dictionary\Brown_clusters\brown-mod 7 | X:\sa\dictionary\Brown_clusters\brown-mod-fixed_length_right 8 | """ 9 | 10 | from argparse import ArgumentParser 11 | from logging import getLogger 12 | import sys 13 | 14 | from cis.deep.utils import utf8_file_open, file_line_generator, logger_config 15 | from word2embeddings.tools.util import prepare_brown_signature 16 | 17 | 18 | log = getLogger(__name__) 19 | logger_config(log) 20 | 21 | parser = ArgumentParser(description='Prepare a given file that contains ' + 22 | 'Brown clustering signatures for words. Convert the variable length ' + 23 | 'signatures into fixed length signatures.') 24 | parser.add_argument('max_size', help='size of the fixed signatures', type=int) 25 | parser.add_argument('infile', help='input file with variable size signatures') 26 | parser.add_argument('outfile', help='output file with fixed size signatures') 27 | parser.add_argument('-r', '--right', default=False, 28 | action='store_true', 29 | help='pad the signatures to the right instead of to the left') 30 | 31 | def main(argv=None): 32 | 33 | if argv is None: 34 | argv = sys.argv[1:] 35 | 36 | args = parser.parse_args(argv) 37 | log.info('start parameters: ' + str(args)) 38 | 39 | log.info('transforming data') 40 | 41 | with utf8_file_open(args.outfile, 'w') as outfile: 42 | for line in file_line_generator(args.infile): 43 | token, signature = line.split(u'\t') 44 | outfile.write(u'%s\t%s\n' % (token, prepare_brown_signature( 45 | signature, args.max_size, args.right))) 46 | 47 | log.info('finished') 48 | 49 | if __name__ == '__main__': 50 | sys.exit(main()) 51 | -------------------------------------------------------------------------------- /src/word2embeddings/apps/test_mlp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #! /usr/bin/env python 3 | """ 4 | Example usage: 5 | X:\sa\experiments\contextual_polarity\mlp\easy_test\1-1-features-predict 6 | X:\sa\experiments\contextual_polarity\mlp\easy_test\1-1-features-predict-out 7 | MultiLayerPerceptron_1_13-12-05_18-20-37.model 8 | """ 9 | 10 | from argparse import ArgumentParser 11 | from logging import getLogger 12 | import logging 13 | import sys 14 | 15 | 16 | # CAUTION: remove the Theano path before importing any of my or Theano's 17 | # libraries 18 | 19 | # print '\n' 20 | # local 21 | # if 'C:\\Anaconda\\lib\\site-packages\\theano_test-current' in sys.path: 22 | # sys.path.remove('C:\\Anaconda\\lib\\site-packages\\theano_test-current') 23 | # print 'removed old theano_test path' 24 | # # Calculus 25 | # if '/usr/local/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg' in sys.path: 26 | # sys.path.remove('/usr/local/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg') 27 | # print 'removed old theano_test path' 28 | # Omega 29 | # if '/usr/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg' in sys.path: 30 | # sys.path.remove('/usr/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg') 31 | # print 'removed old theano_test path' 32 | # sys.path.insert(0, '/mounts/Users/cisintern/ebert/data/promotion/src/deep/src/word2embeddings/main/resources/theano_develop') 33 | # print '\n'.join(sys.path) 34 | # print '\n' 35 | # sys.path.remove('/usr/lib/python2.7/site-packages') 36 | # exit() 37 | 38 | from cis.deep.utils import utf8_file_open, logger_config 39 | from word2embeddings.nn.predictor import MlpPredictor 40 | from word2embeddings.tools.util import debug 41 | 42 | log = getLogger(__name__) 43 | logger_config(log) 44 | 45 | from theano import version 46 | print version.full_version 47 | 48 | parser = ArgumentParser() 49 | parser.add_argument('--disable-padding', dest='disable_padding', 50 | action='store_true', default=False, 51 | help='Disable padding sentences while generating examples') 52 | parser.add_argument('--binary', 53 | action='store_true', 54 | help='Predict binary values, i.e., round output values to {0, 1}') 55 | 56 | parser.add_argument('predict_file', 57 | help='Document with examples to predict the label of.') 58 | parser.add_argument('result_file', 59 | help='Document to which the predictions will be written.') 60 | parser.add_argument('load_model', 61 | help='Proceed training with the given model file.') 62 | 63 | # Argument for MiniBatchTrainer 64 | # parser.add_argument('--batch-size', dest='batch_size', type=int, default=16) 65 | 66 | 67 | def main(argv=None): 68 | log.info('started application') 69 | 70 | if argv is None: 71 | argv = sys.argv[1:] 72 | 73 | args = parser.parse_args() 74 | log.info('start parameters: ' + str(args)) 75 | 76 | if log.level == logging.DEBUG: 77 | sys.excepthook = debug 78 | 79 | log.info('creating predictor') 80 | predictor = MlpPredictor() 81 | predictor.prepare_usage(args) 82 | log.info('starting prediction') 83 | predictions = predictor.run() 84 | 85 | log.info('storing results') 86 | with utf8_file_open(args.result_file, 'w') as outfile: 87 | 88 | for p in predictions: 89 | 90 | if args.binary: 91 | outfile.write(unicode((p > 0.5).astype(int)) + u'\n') 92 | else: 93 | outfile.write(unicode(p) + u'\n') 94 | 95 | log.info('finished') 96 | 97 | if __name__ == '__main__': 98 | sys.exit(main()) 99 | -------------------------------------------------------------------------------- /src/word2embeddings/apps/train_mlp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #! /usr/bin/env python 3 | """ 4 | Example usage: 5 | --dev-file X:\sa\experiments\contextual_polarity\mlp\easy_test\1-1-features-test 6 | --epochs-limit 10 7 | --batch-size 2 8 | --examples-limit 4 9 | --dump-period -1 10 | --validation-period 1000 11 | --error-function "cross_entropy" 12 | X:\sa\experiments\contextual_polarity\mlp\easy_test\1-1-features 13 | 1 14 | 1 15 | "1" 16 | """ 17 | 18 | from argparse import ArgumentParser 19 | from logging import getLogger 20 | import logging 21 | import sys 22 | 23 | from cis.deep.utils import logger_config 24 | from word2embeddings.nn.trainer import MlpTrainer 25 | from word2embeddings.tools.util import debug 26 | 27 | 28 | # import cProfile 29 | # CAUTION: remove the Theano path before importing any of my or Theanos 30 | # libraries 31 | # print '\n' 32 | if 'C:\\Anaconda\\lib\\site-packages\\theano-current' in sys.path: 33 | sys.path.remove('C:\\Anaconda\\lib\\site-packages\\theano-current') 34 | print 'removed old theano path' 35 | # Calculus 36 | if '/usr/local/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg' in sys.path: 37 | sys.path.remove('/usr/local/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg') 38 | print 'removed old theano path' 39 | # Omega 40 | # if '/usr/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg' in sys.path: 41 | # sys.path.remove('/usr/lib/python2.7/site-packages/Theano-0.6.0rc3-py2.7.egg') 42 | # print 'removed old theano path' 43 | # print '\n'.join(sys.path) 44 | # print '\n' 45 | # exit() 46 | 47 | 48 | log = getLogger(__name__) 49 | logger_config(log) 50 | 51 | parser = ArgumentParser() 52 | parser.add_argument('train_file', 53 | help='Document for training that contains tokenized text') 54 | parser.add_argument('input_size', type=int, help='size of the input') 55 | parser.add_argument('output_size', type=int, help='size of the output') 56 | parser.add_argument('hidden_layers', default='32', 57 | help='Width of each hidden layer, comma separated. E.g., "128,64,32"') 58 | 59 | parser.add_argument('--dev-file', dest='dev_file', 60 | help='Document for dev that contains tokenized text. If no file ' + 61 | 'is given validation will only be performed on the training data.') 62 | 63 | parser.add_argument('--disable-padding', dest='disable_padding', 64 | action='store_true', default=False, 65 | help='Disable padding sentences while generating examples') 66 | 67 | parser.add_argument('--load-model', dest='load_model', 68 | help='Proceed training with the given model file.') 69 | 70 | # Argument for MiniBatchTrainer 71 | parser.add_argument('--epochs-limit', dest='epochs_limit', 72 | type=int, default=1) 73 | parser.add_argument('--batch-size', dest='batch_size', type=int, default=16) 74 | parser.add_argument('--learning-rate', dest='learning_rate', 75 | type=float, default=0.1) 76 | parser.add_argument('--decay-learning', dest='decay_learning', 77 | choices=['linear'], default='', help='Supports "linear" decay for now.') 78 | parser.add_argument('--learning-method', dest='learning_method', 79 | choices=['fan_in', 'global'], default='global', 80 | help='Determine the method that learning rate is calculated. Two ' + 81 | 'options are available: {fan_in, global}') 82 | parser.add_argument('--dump-period', dest='dump_period', type=int, 83 | default=1800, 84 | help='A model will be dumped every x seconds (-1 for never, i.e., ' + 85 | 'only the final and the best model after training will be dumped.)') 86 | parser.add_argument('--validation-period', dest='validation_period', 87 | type=float, default=5e5, 88 | help='A model will be evaluated every y seconds/examples. (-1 ' + 89 | 'for never). If a development file is given, the scores on the ' + 90 | 'training data and the validation data is computed, otherwise only ' + 91 | 'the former is computed.') 92 | parser.add_argument('--period-type', dest='period_type', default='examples', 93 | choices=['time', 'examples'], 94 | help='Set the period to be in seconds or number of examples ' + 95 | 'by setting the option to time or examples.') 96 | parser.add_argument('--save-best', dest='save_best', action='store_true', 97 | help='Save the best model every validation period.') 98 | parser.add_argument('--dump-each-epoch', dest='dump_each_epoch', 99 | action='store_true', help='Dump the model after each epoch') 100 | parser.add_argument('--examples-limit', dest='examples_limit', type=float, 101 | help='Size of example to be used', default=1e9) 102 | parser.add_argument('--error-function', dest='error_func', 103 | default='least_squares', choices=['cross_entropy', 'least_squares'], 104 | help='defines the used error function (default: least_squares)') 105 | 106 | def main(argv=None): 107 | log.info('started application') 108 | 109 | if argv is None: 110 | argv = sys.argv[1:] 111 | 112 | args = parser.parse_args() 113 | log.info('start parameters: ' + str(args)) 114 | 115 | if log.level == logging.DEBUG: 116 | sys.excepthook = debug 117 | 118 | log.info('creating trainer') 119 | trainer = MlpTrainer() 120 | trainer.prepare_usage(args) 121 | log.info('starting training') 122 | trainer.run() 123 | log.info('finished') 124 | 125 | if __name__ == '__main__': 126 | # cProfile.run('main()') 127 | 128 | sys.exit(main()) 129 | -------------------------------------------------------------------------------- /src/word2embeddings/apps/train_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #! /usr/bin/env python 3 | """ 4 | """ 5 | from argparse import ArgumentParser 6 | from logging import getLogger 7 | import logging 8 | import sys 9 | 10 | #from word2embeddings.apps import use_theano_development_version 11 | #use_theano_development_version() 12 | 13 | from cis.deep.utils import logger_config 14 | from word2embeddings.nn.trainer import HingeSentimentMiniBatchTrainer, \ 15 | HingeSentiment2MiniBatchTrainer, HingeMiniBatchTrainer, \ 16 | SimpleVLblNceTrainer, SimpleVLblNceSentimentTrainer, \ 17 | VLblNceTrainer, VLblNceSentimentTrainer, VLblNceDistributionalTrainer,\ 18 | NlblNceTrainer, NvLblNceTrainer, SLmNceTrainer, LblNceTrainer 19 | from word2embeddings.tools.util import debug 20 | 21 | log = getLogger(__name__) 22 | logger_config(log) 23 | 24 | parser = ArgumentParser() 25 | parser.add_argument('train_file', 26 | help='Document for training that contains tokenized text') 27 | 28 | parser.add_argument('--hidden-layers', dest='hidden_layers', 29 | help='Width of each hidden layer, comma separated. E.g., ' + 30 | '"28,64,32". This option only has an effect for mlp models and ' + 31 | 'for slm, where only one hidden layer is allowed.') 32 | 33 | parser.add_argument('vocabulary', 34 | help='Vocabulary file that contains list of tokens.\nCaution: Add ' + 35 | 'the special tokens , , , in this exact order at ' + 36 | 'the first positions in the vocabulary.') 37 | 38 | 39 | parser.add_argument('--sentiment-vocabulary', dest='sent_vocab', 40 | help='Vocabulary file that contains sentiment words') 41 | 42 | parser.add_argument('--predict-vocabulary', dest='pred_vocab', 43 | help='Vocabulary that contains the items that should be considered ' + 44 | 'during perplexity computation.\n' + 45 | 'Caution: Make sure this includes .\n' + 46 | 'Caution2: If this vocabulary does not contain a word that is seen ' + 47 | 'in prediction this word is not considered during perplexity ' + 48 | 'calculation.') 49 | 50 | 51 | parser.add_argument('--unigram', dest='unigram', 52 | help='file containing the unigram count (the probabilities are ' + 53 | 'calculated automatically given the counts\n ' + 54 | 'Caution: Add the ' + 55 | 'special tokens , , , in this exact order at the ' + 56 | 'first positions in the vocabulary.') 57 | parser.add_argument('--noise-samples', dest='noise_samples', type=int, 58 | help='number of noise samples per data sample') 59 | parser.add_argument('--nce-seed', dest='nce_seed', type=int, default=2345, 60 | help='seed for the noise sample generation in NCE') 61 | 62 | 63 | parser.add_argument('--validation-file', dest='validation_file', nargs='+', 64 | help='Files for validation that contains tokenized text. Multiple ' + 65 | 'files are supported, with the first file being the main validation ' + 66 | 'file, i.e., if --dump-best is active, then the performance on the ' + 67 | 'first file is considered.\n ' + 68 | 'Note: For all LBL based models the validation cost will be ' + 69 | 'different even if you provide the same validation file twice, ' + 70 | 'because the NCE cost computation involves a randomized process.') 71 | 72 | parser.add_argument('--perplexity', action='store_true', 73 | help='instead of calculating the error on the validation set, ' + 74 | 'additionally calculate the perplexity. Caution: does only work ' + 75 | 'for vLBL models. Note: using ppl in validation is slower.') 76 | 77 | 78 | parser.add_argument('--disable-padding', dest='disable_padding', 79 | action='store_true', default=False, 80 | help='Disable padding sentences while generating examples') 81 | 82 | parser.add_argument('--learn-eos', dest='learn_eos', 83 | action='store_true', default=False, 84 | help='Learn word embedding for the end-of-sentence token .') 85 | 86 | 87 | parser.add_argument('--load-model', dest='load_model', 88 | help='Proceed training with the given model file.') 89 | 90 | parser.add_argument('--model-type', dest='model_type', 91 | choices=['ColWes08', 'sent_1', 'sent_2', 'vlbl', 'nvlbl', 92 | 'vlbl_sent', 'simple_vlbl', 'simple_vlbl_sent', 'vlbl_dist', 93 | 'lbl', 'nlbl', 'slm'], 94 | default='ColWes08', 95 | help='Type of the model to use for training. All sentiment models ' + 96 | 'require a sentiment vocabulary.') 97 | 98 | parser.add_argument('--activation-func', dest='activation_func', default='rect', 99 | choices=['sigmoid', 'tanh', 'rect', 'softsign'], 100 | help='Activation function to use in non-linear models.') 101 | 102 | 103 | parser.add_argument('--left-context', dest='left_context', type=int, 104 | default=2, 105 | help='Left context window to be used measured from the current token') 106 | 107 | parser.add_argument('--right-context', dest='right_context', type=int, 108 | default=2, 109 | help='Right context window measured from the current token') 110 | 111 | parser.add_argument('--word-embedding-size', dest='word_embedding_size', 112 | type=int, default=64) 113 | 114 | 115 | # Argument for MiniBatchTrainer 116 | parser.add_argument('--epochs-limit', dest='epochs_limit', type=int, default=-1, 117 | help='maximal number of epochs to train (-1 for no limit)') 118 | 119 | parser.add_argument('--examples-limit', dest='examples_limit', type=int, 120 | default=-1, 121 | help='maximal number of examples to train (-1 for no limit)') 122 | 123 | parser.add_argument('--early-stopping', dest='early_stopping', type=int, 124 | default=-1, 125 | help='Stop the training when N consecutive validations resulted in ' + \ 126 | 'worse results than the validation before. -1 to deactivate this ' + \ 127 | 'feature.') 128 | 129 | 130 | parser.add_argument('--batch-size', dest='batch_size', type=int, default=16) 131 | 132 | 133 | parser.add_argument('--learning-rate', dest='learning_rate', 134 | default=0.1, 135 | help='Learning rate. If this parameter is a float value than the ' + 136 | 'learning rate is valid for all model parameters. Otherwise, it can ' + 137 | 'contain parameter specific learning rates in using the pattern ' + 138 | '"param_name1:param_learning_rate1,param_name2:param_learning_rate2\.' + 139 | 'You can also specify a learning rate for only some of your ' + 140 | 'parameters and assign the default learning rate for all other ' + 141 | 'parameters by specifying "default:default_learning_rate".') 142 | 143 | parser.add_argument('--lr-adaptation', dest='lr_adaptation_method', 144 | choices=['constant', 'linear', 'adagrad', 'MniTeh12'], 145 | default='constant', 146 | help='Sets the method that is used to reduce the learning rate. ' + 147 | 'Supports "linear" (linear reduction) and "adagrad" (AdaGrad ' + 148 | 'algorithm), and "constant" (no reduction), "MniTeh12" (halves the ' + 149 | 'learning rate whenever the validation perplexity (if "--perplexity" ' + 150 | 'is given) or error (otherwise) goes up; for details see [MniTeh12])') 151 | 152 | parser.add_argument('--learning-method', dest='learning_method', 153 | choices=['fan_in', 'global'], default='global', 154 | help='Determine the method that learning rate is calculated. Two ' + 155 | 'options are available: {fan_in, global}') 156 | 157 | 158 | parser.add_argument('--l1-weight', dest='l1_weight', type=float, default=0.0, 159 | help='Weight of L1 regularization term. 0 to deactivate. ' + 160 | 'Only implemented for LBL models and SLM.') 161 | parser.add_argument('--l2-weight', dest='l2_weight', type=float, default=0.0, 162 | help='Weight of L2 regularization term. 0 to deactivate. ' + 163 | 'Only implemented for LBL models and SLM.') 164 | 165 | parser.add_argument('--dump-period', dest='dump_period', type=int, default=-1, 166 | help='A model will be dumped every x seconds/examples (-1 = no ' + 167 | 'dumping. Only the final model will be dumped.)') 168 | 169 | parser.add_argument('--load-params', dest='load_params', nargs=2, 170 | help='Load initial values from files. This parameter requires two ' + 171 | 'arguments: (i) and (ii) a comma separated list of ' + 172 | 'parameter names as specified by the individual model. Each parameter' + 173 | 'must be stored in csv file format in an own file. The single ' + 174 | 'parameter files are then expected to be named ' + 175 | '..\n ' + 176 | 'Example usage: ~/my_model "C,R" will load ~/my_model.C and ' + 177 | '~/my_model.R.\n ' + 178 | 'Gzip and bz2 files are supported.') 179 | 180 | parser.add_argument('--store-params', dest='store_params', 181 | help='Comma-separated list of parameter names that will be stored ' + 182 | 'each time the model is stored. The parameter names as specified by ' + 183 | 'the individual model. Each parameter is stored in a separate file, ' + 184 | 'e.g., paramter C is stored in .params.C.') 185 | 186 | parser.add_argument('--out-dir', dest='out_dir', default='.', 187 | help='directory where to store the output files') 188 | 189 | parser.add_argument('--dump-vocabulary', dest='dump_vocabulary', 190 | action='store_true', 191 | help='Dump the vocabulary after importing it to remove duplicates.') 192 | 193 | parser.add_argument('--dump-embeddings', dest='dump_embeddings', 194 | action='store_true', 195 | help='Dump the embeddings for every dumped model. Caution: might ' + 196 | 'be a big file.\n ' + 197 | 'Caution: This parameter is deprecated. It\'s not supported by the ' + 198 | 'new vLBL models. Use --store-params instead.') 199 | 200 | parser.add_argument('--validation-period', dest='validation_period', 201 | type=float, default=-1, 202 | help='A model will be evaluated every y seconds/examples. (-1 ' + 203 | 'for never). If a development file is given, the scores on the ' + 204 | 'training data and the validation data is computed, otherwise only ' + 205 | 'the former is computed.') 206 | 207 | parser.add_argument('--period-type', dest='period_type', default='examples', 208 | choices=['time', 'examples'], 209 | help='Set the period to be in seconds or number of examples ' + 210 | 'by setting the option to time or examples.') 211 | 212 | parser.add_argument('--dump-best', dest='dump_best', action='store_true', 213 | help='Save the best model every validation period. What "best" ' + \ 214 | 'means depends on the type of model. If "--perplexity" is given, ' + \ 215 | 'it\'s the model with the lowest perplexity. If not, it\'s the ' + \ 216 | 'model with the lowest training error.') 217 | 218 | parser.add_argument('--dump-each-epoch', dest='dump_each_epoch', 219 | action='store_true', help='Dump the model after each epoch') 220 | 221 | parser.add_argument('--dump-initial-model', dest='dump_initial_model', 222 | action='store_true', 223 | help='Dump the initial model before any training is done.') 224 | 225 | 226 | parser.add_argument('--error-function', dest='error_func', 227 | default='least_squares', choices=['cross_entropy', 'least_squares'], 228 | help='defines the used error function (default: least_squares); ' + 229 | 'This parameter is only valid for MLPs.') 230 | 231 | parser.add_argument('--count-examples', dest='count_examples', 232 | action='store_true', 233 | help='Only count the examples in the training file, don\'t train a ' + 234 | 'model.') 235 | 236 | 237 | parser.add_argument('--debug-host', dest='debug_host', 238 | help='Allow remote debugging at the given host IP. Make sure you ' + 239 | 'follow the instructions at ' + 240 | 'http://pydev.org/manual_adv_remote_debugger.html. Especially, the ' + 241 | 'pydevd source must be in the PYTHONPATH and ' + 242 | 'PATHS_FROM_ECLIPSE_TO_PYTHON in pydevd_file_utils.py must be adapted.') 243 | 244 | def main(argv=None): 245 | log.info('started application') 246 | 247 | if argv is None: 248 | argv = sys.argv[1:] 249 | 250 | args = parser.parse_args(argv) 251 | 252 | check_args(args) 253 | 254 | log.info('start parameters: ' + str(args)) 255 | 256 | if args.debug_host: 257 | import pydevd 258 | pydevd.settrace(host=args.debug_host, stdoutToServer=True, 259 | stderrToServer=True) 260 | 261 | if log.level == logging.DEBUG: 262 | sys.excepthook = debug 263 | 264 | log.info('creating trainer') 265 | 266 | if args.model_type == 'ColWes08': 267 | log.info('Using ColWes08 trainer') 268 | trainer = HingeMiniBatchTrainer() 269 | elif args.model_type == 'sent_1': 270 | log.info('Using sent_1 trainer') 271 | trainer = HingeSentimentMiniBatchTrainer() 272 | elif args.model_type == 'sent_2': 273 | log.info('Using sent_2 trainer') 274 | trainer = HingeSentiment2MiniBatchTrainer() 275 | elif args.model_type == 'simple_vlbl': 276 | log.info('Using simple LBL trainer that uses noise-contrastive estimation') 277 | trainer = SimpleVLblNceTrainer() 278 | elif args.model_type == 'simple_vlbl_sent': 279 | log.info('Using simple LBL trainer that uses noise-contrastive estimation to create sentiment embeddings') 280 | trainer = SimpleVLblNceSentimentTrainer() 281 | elif args.model_type == 'vlbl': 282 | log.info('Using LBL trainer that uses noise-contrastive estimation') 283 | trainer = VLblNceTrainer() 284 | elif args.model_type == 'vlbl_sent': 285 | log.info('Using LBL trainer that uses noise-contrastive estimation to create sentiment embeddings') 286 | trainer = VLblNceSentimentTrainer() 287 | elif args.model_type == 'nvlbl': 288 | log.info('Using non-linear vLBL NCE trainer') 289 | trainer = NvLblNceTrainer() 290 | elif args.model_type == 'lbl': 291 | log.info('Using linear LBL trainer that uses noise-contrastive estimation') 292 | trainer = LblNceTrainer() 293 | elif args.model_type == 'nlbl': 294 | log.info('Using non-linear LBL trainer that uses noise-contrastive estimation') 295 | trainer = NlblNceTrainer() 296 | elif args.model_type == 'vlbl_dist': 297 | log.info('Using LBL trainer that uses distributional representation of input') 298 | trainer = VLblNceDistributionalTrainer() 299 | elif args.model_type == 'slm': 300 | log.info('Using shallow neural network lm with NCE') 301 | trainer = SLmNceTrainer() 302 | else: 303 | raise ValueError('Unknown model type. Abort') 304 | 305 | if args.count_examples is True: 306 | log.info('counting examples') 307 | trainer.configure(args) 308 | count = trainer.count_examples(args.train_file) 309 | log.info('examples: %d' % count) 310 | else: 311 | trainer.prepare_usage(args) 312 | log.info('training is about to begin') 313 | trainer.run() 314 | 315 | log.info('finished') 316 | 317 | def check_args(args): 318 | 319 | 320 | 321 | # if args.epochs_limit == -1 and args.examples_limit == -1: 322 | # raise ValueError('Either epochs-limit or examples-limit must be given.') 323 | pass 324 | 325 | if __name__ == '__main__': 326 | sys.exit(main()) 327 | -------------------------------------------------------------------------------- /src/word2embeddings/apps/use_lm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #! /usr/bin/env python 3 | """ 4 | """ 5 | 6 | from argparse import ArgumentParser 7 | from logging import getLogger 8 | import logging 9 | import sys 10 | 11 | from cis.deep.utils import logger_config 12 | from word2embeddings.nn.predictor import vLblNCEPredictor 13 | from word2embeddings.tools.util import debug 14 | 15 | 16 | log = getLogger(__name__) 17 | logger_config(log) 18 | 19 | parser = ArgumentParser() 20 | parser.add_argument('predict_file', 21 | help='Document with examples to predict the label of.') 22 | 23 | parser.add_argument('result_file', 24 | help='Document to which the predictions will be written.') 25 | 26 | parser.add_argument('vocabulary', 27 | help='Vocabulary file that contains list of tokens.') 28 | 29 | parser.add_argument('load_model', 30 | help='Proceed training with the given model file.') 31 | 32 | 33 | parser.add_argument('--predict-vocabulary', dest='pred_vocab', 34 | help='Vocabulary that contains the items that should be considered ' + 35 | 'during perplexity computation.\n' + 36 | 'Caution: Make sure this includes .\n' + 37 | 'Caution2: If this vocabulary does not contain a word that is seen ' + 38 | 'in prediction this word is not considered during perplexity ' + 39 | 'calculation.') 40 | 41 | parser.add_argument('--batch-size', dest='batch_size', type=int, default=100) 42 | 43 | 44 | parser.add_argument('-a', '--store_argmax', action='store_true', 45 | help='Store the most likely vocabulary item.') 46 | 47 | parser.add_argument('-r', '--store_rank', action='store_true', 48 | help='Store the rank of each vocabulary entry according to the ' + 49 | 'softmax.') 50 | 51 | parser.add_argument('-sm', '--store_softmax', action='store_true', 52 | help='Store the whole softmax distributions. Caution: The vocabulary ' + 53 | 'size can be very high. Therefore, the softmax output, which is a ' + 54 | 'distribution over all vocabulary items, might become very large, too.') 55 | 56 | parser.add_argument('-nr', '--normalize_with_root', action='store_true', 57 | help='Compute the root of the sm distribution and normalize the ' + 58 | 'vectors to unit length. This only has an effect when -sm is given.') 59 | 60 | parser.add_argument('-ppl', '--perplexity', action='store_true', 61 | help='Instead of calculating only the other model outputs, e.g., ' + 62 | 'softmax, etc., also compute the perplexity on the given text. ' + 63 | 'If this parameter is given, the predict_file parameter must point ' + 64 | 'to a text file that is iterate over just as in the training, i.e., ' + 65 | 'using a window approach. That means, it does not handle single ' + 66 | 'contexts per line anymore. Caution: does only work for vLBL models. ' + 67 | 'Note: using ppl in validation is slower.') 68 | 69 | parser.add_argument('-save-word', '--save_word', action='store_true', 70 | help='Works only with -ppl parameter. Store next to probability word ' + 71 | 'from prediction vocabulary. Used during post-processing for the '+ 72 | 'right interpolation.') 73 | 74 | parser.add_argument('-pr', '--predictions', action='store_true', 75 | help='Store predicted embeddings for each context.') 76 | 77 | parser.add_argument('-i', '--information', action='store_true', 78 | help='Store additional information for every prediction, e.g., (k ' + 79 | 'nearest neighboring words).') 80 | 81 | parser.add_argument('--debug-host', dest='debug_host', 82 | help='Allow remote debugging at the given host IP. Make sure you ' + 83 | 'follow the instructions at ' + 84 | 'http://pydev.org/manual_adv_remote_debugger.html. Especially, the ' + 85 | 'pydevd source must be in the PYTHONPATH and ' + 86 | 'PATHS_FROM_ECLIPSE_TO_PYTHON in pydevd_file_utils.py must be adapted.') 87 | 88 | 89 | def main(argv=None): 90 | log.info('started application') 91 | 92 | log.warning('This script is obsolete. It will not be updated anymore and ' + 93 | 'will be deleted in the future. Use use_model.py instead.') 94 | 95 | if argv is None: 96 | argv = sys.argv[1:] 97 | 98 | args = parser.parse_args() 99 | log.info('start parameters: ' + str(args)) 100 | 101 | if args.debug_host: 102 | import pydevd 103 | pydevd.settrace(host=args.debug_host, stdoutToServer=True, 104 | stderrToServer=True) 105 | 106 | if log.level == logging.DEBUG: 107 | sys.excepthook = debug 108 | 109 | log.info('creating predictor') 110 | predictor = vLblNCEPredictor() 111 | predictor.prepare_usage(args) 112 | log.info('starting prediction') 113 | predictor.run() 114 | log.info('finished') 115 | 116 | if __name__ == '__main__': 117 | sys.exit(main()) 118 | -------------------------------------------------------------------------------- /src/word2embeddings/apps/use_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #! /usr/bin/env python 3 | """ 4 | """ 5 | 6 | from argparse import ArgumentParser 7 | from logging import getLogger 8 | import logging 9 | import sys 10 | 11 | from cis.deep.utils import logger_config 12 | from word2embeddings.nn.predictor import vLblNCEPredictor 13 | from word2embeddings.tools.util import debug 14 | 15 | 16 | log = getLogger(__name__) 17 | logger_config(log) 18 | 19 | parser = ArgumentParser() 20 | parser.add_argument('predict_file', 21 | help='Document with examples to predict the label of.') 22 | 23 | parser.add_argument('result_file', 24 | help='Document to which the predictions will be written.') 25 | 26 | parser.add_argument('vocabulary', 27 | help='Vocabulary file that contains list of tokens.') 28 | 29 | parser.add_argument('load_model', 30 | help='Proceed training with the given model file.') 31 | 32 | 33 | parser.add_argument('--predict-vocabulary', dest='pred_vocab', 34 | help='Vocabulary that contains the items that should be considered ' + 35 | 'during perplexity computation.\n' + 36 | 'Caution: Make sure this includes .\n' + 37 | 'Caution2: If this vocabulary does not contain a word that is seen ' + 38 | 'in prediction this word is not considered during perplexity ' + 39 | 'calculation.') 40 | 41 | parser.add_argument('--batch-size', dest='batch_size', type=int, default=100) 42 | 43 | 44 | parser.add_argument('-a', '--store_argmax', action='store_true', 45 | help='Store the most likely vocabulary item.') 46 | 47 | parser.add_argument('-r', '--store_rank', action='store_true', 48 | help='Store the rank of each vocabulary entry according to the ' + 49 | 'softmax.') 50 | 51 | parser.add_argument('-sm', '--store_softmax', action='store_true', 52 | help='Store the whole softmax distributions. Caution: The vocabulary ' + 53 | 'size can be very high. Therefore, the softmax output, which is a ' + 54 | 'distribution over all vocabulary items, might become very large, too.') 55 | 56 | parser.add_argument('-nr', '--normalize_with_root', action='store_true', 57 | help='Compute the root of the sm distribution and normalize the ' + 58 | 'vectors to unit length. This only has an effect when -sm is given.') 59 | 60 | parser.add_argument('-ppl', '--perplexity', action='store_true', 61 | help='Instead of calculating only the other model outputs, e.g., ' + 62 | 'softmax, etc., also compute the perplexity on the given text. ' + 63 | 'If this parameter is given, the predict_file parameter must point ' + 64 | 'to a text file that is iterate over just as in the training, i.e., ' + 65 | 'using a window approach. That means, it does not handle single ' + 66 | 'contexts per line anymore. Caution: does only work for vLBL models. ' + 67 | 'Note: using ppl in validation is slower.') 68 | 69 | parser.add_argument('-save-word', '--save_word', action='store_true', 70 | help='Works only with -ppl parameter. Store next to probability word ' + 71 | 'from prediction vocabulary. Used during post-processing for the '+ 72 | 'right interpolation.') 73 | 74 | parser.add_argument('-pr', '--predictions', action='store_true', 75 | help='Store predicted embeddings for each context.') 76 | 77 | parser.add_argument('-i', '--information', action='store_true', 78 | help='Store additional information for every prediction, e.g., (k ' + 79 | 'nearest neighboring words).') 80 | 81 | parser.add_argument('--debug-host', dest='debug_host', 82 | help='Allow remote debugging at the given host IP. Make sure you ' + 83 | 'follow the instructions at ' + 84 | 'http://pydev.org/manual_adv_remote_debugger.html. Especially, the ' + 85 | 'pydevd source must be in the PYTHONPATH and ' + 86 | 'PATHS_FROM_ECLIPSE_TO_PYTHON in pydevd_file_utils.py must be adapted.') 87 | 88 | 89 | def main(argv=None): 90 | log.info('started application') 91 | 92 | if argv is None: 93 | argv = sys.argv[1:] 94 | 95 | args = parser.parse_args() 96 | log.info('start parameters: ' + str(args)) 97 | 98 | if args.debug_host: 99 | import pydevd 100 | pydevd.settrace(host=args.debug_host, stdoutToServer=True, 101 | stderrToServer=True) 102 | 103 | if log.level == logging.DEBUG: 104 | sys.excepthook = debug 105 | 106 | log.info('creating predictor') 107 | predictor = vLblNCEPredictor() 108 | predictor.prepare_usage(args) 109 | log.info('starting prediction') 110 | predictor.run() 111 | log.info('finished') 112 | 113 | if __name__ == '__main__': 114 | sys.exit(main()) 115 | -------------------------------------------------------------------------------- /src/word2embeddings/lm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/word2embeddings/lm/__init__.py -------------------------------------------------------------------------------- /src/word2embeddings/lm/networks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ Language modeling networks used by the trainer.""" 5 | 6 | import theano 7 | 8 | from word2embeddings.nn.layers import EmbeddingLayer, HingeLayer 9 | from word2embeddings.nn.networks import Network, StackedBiasedHidden 10 | 11 | 12 | class WordPhraseNetwork(Network): 13 | """Model to distinguish between corrupted phrases and observed ones.""" 14 | 15 | def __init__(self, name='WordPhrase', emb_matrix_shape=None, no_of_tokens=1, 16 | hidden_layers=[1]): 17 | super(WordPhraseNetwork, self).__init__(name=name) 18 | _, word_size = emb_matrix_shape 19 | layers = [no_of_tokens * word_size] 20 | layers.extend(hidden_layers) 21 | layers.append(1) 22 | self.word_embedding = EmbeddingLayer(name='w_embedding', 23 | shape=emb_matrix_shape) 24 | self.hidden_stack = StackedBiasedHidden(name='w_stack', layers=layers) 25 | self.loss = HingeLayer(name='loss') 26 | 27 | self.layers = [self.word_embedding, self.hidden_stack, self.loss] 28 | 29 | def link(self, inputs): 30 | self.inputs = inputs 31 | observed_phrases = inputs[0] 32 | corrupted_phrases = inputs[1] 33 | observed_words = self.word_embedding.link([observed_phrases])[0] 34 | observed_scores = self.hidden_stack.link([observed_words])[0] 35 | corrupted_scores = theano.clone(observed_scores, 36 | {observed_phrases: corrupted_phrases}) 37 | self.outputs = self.loss.link([observed_scores, corrupted_scores]) 38 | return self.outputs 39 | 40 | def get_word_embeddings(self): 41 | return self.word_embedding.weights.get_value() 42 | -------------------------------------------------------------------------------- /src/word2embeddings/nn/.gitignore: -------------------------------------------------------------------------------- 1 | /__init__.pyc 2 | /layers.pyc 3 | /util.pyc 4 | -------------------------------------------------------------------------------- /src/word2embeddings/nn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/word2embeddings/nn/__init__.py -------------------------------------------------------------------------------- /src/word2embeddings/nn/predictor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This file contains classes dealing with labeled data. 4 | """ 5 | from logging import getLogger 6 | 7 | from scipy.stats._rank import rankdata 8 | from theano import tensor as T 9 | 10 | from cis.deep.utils import load_object_from_file, text_to_vocab_indices, \ 11 | file_line_generator, sort_dict_by_label, utf8_file_open, log_iterations, \ 12 | ndarray_to_string 13 | from cis.deep.utils import logger_config 14 | from cis.deep.utils.embeddings import read_vocabulary_id_file 15 | from cis.deep.utils.theano import debug_print 16 | import numpy as np 17 | from word2embeddings.nn.layers import floatX, intX 18 | from word2embeddings.nn.trainer import MiniBatchRunner 19 | from word2embeddings.tools.examples_generator import PlainExampleGenerator, \ 20 | PaddedWindowExamplesGenerator 21 | 22 | 23 | log = getLogger(__name__) 24 | logger_config(log) 25 | 26 | class MiniBatchPredictor(MiniBatchRunner): 27 | """Base class for predictors that work in mini batches.""" 28 | example_iterator_type = PlainExampleGenerator 29 | 30 | def __init__(self): 31 | super(MiniBatchPredictor, self).__init__() 32 | 33 | def configure(self, args): 34 | super(MiniBatchPredictor, self).configure(args) 35 | self.predict_file = args.predict_file 36 | 37 | def get_model(self): 38 | self.model = load_object_from_file(self.load_model) 39 | self.predictor_method = self.model.predictor 40 | 41 | def run(self): 42 | """Predict the output of the model on all test examples. 43 | 44 | Returns 45 | ------- 46 | list 47 | list of predicitons 48 | """ 49 | predictions = [] 50 | 51 | for example in self.epoch_iter(self.predict_file): 52 | predictions.append(self.model.predictor([example])) 53 | 54 | return predictions 55 | 56 | def predict_single(self): 57 | """Predict the output of the model on all test examples yielding one 58 | example at a time. 59 | 60 | Returns 61 | ------- 62 | list 63 | list of results for the current example 64 | """ 65 | for example in self.epoch_iter(self.predict_file): 66 | yield self.predictor_method([example]) 67 | 68 | 69 | class LblPredictor(MiniBatchPredictor): 70 | 71 | def __init__(self): 72 | super(LblPredictor, self).__init__() 73 | self.input_data = T.matrix('input_data', dtype=floatX) 74 | # self.label = T.matrix('label', dtype=intX) 75 | self.inputs = [self.input_data] 76 | 77 | def configure(self, args): 78 | super(LblPredictor, self).configure(args) 79 | self.vocab = read_vocabulary_id_file(args.vocabulary) 80 | self.vocab_size = len(self.vocab.keys()) 81 | self.effective_vocab_size = len(self.vocab.keys()) 82 | 83 | def process_example(self, example): 84 | """Convert the given example in handable data structures. 85 | 86 | Splits vectors in their single values and converts the labels into ints 87 | and the data into floats. 88 | 89 | Returns 90 | ------- 91 | list(str) 92 | input text 93 | """ 94 | # return example.split(' ') 95 | return text_to_vocab_indices(self.vocab, example)[0] 96 | 97 | 98 | class vLblNCEPredictor(MiniBatchPredictor): 99 | def __init__(self): 100 | super(vLblNCEPredictor, self).__init__() 101 | self.h_indices = debug_print(T.imatrix('h'), 'h') 102 | self.inputs = [self.h_indices] 103 | 104 | def configure(self, args): 105 | super(vLblNCEPredictor, self).configure(args) 106 | self.vocab = read_vocabulary_id_file(args.vocabulary) 107 | self.vocab_size = len(self.vocab.keys()) 108 | self.effective_vocab_size = len(self.vocab.keys()) 109 | self.perplexity = args.perplexity 110 | self.save_word = args.save_word 111 | self.result_file = args.result_file 112 | self.store_rank = args.store_rank 113 | self.store_argmax = args.store_argmax 114 | self.store_softmax = args.store_softmax 115 | self.normalize_with_root = args.normalize_with_root 116 | self.information = args.information 117 | self.predictions = args.predictions 118 | 119 | # This code is taken from SimpleVLblNceTrainer 120 | if args.pred_vocab: 121 | # Element i contains the index of the i'th prediction vocabulary 122 | # token in the original vocabulary. 123 | self.vocab_mapping_list = list() 124 | 125 | # Mapping from the model vocabulary to the prediction vocabulary 126 | # indices 127 | self.vocab_mapping = dict() 128 | 129 | for i, token in enumerate(file_line_generator(args.pred_vocab)): 130 | 131 | if not token in self.vocab: 132 | raise ValueError('Token "%s" in prediction vocabulary ' + 133 | 'does not exist in model vocabulary.' % token) 134 | 135 | self.vocab_mapping_list.append(self.vocab[token]) 136 | self.vocab_mapping[self.vocab[token]] = i 137 | else: 138 | self.vocab_mapping_list = range(len(self.vocab)) 139 | self.vocab_mapping = dict( 140 | zip(self.vocab_mapping_list, self.vocab_mapping_list)) 141 | 142 | if self.perplexity: 143 | self.example_iterator_type = PaddedWindowExamplesGenerator 144 | self.example_processor = self._process_example_full_text 145 | self.learn_eos = True # We need to set that because otherwise PaddedWindowExampleGenerator will ignore end-of-sentence tags () 146 | self.disable_padding = False 147 | self.w_indices = debug_print(T.imatrix('w'), 'w') 148 | self.inputs.append(self.w_indices) 149 | else: 150 | self.example_processor = self._process_example_context_per_line 151 | 152 | def get_model(self): 153 | super(vLblNCEPredictor, self).get_model() 154 | 155 | if self.perplexity: 156 | self.left_context = self.model.left_context 157 | self.right_context = self.model.right_context 158 | 159 | def predict_single(self): 160 | """Predict the output of the model on all test examples yielding one 161 | example at a time. 162 | 163 | Returns 164 | ------- 165 | list 166 | list of results for the current example 167 | """ 168 | for example in self.epoch_iter(self.predict_file): 169 | example = [example] 170 | 171 | if self.perplexity: 172 | # Pass only the context, not the target word 173 | yield example, self.predictor_method(zip(*example)[0]) 174 | else: 175 | yield example, self.predictor_method(example) 176 | 177 | def process_example(self, example): 178 | """Convert the given example in handable data structures. 179 | 180 | Splits vectors in their single values and converts the labels into ints 181 | and the data into floats. 182 | 183 | Returns 184 | ------- 185 | list(str) 186 | input text 187 | """ 188 | log.debug(example) 189 | res = self.example_processor(example) 190 | log.debug(res) 191 | return res[0] 192 | 193 | def _process_example_context_per_line(self, example): 194 | """Process the given example that contains only the context and not the 195 | target word. 196 | """ 197 | return text_to_vocab_indices(self.vocab, example) 198 | 199 | def _process_example_full_text(self, example): 200 | """Process the given example that contains context and target word. 201 | 202 | The implementation is taken from SimpleVLblNceTrainer.process_example. 203 | """ 204 | idx, example = text_to_vocab_indices(self.vocab, example) 205 | return (idx[:self.model.left_context] if self.model.right_context == 0 else 206 | idx[:self.model.left_context] + idx[self.model.left_context + 1:], 207 | idx[self.model.left_context]), example 208 | 209 | def run(self): 210 | vocab = dict(self.vocab) 211 | 212 | # Get a mapping from index to word 213 | vocab_entries = sort_dict_by_label(vocab) 214 | vocab_entries = zip(*vocab_entries)[0] 215 | log_probabs = 0. 216 | num_ppl_examples = 0 217 | num_examples = 0 218 | 219 | with utf8_file_open(self.result_file, 'w') as outfile: 220 | 221 | for batch, _ in self.next_batch(self.predict_file): 222 | # Handle each prediction 223 | # for (cur_count, (example, predictions)) in enumerate(self.predict_single()): 224 | 225 | log_iterations(log, num_examples, 10000) 226 | num_examples += len(batch) 227 | 228 | if self.perplexity: 229 | batch = zip(*batch) 230 | # Pass only the context, not the target word 231 | predictions = self.predictor_method(batch[0]) 232 | else: 233 | self.predictor_method(batch) 234 | 235 | if self.store_softmax or self.store_rank or self.store_argmax \ 236 | or self.information or self.perplexity: 237 | sm, probabs, cur_log_probabs, cur_num_ppl_examples = \ 238 | self._calc_probabilities_from_similarity(batch[1], predictions[1]) 239 | num_ppl_examples += cur_num_ppl_examples 240 | 241 | if self.store_rank or self.information: 242 | # rankdata sorts ascending, i.e., distances, but we have 243 | # similarities, hence, 1-sm 244 | ranks = rankdata(1 - sm, method='min').astype(int) 245 | 246 | if self.store_rank: 247 | outfile.write(ndarray_to_string(ranks)) 248 | 249 | if self.information: 250 | unique_ranks = set(ranks) 251 | hard_idx = vocab[u'hard'] 252 | sorted_unique_ranks = ' '.join(map(str, sorted(unique_ranks))) 253 | sorted_unique_ranks = '' 254 | top_ten_entries = ' '.join([vocab_entries[i] for i in np.argsort(1 - sm)[:10]]) 255 | print '#%d\t%s\t%s' % (ranks[hard_idx], 256 | sorted_unique_ranks, 257 | top_ten_entries) 258 | 259 | if self.store_argmax: 260 | maximum = np.argmax(sm) 261 | # outfile.write(vocab_entries[maximum] + u' (%d)\t' % maximum) 262 | outfile.write(vocab_entries[maximum]) 263 | 264 | if self.store_softmax: 265 | 266 | if self.normalize_with_root: 267 | sm = np.sqrt(sm) 268 | sm = sm / np.linalg.norm(sm, 2, axis=-1) 269 | 270 | outfile.write(ndarray_to_string(sm)) 271 | 272 | if self.perplexity: 273 | 274 | if self.save_word: 275 | indices_in_predict_vocab = [self.vocab_mapping[batch[1][i]] for i in range(len(batch[1]))] 276 | indices_in_original_vocab = [self.vocab_mapping_list[i] for i in indices_in_predict_vocab] 277 | words = [self.vocab.keys()[self.vocab.values().index(i)] for i in indices_in_original_vocab] 278 | 279 | outfile.write( u'\n'.join("%s %s" % (x, y) for x, y in zip(map(unicode, probabs), words)) ) 280 | else: 281 | outfile.write(u'\n'.join(map(unicode, probabs))) 282 | 283 | log_probabs += cur_log_probabs if cur_log_probabs is not np.nan else 0. 284 | 285 | if self.predictions: 286 | outfile.write(ndarray_to_string(predictions[0][0])) 287 | 288 | outfile.write(u'\n') 289 | 290 | # print all results 291 | # for predictions in predictions: 292 | # outfile.write(ndarray_to_string(predictions[0][0]) + u'\t') 293 | # 294 | # if args.store_softmax: 295 | # outfile.write(ndarray_to_string(predictions[1][0]) + u'\t') 296 | # 297 | # outfile.write(vocab_entries[predictions[2][0]] + u' (%d)' % predictions[2][0]) 298 | # outfile.write(u'\n') 299 | # # outfile.write(unicode(predictions) + u'\n') 300 | if self.perplexity: 301 | ppl = np.exp(-1. / (num_ppl_examples) * log_probabs) 302 | log.info('Perplexity on %d examples is %f', num_ppl_examples, ppl) 303 | 304 | 305 | class MlpPredictor(MiniBatchPredictor): 306 | 307 | def __init__(self): 308 | super(MlpPredictor, self).__init__() 309 | self.input_data = T.matrix('input_data', dtype=floatX) 310 | self.label = T.matrix('label', dtype=intX) 311 | self.inputs = [self.label, self.input_data] 312 | 313 | def process_example(self, example): 314 | """Convert the given example in handable data structures. 315 | 316 | Splits vectors in their single values and converts the labels into ints 317 | and the data into floats. 318 | 319 | Returns 320 | ------- 321 | list(float) 322 | values 323 | """ 324 | return map(float, example.split(' ')) 325 | -------------------------------------------------------------------------------- /src/word2embeddings/nn/tools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """tools.py: Tools for dealing with Networks.""" 5 | 6 | from cis.deep.utils import file_line_generator 7 | import numpy as np 8 | 9 | 10 | LOG_FORMAT = '%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s' 11 | 12 | 13 | def read_unigram_distribution(filename): 14 | """Read the unigram distribution for all vocabulary items from the file. 15 | 16 | 1 probability per line. 17 | Caution: Don't forget to add the 4 special tokens, e.g., . Besides 18 | we don't want to draw them as noise, therefore they should have 19 | a count of 0. 20 | """ 21 | unigram_dist = read_unigram_frequencies(filename) 22 | 23 | # Note: use the same datatype as Theano's floatX here, to avoid problems. 24 | return np.asarray(unigram_dist, 'float32') / np.sum(unigram_dist) 25 | 26 | def read_unigram_frequencies(filename): 27 | """Read the unigram frequencies for all vocabulary items from the file. 28 | 29 | 1 frequency per line. 30 | Caution: Don't forget to add the 4 special tokens, e.g., . Besides 31 | we don't want to draw them as noise, therefore they should have 32 | a count of 0. 33 | """ 34 | unigram_dist = [] 35 | 36 | for line in file_line_generator(filename): 37 | unigram_dist.append(int(line)) 38 | 39 | return unigram_dist 40 | -------------------------------------------------------------------------------- /src/word2embeddings/nn/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """util.py: Useful functions""" 5 | 6 | import numpy 7 | from theano import tensor as T 8 | 9 | 10 | def zero_value(shape, dtype): 11 | return numpy.zeros(shape, dtype=dtype) 12 | 13 | def random_value_GloBen10(shape, dtype, random_generator=None, no_of_units=None): 14 | """ 15 | Return a randomly initialized matrix using a uniform distribution. 16 | 17 | Returns a randomly initialized matrix using the method proposed in 18 | [GloBen10]. 19 | 20 | Parameters 21 | ---------- 22 | shape : (int, int) 23 | size of the matrix that needs to be initialized 24 | dtype : dtype 25 | datatype of the random values 26 | random_generator : numpy.random.RandomState 27 | random number generator; if None a new instance will automatically be 28 | created 29 | no_of_units : (int, int) 30 | number of input and output dimensions; if None it will be the same as 31 | shape 32 | """ 33 | # `W` is initialized with `W_values` which is uniformely sampled 34 | # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden)) 35 | # for tanh activation function 36 | # the output of uniform if converted using asarray to dtype 37 | # theano.config.floatX so that the code is runable on GPU 38 | # Note : optimal initialization of weights is dependent on the 39 | # activation function used (among other things). 40 | # For example, results presented in [Xavier10] suggest that you 41 | # should use 4 times larger initial weights for sigmoid 42 | # compared to tanh 43 | # We have no info for other function, so we use the same as 44 | # tanh. 45 | if not random_generator: 46 | random_generator = numpy.random.RandomState(1234) 47 | 48 | if no_of_units is None: 49 | total_dimensions = numpy.sum(shape) 50 | else: 51 | total_dimensions = numpy.sum(no_of_units) 52 | 53 | low = -numpy.sqrt(6. / total_dimensions) 54 | high = numpy.sqrt(6. / total_dimensions) 55 | random_values = random_generator.uniform(low=low, high=high, size=shape) 56 | W_values = numpy.asarray(random_values, dtype=dtype) 57 | return W_values 58 | 59 | def random_value_normal(shape, dtype, random_generator=None): 60 | """Return a randomly initialized matrix using a normal distribution. 61 | 62 | Returns random numbers from a zero-mean Gaussian with 0.01 std dev. This 63 | std dev value has been proposed by [Hin10]. 64 | 65 | Parameters 66 | ---------- 67 | shape : (int, int) 68 | size of the matrix that needs to be initialized 69 | dtype : dtype 70 | datatype of the random values 71 | random_generator : numpy.random.RandomState 72 | random number generator; if None a new instance will automatically be 73 | created 74 | """ 75 | 76 | if not random_generator: 77 | random_generator = numpy.random.RandomState(1234) 78 | 79 | random_values = random_generator.normal(scale=0.01, size=shape) 80 | W_values = numpy.asarray(random_values, dtype=dtype) 81 | return W_values 82 | 83 | def threshold(x): 84 | """An approximation of sigmoid. 85 | 86 | More approximate and faster than ultra_fast_sigmoid. 87 | 88 | Approx in 3 parts: 0, scaled linear, 1 89 | 90 | Removing the slope and shift does not make it faster. 91 | 92 | """ 93 | # x = theano.printing.Print('x')(x) 94 | # gt = theano.printing.Print('gt')(T.gt(x, 0.5)) 95 | # return gt 96 | return T.gt(x, 0.5) 97 | 98 | -------------------------------------------------------------------------------- /src/word2embeddings/tools/.gitignore: -------------------------------------------------------------------------------- 1 | /__init__.pyc 2 | /theano_extensions.pyc 3 | -------------------------------------------------------------------------------- /src/word2embeddings/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinwenpeng/SciTail/96d01a4568e9da109be691c37bb844e01ee63b30/src/word2embeddings/tools/__init__.py -------------------------------------------------------------------------------- /src/word2embeddings/tools/examples_generator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """examples_generator.py: Examples generator for language models training.""" 5 | 6 | from cis.deep.utils import file_line_generator 7 | from cis.deep.utils.embeddings import SpecialToken 8 | 9 | 10 | class Error(Exception): 11 | """ Base class to be used for other module's exceptions.""" 12 | 13 | 14 | class SpanNotUsedError(Error): 15 | """ Raised if the a token of a particular span is not picked.""" 16 | 17 | 18 | class ExampleGenerator(object): 19 | 20 | def configure(self, options): 21 | """Configure the example generator with the given options. 22 | 23 | Parameters 24 | ---------- 25 | options : dict 26 | options dictionary 27 | """ 28 | pass 29 | 30 | def example_iter(self, filename): 31 | """Iterate over the examples in the given file. 32 | 33 | Must be implemented by each sub class. 34 | 35 | Parameters 36 | ---------- 37 | filename : str 38 | name of the file containing the examples 39 | 40 | Yields 41 | ------ 42 | examples 43 | """ 44 | raise NotImplementedError 45 | 46 | 47 | class PlainExampleGenerator(ExampleGenerator): 48 | """Reads a file containing only examples. 49 | 50 | Each line is considered one example. 51 | """ 52 | def example_iter(self, filename): 53 | 54 | for example in file_line_generator(filename): 55 | yield example 56 | 57 | raise StopIteration 58 | 59 | 60 | class LabeledExampleGenerator(PlainExampleGenerator): 61 | """Reads a file containing label and data for every example. 62 | 63 | Each line in the file is an example. The first column corresponds to the 64 | label, the second column is the input of the classifier. Columns are tab- 65 | separated. Vector-based inputs or outputs are space separated. However, this 66 | generator does not convert the values in any way. Instead, it just returns 67 | the values as strings. 68 | """ 69 | 70 | def example_iter(self, filename): 71 | 72 | for line in super(LabeledExampleGenerator, self).example_iter(filename): 73 | yield line.split('\t') 74 | 75 | raise StopIteration 76 | 77 | 78 | class PaddedWindowExamplesGenerator(PlainExampleGenerator): 79 | """ Generates sequence of fixed-width window of tokens.""" 80 | 81 | def configure(self, options): 82 | self.left_context = options.left_context 83 | self.right_context = options.right_context 84 | self.disable_padding = options.disable_padding 85 | self.learn_eos = options.learn_eos 86 | 87 | def example_iter(self, filename): 88 | 89 | for line in super(PaddedWindowExamplesGenerator, self).example_iter(filename): 90 | line = line.split() 91 | 92 | if not self.disable_padding: 93 | line = self.pad_sent(line) 94 | elif self.learn_eos: # add eos token if we need to learn it, but do not want to do padding 95 | line.append(SpecialToken.SENT_END.value) 96 | 97 | for example in self.sent_examples(line): 98 | yield example 99 | 100 | def is_valid_example(self, _): 101 | """Checks if the given example is a valid example to process. 102 | 103 | Every subclass can specify what is a valid example. 104 | """ 105 | return True 106 | 107 | def pad_sent(self, tokens): 108 | sent = [SpecialToken.SENT_START.value] 109 | sent.extend(tokens) 110 | sent.append(SpecialToken.SENT_END.value) 111 | return sent 112 | 113 | def sent_examples(self, sent): 114 | """Turns a sentence into a number of examples. 115 | An example is like {'sources': [list of feature vectors]} 116 | """ 117 | length = len(sent) 118 | 119 | # if the padding is disabled start pos from leftcontext+1 120 | start_offset = self.left_context if self.disable_padding else 1 121 | end_offset = self.right_context if self.disable_padding else 1 122 | 123 | # if we want to learn end-of-sentence during padding, 124 | # then move end_offset to let pos cover eos token 125 | if not self.disable_padding and self.learn_eos: 126 | end_offset -= 1 127 | 128 | for pos in range(start_offset, length - end_offset): 129 | left_context = sent[max(0, pos - self.left_context): pos] 130 | right_context = sent[pos + 1: pos + 1 + self.right_context] 131 | 132 | left_diff = self.left_context - len(left_context) 133 | 134 | if left_diff > 0: 135 | left_context = left_diff * [SpecialToken.PAD.value] + \ 136 | left_context 137 | 138 | right_diff = self.right_context - len(right_context) 139 | 140 | if right_diff > 0: 141 | right_context = right_context + right_diff * \ 142 | [SpecialToken.PAD.value] 143 | 144 | example = left_context + [sent[pos]] + right_context 145 | 146 | if not self.is_valid_example(example): 147 | continue 148 | 149 | yield example 150 | 151 | 152 | class SentimentExamplesGenerator(PaddedWindowExamplesGenerator): 153 | """Extract special sentiment training examples. 154 | 155 | Extracts positive instances from the text, having the requirement that 156 | the center word of the example is contained in a sentiment vocabulary. 157 | 158 | Attributes 159 | ---------- 160 | vocab : dict 161 | sentiment vocabulary 162 | """ 163 | 164 | def configure(self, options): 165 | """ 166 | Parameters 167 | ---------- 168 | options.vocab : dict 169 | sentiment vocabulary 170 | 171 | """ 172 | super(SentimentExamplesGenerator, self).configure(options) 173 | self.sent_vocab = options.sent_vocab 174 | 175 | def is_valid_example(self, example): 176 | return example[self.left_context] in self.sent_vocab 177 | 178 | 179 | class SentimentAnywhereExamplesGenerator(SentimentExamplesGenerator): 180 | """Extract special sentiment training examples. 181 | 182 | Extracts examples from the text, having the requirement that 183 | at least one token of the example is contained in a sentiment vocabulary. 184 | """ 185 | 186 | def is_valid_example(self, example): 187 | return any((e in self.sent_vocab for e in example)) 188 | -------------------------------------------------------------------------------- /src/word2embeddings/tools/theano_extensions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy 3 | from theano import Op, Apply 4 | from theano.sandbox.rng_mrg import MRG_RandomStreams 5 | from theano.tensor.basic import as_tensor_variable 6 | 7 | import theano.tensor as T 8 | 9 | 10 | class MRG_RandomStreams2(MRG_RandomStreams): 11 | """Module component with similar interface to numpy.random 12 | (numpy.random.RandomState) 13 | """ 14 | 15 | def __init__(self, seed=12345, use_cuda=None): 16 | """ 17 | :type seed: int or list of 6 int. 18 | 19 | :param seed: a default seed to initialize the random state. 20 | If a single int is given, it will be replicated 6 times. 21 | The first 3 values of the seed must all be less than M1 = 2147483647, 22 | and not all 0; and the last 3 values must all be less than 23 | M2 = 2147462579, and not all 0. 24 | 25 | """ 26 | super(MRG_RandomStreams2, self).__init__(seed, use_cuda) 27 | 28 | def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int32', 29 | nstreams=None): 30 | """ 31 | Sample `n` (currently `n` needs to be 1) times from a multinomial 32 | distribution defined by probabilities pvals. 33 | 34 | Example : pvals = [[.98, .01, .01], [.01, .98, .01]] will 35 | probably result in [[1,0,0],[0,1,0]]. 36 | 37 | .. note:: 38 | -`size` and `ndim` are only there keep the same signature as other 39 | uniform, binomial, normal, etc. 40 | todo : adapt multinomial to take that into account 41 | 42 | -Does not do any value checking on pvals, i.e. there is no 43 | check that the elements are non-negative, less than 1, or 44 | sum to 1. passing pvals = [[-2., 2.]] will result in 45 | sampling [[0, 0]] 46 | """ 47 | if pvals is None: 48 | raise TypeError('You have to specify pvals') 49 | pvals = as_tensor_variable(pvals) 50 | if size is not None: 51 | if any([isinstance(i, int) and i <= 0 for i in size]): 52 | raise ValueError( 53 | 'The specified size contains a dimension with value <= 0', 54 | size) 55 | 56 | if n == 1 and pvals.ndim == 1: 57 | if ndim is not None: 58 | raise ValueError('Provided an ndim argument to ' + 59 | 'MRG_RandomStreams2.multinomial, which does not use ' + 60 | 'the ndim argument.') 61 | unis = self.uniform(size=size, ndim=2, nstreams=nstreams) 62 | op = MultinomialFromUniform2(dtype) 63 | return op(pvals, unis) 64 | else: 65 | raise NotImplementedError('MRG_RandomStreams2.multinomial only ' + 66 | ' implemented with n == 1 and pvals.ndim = 2') 67 | 68 | 69 | class MultinomialFromUniform2(Op): 70 | '''Converts samples from a uniform into sample from a multinomial. 71 | 72 | This random number generator is faster than the standard one of Theano, 73 | because it stops earlier and doesn't return matrices of zeros and ones, 74 | indicating which index was drawn. Instead it returns the index of the drawn 75 | element. 76 | ''' 77 | def __init__(self, odtype): 78 | self.odtype = odtype 79 | 80 | def __eq__(self, other): 81 | return type(self) == type(other) and self.odtype == other.odtype 82 | 83 | def __hash__(self): 84 | return hash((type(self), self.odtype)) 85 | 86 | def __str__(self): 87 | return '%s{%s}' % (self.__class__.__name__, self.odtype) 88 | 89 | def __setstate__(self, dct): 90 | self.__dict__.update(dct) 91 | try: 92 | self.odtype 93 | except AttributeError: 94 | self.odtype = 'auto' 95 | 96 | def make_node(self, pvals, unis): 97 | pvals = T.as_tensor_variable(pvals) 98 | unis = T.as_tensor_variable(unis) 99 | if pvals.ndim != 1: 100 | raise NotImplementedError('pvals ndim should be 1', pvals.ndim) 101 | if unis.ndim != 2: 102 | raise NotImplementedError('unis ndim should be 2', unis.ndim) 103 | if self.odtype == 'auto': 104 | odtype = pvals.dtype 105 | else: 106 | odtype = self.odtype 107 | out = T.tensor(dtype=odtype, broadcastable=unis.type.broadcastable) 108 | return Apply(self, [pvals, unis], [out]) 109 | 110 | def grad(self, ins, outgrads): 111 | pvals, unis = ins 112 | (gz,) = outgrads 113 | return [T.zeros_like(x) for x in ins] 114 | 115 | # def c_code_cache_version(self): 116 | # return (5,) 117 | 118 | def c_code(self, node, name, ins, outs, sub): 119 | (pvals, unis) = ins 120 | (z,) = outs 121 | 122 | fail = sub['fail'] 123 | return """ 124 | if (PyArray_NDIM(%(pvals)s) != 1) 125 | { 126 | PyErr_Format(PyExc_TypeError, "pvals wrong rank"); 127 | %(fail)s; 128 | } 129 | if (PyArray_NDIM(%(unis)s) != 2) 130 | { 131 | PyErr_Format(PyExc_TypeError, "unis wrong rank"); 132 | %(fail)s; 133 | } 134 | 135 | if ((NULL == %(z)s) 136 | || ((PyArray_DIMS(%(z)s))[0] != (PyArray_DIMS(%(unis)s))[0]) 137 | || ((PyArray_DIMS(%(z)s))[1] != (PyArray_DIMS(%(unis)s))[1]) 138 | ) 139 | { 140 | Py_XDECREF(%(z)s); 141 | %(z)s = (PyArrayObject*) PyArray_ZEROS(2, 142 | PyArray_DIMS(%(unis)s), 143 | type_num_%(z)s, 144 | 0); 145 | if (!%(z)s) 146 | { 147 | PyErr_SetString(PyExc_MemoryError, "failed to alloc z output"); 148 | %(fail)s; 149 | } 150 | } 151 | 152 | { // NESTED SCOPE 153 | 154 | const int nb_outcomes = PyArray_DIMS(%(pvals)s)[0]; 155 | const int nb_rows = PyArray_DIMS(%(unis)s)[0]; 156 | const int nb_cols = PyArray_DIMS(%(unis)s)[1]; 157 | 158 | // 159 | // For each multinomial, loop over each possible outcome 160 | // 161 | for (int row = 0; row < nb_rows; ++row) 162 | { 163 | for (int col = 0; col < nb_cols; ++col) { 164 | // std::cout << row << 'x' << col << std::endl; 165 | 166 | dtype_%(pvals)s cummul = 0.; 167 | const dtype_%(unis)s* unis_n = (dtype_%(unis)s*)PyArray_GETPTR2(%(unis)s, row, col); 168 | dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, row, col); 169 | *z_nm = -1; 170 | 171 | // std::cout << "unis " << (int)(*unis_n * 100) << std::endl; 172 | // std::cout << "z_nm " << (int)(*z_nm * 100) << std::endl; 173 | 174 | for (int m = 0; m < nb_outcomes; ++m) 175 | { 176 | const dtype_%(pvals)s* pvals_m = (dtype_%(pvals)s*)PyArray_GETPTR1(%(pvals)s, m); 177 | cummul += *pvals_m; 178 | // std::cout << "cummul " << (int)(cummul * 100) << std::endl; 179 | 180 | if (cummul > *unis_n) 181 | { 182 | *z_nm = m; 183 | // *z_nm = 17; 184 | break; 185 | } 186 | 187 | } 188 | 189 | // If we reached the end, use the last value. 190 | // If we have a real distribution [0,1], than this should never 191 | // happen, right? I got a segmentation fault when removing it. 192 | // 2014-04-08 193 | // This might happen due to rounding errors. 2014-05-01 194 | if (*z_nm == -1) { 195 | *z_nm = nb_outcomes - 1; 196 | } 197 | } 198 | } 199 | } // END NESTED SCOPE 200 | """ % locals() 201 | 202 | def perform(self, node, ins, outs): 203 | (pvals, unis) = ins 204 | (z,) = outs 205 | 206 | if z[0] is None or z[0].shape != numpy.sum(unis.shape): 207 | z[0] = numpy.zeros(unis.shape, dtype=node.outputs[0].dtype) 208 | 209 | z[0][:, :] = -1 210 | 211 | nb_outcomes = pvals.shape[0] 212 | 213 | for row in xrange(unis.shape[0]): 214 | for col in xrange(unis.shape[1]): 215 | cummul = 0 216 | unis_n = unis[row, col] 217 | 218 | for m in range(nb_outcomes): 219 | cummul += pvals[m] 220 | 221 | if cummul > unis_n: 222 | z[0][row, col] = m 223 | # z[0][row, col] = 13 224 | break 225 | 226 | # If we reached the end, use the last value. 227 | # If we have a real distribution [0,1], than this should never 228 | # happen, right? I got a segmentation fault when removing it. 229 | # 2014-04-08 230 | # This might happen due to rounding errors. 2014-05-01 231 | if z[0][row, col] == -1: 232 | z[0][row, col] = nb_outcomes - 1; 233 | -------------------------------------------------------------------------------- /src/word2embeddings/tools/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """util.py: Collection of useful utilities.""" 5 | 6 | from itertools import islice, izip_longest 7 | import re 8 | import sys 9 | 10 | from cis.deep.utils import file_line_generator 11 | 12 | 13 | LOG_FORMAT = '%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s' 14 | 15 | def Enum(**enums): 16 | """An enumeration factory class.""" 17 | obj = type('Enum', (), enums) 18 | obj.named_value = dict([(a, v) for a, v in vars(obj).items() if not a.startswith('__')]) 19 | obj.value_named = dict([(v, a) for a, v in obj.named_value.items()]) 20 | return obj 21 | 22 | def debug(type_, value, tb): 23 | if hasattr(sys, 'ps1') or not sys.stderr.isatty(): 24 | # we are in interactive mode or we don't have a tty-like 25 | # device, so we call the default hook 26 | sys.__excepthook__(type_, value, tb) 27 | else: 28 | import traceback, pdb 29 | # we are NOT in interactive mode, print the exception... 30 | traceback.print_exception(type_, value, tb) 31 | print('\n') 32 | # ...then start the debugger in post-mortem mode. 33 | pdb.pm() 34 | 35 | def extract_results_from_logfile(logfile, result='train_error', fmt='new', 36 | no_of_val_files=1): 37 | """Extract results from a given logfile and returns them as ndarray. 38 | 39 | Parameters 40 | ---------- 41 | logfile : str 42 | path of the logfile 43 | result : str 44 | type of the result to be extracted; one of 'train_error', 45 | 'val_error', 'val_ppl' 46 | format : str 47 | 'new' or 'old', new format allows several validation files; old format 48 | only allowed 1 validation file. 49 | no_of_val_files : int 50 | number of validation files used in the logfile; is only matters if 51 | result = 'val_error' or 'val_perplexity' 52 | 53 | Returns 54 | ------- 55 | ndarray 56 | contains all results in an array 57 | """ 58 | 59 | if fmt == 'old': 60 | val_method_name = 'validate' 61 | else: 62 | val_method_name = '_validate_single_file' 63 | 64 | 65 | if result == 'train_error': 66 | pattern = re.compile(r'run\tAverage loss on .*? training set is (.*)', 67 | re.UNICODE) 68 | elif result == 'val_error': 69 | pattern = re.compile( 70 | r'%s\tAverage loss on .*? validation set is (.*)' % val_method_name, 71 | re.UNICODE) 72 | elif result == 'val_ppl': 73 | pattern = re.compile( 74 | r'%s\tPerplexity on .*? validation set is (.*)' % val_method_name, 75 | re.UNICODE) 76 | else: 77 | raise ValueError('Unknown result type to be extracted from logfile: %s' 78 | % result) 79 | 80 | values = list() 81 | 82 | for line in file_line_generator(logfile): 83 | match = re.search(pattern, line) 84 | 85 | if not match: 86 | continue 87 | 88 | values.append(float(match.group(1))) 89 | 90 | # Converts the 1d list of results into one list per validation file. 91 | if (result == 'val_error' or result == 'val_ppl') and no_of_val_files != 1: 92 | values = list(grouper_recipes(values, no_of_val_files)) 93 | values = zip(*values) 94 | 95 | return values 96 | 97 | def grouper(iterable, n): 98 | """Group n items from the iterable into a group. 99 | 100 | Parameters 101 | ---------- 102 | iterable : any 103 | iterator to get the items from 104 | n : int 105 | number of items to form one group 106 | 107 | Returns 108 | ------- 109 | tuple(items) 110 | tuple of n items taken from the iterator 111 | """ 112 | chunk = tuple(islice(iterable, n)) 113 | 114 | if not chunk: 115 | return 116 | yield chunk 117 | 118 | def grouper_recipes(iterable, n, fillvalue=None): 119 | """Collect data into fixed-length chunks or blocks. 120 | Grouper taken from https://docs.python.org/2/library/itertools.html. 121 | """ 122 | # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx 123 | args = [iter(iterable)] * n 124 | return izip_longest(fillvalue=fillvalue, *args) 125 | 126 | def prepare_brown_signature(signature, max_size, add_right=False): 127 | """Convert variable length signatures into fixed length ones. 128 | 129 | Prepends zeros to the front of the signature. 130 | 131 | Parameters 132 | ---------- 133 | signature : str 134 | brown signature a string (space separated) 135 | max_size : int 136 | size of the fixed signature 137 | add_right : bool 138 | indicates whether to add the padding zeros to the right of the signature 139 | instead of the left 140 | 141 | Returns 142 | ------- 143 | str 144 | fixed length brown signature 145 | 146 | Example 147 | ------- 148 | >>> prepare_brown_signature(u'1 1', 4) 149 | u'0 0 1 1' 150 | 151 | >>> prepare_brown_signature(u'1 1 1 1', 4) 152 | u'1 1 1 1' 153 | 154 | >>> prepare_brown_signature(u'1 1', 4, True) 155 | u'1 1 0 0' 156 | 157 | >>> prepare_brown_signature(u'1 1 1 1', 4, True) 158 | u'1 1 1 1' 159 | """ 160 | sig_len = len(signature.split()) 161 | needed_padding = max_size - sig_len 162 | 163 | if needed_padding == 0: 164 | return signature 165 | 166 | padding = u' '.join([u'0' for _ in xrange(needed_padding)]) 167 | return padding + u' ' + signature \ 168 | if not add_right else signature + u' ' + padding 169 | --------------------------------------------------------------------------------