├── .gitignore
├── README.md
├── convolution
    ├── __init__.py
    └── cnn.py
├── embeddings
    ├── __init__.py
    └── text_embeddings.py
├── eval.py
├── evaluation
    ├── __init__.py
    └── confusion_matrix.py
├── graphs
    ├── __init__.py
    └── graph.py
├── helpers
    ├── __init__.py
    ├── data_helper.py
    ├── data_shaper.py
    └── io_helper.py
├── ml
    ├── __init__.py
    ├── batcher.py
    ├── loss_functions.py
    └── trainer.py
├── nlp.py
├── scaler.py
├── sts
    ├── __init__.py
    └── simple_sts.py
├── supervised-scaler.py
├── wfcode
    ├── __init__.py
    ├── corpus.py
    └── scaler.py
└── wordfish.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SemScale
 2 | An easy-to-use tool for semantic scaling of political text, based on word embeddings. Check out the working draft of our [political science article](https://arxiv.org/pdf/1904.06217.pdf) (plus its  [online appendix](https://umanlp.github.io/semantic-scaling/)) and the [original NLP paper](https://ub-madoc.bib.uni-mannheim.de/42002/1/E17-2109.pdf).
 3 | 
 4 | ## How to use it
 5 | 
 6 | Clone or download the project, then go into the SemScale directory. The script scaler.py needs just the following inputs:
 7 | 
 8 |  __datadir__ -> A path to the directory containing the input text
 9 |                         files for scaling (one score will be assigned per
10 |                         file).
11 |                         
12 |  __embs__ -> A path to the file containing pre-trained word
13 |                         embeddings
14 |                         
15 |  __output__ -> A file path to which to store the scaling results.
16 |  
17 | 
18 | optional arguments:
19 | 
20 |   -h, --help -> show this help message and exit
21 |   
22 |   --stopwords STOPWORDS -> A file to the path containing stopwords
23 |   
24 |   --emb_cutoff EMB_CUTOFF -> A cutoff on the vocabulary size of the embeddings.
25 | 
26 | ### Data directory
27 | 
28 | The expected input is in the one-text-per-file format. Each text file in the referenced directory should contain a language (e.g., "en") in the first line, i.e., the format should be "*language*\n*text of the file*". 
29 | 
30 | ### (Multilingual) Word Embeddings
31 | 
32 | For an easy set-up, we provide pre-trained FastText embeddings in a single file for the following five language: English, French, German, Italian and Spanish, that can be obtained from [here](https://drive.google.com/file/d/1Oy61TV0DpruUXOK9qO3IFsvL5DMvwGwD/view?usp=sharing). 
33 | 
34 | Nonetheless, you can easily use the tool for texts in other languages or with different word embeddings, as long as you:
35 | 
36 | 1) provide a (language-prefixed) word embedding file, the following way: for each word, abbreviation for the language plus double underscore plus word and then the word embedding. For instance, each word in a Bulgarian word embeddings file might be prefixed with "bg__")
37 | 
38 | 2) in case you employ embeddings in a different language to the 5 listed above, update the list of supported languages in the beginning of the code file *nlp.py* and at the beginning of the task script you're using (e.g., *scaler.py*)
39 | 
40 | ### Output File
41 | 
42 | A simple .txt, which will be filled with filename, positional-score for each input file.
43 | 
44 | ### (Optional) Stopwords
45 | 
46 | Stopwords can be automatically excluded, via this input file (one stop-word per line).
47 | 
48 | ### Prerequisites
49 | 
50 | The script requires basic libraries from the Python scientific stack: *numpy* (tested with version 1.12.1), *scipy* (tested with version 0.19.0), and *nltk* (tested with version 3.2.3); 
51 | 
52 | ## Run it!
53 | 
54 | In the SemScale folder, just run the following command:
55 | 
56 | ``
57 | python scaler.py path-to-embeddings-file path-to-input-folder output.txt 
58 | ``
59 | 
60 | ## Other functionalities
61 | 
62 | To use the supervised scaling version of our approach (dubbed __SemScores__), just run:
63 | 
64 | ``
65 | python supervised-scaler.py
66 | ``
67 | 
68 | and add as final arguments the two pivot texts to be used.
69 | 
70 | We also offer a Python implementation of the famous Wordfish algorithm for text scaling. To know how to use it, just run: 
71 | 
72 | ``
73 | python wordfish.py -h
74 | ``
75 | 
76 | Additional functionalities (classification, topical-scaling) are available in the [main branch](https://github.com/codogogo/topfish) of this project. 
77 | 
78 | ## License
79 | 
80 | <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License</a>.
81 | 
82 | ## Referencing
83 | 
84 | If you're using this tool, please cite the following paper: 
85 | 
86 | ```
87 | @InProceedings{glavavs-nanni-ponzetto:2017:EACLshort,
88 |   author    = {Glava\v{s}, Goran  and  Nanni, Federico  and  Ponzetto, Simone Paolo},
89 |   title     = {Unsupervised Cross-Lingual Scaling of Political Texts},
90 |   booktitle = {Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers},
91 |   month     = {April},
92 |   year      = {2017},
93 |   address   = {Valencia, Spain},
94 |   publisher = {Association for Computational Linguistics},
95 |   pages     = {688--693},
96 |   url       = {http://www.aclweb.org/anthology/E17-2109}
97 | }
98 | ```
99 | 


--------------------------------------------------------------------------------
/convolution/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codogogo/topfish/6b3f5723029616cb430d6226bc59c013fe79eb78/convolution/__init__.py


--------------------------------------------------------------------------------
/convolution/cnn.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from helpers import io_helper
  4 | 
  5 | def load_labels_and_max_length(path):
  6 | 	parameters, model = io_helper.deserialize(path)
  7 | 	return parameters["dist_labels"], parameters["max_text_length"]
  8 | 
  9 | def load_model(path, embeddings, loss_function, just_predict = True):
 10 | 	parameters, model = io_helper.deserialize(path)
 11 | 
 12 | 	print("Defining and initializing model...")
 13 | 	classifier = CNN(embeddings = (parameters["embedding_size"], embeddings), num_conv_layers = parameters["num_convolutions"], filters = parameters["filters"], k_max_pools = parameters["k_max_pools"], manual_features_size = parameters["manual_features_size"])
 14 | 	classifier.define_model(parameters["max_text_length"], parameters["num_classes"], loss_function, -1, l2_reg_factor = parameters["reg_factor"], update_embeddings = parameters["upd_embs"])
 15 | 	if not just_predict:
 16 | 		classifier.define_optimization(learning_rate = parameters["learning_rate"])
 17 | 
 18 | 	print("Initializing session...", flush = True)
 19 | 	session = tf.InteractiveSession()
 20 | 	session.run(tf.global_variables_initializer())
 21 | 
 22 | 	classifier.set_variable_values(session, model)
 23 | 	classifier.set_distinct_labels(parameters["dist_labels"])
 24 | 
 25 | 	return classifier, session
 26 | 
 27 | class CNN(object):
 28 | 	"""
 29 | 	A general convolutional neural network for text classification.
 30 | 	The CNN is highly customizable, the user may determine the number of convolutional and pooling layers and all other parameters of the network (e.g., the number of filters and filter sizes)  
 31 | 	Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
 32 | 	"""
 33 | 
 34 | 	def __init__(self, embeddings = (100, None), num_conv_layers = 1, filters = [[(3, 64), (4, 128), (5, 64)]], k_max_pools = [1], manual_features_size = 0):
 35 | 		self.emb_size = embeddings[0]
 36 | 		self.embs = embeddings[1]
 37 | 		self.num_convolutions = num_conv_layers
 38 | 		self.filters = filters
 39 | 		self.k_max_pools = k_max_pools
 40 | 
 41 | 		self.variable_memory = {}
 42 | 		self.manual_features_size = manual_features_size
 43 | 
 44 | 	def define_model(self, max_text_length, num_classes, loss_function, vocab_size, l2_reg_factor = 0.0, update_embeddings = False):
 45 | 		self.update_embeddings = update_embeddings
 46 | 		self.reg_factor = l2_reg_factor
 47 | 		self.max_text_length = max_text_length
 48 | 		self.num_classes = num_classes
 49 | 		self.loss_function = loss_function
 50 | 
 51 | 		self.input_x = tf.placeholder(tf.int32, [None, max_text_length], name="input_x")
 52 | 		if self.manual_features_size > 0:
 53 | 			self.manual_features = tf.placeholder(tf.float32, [None, self.manual_features_size], name="man_feats")
 54 | 		self.dropout = tf.placeholder(tf.float32, name="dropout")
 55 | 
 56 | 		if self.embs is None:
 57 | 			self.W_embeddings = tf.Variable(tf.random_uniform([vocab_size, self.emb_size], -1.0, 1.0), name="W_embeddings")
 58 | 		elif update_embeddings:
 59 | 			self.W_embeddings = tf.Variable(self.embs, dtype = tf.float32, name="W_embeddings")
 60 | 		else:
 61 | 			self.W_embeddings = tf.constant(self.embs, dtype = tf.float32, name="W_embeddings")
 62 | 
 63 | 		self.mb_embeddings = tf.expand_dims(tf.nn.embedding_lookup(self.W_embeddings, self.input_x), -1)
 64 | 
 65 | 		for i in range(self.num_convolutions):
 66 | 			current_filters = self.filters[i]
 67 | 			current_max_pool_size = self.k_max_pools[i]
 68 | 
 69 | 			if i > 0:
 70 | 				pooled = tf.reshape(pooled, [-1, self.k_max_pools[i - 1], sum_filt, 1]) 
 71 | 
 72 | 			input = self.mb_embeddings if i == 0 else pooled
 73 | 			input_dim = self.emb_size if i == 0 else sum_filt
 74 | 			num_units = max_text_length if i == 0 else self.k_max_pools[i - 1]
 75 | 
 76 | 			sum_filt = 0
 77 | 			for filter_size, num_filters in current_filters:				
 78 | 				filter_shape = [filter_size, input_dim, 1, num_filters]
 79 | 				
 80 | 				W_conv = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1, dtype = tf.float32), name="W_conv_" + str(i) + "_" + str(filter_size))
 81 | 				self.variable_memory["W_conv_" + str(i) + "_" + str(filter_size)] = W_conv
 82 | 
 83 | 				b_conv = tf.Variable(tf.constant(0.1, shape=[num_filters], dtype = tf.float32), name="b_" + str(i) + "_" + str(filter_size))
 84 | 				self.variable_memory["b_" + str(i) + "_" + str(filter_size)] = b_conv
 85 | 
 86 | 				conv = tf.nn.conv2d(input, W_conv, strides=[1, 1, 1, 1], padding="VALID", name="conv_" + str(i) + "_" + str(filter_size))
 87 | 				h = tf.nn.relu(tf.nn.bias_add(conv, b_conv), name="relu" + str(i) + "_" + str(filter_size))
 88 | 
 89 | 				if sum_filt == 0:
 90 | 					pooled = tf.nn.max_pool(h, ksize=[1, (num_units - filter_size + 1) - current_max_pool_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool_" + str(i) + "_" + str(filter_size))
 91 | 					
 92 | 				else:
 93 | 					new_pool = tf.nn.max_pool(h, ksize=[1, (num_units - filter_size + 1) - current_max_pool_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool_" + str(i) + "_" + str(filter_size))					
 94 | 					pooled = tf.concat(axis=3, values=[pooled, new_pool])
 95 | 
 96 | 				sum_filt += num_filters
 97 | 		
 98 | 		self.pooled_flat = tf.reshape(pooled, [-1, self.k_max_pools[-1] * sum_filt])
 99 | 		self.pooled_dropout = tf.nn.dropout(self.pooled_flat, self.dropout)
100 | 
101 | 		W_softmax = tf.get_variable("W_softmax", shape=[self.k_max_pools[-1] * sum_filt + self.manual_features_size, num_classes], initializer=tf.contrib.layers.xavier_initializer(), dtype = tf.float32)
102 | 		self.variable_memory["W_softmax"] = W_softmax
103 | 
104 | 		b_softmax = tf.Variable(tf.constant(0.1, shape=[num_classes], dtype = tf.float32), name="b_softmax")
105 | 		self.variable_memory["b_softmax"] = b_softmax
106 | 
107 | 		self.final_features = tf.concat(axis=1, values=[self.pooled_dropout, self.manual_features]) if self.manual_features_size > 0 else self.pooled_dropout  
108 | 		self.preds = tf.nn.xw_plus_b(self.final_features, W_softmax, b_softmax, name="scores")
109 | 		#self.preds_sftmx = tf.nn.softmax(self.preds)
110 | 
111 | 		self.l2_loss = tf.constant(0.0)
112 | 		self.l2_loss += tf.nn.l2_loss(W_softmax)
113 | 		self.l2_loss += tf.nn.l2_loss(b_softmax)
114 | 
115 | 	
116 | 	def define_optimization(self, learning_rate = 1e-3):
117 | 		self.input_y = tf.placeholder(tf.float32, [None, self.num_classes], name="input_y")
118 | 		self.pure_loss = self.loss_function(self.preds, self.input_y)
119 | 		self.loss = self.pure_loss + self.reg_factor * self.l2_loss
120 | 
121 | 		self.learning_rate = learning_rate
122 | 		self.train_step = tf.train.RMSPropOptimizer(learning_rate).minimize(self.loss)
123 | 
124 | 	def set_distinct_labels(self, dist_labels):
125 | 		self.dist_labels = dist_labels
126 | 		
127 | 	def get_feed_dict(self, input_data, labels, dropout, manual_feats = None):
128 | 		fd_mine = { self.input_x : input_data, self.dropout : dropout }
129 | 		if labels is not None:
130 | 			fd_mine.update({self.input_y : labels})
131 | 		if manual_feats is not None:
132 | 			fd_mine.update({self.manual_features : manual_feats})
133 | 		return fd_mine
134 | 
135 | 	def get_variable_values(self, session):
136 | 		variables = {}
137 | 		for v in self.variable_memory:
138 | 			value = self.variable_memory[v].eval(session = session)
139 | 			variables[v] = value	
140 | 		return variables
141 | 	
142 | 	def set_variable_values(self, session, var_values):
143 | 		for v in var_values:
144 | 			session.run(self.variable_memory[v].assign(var_values[v]))
145 | 
146 | 	def get_hyperparameters(self):
147 | 		params = { "embedding_size" : self.emb_size,
148 | 				  "num_convolutions" : self.num_convolutions,
149 | 				  "filters" : self.filters, 
150 | 				  "k_max_pools" : self.k_max_pools, 
151 | 				  "upd_embs" : self.update_embeddings, 
152 | 				  "reg_factor" : self.reg_factor, 
153 | 				  "learning_rate" : self.learning_rate, 
154 | 				  "manual_features_size" : self.manual_features_size, 
155 | 				  "max_text_length" : self.max_text_length,
156 | 				  "num_classes" : self.num_classes, 
157 | 				  "dist_labels" : self.dist_labels }
158 | 		return params
159 | 
160 | 	def get_model(self, session):
161 | 		return [self.get_hyperparameters(), self.get_variable_values(session)]
162 | 
163 | 	def serialize(self, session, path):
164 | 		variables = self.get_variable_values(session)
165 | 		to_serialize = [self.get_hyperparameters(), self.get_variable_values(session)]
166 | 		io_helper.serialize(to_serialize, path)		
167 | 			
168 | 			
169 | 			
170 | 				
171 | 	


--------------------------------------------------------------------------------
/embeddings/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codogogo/topfish/6b3f5723029616cb430d6226bc59c013fe79eb78/embeddings/__init__.py


--------------------------------------------------------------------------------
/embeddings/text_embeddings.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from helpers import io_helper as ioh
  3 | import codecs
  4 | from helpers import io_helper
  5 | 
  6 | def aggregate_phrase_embedding(words, stopwords, embs, emb_size, l2_norm_vec = True, lang = 'en'):
  7 | 	vec_res = np.zeros(emb_size)
  8 | 	fit_words = [w.lower() for w in words if w.lower() not in stopwords and w.lower() in embs.lang_vocabularies[lang]]
  9 | 	if len(fit_words) == 0:
 10 | 		return None
 11 | 
 12 | 	for w in fit_words:
 13 | 		vec_res += embs.get_vector(lang, w) 	
 14 | 	res = np.multiply(1.0 / (float(len(fit_words))), vec_res)
 15 | 	if l2_norm_vec:
 16 | 		res = np.multiply(1.0 / np.linalg.norm(res), res)
 17 | 	return res
 18 | 
 19 | 
 20 | class Embeddings(object):
 21 | 	"""Captures functionality to load and store textual embeddings"""
 22 | 
 23 | 	def __init__(self, cache_similarities = False):
 24 | 		self.lang_embeddings = {}
 25 | 		self.lang_emb_norms = {}
 26 | 		self.lang_vocabularies = {}
 27 | 		self.emb_sizes = {}
 28 | 		self.cache = {}
 29 | 		self.do_cache = cache_similarities
 30 | 
 31 | 	def inverse_vocabularies(self):
 32 | 		self.inverse_vocabularies = {}
 33 | 		for l in self.lang_vocabularies:
 34 | 			self.inverse_vocabularies[l] = {v: k for k, v in self.lang_vocabularies[l].items()}
 35 | 
 36 | 	def get_word_from_index(self, index, lang = 'en'):
 37 | 		if index in self.inverse_vocabularies[lang]:
 38 | 			return self.inverse_vocabularies[lang][index]
 39 | 		else:
 40 | 			return None
 41 | 
 42 | 	def get_vector(self, lang, word):
 43 | 		if word in self.lang_vocabularies[lang]:
 44 | 			return self.lang_embeddings[lang][self.lang_vocabularies[lang][word]]
 45 | 		else: 
 46 | 			return None
 47 | 
 48 | 	def set_vector(self, lang, word, vector):
 49 | 		if word in self.lang_vocabularies[lang]:
 50 | 			self.lang_embeddings[lang][self.lang_vocabularies[lang][word]] = vector
 51 | 
 52 | 	def get_norm(self, lang, word):
 53 | 		if word in self.lang_vocabularies[lang]:
 54 | 			return self.lang_emb_norms[lang][self.lang_vocabularies[lang][word]]
 55 | 		else: 
 56 | 			return None
 57 | 
 58 | 	def set_norm(self, lang, word, norm):
 59 | 		if word in self.lang_vocabularies[lang]:
 60 | 			self.lang_emb_norms[lang][self.lang_vocabularies[lang][word]] = norm
 61 | 
 62 | 	def add_word(self, lang, word, vector = None):
 63 | 		if word not in self.lang_vocabularies[lang]:
 64 | 			self.lang_vocabularies[lang][word] = len(self.lang_vocabularies[lang])
 65 | 			rvec = np.random.uniform(-1.0, 1.0, size = [self.emb_sizes[lang]]) if vector is None else vector
 66 | 			rnrm = np.linalg.norm(rvec, 2)
 67 | 			self.lang_embeddings[lang] = np.vstack((self.lang_embeddings[lang], rvec))
 68 | 			self.lang_emb_norms[lang] = np.concatenate((self.lang_emb_norms[lang], [rnrm]))	
 69 | 
 70 | 	def remove_word(self, lang, word):
 71 | 		self.lang_vocabularies[lang].pop(word, None)
 72 | 	
 73 | 	def load_embeddings(self, filepath, limit, language = 'en', print_loading = "False", skip_first_line = False, min_one_letter = False, special_tokens = None):
 74 | 		vocabulary, embs, norms = ioh.load_embeddings_dict_with_norms(filepath, limit = limit, special_tokens = special_tokens, print_load_progress = print_loading, skip_first_line = skip_first_line, min_one_letter = min_one_letter)		
 75 | 		self.lang_embeddings[language] = embs
 76 | 		self.lang_emb_norms[language] = norms
 77 | 		self.emb_sizes[language] = embs.shape[1]
 78 | 		self.lang_vocabularies[language] = vocabulary	
 79 | 	
 80 | 
 81 | 	def word_similarity(self, first_word, second_word, first_language = 'en', second_language = 'en'):	
 82 | 		if self.do_cache:
 83 | 			cache_str = min(first_word, second_word) + "-" + max(first_word, second_word)
 84 | 			if (first_language + "-" + second_language) in self.cache and cache_str in self.cache[first_language + "-" + second_language]:
 85 | 				return self.cache[first_language + "-" + second_language][cache_str]
 86 | 		elif (first_word not in self.lang_vocabularies[first_language] and first_word.lower() not in self.lang_vocabularies[first_language]) or (second_word not in self.lang_vocabularies[second_language] and second_word.lower() not in self.lang_vocabularies[second_language]):
 87 | 			if ((first_word in second_word or second_word in first_word) and first_language == second_language) or first_word == second_word:
 88 | 					return 1
 89 | 			else:
 90 | 					return 0
 91 | 
 92 | 		index_first = self.lang_vocabularies[first_language][first_word] if first_word in self.lang_vocabularies[first_language] else (self.lang_vocabularies[first_language][first_word.lower()] if first_word.lower() in self.lang_vocabularies[first_language] else -1)
 93 | 		index_second = self.lang_vocabularies[second_language][second_word] if second_word in self.lang_vocabularies[second_language] else (self.lang_vocabularies[second_language][second_word.lower()] if second_word.lower() in self.lang_vocabularies[second_language] else -1)		
 94 | 
 95 | 		if index_first >= 0 and index_second >= 0:		
 96 | 			first_emb = self.lang_embeddings[first_language][index_first]
 97 | 			second_emb = self.lang_embeddings[second_language][index_second] 
 98 | 
 99 | 			first_norm = self.lang_emb_norms[first_language][index_first]
100 | 			second_norm = self.lang_emb_norms[second_language][index_second]
101 | 
102 | 			score =  np.dot(first_emb, second_emb) / (first_norm * second_norm)
103 | 		else:
104 | 			score = 0
105 | 		
106 | 		if self.do_cache:
107 | 			if (first_language + "-" + second_language) not in self.cache:
108 | 				self.cache[first_language + "-" + second_language] = {}
109 | 				if cache_str not in self.cache[first_language + "-" + second_language]:
110 | 					self.cache[first_language + "-" + second_language][cache_str] = score		
111 | 		return score
112 | 
113 | 	def most_similar(self, embedding, target_lang, num, similarity = True):
114 | 		ms = []
115 | 		for w in self.lang_vocabularies[target_lang]:
116 | 			targ_w_emb = self.get_vector(target_lang, w)
117 | 			if len(embedding) != len(targ_w_emb):
118 | 				print("Unaligned embedding length: " + w)
119 | 			else:
120 | 				if similarity:
121 | 					nrm = np.linalg.norm(embedding, 2)
122 | 					trg_nrm = self.get_norm(target_lang, w)
123 | 					sim = np.dot(embedding, targ_w_emb) / (nrm * trg_nrm)
124 | 					if (len(ms) < num) or (sim > ms[-1][1]):	
125 | 						ms.append((w, sim))
126 | 						ms.sort(key = lambda x: x[1], reverse = True)
127 | 				else:
128 | 					dist = np.linalg.norm(embedding - targ_w_emb)
129 | 					if (len(ms) < num) or (dist < ms[-1][1]):	
130 | 						ms.append((w, dist))
131 | 						ms.sort(key = lambda x: x[1])
132 | 				if len(ms) > num: 
133 | 						ms.pop() 
134 | 		return [ws for ws in ms]
135 | 	
136 | 	def merge_embedding_spaces(self, languages, emb_size, merge_name = 'merge', lang_prefix_delimiter = '__', special_tokens = None):
137 | 		print("Merging embedding spaces...")
138 | 		merge_vocabulary = {}
139 | 		merge_embs = []
140 | 		merge_norms = []
141 | 
142 | 		for lang in languages:
143 | 			print("For language: " + lang)
144 | 			norms =[]
145 | 			embs = []
146 | 			for word in self.lang_vocabularies[lang]:
147 | 				if special_tokens is None or word not in special_tokens:
148 | 					merge_vocabulary[lang + lang_prefix_delimiter + word] = len(merge_vocabulary)
149 | 				else:
150 | 					merge_vocabulary[word] = len(merge_vocabulary)
151 | 				embs.append(self.get_vector(lang, word))
152 | 				norms.append(self.get_norm(lang, word))
153 | 			merge_embs =  np.copy(embs) if len(merge_embs) == 0 else np.vstack((merge_embs, embs))
154 | 			merge_norms = np.copy(norms) if len(merge_norms) == 0 else np.concatenate((merge_norms, norms))
155 | 			
156 | 		self.lang_vocabularies[merge_name] = merge_vocabulary
157 | 		self.lang_embeddings[merge_name] = merge_embs
158 | 		self.lang_emb_norms[merge_name] = merge_norms
159 | 		self.emb_sizes[merge_name] = emb_size  
160 | 
161 | 	def store_embeddings(self, path, language):
162 | 		io_helper.store_embeddings(path, self, language)


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | from scipy import stats
 3 | import os
 4 | import numpy as np
 5 | import sys
 6 | 
 7 | def pairwise_accuracy(golds, preds):
 8 | 	count_good = 0.0
 9 | 	count_all = 0.0
10 | 	for i in range(len(golds) - 1):
11 | 		for j in range(i+1, len(golds)):
12 | 			count_all += 1.0
13 | 			diff_gold = golds[i] - golds[j]
14 | 			diff_pred = preds[i] - preds[j]
15 | 			if (diff_gold * diff_pred >= 0):
16 | 				count_good += 1.0
17 | 	return count_good / count_all
18 | 
19 | 
20 | def evaluate(gold_path, predicted_path):
21 | 	golds = [(x.split()[0].strip(), float(x.split()[1].strip())) for x in list(codecs.open(gold_path, "r", "utf-8").readlines())]
22 | 	predicts =  [(x.split()[0].strip(), float(x.split()[1].strip())) for x in list(codecs.open(predicted_path, "r", "utf-8").readlines())]
23 | 	
24 | 	gold_scores = [x[1] for x in golds]
25 | 	gold_min = min(gold_scores)
26 | 	gold_max = max(gold_scores)
27 | 
28 | 	predict_scores = [x[1] for x in predicts]
29 | 	preds_min = min(predict_scores)
30 | 	preds_max = max(predict_scores)
31 | 
32 | 	golds_norm = {x[0] : (x[1] - gold_min) / (gold_max - gold_min) for x in golds }
33 | 	preds_norm = {x[0] : (x[1] - preds_min) / (preds_max - preds_min) for x in predicts }
34 | 	preds_inv_norm = {key : 1.0 - preds_norm[key] for key in preds_norm}
35 | 
36 | 	g_last = []
37 | 	p_last = []
38 | 	pinv_last = [] 
39 | 	for k in golds_norm:
40 | 		g_last.append(golds_norm[k])
41 | 		p_last.append(preds_norm[k])
42 | 		pinv_last.append(preds_inv_norm[k])
43 | 
44 | 	pearson = stats.pearsonr(g_last, p_last)[0]
45 | 	spearman = stats.spearmanr(g_last, p_last)[0]
46 | 	pa = pairwise_accuracy(g_last, p_last)
47 | 
48 | 	pearson_inv = stats.pearsonr(g_last, pinv_last)[0]
49 | 	spearman_inv = stats.spearmanr(g_last, pinv_last)[0]
50 | 	pa_inv = pairwise_accuracy(g_last, pinv_last)
51 | 
52 | 	return max(pearson, pearson_inv), max(spearman, spearman_inv), max(pa, pa_inv)
53 | 
54 | gold_path = sys.argv[1]
55 | pred_path = sys.argv[2]
56 | pears, spear, pa = evaluate(gold_path, pred_path)
57 | print("Pearson coefficient: " + str(pears))
58 | print("Spearman coefficient: " + str(spear))
59 | print("Pairwise accuracy: " + str(pa))


--------------------------------------------------------------------------------
/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codogogo/topfish/6b3f5723029616cb430d6226bc59c013fe79eb78/evaluation/__init__.py


--------------------------------------------------------------------------------
/evaluation/confusion_matrix.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def merge_confusion_matrices(conf_mats):
 4 | 	res_mat = ConfusionMatrix(conf_mats[0].labels)
 5 | 	for cm in conf_mats:
 6 | 		res_mat.matrix = np.add(res_mat.matrix, cm.matrix)
 7 | 	res_mat.compute_all_scores()
 8 | 	return res_mat	
 9 | 
10 | class ConfusionMatrix(object):
11 | 	"""
12 | 	Confusion matrix for evaluating classification tasks. 
13 | 	"""
14 | 	
15 | 	def __init__(self, labels = [], predictions = [], gold = [], one_hot_encoding = False, class_indices = False):
16 | 		# rows are true labels, columns predictions
17 | 		self.matrix = np.zeros(shape = (len(labels), len(labels)))
18 | 		self.labels = labels
19 | 
20 | 		if len(predictions) != len(gold):
21 | 			raise ValueError("Predictions and gold labels do not have the same count.")
22 | 		for i in range(len(predictions)):
23 | 			index_pred = np.argmax(predictions[i]) if one_hot_encoding else (predictions[i] if class_indices else labels.index(predictions[i]))
24 | 			index_gold = np.argmax(gold[i]) if one_hot_encoding else (gold[i] if class_indices else labels.index(gold[i]))
25 | 			self.matrix[index_gold][index_pred] += 1
26 | 
27 | 		if len(predictions) > 0: 
28 | 			self.compute_all_scores()
29 | 
30 | 	def compute_all_scores(self):
31 | 		self.class_performances = {}
32 | 		self.counts = {}
33 | 		for i in range(len(self.labels)):
34 | 			tp = np.float32(self.matrix[i][i])
35 | 			fp_plus_tp = np.float32(np.sum(self.matrix, axis = 0)[i])
36 | 			fn_plus_tp = np.float32(np.sum(self.matrix, axis = 1)[i])
37 | 			p = tp / fp_plus_tp
38 | 			r = tp / fn_plus_tp
39 | 			self.class_performances[self.labels[i]] = (p, r, 2*p*r/(p+r))
40 | 			self.counts[self.labels[i]] = (tp, fp_plus_tp - tp, fn_plus_tp - tp)
41 | 
42 | 		self.microf1 = np.float32(np.trace(self.matrix)) / np.sum(self.matrix)
43 | 		self.macrof1 = float(sum([x[2] for x in self.class_performances.values()])) / float(len(self.labels))
44 | 		self.macroP = float(sum([x[0] for x in self.class_performances.values()])) / float(len(self.labels))
45 | 		self.macroR = float(sum([x[1] for x in self.class_performances.values()])) / float(len(self.labels))
46 | 		self.accuracy = float(sum([self.matrix[i, i] for i in range(len(self.labels))])) / float(np.sum(self.matrix))
47 | 		
48 | 
49 | 	def print_results(self):
50 | 		for l in self.labels:
51 | 			print(l + ": " + str(self.get_class_performance(l)))
52 | 		print("Micro avg: " + str(self.accuracy))
53 | 		print("Macro avg: " + str(self.macrof1))
54 | 
55 | 	def get_class_performance(self, label):
56 | 		if label in self.labels:
57 | 			return self.class_performances[label]
58 | 		else:
59 | 			raise ValueException("Unknown label")
60 | 
61 | 	def aggregate_class_performance(self, classes):
62 | 		true_sum = 0.0
63 | 		fp_sum = 0.0
64 | 		fn_sum = 0.0
65 | 		for l in classes:
66 | 			tp, fp, fn = self.counts[l]
67 | 			true_sum += tp
68 | 			fp_sum += fp
69 | 			fn_sum += fn
70 | 		p = true_sum / (fp_sum + true_sum)
71 | 		r = true_sum / (fn_sum + true_sum)
72 | 		f = (2 * r * p) / (r + p)
73 | 		return p, r, f
74 | 			
75 | 		
76 | 
77 | 		
78 | 			 


--------------------------------------------------------------------------------
/graphs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codogogo/topfish/6b3f5723029616cb430d6226bc59c013fe79eb78/graphs/__init__.py


--------------------------------------------------------------------------------
/graphs/graph.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | class Graph(object):
  4 | 	"""description of class"""
  5 | 	def __init__(self, nodes = [], edges = [], symmetric = True):
  6 | 		self.nodes = nodes
  7 | 		self.edges = []
  8 | 		for edge in edges:
  9 | 			self.add_edge(edge)
 10 | 		self.build_adjacency_matrix(symmetric)
 11 | 
 12 | 	def add_node(self, node):
 13 | 		self.nodes.append(node)
 14 | 
 15 | 	def add_edge(self, edge):
 16 | 		if len(edge) != 3: 
 17 | 			raise ValueError('An edge needs to have three values: starting node, ending node, and the weight (1 for unweighted graph)')
 18 | 		if edge[0] not in self.nodes:
 19 | 			raise ValueError('Starting node of the edge is unknown, i.e., not in the node list of the graph')
 20 | 		if edge[1] not in self.nodes:
 21 | 			raise ValueError('Ending node of the edge is unknown, i.e., not in the node list of the graph')
 22 | 		self.edges.append((self.nodes.index(edge[0]), self.nodes.index(edge[1]), edge[2]))
 23 | 
 24 | 	def build_adjacency_matrix(self, symmetric = True):
 25 | 		self.adj_mat = np.zeros((len(self.nodes), len(self.nodes)))
 26 | 		for edge in self.edges:
 27 | 			self.adj_mat[edge[0]][edge[1]] = edge[2]
 28 | 			if symmetric:
 29 | 				self.adj_mat[edge[1]][edge[0]] = edge[2]
 30 | 
 31 | 	def harmonic_function_label_propagation(self, fixed_indices_vals, rescale_extremes = True, normalize = True):
 32 | 		self.wedeg_mat = np.zeros((len(self.nodes), len(self.nodes)))
 33 | 		for i in range(len(self.nodes)):
 34 | 			self.wedeg_mat[i][i] = sum(self.adj_mat[i])
 35 | 			
 36 | 		lap_mat = np.subtract(self.wedeg_mat, self.adj_mat)
 37 | 		lap_mat_uu = lap_mat[np.ix_([x for x in range(len(self.nodes)) if x not in [y[0] for y in fixed_indices_vals]], [x for x in range(len(self.nodes)) if x not in [y[0] for y in fixed_indices_vals]])]
 38 | 		lap_mat_ul = lap_mat[np.ix_([x for x in range(len(self.nodes)) if x not in [y[0] for y in fixed_indices_vals]], [y[0] for y in fixed_indices_vals])]
 39 | 		scores_l = np.expand_dims(np.array([y[1] for y in fixed_indices_vals]), axis = 0)
 40 | 		
 41 | 		scores_u = np.dot(np.dot(np.multiply(-1.0, np.linalg.inv(lap_mat_uu)), lap_mat_ul), scores_l.T)
 42 | 		unlab_docs = [x for x in self.nodes if self.nodes.index(x) not in [y[0] for y in fixed_indices_vals]]
 43 | 		all_scores = dict(zip(unlab_docs, scores_u.T[0]))
 44 | 		
 45 | 		for e in fixed_indices_vals:
 46 | 			if not rescale_extremes:
 47 | 				all_scores[self.nodes[e[0]]] = e[1]
 48 | 			else: 
 49 | 				adj_row = self.adj_mat[e[0]]
 50 | 				adj_row = np.multiply(1.0 / np.sum(adj_row), adj_row)
 51 | 				all_scores[self.nodes[e[0]]] = sum([adj_row[i] * all_scores[self.nodes[i]] for i in range(len(self.nodes)) if i not in [y[0] for y in fixed_indices_vals]]) 
 52 | 
 53 | 		if normalize:
 54 | 			min_score = min(all_scores.values())
 55 | 			max_score = max(all_scores.values())
 56 | 			for k in all_scores:
 57 | 				all_scores[k] = (all_scores[k] - min_score) / (max_score - min_score)
 58 | 		return all_scores
 59 | 		
 60 | 	
 61 | 	def pagerank(self, alpha = 0.15, init_pr_vector = None, fixed_indices = None, rescale_extremes = True):
 62 | 		#print("Running PageRank...")
 63 | 		if init_pr_vector is None:
 64 | 			init_pr_vector = np.expand_dims(np.full((len(self.nodes)), 1.0/((float)(len(self.nodes)))), axis = 0)
 65 | 		
 66 | 		# normalization and stochasticity adjustment of the adjacence matrix
 67 | 		pr_mat = np.zeros((len(self.nodes), len(self.nodes)))
 68 | 		for i in range(len(self.nodes)):
 69 | 			if np.count_nonzero(self.adj_mat[i]) == 0:
 70 | 				pr_mat[i][:] = np.full((len(self.nodes)), 1.0/((float)(len(self.nodes))))
 71 | 			else:
 72 | 				pr_mat[i][:] =  np.multiply(1.0 / np.sum(self.adj_mat[i]), self.adj_mat[i])
 73 | 
 74 | 		# primitivity adjustment
 75 | 		pr_mat = np.multiply(1 - alpha, pr_mat) + np.multiply(alpha, np.full((len(self.nodes), len(self.nodes)), 1.0/((float)(len(self.nodes)))))
 76 | 
 77 | 		# pagerank iterations
 78 | 		diff = 1
 79 | 		it = 1
 80 | 		while diff > 0.001:
 81 | 			old_vec = init_pr_vector
 82 | 			init_pr_vector = np.dot(init_pr_vector, pr_mat)
 83 | 			#init_pr_vector = np.multiply(1.0 / np.sum(init_pr_vector), init_pr_vector)
 84 | 			
 85 | 			if fixed_indices is not None:
 86 | 				for ind in fixed_indices:	
 87 | 					init_pr_vector[0][ind] = fixed_indices[ind]
 88 | 
 89 | 			diff = np.sum(np.abs(init_pr_vector - old_vec))
 90 | 			#print("PR iteration " + str(it) + ": " + str(init_pr_vector))
 91 | 			it += 1
 92 | 		
 93 | 		
 94 | 		if fixed_indices is not None and rescale_extremes:
 95 | 				for ind in fixed_indices:	
 96 | 					adj_row = self.adj_mat[ind]
 97 | 					adj_row = np.multiply(1.0 / np.sum(adj_row), adj_row)
 98 | 					init_pr_vector[0][ind] = sum([adj_row[i] * init_pr_vector[0][i] for i in range(len(self.nodes)) if i != ind]) 
 99 | 
100 | 		return dict(zip(self.nodes, init_pr_vector[0]))
101 | 


--------------------------------------------------------------------------------
/helpers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codogogo/topfish/6b3f5723029616cb430d6226bc59c013fe79eb78/helpers/__init__.py


--------------------------------------------------------------------------------
/helpers/data_helper.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import re
  3 | import itertools
  4 | from collections import Counter
  5 | import sys
  6 | import codecs
  7 | import random
  8 | from embeddings import text_embeddings
  9 | from sys import stdin
 10 | 
 11 | def clean_str(string):
 12 |     """
 13 |     Tokenization/string cleaning.
 14 |     Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
 15 |     """
 16 |     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
 17 |     string = re.sub(r"\'s", " \'s", string)
 18 |     string = re.sub(r"\'ve", " \'ve", string)
 19 |     string = re.sub(r"n\'t", " n\'t", string)
 20 |     string = re.sub(r"\'re", " \'re", string)
 21 |     string = re.sub(r"\'d", " \'d", string)
 22 |     string = re.sub(r"\'ll", " \'ll", string)
 23 |     string = re.sub(r",", " , ", string)
 24 |     string = re.sub(r"!", " ! ", string)
 25 |     string = re.sub(r"\(", " \( ", string)
 26 |     string = re.sub(r"\)", " \) ", string)
 27 |     string = re.sub(r"\?", " \? ", string)
 28 |     string = re.sub(r"\s{2,}", " ", string)
 29 |     return string.strip().lower()
 30 | 
 31 | def load_text_and_labels(path, lowercase = True, multilingual = False, distinct_labels_index = None):
 32 | 	"""Loads text instances from files (one text one line), splits the data into words and generates labels (as one-hot vectors).
 33 | 		Returns split sentences and labels.
 34 | 	"""
 35 |     # Load data from files
 36 | 	lines = [(s.lower() if lowercase else s).strip().split() for s in list(codecs.open(path,'r',encoding='utf8', errors='replace').readlines())]
 37 | 	x_instances = [l[1:-1] for l in lines] if multilingual else [l[:-1] for l in lines]
 38 | 
 39 | 	if multilingual: 
 40 | 		langs = [l[0] for l in lines]
 41 | 	labels = [l[-1] for l in lines]
 42 | 	
 43 | 	dist_labels = list(set(labels)) if distinct_labels_index is None else distinct_labels_index
 44 | 	y_instances = [np.zeros(len(dist_labels)) for l in labels]
 45 | 	for i in range(len(y_instances)):
 46 | 		y_instances[i][dist_labels.index(labels[i])] = 1
 47 | 
 48 | 	return [x_instances, y_instances, langs, dist_labels] if multilingual else [x_instances, y_instances, dist_labels] 
 49 | 
 50 | def build_text_and_labels(texts, class_labels, lowercase = True, multilingual = False, langs = None, distinct_labels_index = None):
 51 | 	# Load data from files
 52 | 	lines = [(text.lower() if lowercase else text).strip().split() for text in texts]
 53 | 	x_instances = [l[1:-1] for l in lines] if multilingual else [l[:-1] for l in lines]
 54 | 	
 55 | 	dist_labels = list(set(class_labels)) if distinct_labels_index is None else distinct_labels_index
 56 | 	y_instances = [np.zeros(len(dist_labels)) for l in class_labels]
 57 | 	for i in range(len(y_instances)):
 58 | 		y_instances[i][dist_labels.index(class_labels[i])] = 1
 59 | 
 60 | 	return [x_instances, y_instances, langs, dist_labels] if multilingual else [x_instances, y_instances, dist_labels] 
 61 | 
 62 | def pad_texts(texts, padding_word="<PAD/>", max_length = None):
 63 |     """
 64 |     Pads all sentences to the same length. The length is defined by the longest sentence.
 65 |     Returns padded sentences.
 66 |     """
 67 |     sequence_length = max(len(x) for x in texts) if max_length is None else max_length
 68 |     padded_texts = []
 69 |     for i in range(len(texts)):
 70 |         text = texts[i]
 71 |         num_padding = sequence_length - len(text)
 72 |         padded_text = text + [padding_word] * num_padding if num_padding >= 0 else text[ : sequence_length]
 73 |         padded_texts.append(padded_text)
 74 |     return padded_texts
 75 | 
 76 | def build_vocab(texts):
 77 | 	"""
 78 | 	Builds a vocabulary mapping from word to index based on the sentences.
 79 | 	Returns vocabulary mapping and inverse vocabulary mapping.
 80 | 	"""
 81 | 	# Build vocabulary
 82 | 	word_counts = Counter(itertools.chain(*texts))
 83 | 	# Mapping from index to word
 84 | 	vocabulary_invariable = [x[0] for x in word_counts.most_common()]
 85 | 	vocabulary_invariable = list(sorted(vocabulary_invariable))
 86 | 	# Mapping from word to index
 87 | 	vocabulary = {x: i for i, x in enumerate(vocabulary_invariable)}
 88 | 	inverse_vocabulary = {v: k for k, v in vocabulary.items()}
 89 | 	return [vocabulary, inverse_vocabulary]
 90 | 
 91 | def build_input_data(texts, labels, vocabulary, padding_token = "<PAD/>", langs = None, ignore_empty = True):
 92 | 	x = []
 93 | 	y = []
 94 | 	if langs is not None:
 95 | 		filt_langs = []	
 96 | 	for i in range(len(texts)):
 97 | 		num_not_pad = len([x for x in texts[i] if x != padding_token])					
 98 | 		if (num_not_pad > 0 or (not ignore_empty)):
 99 | 			x.append([vocabulary[t] for t in texts[i]])
100 | 			y.append(labels[i])
101 | 			if langs is not None:
102 | 				filt_langs.append(langs[i])
103 | 	if langs is not None:
104 | 		return [np.array(x), np.array(y), langs]
105 | 	else:
106 | 		return [np.array(x), np.array(y)]
107 | 
108 | def remove_stopwords(texts, langs, stopwords, lowercase = True, multilingual = False, lang_prefix_delimiter = '__'):
109 | 	for i in range(len(texts)):
110 | 		texts[i] = [x for x in texts[i] if (x.split('__')[1].strip() if multilingual else x).lower() not in (stopwords[langs[i]] if multilingual else stopwords)]
111 | 
112 | def filter_against_vocabulary(texts, vocabulary, lowercase = False):
113 |     return [[(t.lower() if lowercase else t) for t in s if (t.lower() if lowercase else t) in vocabulary] for s in texts]
114 | 
115 | def load_data_build_vocabulary(path, stopwords = None, lowercase = True, multilingual = False, lang_prefix_delimiter = '__'):
116 | 	"""
117 | 	Loads and preprocesses data.
118 | 	Returns input vectors, labels, vocabulary, and inverse vocabulary.
119 | 	"""
120 | 	# Load and preprocess data
121 | 	if multilingual:
122 | 		texts, labels, langs, dist_labels = load_text_and_labels(path, lowercase = lowercase, multilingual = True)
123 | 		for i in range(len(texts)):
124 | 			texts[i] = langs[i].lower() + lang_prefix_delimiter + texts[i]
125 | 	else:
126 | 		texts, labels, dist_labels = load_text_and_labels(path, lowercase = lowercase, multilingual = False)
127 | 
128 | 	if stopwords is not None:
129 | 		texts = remove_stopwords(texts, langs, stopwords, lowercase = lowercase)
130 | 	texts_padded = pad_texts(texts)
131 | 	
132 | 	vocabulary, vocabulary_inverse = build_vocab(texts_padded)
133 | 	x, y = build_input_data(texts_padded, labels, vocabulary)
134 | 	
135 | 	return [x, y, dist_labels, vocabulary, vocabulary_inverse]
136 | 
137 | 
138 | def load_data_given_vocabulary(path, vocabulary, stopwords = None, lowercase = False, multilingual = False, lang_prefix_delimiter = '__', max_length = None, split = None, ignore_empty = True, distinct_labels_index = None):
139 | 	"""
140 | 	Loads and preprocesses data given the vocabulary.
141 | 	Returns input vectors, labels, vocabulary, and inverse vocabulary.
142 | 	"""
143 | 	# Load and preprocess data
144 | 	if multilingual:
145 | 		texts, labels, langs, dist_labels = load_text_and_labels(path, lowercase = lowercase, multilingual = True, distinct_labels_index = distinct_labels_index)
146 | 		for i in range(len(texts)):
147 | 			for j in range(len(texts[i])):
148 | 				texts[i][j] = langs[i].lower() + lang_prefix_delimiter + texts[i][j]
149 | 	else:
150 | 		texts, labels, dist_labels = load_text_and_labels(path, lowercase = lowercase, multilingual = False, distinct_labels_index = distinct_labels_index)
151 | 
152 | 	if stopwords is not None:
153 | 		remove_stopwords(texts, langs if multilingual else None, stopwords, lowercase = lowercase, multilingual = multilingual)
154 | 
155 | 	texts = filter_against_vocabulary(texts, vocabulary)
156 | 	texts_padded = pad_texts(texts, max_length = max_length)
157 | 	
158 | 	if multilingual:
159 | 		x, y, flangs = build_input_data(texts_padded, labels, vocabulary, langs = langs, ignore_empty = ignore_empty)
160 | 		dist_langs = set(flangs)
161 | 		for dl in dist_langs:
162 | 			num = len([l for l in flangs if l == dl])
163 | 			print("Language: " + dl + ", num: " + str(num))
164 | 	else:
165 | 		x, y = build_input_data(texts_padded, labels, vocabulary, ignore_empty = ignore_empty)
166 | 	if split is None:
167 | 		return [x, y, dist_labels]
168 | 	else:
169 | 		x_train = x[:split]
170 | 		y_train = y[:split]
171 | 		x_test = x[split:]
172 | 		y_test = y[split:]
173 | 		return [x_train, y_train, x_test, y_test, dist_labels]
174 | 
175 | def build_data_given_vocabulary(data, class_labels, vocabulary, stopwords = None, lowercase = False, multilingual = False, lang_prefix_delimiter = '__', max_length = None, split = None, ignore_empty = True, distinct_labels_index = None):
176 | 	"""
177 | 	Loads and preprocesses data given the vocabulary.
178 | 	Returns input vectors, labels, vocabulary, and inverse vocabulary.
179 | 	"""
180 | 	# Load and preprocess data
181 | 	if multilingual:
182 | 		texts, labels, langs, dist_labels = load_text_and_labels(path, lowercase = lowercase, multilingual = True, distinct_labels_index = distinct_labels_index)
183 | 		for i in range(len(texts)):
184 | 			for j in range(len(texts[i])):
185 | 				texts[i][j] = langs[i].lower() + lang_prefix_delimiter + texts[i][j]
186 | 	else:
187 | 		texts, labels, dist_labels = load_text_and_labels(path, lowercase = lowercase, multilingual = False, distinct_labels_index = distinct_labels_index)
188 | 
189 | 	if stopwords is not None:
190 | 		remove_stopwords(texts, langs if multilingual else None, stopwords, lowercase = lowercase, multilingual = multilingual)
191 | 
192 | 	texts = filter_against_vocabulary(texts, vocabulary)
193 | 	texts_padded = pad_texts(texts, max_length = max_length)
194 | 	
195 | 	if multilingual:
196 | 		x, y, flangs = build_input_data(texts_padded, labels, vocabulary, langs = langs, ignore_empty = ignore_empty)
197 | 		dist_langs = set(flangs)
198 | 		for dl in dist_langs:
199 | 			num = len([l for l in flangs if l == dl])
200 | 			print("Language: " + dl + ", num: " + str(num))
201 | 	else:
202 | 		x, y = build_input_data(texts_padded, labels, vocabulary, ignore_empty = ignore_empty)
203 | 	if split is None:
204 | 		return [x, y, dist_labels]
205 | 	else:
206 | 		x_train = x[:split]
207 | 		y_train = y[:split]
208 | 		x_test = x[split:]
209 | 		y_test = y[split:]
210 | 		return [x_train, y_train, x_test, y_test, dist_labels]
211 | 
212 | 
213 | 
214 | def load_vocabulary_embeddings(vocabulary_inv, embeddings, emb_size, padding = "<PAD/>"):
215 | 	voc_embs = []
216 | 	for i in range(len(vocabulary_inv)):
217 | 		if i not in vocabulary_inv:
218 | 			raise Exception("Index not in index vocabulary!" + " Index: " + str(i))
219 | 		word = vocabulary_inv[i]
220 | 		if word == padding:
221 | 			voc_embs.append(np.random.uniform(-1.0, 1.0, size = [emb_size]))
222 | 		elif word not in embeddings:
223 | 			raise Exception("Word not found in embeddings! " + word)
224 | 		else:
225 | 			 voc_embs.append(embeddings[word])
226 | 	return np.array(voc_embs, dtype = np.float32)
227 | 
228 | def prepare_data_for_kb_embedding(data, prebuilt_dicts = None, valid_triples_dict = None, generate_corrupt = True, num_corrupt = 2):
229 | 	if valid_triples_dict is None:
230 | 		valid_triples_dict = {}
231 | 
232 | 	if prebuilt_dicts is None:
233 | 		cnt_ent = 0
234 | 		cnt_rel = 0
235 | 		entity_dict = {}
236 | 		relations_dict = {}
237 | 	else:
238 | 		entity_dict = prebuilt_dicts[0]
239 | 		relations_dict = prebuilt_dicts[1]
240 | 
241 | 	for d in data:	
242 | 		if prebuilt_dicts is None:
243 | 			if d[0] not in entity_dict:
244 | 				entity_dict[d[0]] = cnt_ent
245 | 				cnt_ent += 1
246 | 			if d[2] not in entity_dict:
247 | 				entity_dict[d[2]] = cnt_ent
248 | 				cnt_ent += 1
249 | 			if d[1] not in relations_dict:
250 | 				relations_dict[d[1]] = cnt_rel
251 | 				cnt_rel += 1
252 | 
253 | 		str_rep = str(entity_dict[d[0]]) + "_" + str(relations_dict[d[1]]) + "_" + str(entity_dict[d[2]])
254 | 		valid_triples_dict[str_rep] = str_rep
255 | 
256 | 	e1_indices = []
257 | 	e2_indices = []
258 | 	r_indices = []
259 | 	y_vals = []
260 | 
261 | 	count_corrupt_valid = 0
262 | 	for d in data:
263 | 		e1_ind = entity_dict[d[0]]
264 | 		e2_ind = entity_dict[d[2]]
265 | 		r_ind = relations_dict[d[1]]
266 | 
267 | 		e1_indices.append(e1_ind)	
268 | 		e2_indices.append(e2_ind)
269 | 		r_indices.append(r_ind)
270 | 		y_vals.append(1)
271 | 
272 | 		if generate_corrupt:
273 | 			for i in range(num_corrupt):
274 | 				corr_type = random.randint(1,3)
275 | 				fake_ind = random.randint(0, (len(entity_dict) if (corr_type == 1 or corr_type == 3) else len(relations_dict)) - 1)
276 | 				corr_triple_str_rep =  (str(fake_ind) + "_" + str(r_ind) + "_" + str(e2_ind) if corr_type == 1 else (str(e1_ind) + "_" + str(r_ind) + "_" + str(fake_ind) if corr_type == 3 else str(e1_ind) + "_" + str(fake_ind) + "_" + str(e2_ind)))
277 | 
278 | 				while corr_triple_str_rep in valid_triples_dict:
279 | 						fake_ind = random.randint(0, (len(entity_dict) if (corr_type == 1 or corr_type == 3) else len(relations_dict)) - 1)
280 | 						corr_triple_str_rep =  (str(fake_ind) + "_" + str(r_ind) + "_" + str(e2_ind) if corr_type == 1 else (str(e1_ind) + "_" + str(r_ind) + "_" + str(fake_ind) if corr_type == 3 else str(e1_ind) + "_" + str(fake_ind) + "_" + str(e2_ind)))
281 | 						count_corrupt_valid += 1
282 | 
283 | 				if corr_type == 1:
284 | 					e1_indices.append(fake_ind)
285 | 					e2_indices.append(e2_ind)
286 | 					r_indices.append(r_ind)
287 | 				elif corr_type == 2:
288 | 					e1_indices.append(e1_ind)
289 | 					e2_indices.append(e2_ind)
290 | 					r_indices.append(fake_ind)
291 | 				elif corr_type == 3:
292 | 					e1_indices.append(e1_ind)
293 | 					e2_indices.append(fake_ind)
294 | 					r_indices.append(r_ind)
295 | 				y_vals.append(-1)
296 | 	
297 | 	return [(entity_dict, relations_dict), valid_triples_dict, np.array(e1_indices, dtype = np.int32), np.array(e2_indices, dtype = np.int32), np.array(r_indices, dtype = np.int32), np.array(y_vals, dtype = np.float32) ]
298 | 				
299 | def prepare_wn_data(data, concept_dict, rel_string, rel_string_inv, prev_dict = None):	
300 | 	data_out = []	
301 | 	if prev_dict is None:
302 | 		prev_dict = {}
303 | 	
304 | 	data = [x for x in data if x[1] == rel_string or x[1] == rel_string_inv]
305 | 	
306 | 	for i in range(len(data)):	
307 | 		d = data[i]
308 | 		if d[1] == rel_string:
309 | 			rel_str = concept_dict[d[0]] + "_" + concept_dict[d[2]]
310 | 			if rel_str not in prev_dict:
311 | 				data_out.append((concept_dict[d[0]], concept_dict[d[2]], "1"))			
312 | 				prev_dict[rel_str] = 1
313 | 		elif d[1] == rel_string_inv: 
314 | 			rel_str = concept_dict[d[2]] + "_" + concept_dict[d[0]]
315 | 			if rel_str not in prev_dict:
316 | 				data_out.append((concept_dict[d[2]], concept_dict[d[0]], "1"))
317 | 				prev_dict[rel_str] = 1
318 | 	return data_out
319 | 
320 | def create_corrupts(correct_train, correct_test, concept_dict, prev_dict, num_corrupt = 2, shuffle = True):
321 | 	concepts = list(concept_dict.values())
322 | 	train_corrupt = []
323 | 	test_corrupt = []
324 | 	current_dict = {}	
325 | 
326 | 	merged = []
327 | 	merged.extend(correct_train)
328 | 	merged.extend(correct_test)
329 | 
330 | 	for i in range(len(merged)):
331 | 		rel_str = merged[i][1] + "_" + merged[i][0]
332 | 		if rel_str not in prev_dict and rel_str not in current_dict:
333 | 			(train_corrupt if i < len(correct_train) else test_corrupt).append((merged[i][1], merged[i][0], "0"))
334 | 			current_dict[rel_str] = 1
335 | 		
336 | 		for j in range(num_corrupt - 1):
337 | 			c1 = concepts[random.randint(0, len(concepts) - 1)]
338 | 			c2 = concepts[random.randint(0, len(concepts) - 1)]
339 | 			rel_str = c1 + "_" + c2
340 | 			while(rel_str in prev_dict or rel_str in current_dict):
341 | 				c1 = concepts[random.randint(0, len(concepts) - 1)]
342 | 				c2 = concepts[random.randint(0, len(concepts) - 1)]
343 | 				rel_str = c1 + "_" + c2
344 | 			(train_corrupt if i < len(correct_train) else test_corrupt).append((c1, c2, "0"))
345 | 			current_dict[rel_str] = 1
346 | 			
347 | 	fdata_train = []
348 | 	fdata_train.extend(correct_train)
349 | 	fdata_train.extend(train_corrupt)
350 | 	
351 | 	fdata_test = []
352 | 	fdata_test.extend(correct_test)
353 | 	fdata_test.extend(test_corrupt)
354 | 
355 | 	if shuffle:
356 | 		random.shuffle(fdata_train)
357 | 		random.shuffle(fdata_test)
358 | 	
359 | 	return (fdata_train, fdata_test)
360 | 	
361 | def lexically_independent_train_set(data_train, data_test):
362 | 	ents_test = [x[0] for x in data_test]
363 | 	ents_test.extend([x[1] for x in data_test])
364 | 	ents_test = set(ents_test)
365 | 
366 | 	filtered_train = [x for x in data_train if x[0] not in ents_test and x[1] not in ents_test]
367 | 	return filtered_train 
368 | 
369 | def prepare_eval_semrel_emb(word_embeddings, stopwords, emb_size, data, y_direct = False, keep_words = False):
370 | 	left_mat = []
371 | 	right_mat = []
372 | 	gold_labels = []
373 | 	words = []
374 | 
375 | 	for i in range(len(data)):
376 | 		first_word = data[i][0]
377 | 		emb1 = text_embeddings.aggregate_phrase_embedding(first_word.strip().split(), stopwords, word_embeddings, emb_size, l2_norm_vec = False)
378 | 		second_word = data[i][1]
379 | 		emb2 = text_embeddings.aggregate_phrase_embedding(second_word.strip().split(), stopwords, word_embeddings, emb_size, l2_norm_vec = False)
380 | 		
381 | 		if emb1 is not None and emb2 is not None:
382 | 			left_mat.append(emb1)
383 | 			right_mat.append(emb2)
384 | 			if keep_words:
385 | 				words.append(first_word + '\t' + second_word)
386 | 			if not y_direct:
387 | 				gold_labels.append(-1.0 if data[i][2] == "0" else 1.0)
388 | 			else:
389 | 				gold_labels.append(data[i][2])
390 | 
391 | 	if keep_words: 
392 | 		return [np.array(left_mat), np.array(right_mat), gold_labels, words]
393 | 	else:
394 | 		 return [np.array(left_mat), np.array(right_mat), gold_labels]
395 | 
396 | def prepare_dataset_semrel_emb(entity_dict, selected_embeddings, stopwords, word_embeddings, emb_size, data, dict_examples):
397 | 	cnt_ent = len(entity_dict)
398 | 	e1_inds = []
399 | 	e2_inds = []
400 | 	y_vals = []	
401 | 
402 | 	cnt_emb_fail = 0
403 | 	cnt_existing = 0
404 | 	
405 | 	for i in range(len(data)):
406 | 		first_word = data[i][0]
407 | 		if first_word not in entity_dict:
408 | 			emb = text_embeddings.aggregate_phrase_embedding(first_word.strip().split(), stopwords, word_embeddings, emb_size, l2_norm_vec = False)
409 | 			if emb is not None:
410 | 				selected_embeddings.append(emb)
411 | 				entity_dict[first_word] = cnt_ent
412 | 				cnt_ent += 1
413 | 			else:
414 | 				cnt_emb_fail += 1
415 | 				continue
416 | 		second_word = data[i][1]
417 | 		if second_word not in entity_dict:
418 | 			emb = text_embeddings.aggregate_phrase_embedding(second_word.strip().split(), stopwords, word_embeddings, emb_size, l2_norm_vec = False)
419 | 			if emb is not None:
420 | 				selected_embeddings.append(emb)
421 | 				entity_dict[second_word] = cnt_ent
422 | 				cnt_ent += 1
423 | 			else:
424 | 				cnt_emb_fail += 1
425 | 				continue
426 | 
427 | 		e1i = entity_dict[first_word]
428 | 		e2i = entity_dict[second_word]
429 | 		stres = str(e1i) + "_" + str(e2i)
430 | 		if stres not in dict_examples:
431 | 			e1_inds.append(e1i)
432 | 			e2_inds.append(e2i)
433 | 			y_vals.append(-1.0 if data[i][2] == "0" else 1.0)
434 | 			dict_examples[stres] = stres
435 | 		else:
436 | 			#print("Example (pair of entities) already seen: "+ "\"" + first_word + "\" ; \"" + second_word + "\"")
437 | 			cnt_existing += 1
438 | 
439 | 	return [list(zip(e1_inds, e2_inds, y_vals)), selected_embeddings]
440 | 	
441 | 			
442 | 
443 | 			
444 | 				


--------------------------------------------------------------------------------
/helpers/data_shaper.py:
--------------------------------------------------------------------------------
  1 | from helpers import io_helper
  2 | import numpy as np
  3 | import re
  4 | 
  5 | def punctuation():
  6 | 	return ['—', '-', '.', ',', ';', ':', '\'', '"', '{', '}', '(', ')', '[', ']']
  7 | 
  8 | def is_number(token):
  9 | 	return re.match('^[\d]+[,]*.?\d*$', token) is not None
 10 | 
 11 | def decode_predictions(labels, predictions, flatten = False):
 12 | 	if len(predictions.shape) == 2:
 13 | 		labs = [labels[np.nonzero(instance)[0][0]] if len(np.nonzero(instance)[0]) > 0 else '' for instance in predictions]
 14 | 	elif len(predictions.shape) == 3:
 15 | 		labs = [[labels[np.nonzero(instance)[0][0]] if len(np.nonzero(instance)[0]) > 0 else '' for instance in sequence] for sequence in predictions]
 16 | 		if flatten:
 17 | 			labs = [item for sublist in labs for item in sublist]
 18 | 	else:
 19 | 		raise ValueError("Not supported. Only list of single instances or list of sequences supported for decoding labels.")
 20 | 	return labs
 21 | 
 22 | def prep_labels_one_hot_encoding(labels, dist_labels = None, multilabel = False):
 23 | 	if dist_labels is None:
 24 | 		if multilabel:
 25 | 			dist_labels = list(set([y for s in labels for y in s]))
 26 | 		else:
 27 | 			dist_labels = list(set(labels))
 28 | 	y = []
 29 | 	for i in range(len(labels)):
 30 | 		lab_vec = [0] * len(dist_labels)
 31 | 		if multilabel:
 32 | 			for j in range(len(labels[i])):
 33 | 				lab_vec[dist_labels.index(labels[i][j])] = 1.0
 34 | 		else:
 35 | 			lab_vec[dist_labels.index(labels[i])] = 1.0
 36 | 		y.append(lab_vec)
 37 | 	return np.array(y, dtype = np.float64), dist_labels
 38 | 
 39 | def prep_word_tuples(word_lists, embeddings, embeddings_language, langs = None, labels = None,):
 40 | 	examples = []
 41 | 	if labels: 
 42 | 		labs = []	
 43 | 	for i in range(len(word_lists)):
 44 | 		example = []
 45 | 		add_example = True
 46 | 		for j in range(len(word_lists[i])):
 47 | 			w = ("" if langs is None else langs[i] + "__") + word_lists[i][j]
 48 | 			if w in embeddings.lang_vocabularies[embeddings_language]:
 49 | 				example.append(embeddings.lang_vocabularies[embeddings_language][w])	
 50 | 			elif w.lower() in embeddings.lang_vocabularies[embeddings_language]:
 51 | 				example.append(embeddings.lang_vocabularies[embeddings_language][w.lower()])
 52 | 			else:
 53 | 				add_example = False
 54 | 				break
 55 | 		if add_example:
 56 | 			examples.append(example)
 57 | 			if labels:	
 58 | 				labs.append(labels[i])	
 59 | 	if labels: 
 60 | 		return examples, labs
 61 | 	else:
 62 | 		return examples
 63 | 			
 64 | def prep_sequence_labelling(texts, labels, embeddings, stopwords = None, embeddings_language = 'en', multilingual_langs = None, lowercase = False, pad = True, pad_token = '<PAD/>', numbers_token = None, punct_token = None, dist_labels = None, max_seq_len = None, add_missing_tokens = False):
 65 | 	x = []
 66 | 	if labels:
 67 | 		y = []
 68 | 	
 69 | 	for i in range(len(texts)):
 70 | 		if i % 100 == 0:
 71 | 			print("Line: " + str(i) + " of " + str(len(texts)))
 72 | 		tok_list = []
 73 | 		if labels:
 74 | 			lab_list = []
 75 | 		language = embeddings_language if multilingual_langs is None else multilingual_langs[i]
 76 | 
 77 | 		for j in range(len(texts[i])):
 78 | 			token_clean = texts[i][j].lower() if lowercase else texts[i][j]
 79 | 			token = token_clean if multilingual_langs is None else multilingual_langs[i] + "__" + token_clean
 80 | 
 81 | 			if token_clean.strip() in punctuation() and punct_token is not None:
 82 | 				token = punct_token
 83 | 			if is_number(token_clean) and numbers_token is not None:
 84 | 				token = numbers_token
 85 | 				
 86 | 			if stopwords is not None and (token_clean in stopwords[language] or token_clean.lower() in stopwords[language]):
 87 | 				continue
 88 | 			if token not in embeddings.lang_vocabularies[embeddings_language] and token.lower() not in embeddings.lang_vocabularies[embeddings_language]:
 89 | 				if add_missing_tokens: 
 90 | 					embeddings.add_word(embeddings_language, token)
 91 | 				else:
 92 | 					continue
 93 | 
 94 | 			tok_list.append(embeddings.lang_vocabularies[embeddings_language][token] if token in embeddings.lang_vocabularies[embeddings_language] else embeddings.lang_vocabularies[embeddings_language][token.lower()])
 95 | 			if labels:
 96 | 				lab_list.append(labels[i][j])
 97 | 		x.append(tok_list)
 98 | 		if labels:
 99 | 			y.append(lab_list)
100 | 
101 | 	if labels:
102 | 		y_clean = []
103 | 		if dist_labels is None:
104 | 			dist_labels = list(set([l for txt_labs in y for l in txt_labs]))
105 | 		for i in range(len(y)):
106 | 			lab_list = []
107 | 			for j in range(len(y[i])):
108 | 				lab_vec = [0] * len(dist_labels)
109 | 				lab_vec[dist_labels.index(y[i][j])] = 1.0
110 | 				lab_list.append(lab_vec)
111 | 			y_clean.append(lab_list)		
112 | 
113 | 	if pad:
114 | 		ind_pad = embeddings.lang_vocabularies[embeddings_language][pad_token]
115 | 		max_len = max([len(t) for t in x]) if max_seq_len is None else max_seq_len
116 | 		x = [t + [ind_pad] * (max_len - len(t)) for t in x]
117 | 		if labels:
118 | 			for r in y_clean:
119 | 				extension = [[0] * len(dist_labels)] * (max_len - len(r))
120 | 				r.extend(extension)
121 | 		sent_lengths = [len([ind for ind in txt if ind != ind_pad]) for txt in x]
122 | 	else:
123 | 		sent_lengths = [len(txt) for txt in x]
124 | 
125 | 	if labels:
126 | 		return np.array(x, dtype = np.int32), np.array(y_clean, dtype = np.float64), dist_labels, sent_lengths
127 | 	else:
128 | 		return np.array(x, dtype = np.int32), sent_lengths
129 | 	
130 | def prep_classification(texts, labels, embeddings, stopwords = None, embeddings_language = 'en', multilingual_langs = None, lowercase = False, pad = True, pad_token = '<PAD/>', numbers_token = None, punct_token = None, dist_labels = None, max_seq_len = None, add_out_of_vocabulary_terms = False):
131 | 	x = []
132 | 	y = []
133 | 	
134 | 	for i in range(len(texts)):
135 | 		tok_list = []
136 | 		lab_list = []
137 | 		language = embeddings_language if multilingual_langs is None else multilingual_langs[i]
138 | 
139 | 		for j in range(len(texts[i])):
140 | 			token_clean = texts[i][j].lower() if lowercase else texts[i][j]
141 | 			token = token_clean if multilingual_langs is None else multilingual_langs[i] + "__" + token_clean
142 | 
143 | 			if token_clean.strip() in punctuation() and punct_token is not None:
144 | 				token = punct_token
145 | 			if is_number(token_clean) and numbers_token is not None:
146 | 				token = numbers_token
147 | 				
148 | 			if stopwords is not None and (token_clean in stopwords[language] or token_clean.lower() in stopwords[language]):
149 | 				continue
150 | 			if token not in embeddings.lang_vocabularies[embeddings_language] and token.lower() not in embeddings.lang_vocabularies[embeddings_language]:
151 | 				if add_out_of_vocabulary_terms:
152 | 					embeddings.add_word(embeddings_language, token)
153 | 				else:
154 | 					continue
155 | 			if max_seq_len is None or len(tok_list) < max_seq_len:
156 | 				tok_list.append(embeddings.lang_vocabularies[embeddings_language][token] if token in embeddings.lang_vocabularies[embeddings_language] else embeddings.lang_vocabularies[embeddings_language][token.lower()])
157 | 			else: 
158 | 				break
159 | 		x.append(tok_list)
160 | 
161 | 	if labels is not None:
162 | 		if dist_labels is None:
163 | 			dist_labels = list(set([l for txt_labs in labels for l in txt_labs]))
164 | 		for i in range(len(labels)):
165 | 			lab_vec = [0] * len(dist_labels)
166 | 			for j in range(len(labels[i])):
167 | 				lab_vec[dist_labels.index(labels[i][j])] = 1.0
168 | 			y.append(lab_vec)
169 | 
170 | 	if pad:
171 | 		ind_pad =  embeddings.lang_vocabularies[embeddings_language][pad_token]
172 | 		max_len = max([len(t) for t in x]) if max_seq_len is None else max_seq_len
173 | 		x = [t + [ind_pad] * (max_len - len(t)) for t in x]
174 | 
175 | 	if labels is not None: 
176 | 			x_ret = np.array(x, dtype = np.int32)
177 | 			y_ret = np.array(y, dtype = np.float64)
178 | 			return x_ret, y_ret, dist_labels
179 | 	else:
180 | 		return np.array(x, dtype = np.int32)
181 | 
182 | def prepare_contrastive_learning_examples(positives, negatives, num_negatives_per_positive):
183 | 	if len(negatives) != len(positives) * num_negatives_per_positive:
184 | 		raise ValueError("The number of negative examples (per positive examples) is incorrect!")
185 | 	examples = []
186 | 	for i in len(positives):
187 | 		examples.append(positives[i])
188 | 		examples.extend(negatives[i*num_negatives_per_positive : (i+1)*num_negatives_per_positive])
189 | 	return examples
190 | 


--------------------------------------------------------------------------------
/helpers/io_helper.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import codecs
  3 | from os import listdir
  4 | from os.path import isfile, join
  5 | import pickle
  6 | import numpy as np
  7 | from helpers import data_helper
  8 | import re
  9 | 
 10 | ################################################################################################################################
 11 | 
 12 | def serialize(item, path):
 13 | 	pickle.dump(item, open(path, "wb" ))
 14 | 
 15 | def deserialize(path):
 16 | 	return pickle.load(open(path, "rb" ))
 17 | 
 18 | def load_file(filepath):
 19 | 	return (codecs.open(filepath, 'r', encoding = 'utf8', errors = 'replace')).read()
 20 | 
 21 | def load_lines(filepath):
 22 | 	return [l.strip() for l in list(codecs.open(filepath, "r", encoding = 'utf8', errors = 'replace').readlines())]
 23 | 
 24 | def load_blocked_lines(filepath):
 25 | 	lines = [l.strip() for l in list(codecs.open(filepath, "r", encoding = 'utf8', errors = 'replace').readlines())]
 26 | 	blocks = []
 27 | 	block = []
 28 | 	for l in lines:
 29 | 		if l == "":
 30 | 			blocks.append(block)
 31 | 			block = []
 32 | 		else:
 33 | 			block.append(l)
 34 | 	if len(block) > 0:
 35 | 		blocks.append(block)
 36 | 	return blocks
 37 | 
 38 | def load_all_files(dirpath):
 39 | 	files = []
 40 | 	for filename in listdir(dirpath):		
 41 | 		files.append((filename, load_file(dirpath + "/" + filename)))
 42 | 	return files
 43 | 
 44 | ################################################################################################################################
 45 | 
 46 | def store_embeddings(path, embeddings, language, print_progress = True):
 47 | 	f = codecs.open(path,'w',encoding='utf8')
 48 | 	vocab = embeddings.lang_vocabularies[language]
 49 | 	embs = 	embeddings.lang_embeddings[language]
 50 | 
 51 | 	cnt = 0
 52 | 	for word in vocab:
 53 | 		cnt += 1
 54 | 		if print_progress and cnt % 1000 == 0:
 55 | 			print("Storing embeddings " + str(cnt))
 56 | 		f.write(word + " ")
 57 | 		for i in range(len(embs[vocab[word]])):
 58 | 			f.write(str(embs[vocab[word]][i]) + " ")
 59 | 		f.write("\n")
 60 | 	f.close()
 61 | 
 62 | def load_embeddings_dict_with_norms(filepath, limit = None, special_tokens = None, print_load_progress = False, min_one_letter = False, skip_first_line = False):
 63 | 	norms = []
 64 | 	vocabulary = {}
 65 | 	embeddings = []
 66 | 	cnt = 0
 67 | 	cnt_dict = 0
 68 | 	emb_size = -1
 69 | 
 70 | 	with codecs.open(filepath,'r',encoding='utf8', errors='replace') as f:
 71 | 		for line in f:
 72 | 			try:
 73 | 				cnt += 1
 74 | 				if limit and cnt > limit: 
 75 | 					break
 76 | 				if print_load_progress and (cnt % 1000 == 0): 
 77 | 					print("Loading embeddings: " + str(cnt))
 78 | 				if cnt > 1 or not skip_first_line:
 79 | 					splt = line.split()
 80 | 					word = splt[0]
 81 | 					if min_one_letter and not any(c.isalpha() for c in word):
 82 | 						continue
 83 | 
 84 | 					vec = [np.float32(x) for x in splt[1:]]
 85 | 					if emb_size < 0 and len(vec) > 10:
 86 | 						emb_size = len(vec)
 87 | 
 88 | 					if emb_size > 0 and len(vec) == emb_size:
 89 | 						vocabulary[word] = cnt_dict
 90 | 						cnt_dict += 1
 91 | 						norms.append(np.linalg.norm(vec, 2))
 92 | 						embeddings.append(vec)			
 93 | 			except(ValueError,IndexError,UnicodeEncodeError):
 94 | 				print("Incorrect format line!")
 95 | 	
 96 | 	if special_tokens is not None:
 97 | 		for st in special_tokens:
 98 | 			vocabulary[st] = cnt_dict
 99 | 			cnt_dict += 1
100 | 			vec = np.array([0.1 * (special_tokens.index(st) + 1)] * emb_size) #np.random.uniform(-1.0, 1.0, size = [emb_size])
101 | 			norms.append(np.linalg.norm(vec, 2))
102 | 			embeddings.append(vec)
103 | 
104 | 	return vocabulary, np.array(embeddings, dtype = np.float32), norms 
105 | 
106 | ############################################################################################################################
107 | 
108 | def load_whitespace_separated_data(filepath):
109 | 	lines = list(codecs.open(filepath,'r',encoding='utf8', errors='replace').readlines())
110 | 	return [[x.strip() for x in l.strip().split()] for l in lines]
111 | 
112 | def load_tab_separated_data(filepath):
113 | 	lines = list(codecs.open(filepath,'r',encoding='utf8', errors='replace').readlines())
114 | 	return [[x.strip() for x in l.strip().split('\t')] for l in lines]
115 | 
116 | def load_wn_concepts_dict(path):
117 | 	lines = list(codecs.open(path,'r',encoding='utf8', errors='replace').readlines())
118 | 	lcols = {x[0] : ' '.join((x[1].split('_'))[2:-2]) for x in [l.strip().split() for l in lines]}
119 | 	return lcols
120 | 
121 | def load_bless_dataset(path):
122 | 	lines = list(codecs.open(path,'r',encoding='utf8', errors='replace').readlines())
123 | 	lcols = [(x[0].split('-')[0], x[3].split('-')[0], "1" if x[2] == "hyper" else "0") for x in [l.strip().split() for l in lines]]
124 | 	return lcols
125 | 
126 | def write_list(path, list):
127 | 	f = codecs.open(path,'w',encoding='utf8')
128 | 	for l in list:
129 | 		f.write(l + "\n")
130 | 	f.close()
131 | 
132 | def write_dictionary(path, dictionary, append = False):
133 | 	f = codecs.open(path,'a' if append else 'w',encoding='utf8')
134 | 	for k in dictionary:
135 | 		f.write(str(k) + "\t" + str(dictionary[k]) + "\n")
136 | 	f.close()
137 | 
138 | def load_translation_pairs(filepath):
139 | 	lines = list(codecs.open(filepath,'r',encoding='utf8', errors='replace').readlines())
140 | 	dataset = []; 
141 | 	for line in lines:
142 | 		spl = line.split(',')
143 | 		srcword = spl[0].strip()
144 | 		trgword = spl[1].strip(); 
145 | 		if (" " not in srcword.strip()) and  (" " not in trgword.strip()):
146 | 			dataset.append((srcword, trgword)); 
147 | 	return dataset	
148 | 
149 | def write_list_tuples_separated(path, list, delimiter = '\t'):
150 | 	f = codecs.open(path,'w',encoding='utf8')
151 | 	for i in range(len(list)):
152 | 		for j in range(len(list[i])):
153 | 			if j == len(list[i]) - 1: 
154 | 				f.write(str(list[i][j]) + '\n')
155 | 			else:
156 | 				f.write(str(list[i][j]) + delimiter)  
157 | 	f.close()
158 | 
159 | def store_wordnet_rels(dirpath, relname, pos, lang, instances):
160 | 	f = codecs.open(dirpath + "/" + lang + "_" + relname + "_" + pos + ".txt",'w',encoding='utf8')
161 | 	for i in instances:
162 | 		splt = i.split('::')
163 | 		f.write(splt[0].replace("_", " ") + "\t" + splt[1].replace("_", " ") + "\t" + str(instances[i]) + "\n")
164 | 	f.close()
165 | 
166 | def load_csv_lines(path, delimiter = ',', indices = None):
167 | 	f = codecs.open(path,'r',encoding='utf8')
168 | 	lines = [l.strip().split(delimiter) for l in f.readlines()]
169 | 	if indices is None:
170 | 		return lines
171 | 	else:
172 | 		return [sublist(l, indices) for l in lines]
173 | 
174 | def load_csv_lines_line_by_line(path, delimiter = ',', indices = None, limit = None):
175 | 	lines = []
176 | 	f = codecs.open(path,'r',encoding='utf8')
177 | 	line = f.readline().strip()
178 | 	cnt = 1
179 | 	while line is not '':
180 | 		lines.extend(sublist(line, indices) if indices is not None else line.split(delimiter))
181 | 		line = f.readline().strip()
182 | 		cnt += 1
183 | 		if limit is not None and cnt > limit:
184 | 			break
185 | 	return lines
186 | 
187 | def sublist(list, indices):
188 | 	sublist = []
189 | 	for i in indices:	
190 | 		sublist.append(list[i])
191 | 	return sublist
192 | 
193 | 
194 | ############################################################################################################################
195 | 
196 | def load_sequence_labelling_data(path, delimiter = '\t', indices = None, line_start_skip = None):
197 | 	f = codecs.open(path,'r',encoding='utf8')
198 | 	lines = [[t.strip() for t in l.split(delimiter)] for l in f.readlines()]
199 | 	instances = []
200 | 	instance = []
201 | 	for i in range(len(lines)):
202 | 		if line_start_skip is not None and lines[i][0].startswith(line_start_skip):
203 | 			continue
204 | 		if len(lines[i]) == 1 and lines[i][0] == "":
205 | 			instances.append(instance)
206 | 			instance = []
207 | 		else:
208 | 			if indices is None:
209 | 				instance.append(lines[i])
210 | 			else:
211 | 				instance.append(sublist(lines[i], indices))
212 | 	if len(instance) > 0:
213 | 		instances.append(instance)
214 | 	return instances
215 | 
216 | def load_classification_data(path, delimiter_text_labels = '\t', delimiter_labels = '\t', line_start_skip = None):
217 | 	f = codecs.open(path,'r',encoding='utf8')
218 | 	lines = [[t.strip() for t in l.split(delimiter_text_labels)] for l in f.readlines()]
219 | 	instances = []
220 | 	for i in range(len(lines)):
221 | 		if line_start_skip is not None and lines[i][0].startswith(line_start_skip):
222 | 			continue
223 | 		text = data_helper.clean_str(lines[i][0].strip()).split()
224 | 		if delimiter_text_labels == delimiter_labels:
225 | 			labels = lines[i][1:]
226 | 		else:
227 | 			labels = lines[i][1].strip().split(delimiter_labels)
228 | 		instances.append((text, labels))
229 | 	return instances		
230 | 
231 | ############################################################################################################################
232 | # Applications specific loading
233 | ############################################################################################################################
234 | 
235 | def load_snli_data(path):
236 | 	l = load_csv_lines(path, delimiter = '\t', indices = [0, 5, 6])
237 | 	l.pop(0)
238 | 
239 | 	labels = [x[0] for x in l]
240 | 	premises = [x[1] for x in l]
241 | 	implications = [x[2] for x in l]
242 | 	
243 | 	return premises, implications, labels
244 | 
245 | 


--------------------------------------------------------------------------------
/ml/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codogogo/topfish/6b3f5723029616cb430d6226bc59c013fe79eb78/ml/__init__.py


--------------------------------------------------------------------------------
/ml/batcher.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | 
 4 | def batch_iter(data, batch_size, num_epochs, shuffle = True):
 5 | 		"""
 6 | 		Generates a batch iterator for a dataset.
 7 | 		"""
 8 | 		#data = np.array(data, dtype = np.int32)
 9 | 		data_size = len(data)
10 | 
11 | 		num_batches_per_epoch = int(data_size/batch_size) + 1
12 | 		for epoch in range(num_epochs):
13 | 			# Shuffle the data at each epoch
14 | 			if shuffle:
15 | 				#shuffle_indices = np.random.permutation(np.arange(data_size))
16 | 				#shuffled_data = data[shuffle_indices]
17 | 				random.shuffle(data)
18 | 			#else:
19 | 			#	shuffled_data = data
20 | 
21 | 			for batch_num in range(num_batches_per_epoch):
22 | 				start_index = batch_num * batch_size
23 | 				end_index = min((batch_num + 1) * batch_size, data_size)
24 | 				yield data[start_index:end_index]


--------------------------------------------------------------------------------
/ml/loss_functions.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def softmax_cross_entropy(predictions, golds):
 4 | 		losses = tf.nn.softmax_cross_entropy_with_logits(logits=predictions, labels=golds)
 5 | 		loss = tf.reduce_mean(losses)
 6 | 		return loss
 7 | 
 8 | def softmax_cross_entropy_micro_batches(predictions, golds, params):
 9 | 	print("Defining micro-batched cross-entropy loss...")
10 | 	micro_batch_size, batch_size = params
11 | 	print("Micro-batch size: " + str(micro_batch_size))
12 | 
13 | 	preds_unstacked = tf.unstack(predictions, num = batch_size)
14 | 	golds_unstacked = tf.unstack(golds, num = batch_size)
15 | 
16 | 	if (len(preds_unstacked) % micro_batch_size != 0 or len(preds_unstacked) != len(golds_unstacked)):
17 | 		raise ValueError("Unexpected batch size, must be a multiplier of number of contrastive examples or num golds and predictions doesn't match!")
18 | 	
19 | 	loss = 0
20 | 	k = 0
21 | 	while k*micro_batch_size < len(preds_unstacked):
22 | 		print("Micro-batch iteration: " + str(k+1))
23 | 		preds_micro_batch = tf.nn.softmax(tf.stack(preds_unstacked[k*micro_batch_size : (k+1)*micro_batch_size]))
24 | 		golds_micro_batch = tf.nn.softmax(tf.stack(golds_unstacked[k*micro_batch_size : (k+1)*micro_batch_size]))
25 | 		loss += softmax_cross_entropy(preds_micro_batch, golds_micro_batch)
26 | 		k += 1
27 | 	return loss
28 | 
29 | def margin_based_loss(predictions, golds):
30 | 	return tf.reduce_sum(tf.maximum(tf.subtract(tf.constant(1.0, dtype = tf.float64), tf.multiply(predictions, golds)), 0.0))
31 | 
32 | def mse_loss(predictions, golds):
33 | 	return tf.reduce_sum(tf.square(tf.subtract(predictions, golds)))
34 | 
35 | def contrastive_loss(predictions, golds, params):
36 | 	print("Defining contrastive loss...")
37 | 	num_pos_pairs, num_neg_pairs, gamma = params
38 | 	preds_unstacked = tf.unstack(predictions)
39 | 	size = num_pos_pairs + num_neg_pairs
40 | 	if (len(preds_unstacked) % size != 0):
41 | 		raise ValueError("Unexpected batch size, must be a multiplier of number of contrastive examples!")
42 | 	
43 | 	loss = 0
44 | 	k = 0
45 | 	while k*size < len(preds_unstacked):
46 | 		pos_pairs = preds_unstacked[k*size : k*size + num_pos_pairs]
47 | 		print("Len of pos pair preds: " + str(len(pos_pairs)))
48 | 		neg_pairs = preds_unstacked[k*size + num_pos_pairs : (k+1) * size]
49 | 		print("Len of neg pair preds: " + str(len(neg_pairs)))
50 | 		for p in pos_pairs:
51 | 			for n in neg_pairs:
52 | 				loss += tf.maximum(tf.constant(0.0, dtype = tf.float64), gamma - (p - n))
53 | 		k += 1
54 | 	return loss
55 | 
56 | def contrastive_loss_nonbinary(predictions, golds, params):
57 | 	print("Defining contrastive loss...")
58 | 	num_pos_pairs, num_neg_pairs, mean_square_error, batch_size = params
59 | 	preds_unstacked = tf.unstack(predictions, num = batch_size)
60 | 	golds_unstacked = tf.unstack(golds, num = batch_size)
61 | 
62 | 	size = num_pos_pairs + num_neg_pairs
63 | 	if (len(preds_unstacked) % size != 0 or len(preds_unstacked) != len(golds_unstacked)):
64 | 		raise ValueError("Unexpected batch size, must be a multiplier of number of contrastive examples or num golds and predictions doesn't match!")
65 | 	
66 | 	loss = 0
67 | 	k = 0
68 | 	while k*size < len(preds_unstacked):
69 | 		print("Micro-batch iteration: " + str(k+1))
70 | 		pos_pairs = preds_unstacked[k*size : k*size + num_pos_pairs]
71 | 		pos_golds = golds_unstacked[k*size : k*size + num_pos_pairs]
72 | 		print("Len of pos pair preds: " + str(len(pos_pairs)))
73 | 
74 | 		neg_pairs = preds_unstacked[k*size + num_pos_pairs : (k+1) * size]
75 | 		neg_golds = golds_unstacked[k*size + num_pos_pairs : (k+1) * size]
76 | 		print("Len of neg pair preds: " + str(len(neg_pairs)))
77 | 
78 | 		for i in range(len(pos_pairs)):
79 | 			for j in range(len(neg_pairs)):
80 | 				if mean_square_error:
81 | 					if k == 0 and i == 0 and j == 0: 
82 | 						print("MSE NCE loss for pair...")
83 | 					loss += tf.square((pos_golds[i] - neg_golds[j]) - (pos_pairs[i] - neg_pairs[j]))
84 | 				else:
85 | 					if k == 0 and i == 0 and j == 0: 
86 | 						print("Hinge, margin loss for pair...")
87 | 					loss += tf.maximum(tf.constant(0.0, dtype = tf.float64), (pos_golds[i] - neg_golds[j]) - (pos_pairs[i] - neg_pairs[j]))
88 | 		k += 1
89 | 	return loss
90 | 		
91 | 	
92 | 		
93 | 	
94 | 	
95 | 	
96 | 


--------------------------------------------------------------------------------
/ml/trainer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from evaluation import confusion_matrix
  3 | from ml import batcher
  4 | import random
  5 | import copy
  6 | import tensorflow as tf
  7 | from sys import stdin
  8 | 
  9 | class SimpleTrainer(object):
 10 | 	def __init__(self, model, session, feed_dict_function, eval_func, configuration_func = None, labels = None, additional_results_func = None):
 11 | 		self.model = model
 12 | 		self.session = session
 13 | 		self.feed_dict_function = feed_dict_function
 14 | 		self.eval_func = eval_func
 15 | 		self.config_func = configuration_func
 16 | 		self.additional_results_function = additional_results_func
 17 | 		self.labels = labels
 18 | 
 19 | 	def train_model_single_iteration(self, feed_dict):
 20 | 		self.model.train_step.run(session = self.session, feed_dict = feed_dict)
 21 | 
 22 | 	def predict(self, feed_dict):
 23 | 		return self.model.preds.eval(session = self.session, feed_dict = feed_dict)
 24 | 	
 25 | 	def evaluate(self, feed_dict, gold):
 26 | 		preds = predict(self.model, self.session, feed_dict)
 27 | 		return preds, self.eval_func(gold, preds)
 28 | 
 29 | 	def test(self, test_data, batch_size, eval_params = None, print_batches = False, batch_size_irrelevant = True, compute_loss = False):
 30 | 		if compute_loss:
 31 | 			epoch_loss = 0
 32 | 		batches_eval = batcher.batch_iter(test_data, batch_size, 1, shuffle = False)
 33 | 		eval_batch_counter = 1
 34 | 				
 35 | 		for batch_eval in batches_eval:
 36 | 			if (batch_size_irrelevant or len(batch_eval) == batch_size):
 37 | 				feed_dict_eval, golds_batch_eval = self.feed_dict_function(self.model, batch_eval, None, predict = True)	
 38 | 				preds_batch_eval = self.predict(feed_dict_eval)
 39 | 				if compute_loss:
 40 | 					batch_eval_loss = self.model.loss.eval(session = self.session, feed_dict = feed_dict_eval)
 41 | 					epoch_loss += batch_eval_loss
 42 | 
 43 | 				if eval_batch_counter == 1:
 44 | 					golds = golds_batch_eval
 45 | 					preds = preds_batch_eval
 46 | 				else:
 47 | 					golds = np.concatenate((golds, golds_batch_eval), axis = 0)
 48 | 					preds = np.concatenate((preds, preds_batch_eval), axis = 0)
 49 | 				if print_batches:
 50 | 					print("Eval batch counter: " + str(eval_batch_counter), flush=True)
 51 | 			eval_batch_counter += 1
 52 | 
 53 | 		if self.eval_func is not None:
 54 | 			score = self.eval_func(golds, preds, eval_params)
 55 | 			if compute_loss:
 56 | 				return preds, score, epoch_loss 
 57 | 			else:
 58 | 				return preds, score
 59 | 		else:
 60 | 			if compute_loss:
 61 | 				return preds, epoch_loss
 62 | 			else:
 63 | 				return preds 
 64 | 
 65 | 	def train(self, train_data, batch_size, max_num_epochs, num_epochs_not_better_end = 5, epoch_diff_smaller_end = 1e-5, print_batch_losses = True, configuration = None, eval_params = None, shuffle_data = True, batch_size_irrelevant = True):
 66 | 		batch_counter = 0
 67 | 		epoch_counter = 0
 68 | 		epoch_losses = []
 69 | 		epoch_loss = 0
 70 | 		batches_in_epoch = int(len(train_data)/batch_size) + 1
 71 | 
 72 | 		batches = batcher.batch_iter(train_data, batch_size, max_num_epochs, shuffle = shuffle_data)
 73 | 		for batch in batches:
 74 | 			batch_counter += 1
 75 | 
 76 | 			if (batch_size_irrelevant or len(batch) == batch_size):
 77 | 				feed_dict, gold_labels = self.feed_dict_function(self.model, batch, config = configuration, predict = False)
 78 | 				self.train_model_single_iteration(feed_dict)
 79 | 				batch_loss = self.model.loss.eval(session = self.session, feed_dict = feed_dict)
 80 | 				if print_batch_losses:
 81 | 					print("Batch " + str(batch_counter) + ": " + str(batch_loss), flush=True)
 82 | 
 83 | 			if batch_counter % batches_in_epoch == 0:
 84 | 				epoch_counter += 1
 85 | 				print("Evaluating the epoch loss for epoch " + str(epoch_counter), flush=True)
 86 | 				
 87 | 				if self.eval_func: 
 88 | 					preds, score, epoch_loss = self.test(train_data, batch_size, eval_params, False, batch_size_irrelevant = batch_size_irrelevant, compute_loss = True)
 89 | 				else: 
 90 | 					preds, epoch_loss = self.test(train_data, batch_size, None, False, batch_size_irrelevant = batch_size_irrelevant, compute_loss = True)
 91 | 
 92 | 				print("Epoch " + str(epoch_counter) + ": " + str(epoch_loss), flush=True)
 93 | 				if self.eval_func: 
 94 | 					print("Epoch (train) performance: " + str(score), flush=True)
 95 | 				print("Previous epochs: " + str(epoch_losses), flush=True)
 96 | 
 97 | 				if len(epoch_losses) == num_epochs_not_better_end and (epoch_losses[0] - epoch_loss < epoch_diff_smaller_end):
 98 | 					break
 99 | 				else: 
100 | 					epoch_losses.append(epoch_loss)
101 | 					epoch_loss = 0
102 | 					if len(epoch_losses) > num_epochs_not_better_end:
103 | 						epoch_losses.pop(0)
104 | 
105 | 	def train_dev(self, train_data, dev_data, batch_size, max_num_epochs, num_devs_not_better_end = 5, batch_dev_perf = 100, print_batch_losses = True, dev_score_maximize = True, configuration = None, print_training = False, shuffle_data = True):
106 | 		batch_counter = 0
107 | 		epoch_counter = 0
108 | 		epoch_losses = []
109 | 		dev_performances = []
110 | 		dev_losses = []
111 | 		epoch_loss = 0
112 | 		
113 | 		best_model = None
114 | 		best_performance = -1
115 | 		best_preds_dev = None	
116 | 		batches_in_epoch = int(len(train_data)/batch_size) + 1
117 | 
118 | 		batches = batcher.batch_iter(train_data, batch_size, max_num_epochs, shuffle = shuffle_data)
119 | 		for batch in batches:
120 | 			batch_counter += 1
121 | 
122 | 			if (len(batch) == batch_size):
123 | 				feed_dict, gold_labels = self.feed_dict_function(self.model, batch, configuration)
124 | 				self.train_model_single_iteration(feed_dict)
125 | 			
126 | 				batch_loss = self.model.pure_loss.eval(session = self.session, feed_dict = feed_dict)
127 | 				#batch_dist_loss = self.model.distance_loss.eval(session = self.session, feed_dict = feed_dict) 
128 | 				epoch_loss += batch_loss
129 | 
130 | 				if print_training and print_batch_losses:
131 | 					print("Batch loss" + str(batch_counter) + ": " + str(batch_loss), flush=True)
132 | 					#print("Batch distance loss" + str(batch_counter) + ": " + str(batch_dist_loss))
133 | 
134 | 			if batch_counter % batches_in_epoch == 0:
135 | 				epoch_counter += 1
136 | 				if print_training: 
137 | 					print("\nEpoch " + str(epoch_counter) + ": " + str(epoch_loss), flush=True)
138 | 					print("Previous epochs: " + str(epoch_losses) + "\n", flush=True)
139 | 				epoch_losses.append(epoch_loss)
140 | 				epoch_loss = 0
141 | 				if len(epoch_losses) > num_devs_not_better_end:
142 | 					epoch_losses.pop(0)
143 | 		
144 | 			if batch_counter % batch_dev_perf == 0:
145 | 				if print_training:
146 | 					print("\n### Evaluation of development set, after batch " + str(batch_counter) + " ###", flush=True)
147 | 				batches_dev = batcher.batch_iter(dev_data, batch_size, 1, shuffle = False)
148 | 				dev_batch_counter = 1
149 | 				dev_loss = 0
150 | 				for batch_dev in batches_dev:
151 | 					if (len(batch_dev) == batch_size):
152 | 						feed_dict_dev, golds_batch_dev = self.feed_dict_function(self.model, batch_dev, configuration, predict = True)	
153 | 						dev_batch_loss = self.model.pure_loss.eval(session = self.session, feed_dict = feed_dict_dev)
154 | 						dev_loss += dev_batch_loss
155 | 						if print_training and print_batch_losses: 
156 | 							print("Dev batch: " + str(dev_batch_counter) + ": " + str(dev_batch_loss), flush=True)
157 | 						preds_batch_dev = self.predict(feed_dict_dev) 
158 | 						if dev_batch_counter == 1:
159 | 							golds = golds_batch_dev
160 | 							preds = preds_batch_dev
161 | 						else:
162 | 							golds = np.concatenate((golds, golds_batch_dev), axis = 0)
163 | 							preds = np.concatenate((preds, preds_batch_dev), axis = 0)
164 | 					dev_batch_counter += 1
165 | 				print("Development pure loss: " + str(dev_loss), flush=True)
166 | 				score = self.eval_func(golds, preds, self.labels)
167 | 				if self.additional_results_function:
168 | 					self.additional_results_function(self.model, self.session)
169 | 				if print_training:
170 | 					print("Peformance: " + str(score) + "\n", flush=True)
171 | 					print("Previous performances: " + str(dev_performances), flush=True)
172 | 					print("\nLoss: " + str(dev_loss) + "\n", flush=True)
173 | 					print("Previous losses: " + str(dev_losses), flush=True)
174 | 				if score > best_performance:
175 | 					best_model = self.model.get_model(self.session)
176 | 					best_preds_dev = preds
177 | 					best_performance = score
178 | 			
179 | 				#if len(dev_performances) == num_devs_not_better_end and ((dev_score_maximize and dev_performances[0] >= score) or (not dev_score_maximize and dev_performances[0] <= score)):
180 | 				if len(dev_losses) == num_devs_not_better_end and dev_losses[0] < dev_loss:
181 | 					break
182 | 				else: 
183 | 					dev_performances.append(score)
184 | 					dev_losses.append(dev_loss)
185 | 					if len(dev_performances) > num_devs_not_better_end:
186 | 						dev_performances.pop(0)
187 | 						dev_losses.pop(0) 			
188 | 		return (best_model, best_performance, best_preds_dev, golds)
189 | 
190 | 	def cross_validate(self, data, batch_size, max_num_epochs, num_folds = 5, num_devs_not_better_end = 5, batch_dev_perf = 100, print_batch_losses = True, dev_score_maximize = True, configuration = None, print_training = False, micro_performance = True, shuffle_data = True):
191 | 		folds = np.array_split(data, num_folds)
192 | 		results = {}
193 | 
194 | 		for i in range(num_folds):
195 | 			train_data = []
196 | 			for j in range(num_folds):
197 | 				if j != i:
198 | 					train_data.extend(folds[j])
199 | 			dev_data = folds[i]
200 | 
201 | 			print("Sizes: train " + str(len(train_data)) + "; dev " + str(len(dev_data)), flush=True)
202 | 			print("Fold " + str(i+1) + ", creating model...", flush=True)
203 | 			model, conf_str, session = self.config_func(configuration)
204 | 			self.model = model
205 | 			self.session = session
206 | 			print("Fold " + str(i+1) + ", training the model...", flush=True)
207 | 			results[conf_str + "__fold-" + str(i+1)] = self.train_dev(train_data, dev_data, batch_size, max_num_epochs, num_devs_not_better_end, batch_dev_perf, print_batch_losses, dev_score_maximize, configuration, print_training, shuffle_data = shuffle_data)
208 | 			
209 | 			print("Closing session, reseting the default graph (freeing memory)", flush=True)
210 | 			self.session.close()
211 | 			tf.reset_default_graph()
212 | 			print("Performance: " + str(results[conf_str + "__fold-" + str(i+1)][1]), flush=True)
213 | 		
214 | 		if micro_performance:
215 | 			print("Concatenating fold predictions for micro-performance computation", flush=True)
216 | 			cntr = 0
217 | 			for k in results:
218 | 				cntr += 1
219 | 				if cntr == 1:
220 | 					all_preds = results[k][2]
221 | 					all_golds = results[k][3]
222 | 				else:
223 | 					all_preds = np.concatenate((all_preds, results[k][2]), axis = 0)
224 | 					all_golds = np.concatenate((all_golds, results[k][3]), axis = 0)	
225 | 			micro_perf = self.eval_func(all_golds, all_preds, self.labels)
226 | 			return results, micro_perf
227 | 		else: 
228 | 			return results	
229 | 
230 | 	def grid_search(self, configurations, train_data, dev_data, batch_size, max_num_epochs, num_devs_not_better_end = 5, batch_dev_perf = 100, print_batch_losses = True, dev_score_maximize = True, cross_validate = False, cv_folds = None, print_training = False, micro_performance = False, shuffle_data = True):
231 | 		if self.config_func is None:
232 | 			raise ValueError("Function that creates a concrete model for a given hyperparameter configuration must be defined!")
233 | 		results = {}
234 | 		config_cnt = 0
235 | 		for config in configurations:
236 | 			config_cnt += 1
237 | 			print("Config: #" + str(config_cnt), flush=True)
238 | 			if cross_validate:
239 | 				results[str(config)] = self.cross_validate(train_data, batch_size, max_num_epochs, cv_folds, num_devs_not_better_end, batch_dev_perf, print_batch_losses, dev_score_maximize, config, print_training, micro_performance = micro_performance, shuffle_data = shuffle_data)
240 | 				if micro_performance:
241 | 					print("### Configuration performance: " + str(results[str(config)][1]), flush=True)
242 | 			else:
243 | 				model, conf_str, session = self.config_func(config)
244 | 				self.model = model
245 | 				self.session = session
246 | 				results[conf_str] = self.train_dev(train_data, dev_data, batch_size, max_num_epochs, num_devs_not_better_end, batch_dev_perf, print_batch_losses, dev_score_maximize, config, print_training, shuffle_data = shuffle_data)
247 | 				
248 | 				print("Closing session, reseting the default graph (freeing memory)", flush=True)
249 | 				self.session.close()
250 | 				tf.reset_default_graph()
251 | 		return results
252 | 			
253 | 
254 | class Trainer(object):
255 | 	"""
256 | 	A wrapper around the classifiers, implementing functionality like cross-validation, batching, grid search, etc.
257 | 	"""
258 | 	def __init__(self, classifier, one_hot_encoding_preds = False, class_indexes = True):
259 | 		self.classifier = classifier
260 | 		self.one_hot_encoding_preds = one_hot_encoding_preds
261 | 		self.class_indices = class_indexes
262 | 	
263 | 	def cross_validate(self, tf_session, class_labels, data_input, data_labels, num_folds, batch_size, num_epochs, model_reset_function = None, shuffle = False, fold_avg = 'micro', cl_perf = None, overall_perf = True, num_epochs_not_better_end = 2):
264 | 		conf_matrices = []
265 | 		best_epochs = []	
266 | 		if shuffle:
267 | 			paired = list(zip(data_input, data_labels))
268 | 			random.shuffle(paired)	
269 | 			data_input, data_labels = zip(*paired)
270 | 
271 | 		folds = self.cross_validation_fold(data_input, data_labels, num_folds)
272 | 		fold_counter = 1
273 | 		for fold in folds:
274 | 			print("Fold: " + str(fold_counter), flush=True)
275 | 			train_input = fold[0]; train_labels = fold[1]; dev_input = fold[2]; dev_labels = fold[3]
276 | 			model_reset_function(tf_session)
277 | 			conf_mat, epoch = self.train_and_test(tf_session, class_labels, train_input, train_labels, dev_input, dev_labels, batch_size, num_epochs, cl_perf, overall_perf, num_epochs_not_better_end = num_epochs_not_better_end)
278 | 			conf_matrices.append(conf_mat)
279 | 			best_epochs.append(epoch)
280 | 			fold_counter += 1
281 | 		if fold_avg == 'macro':
282 | 			return conf_matrices, best_epochs
283 | 		elif fold_avg == 'micro':
284 | 			return confusion_matrix.merge_confusion_matrices(conf_matrices), best_epochs
285 | 		else:
286 | 			raise ValueError("Unknown value for fold_avg")	
287 | 
288 | 		
289 | 	def cross_validation_fold(self, data_input, data_labels, num_folds):
290 | 		folds_x_train = np.array_split(data_input, num_folds)
291 | 		folds_y_train = np.array_split(data_labels, num_folds)
292 | 		for i in range(num_folds):
293 | 			train_set_x = []
294 | 			train_set_y = []	
295 | 			for j in range(num_folds):
296 | 				if j != i:
297 | 					train_set_x.extend(folds_x_train[j])
298 | 					train_set_y.extend(folds_y_train[j])
299 | 			dev_set_x = folds_x_train[i]
300 | 			dev_set_y = folds_y_train[i]
301 | 			yield [np.array(train_set_x), np.array(train_set_y), dev_set_x, dev_set_y]
302 | 
303 | 	def train_and_test(self, session, class_labels, x_train, y_train, x_test, y_test, batch_size, num_epochs, cl_perf = None, overall_perf = True, num_epochs_not_better_end = 10, manual_features = False):
304 | 		batch_counter = 0
305 | 		epoch_loss = 0
306 | 		epoch_counter = 0
307 | 		last_epoch_results = []
308 | 		best_f = 0
309 | 		best_epoch = 0
310 | 		best_conf_mat = None
311 | 		best_predictions = []
312 | 
313 | 		num_batches_per_epoch = int((len(x_train) if not manual_features else len(x_train[0])) / batch_size) + 1
314 | 
315 | 		batches = batcher.batch_iter(list(zip(x_train, y_train)), batch_size, num_epochs) if not manual_features else batcher.batch_iter(list(zip(x_train[0], x_train[1], y_train)), batch_size, num_epochs)
316 | 		for batch in batches:
317 | 			if manual_features:
318 | 				x_b, x_b_man, y_b = zip(*batch)	
319 | 				batch_loss = self.classifier.train(session, x_b, y_b, man_feats = x_b_man)
320 | 			else:
321 | 				x_b, y_b = zip(*batch)
322 | 				x_b = np.array(x_b)
323 | 				y_b = np.array(y_b)
324 | 				batch_loss = self.classifier.train(session, x_b, y_b)
325 | 			epoch_loss += batch_loss
326 | 
327 | 			batch_counter += 1
328 | 
329 | 			#if batch_counter % 50 == 0:
330 | 				#print("Batch " + str(batch_counter) + " loss: " + str(batch_loss))
331 | 				# evaluating current model's performance on test
332 | 				#preds, gold = self.classifier.predict(session, x_test, y_test)
333 | 				#self.evaluate_performance(class_labels, preds, gold, cl_perf, overall_perf, " (test set) ")
334 | 
335 | 			if batch_counter % num_batches_per_epoch == 0:
336 | 				epoch_counter += 1
337 | 				print("Epoch " + str(epoch_counter) + " loss: " + str(epoch_loss), flush=True)
338 | 				last_epoch_results.append(epoch_loss)
339 | 				epoch_loss = 0
340 | 
341 | 				if manual_features:
342 | 					x_test_text = x_test[0]
343 | 					x_test_manual = x_test[1]
344 | 					preds, gold = self.classifier.predict(session, x_test_text, y_test, man_feats = x_test_manual)	
345 | 
346 | 				else: 
347 | 					preds, gold = self.classifier.predict(session, x_test, y_test)
348 | 
349 | 				cm = self.evaluate_performance(class_labels, preds, gold, cl_perf, overall_perf, " (test set) ")
350 | 				
351 | 				fepoch = cm.accuracy # cm.get_class_performance("1")[2]
352 | 				if fepoch > best_f:
353 | 					best_f = fepoch
354 | 					best_epoch = epoch_counter
355 | 					best_conf_mat = cm
356 | 					best_predictions = preds
357 | 		
358 | 				if len(last_epoch_results) > num_epochs_not_better_end:
359 | 					last_epoch_results.pop(0)
360 | 				print("Last epochs: " + str(last_epoch_results), flush=True)
361 | 
362 | 				if len(last_epoch_results) == num_epochs_not_better_end and last_epoch_results[0] < last_epoch_results[-1]:
363 | 					print("End condition satisfied, training finished. ", flush=True)
364 | 					break
365 | 
366 | 		#preds, gold = self.classifier.predict(session, x_train, y_train)
367 | 		#self.evaluate_performance(class_labels, preds, gold, cl_perf, overall_perf, " (train set) ")
368 | 
369 | 		#preds, gold = self.classifier.predict(session, x_test, y_test)
370 | 		#conf_mat = self.evaluate_performance(class_labels, preds, gold, cl_perf, overall_perf, " (test set) ")
371 | 		#return conf_mat
372 | 		return best_conf_mat, best_epoch, best_predictions
373 | 			
374 | 	def evaluate_performance(self, class_labels, preds, gold, cl_perf = None, overall_perf = True, desc = " () ", print_perf = True):
375 | 		conf_matrix = confusion_matrix.ConfusionMatrix(class_labels, preds, gold, self.one_hot_encoding_preds, self.class_indices)
376 | 		if print_perf:
377 | 			if cl_perf is not None:
378 | 				for cl in cl_perf:
379 | 					p, r, f = conf_matrix.get_class_performance(cl)
380 | 					print(desc + " Class: " + cl + "\nP: " + str(p) + "\nR: " + str(r) + "\nF: " + str(f) + "\n", flush=True)
381 | 			if overall_perf:
382 | 				print(desc + " Micro F1: " + str(conf_matrix.microf1) + "\nMacro F1: " + str(conf_matrix.macrof1) + "\n", flush=True)
383 | 		return conf_matrix


--------------------------------------------------------------------------------
/nlp.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from embeddings import text_embeddings
  4 | from helpers import io_helper
  5 | from helpers import data_shaper
  6 | from convolution import cnn
  7 | from ml import loss_functions
  8 | from evaluation import confusion_matrix
  9 | from ml import trainer
 10 | from helpers import data_helper
 11 | import math
 12 | import nltk
 13 | from sts import simple_sts
 14 | from graphs import graph
 15 | import math
 16 | import os
 17 | from sys import stdin
 18 | from datetime import datetime
 19 | from scipy import spatial
 20 | import codecs
 21 | import pickle
 22 | 
 23 | def map_lang(lang):
 24 | 	if lang.lower() == 'english':
 25 | 		return 'en'
 26 | 	elif lang.lower() == 'french':
 27 | 		return 'fr'
 28 | 	elif lang.lower() == 'german':
 29 | 		return 'de'
 30 | 	elif lang.lower() == 'italian':
 31 | 		return 'it'
 32 | 	elif lang.lower() == 'spanish':
 33 | 		return 'es'
 34 | 	elif lang.lower() in ["en", "es", "de", "fr", "it"]:
 35 | 		return lang.lower()
 36 | 	else:
 37 | 		return None
 38 | 
 39 | def inverse_map_lang(lang):
 40 | 	if lang.lower() == 'en':
 41 | 		return 'english'
 42 | 	elif lang.lower() == 'fr':
 43 | 		return 'french'
 44 | 	elif lang.lower() == 'de':
 45 | 		return 'german'
 46 | 	elif lang.lower() == 'it':
 47 | 		return 'italian'
 48 | 	elif lang.lower() == 'es':
 49 | 		return 'spanish'
 50 | 	elif lang.lower() in ["english", "spanish", "german", "french", "italian"]:
 51 | 		return lang.lower()
 52 | 	else:
 53 | 		return None
 54 | 
 55 | def load_embeddings(path):
 56 | 	embeddings = text_embeddings.Embeddings()
 57 | 	embeddings.load_embeddings(path, limit = None, language = 'default', print_loading = False)
 58 | 	return embeddings
 59 | 
 60 | def build_feed_dict_func(model, data, config = None, predict = False):
 61 | 	x, y = zip(*data)
 62 | 	fd = model.get_feed_dict(x, None if None in y else y, 1.0 if predict else 0.5)		
 63 | 	return fd, y
 64 | 
 65 | def eval_func(golds, preds, params = None):
 66 | 	gold_labs = np.argmax(golds, axis = 1)
 67 | 	pred_labs = np.argmax(preds, axis = 1)   
 68 | 
 69 | 	conf_matrix = confusion_matrix.ConfusionMatrix(params["dist_labels"], pred_labs, gold_labs, False, class_indices = True)
 70 | 	res = conf_matrix.accuracy
 71 | 	return 0 if math.isnan(res) else res
 72 | 
 73 | def get_prediction_labels(preds, dist_labels):
 74 | 	pred_labs = [dist_labels[x] for x in np.argmax(preds, axis = 1)]   
 75 | 	return pred_labs
 76 | 	
 77 | def train_cnn(texts, languages, labels, embeddings, parameters, model_serialization_path, emb_lang = 'default'):
 78 | 	# preparing texts
 79 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'  Preparing texts...', flush=True)
 80 | 	texts_clean = [data_helper.clean_str(t.strip()).split() for t in texts]
 81 | 	# encoding languages (full name to abbreviation)
 82 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'  Encoding languages (full name to abbreviation)...', flush=True)
 83 | 	langs = [map_lang(x) for x in languages]
 84 | 	# preparing training examples
 85 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'  Preparing training examples...', flush=True)
 86 | 	x_train, y_train, dist_labels = data_shaper.prep_classification(texts_clean, labels, embeddings, embeddings_language = emb_lang, multilingual_langs = langs, numbers_token = '<NUM/>', punct_token = '<PUNC/>', add_out_of_vocabulary_terms = False)	
 87 | 
 88 | 	# defining the CNN model
 89 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'  Defining the CNN model...', flush=True)
 90 | 	cnn_classifier = cnn.CNN(embeddings = (embeddings.emb_sizes[emb_lang], embeddings.lang_embeddings[emb_lang]), num_conv_layers = parameters["num_convolutions"], filters = parameters["filters"], k_max_pools = parameters["k_max_pools"], manual_features_size = 0)
 91 | 	cnn_classifier.define_model(len(x_train[0]), len(dist_labels), loss_functions.softmax_cross_entropy, len(embeddings.lang_vocabularies[emb_lang]), l2_reg_factor = parameters["reg_factor"], update_embeddings = parameters["update_embeddings"])
 92 | 	cnn_classifier.define_optimization(learning_rate = parameters["learning_rate"])
 93 | 	cnn_classifier.set_distinct_labels(dist_labels)
 94 | 
 95 | 	# initializing a Tensorflow session
 96 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'  Initializing a Tensorflow session...', flush=True)
 97 | 	session = tf.InteractiveSession()
 98 | 	session.run(tf.global_variables_initializer())
 99 | 
100 | 	# training the model
101 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'  Training the model...', flush=True)
102 | 	simp_trainer = trainer.SimpleTrainer(cnn_classifier, session, build_feed_dict_func, eval_func, configuration_func = None)
103 | 	simp_trainer.train(list(zip(x_train, y_train)), parameters["batch_size"], parameters["num_epochs"], num_epochs_not_better_end = 5, epoch_diff_smaller_end = parameters["epoch_diff_smaller_end"], print_batch_losses = True, eval_params = { "dist_labels" : dist_labels })
104 | 
105 | 	# storing the model
106 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'  Storing the model...', flush=True)
107 | 	cnn_classifier.serialize(session, model_serialization_path)
108 | 	session.close()
109 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'  Training model is done!', flush=True)
110 | 
111 | def test_cnn(texts, languages, labels, embeddings, model_serialization_path, predictions_file_path, parameters, emb_lang = 'default'):
112 | 	# loading the serialized 
113 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'  Loading the serialized model...', flush=True)
114 | 	cnn_classifier, session = cnn.load_model(model_serialization_path, embeddings.lang_embeddings[emb_lang], loss_functions.softmax_cross_entropy,  just_predict = (labels is None))
115 | 
116 | 	# preparing/cleaning the texts
117 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'  Preparing/cleaning the texts...', flush=True)
118 | 	texts_clean = [data_helper.clean_str(t.strip()).split() for t in texts]
119 | 	# encoding languages (full name to abbreviation)
120 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'  Encoding languages (full name to abbreviation)...', flush=True)
121 | 	langs = [map_lang(x) for x in languages]
122 | 	# preparing testing examples
123 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'  Preparing training examples...', flush=True)
124 | 	if labels:
125 | 		x_test, y_test, dist_labels = data_shaper.prep_classification(texts_clean, labels, embeddings, embeddings_language = emb_lang, multilingual_langs = langs, numbers_token = '<NUM/>', punct_token = '<PUNC/>', add_out_of_vocabulary_terms = False, dist_labels = cnn_classifier.dist_labels, max_seq_len = cnn_classifier.max_text_length)	
126 | 	else:	
127 | 		x_test = data_shaper.prep_classification(texts_clean, labels, embeddings, embeddings_language = emb_lang, multilingual_langs = langs, numbers_token = '<NUM/>', punct_token = '<PUNC/>', add_out_of_vocabulary_terms = False, dist_labels = cnn_classifier.dist_labels, max_seq_len = cnn_classifier.max_text_length)	
128 | 	
129 | 	simp_trainer = trainer.SimpleTrainer(cnn_classifier, session, build_feed_dict_func, None if not labels else eval_func, configuration_func = None)
130 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'  Starting test...', flush=True)
131 | 	results = simp_trainer.test(list(zip(x_test, y_test if labels else [None] * len(x_test))), parameters["batch_size"], eval_params = { "dist_labels" : cnn_classifier.dist_labels }, batch_size_irrelevant = True)
132 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'  Getting prediction labels...', flush=True)
133 | 	pred_labs = get_prediction_labels(results[0] if labels else results, cnn_classifier.dist_labels)
134 | 	
135 | 	if labels is None:
136 | 		io_helper.write_list(predictions_file_path, pred_labs)
137 | 	else:
138 | 		list_pairs = list(zip(pred_labs, labels))
139 | 		list_pairs.insert(0, ("Prediction", "Real label"))
140 | 		list_pairs.append(("Performance: ", str(results[1])))
141 | 		io_helper.write_list_tuples_separated(predictions_file_path, list_pairs)
142 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '  Prediction is done!', flush=True)
143 | 
144 | def scale_supervised(filenames, texts, languages, embeddings, predictions_file_path, pivot1, pivot2, stopwords = [], emb_lang = 'default'):
145 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Tokenizing documents...", flush = True)
146 | 	texts_tokenized = []
147 | 	for i in range(len(texts)):
148 | 		print("Document " + str(i + 1) + " of " + str(len(texts)), flush = True)
149 | 		texts_tokenized.append(simple_sts.simple_tokenize(texts[i], stopwords, lang_prefix = map_lang(languages[i])))
150 | 
151 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Building tf-idf indices for weighted aggregation...", flush = True)
152 | 	tf_index, idf_index = simple_sts.build_tf_idf_indices(texts_tokenized)
153 | 	agg_vecs = []
154 | 	for i in range(len(texts_tokenized)):
155 | 		print("Aggregating vector of the document: " + str(i+1) + " of " + str(len(texts_tokenized)), flush = True)
156 | 		#agg_vec = simple_sts.aggregate_weighted_text_embedding(embeddings, tf_index[i], idf_index, emb_lang, weigh_idf = (len(set(languages)) == 1))
157 | 		agg_vec = simple_sts.aggregate_weighted_text_embedding(embeddings, tf_index[i], idf_index, emb_lang, weigh_idf = False)
158 | 		agg_vecs.append(agg_vec)
159 | 	pairs = []
160 | 	cntr = 0
161 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Computing pairwise similarities...", flush = True)
162 | 	for i in range(len(agg_vecs) - 1):
163 | 		for j in range(i+1, len(agg_vecs)):
164 | 			cntr += 1
165 | 			#print("Pair: " + filenames[i] + " - " + filenames[j] + " (" + str(cntr) + " of " + str((len(filenames) * (len(filenames) - 1)) / 2))
166 | 			sim = 1.0 - spatial.distance.cosine(agg_vecs[i], agg_vecs[j])
167 | 			print (sim)
168 | 			#print("Similarity: " + str(sim))
169 | 			pairs.append((filenames[i], filenames[j], sim))
170 | 
171 | 	# rescale distances and produce similarities
172 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Normalizing pairwise similarities...", flush = True)
173 | 	max_sim = max([x[2] for x in pairs])
174 | 	min_sim = min([x[2] for x in pairs])
175 | 	pairs = [(x[0], x[1], (x[2] - min_sim) / (max_sim - min_sim)) for x in pairs]
176 | 
177 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Fixing the pivot documents for scaling...", flush = True)
178 | 	min_sim_pair = [pivot1,pivot2,0.0]
179 | 
180 | 	fixed = [(filenames.index(min_sim_pair[0]), -1.0), (filenames.index(min_sim_pair[1]), 1.0)]
181 | #       fixed = [(pivot1, -1.0), (pivot2, 1.0)]
182 | 
183 | 	# propagating position scores, i.e., scaling
184 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Running graph-based label propagation with pivot rescaling and score normalization...", flush = True)
185 | 	g = graph.Graph(nodes = filenames, edges = pairs)
186 | 	scores = g.harmonic_function_label_propagation(fixed, rescale_extremes = False, normalize = True)
187 | 
188 | 	embs_to_store = {filenames[x]: [agg_vecs[x],scores[filenames[x]]] for x in range(len(agg_vecs))}
189 | 	print ("embs_to_store", len(embs_to_store))
190 | 
191 | 	with open('docs-embs.pickle', 'wb') as handle:
192 | 		pickle.dump(embs_to_store, handle, protocol=pickle.HIGHEST_PROTOCOL)
193 | 		    
194 | 	if predictions_file_path:
195 | 		io_helper.write_dictionary(predictions_file_path, scores)
196 | 	
197 | 	return scores
198 | 
199 | def scale_efficient(filenames, texts, languages, embeddings, predictions_file_path, parameters, emb_lang = 'default', stopwords = []):
200 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Tokenizing documents...", flush = True)
201 | 	texts_tokenized = []
202 | 	for i in range(len(texts)):
203 | 		print("Document " + str(i + 1) + " of " + str(len(texts)), flush = True)
204 | 		texts_tokenized.append(simple_sts.simple_tokenize(texts[i], stopwords, lang_prefix = map_lang(languages[i])))
205 | 
206 |         
207 | 	embs_to_store = {filenames[x]: [texts_tokenized[x]] for x in range(len(texts_tokenized))} 
208 | 	print ("embs_to_store", len(embs_to_store))
209 |     
210 | 	with open('tok-text.pickle', 'wb') as handle:
211 | 		pickle.dump(embs_to_store, handle, protocol=pickle.HIGHEST_PROTOCOL)	
212 | 	
213 |         
214 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Building tf-idf indices for weighted aggregation...", flush = True)
215 | 	tf_index, idf_index = simple_sts.build_tf_idf_indices(texts_tokenized)
216 | 	agg_vecs = []
217 | 	for i in range(len(texts_tokenized)):
218 | 		print("Aggregating vector of the document: " + str(i+1) + " of " + str(len(texts_tokenized)), flush = True)
219 | 		#agg_vec = simple_sts.aggregate_weighted_text_embedding(embeddings, tf_index[i], idf_index, emb_lang, weigh_idf = (len(set(languages)) == 1))
220 | 		agg_vec = simple_sts.aggregate_weighted_text_embedding(embeddings, tf_index[i], idf_index, emb_lang, weigh_idf = False)
221 | 		agg_vecs.append(agg_vec)
222 | 
223 |         
224 |         
225 | 	pairs = []
226 | 	cntr = 0
227 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Computing pairwise similarities...", flush = True)
228 | 	for i in range(len(agg_vecs) - 1):
229 | 		for j in range(i+1, len(agg_vecs)):
230 | 			cntr += 1
231 | 			#print("Pair: " + filenames[i] + " - " + filenames[j] + " (" + str(cntr) + " of " + str((len(filenames) * (len(filenames) - 1)) / 2))
232 | 			sim = 1.0 - spatial.distance.cosine(agg_vecs[i], agg_vecs[j])
233 | 			print (sim)
234 | 			#print("Similarity: " + str(sim))
235 | 			pairs.append((filenames[i], filenames[j], sim))
236 | 
237 | 	# rescale distances and produce similarities
238 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Normalizing pairwise similarities...", flush = True)
239 | 	max_sim = max([x[2] for x in pairs])
240 | 	min_sim = min([x[2] for x in pairs])
241 | 	pairs = [(x[0], x[1], (x[2] - min_sim) / (max_sim - min_sim)) for x in pairs]
242 | 
243 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Fixing the pivot documents for scaling...", flush = True)
244 | 	min_sim_pair = [x for x in pairs if x[2] == 0][0]
245 | 	fixed = [(filenames.index(min_sim_pair[0]), -1.0), (filenames.index(min_sim_pair[1]), 1.0)]
246 | 
247 | 	# propagating position scores, i.e., scaling
248 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Running graph-based label propagation with pivot rescaling and score normalization...", flush = True)
249 | 	g = graph.Graph(nodes = filenames, edges = pairs)
250 | 	scores = g.harmonic_function_label_propagation(fixed, rescale_extremes = True, normalize = True)
251 |     
252 | 	embs_to_store = {filenames[x]: [agg_vecs[x],scores[filenames[x]]] for x in range(len(agg_vecs))} 
253 | 	print ("embs_to_store", len(embs_to_store))
254 |     
255 | 	with open('docs-embs.pickle', 'wb') as handle:
256 | 		pickle.dump(embs_to_store, handle, protocol=pickle.HIGHEST_PROTOCOL)	
257 | 	
258 | 	if predictions_file_path:
259 | 		io_helper.write_dictionary(predictions_file_path, scores)
260 | 	return scores
261 | 		
262 | def scale(filenames, texts, languages, embeddings, predictions_file_path, parameters, emb_lang = 'default'):
263 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"  Tokenizing documents...", flush=True)
264 | 	texts_tokenized = []
265 | 	for i in range(len(texts)):
266 | 		print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"  Document " + str(i + 1) + " of " + str(len(texts)), flush=True)
267 | 		texts_tokenized.append(simple_sts.simple_tokenize(texts[i], [], lang_prefix = map_lang(languages[i])))
268 | 	
269 | 	doc_dicts = []
270 | 	cntr = 0
271 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"  Building vocabularies for documents...", flush=True)
272 | 	for x in texts_tokenized:
273 | 		cntr += 1
274 | 		print("Document " + str(cntr) + " of " + str(len(texts)))
275 | 		doc_dicts.append(simple_sts.build_vocab(x, count_treshold = 1))
276 | 
277 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"  Computing similarities between document pairs...", flush=True)
278 | 	items = list(zip(filenames, languages, doc_dicts))
279 | 	pairs = []
280 | 	cntr = 0
281 | 	for i in range(len(items) - 1):
282 | 		for j in range(i+1, len(items)):
283 | 			cntr += 1
284 | 			print("Pair: " + items[i][0] + " - " + items[j][0] + " (" + str(cntr) + " of " + str((len(items) * (len(items) - 1)) / 2), flush=True)
285 | 			sim = simple_sts.greedy_alignment_similarity(embeddings, items[i][2], items[j][2], lowest_sim = 0.01, length_factor = 0.01)
286 | 			print("Similarity: " + str(sim), flush=True)
287 | 			print("\n", flush=True)
288 | 			pairs.append((items[i][0], items[j][0], sim))
289 | 
290 | 	# rescale distances and produce similarities
291 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"  Normalizing pairwise similarities...", flush=True)
292 | 	max_sim = max([x[2] for x in pairs])
293 | 	min_sim = min([x[2] for x in pairs])
294 | 	pairs = [(x[0], x[1], (x[2] - min_sim) / (max_sim - min_sim)) for x in pairs]
295 | 
296 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"  Fixing the pivot documents for scaling...", flush=True)
297 | 	min_sim_pair = [x for x in pairs if x[2] == 0][0]
298 | 	fixed = [(filenames.index(min_sim_pair[0]), -1.0), (filenames.index(min_sim_pair[1]), 1.0)]
299 | 
300 | 	# propagating position scores, i.e., scaling
301 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"  Running graph-based label propagation with pivot rescaling and score normalization...", flush=True)
302 | 	g = graph.Graph(nodes = filenames, edges = pairs)
303 | 	scores = g.harmonic_function_label_propagation(fixed, rescale_extremes = True, normalize = True)
304 | 	if predictions_file_path:
305 | 		io_helper.write_dictionary(predictions_file_path, scores)
306 | 	return scores
307 | 
308 | def topically_scale(filenames, texts, languages, embeddings, model_serialization_path, predictions_file_path, parameters, emb_lang = 'default', stopwords = []):
309 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "  Loading classifier...", flush=True)
310 | 	cnn_classifier, session = cnn.load_model(model_serialization_path, embeddings.lang_embeddings[emb_lang], loss_functions.softmax_cross_entropy,  just_predict = True)
311 | 	simp_trainer = trainer.SimpleTrainer(cnn_classifier, session, build_feed_dict_func, None, configuration_func = None)
312 | 
313 | 	classified_texts = {}
314 | 	items = list(zip(filenames, texts, [map_lang(x) for x in languages]))
315 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "  Topically classifying texts...", flush = True)
316 | 	for item in items:
317 | 		fn, text, lang = item
318 | 		print(fn, flush=True)
319 | 		# split text in sentences
320 | 		sentences = nltk.sent_tokenize(text)
321 | 		sents_clean = [data_helper.clean_str(s.strip()).split() for s in sentences]
322 | 		langs = [lang] * len(sentences)
323 | 		
324 | 		# preparing training examples
325 | 		x_test = data_shaper.prep_classification(sents_clean, None, embeddings, embeddings_language = emb_lang, multilingual_langs = langs, numbers_token = '<NUM/>', punct_token = '<PUNC/>', add_out_of_vocabulary_terms = False, dist_labels = cnn_classifier.dist_labels, max_seq_len = cnn_classifier.max_text_length)
326 | 		
327 | 		results = simp_trainer.test(list(zip(x_test, [None]*len(x_test))), parameters["batch_size"], batch_size_irrelevant = True, print_batches = True)
328 | 		
329 | 		pred_labs = get_prediction_labels(results, cnn_classifier.dist_labels)
330 | 		print("Predictions: ", flush=True)
331 | 		print(pred_labs, flush=True)
332 | 
333 | 		classified_texts[fn] = list(zip(sentences, pred_labs, langs))
334 | 
335 | 		print("Languages: " + str(langs), flush=True)	
336 | 		print("Done with classifying: " + fn, flush=True)
337 | 
338 | 	lines_to_write = []
339 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+ "  Topical scaling...", flush=True)
340 | 	for l in cnn_classifier.dist_labels:
341 | 		label_filtered = [(fn, classified_texts[fn][0][2], ' '.join([sent_label[0] for sent_label in classified_texts[fn] if sent_label[1] == l])) for fn in classified_texts]
342 | 		label_filtered = [x for x in label_filtered if len(x[2].strip()) > 50]
343 | 		if len(label_filtered) > 3:
344 | 			print("Topic: " + l, flush=True)
345 | 			fns = [x[0] for x in label_filtered]
346 | 			langs = [x[1] for x in label_filtered]
347 | 			filt_texts = [x[2] for x in label_filtered]
348 | 	
349 | 			for i in range(len(fns)):
350 | 				io_helper.write_list(os.path.dirname(predictions_file_path) + "/" + fns[i].split(".")[0] + "_" + l.replace(" ", "-") + ".txt", [filt_texts[i]])
351 | 
352 | 			label_scale = scale_efficient(fns, filt_texts, [inverse_map_lang(x) for x in langs], embeddings, None, parameters, emb_lang = emb_lang, stopwords = stopwords)
353 | 			lines_to_write.append("Scaling for class: " + l)
354 | 			lines_to_write.extend([k + " " + str(label_scale[k]) for k in label_scale])
355 | 			lines_to_write.append("\n")
356 | 		else:
357 | 			lines_to_write.append("Topic: " + l + ": Insufficient number of files contains text of this topic (i.e., class) in order to allow for scaling for the topic.")
358 | 			print("Topic: " + l + ": Insufficient number of files contains text of this topic (i.e., class) in order to allow for scaling for the topic.", flush=True)
359 | 	
360 | 	io_helper.write_list(predictions_file_path, lines_to_write)
361 | 	print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'  Topical Scaling is done!', flush=True)
362 | 
363 | 
364 | 


--------------------------------------------------------------------------------
/scaler.py:
--------------------------------------------------------------------------------
 1 | from embeddings import text_embeddings
 2 | import nlp
 3 | from helpers import io_helper
 4 | from sts import simple_sts 
 5 | from sys import stdin
 6 | import argparse
 7 | import os
 8 | from datetime import datetime
 9 | 
10 | supported_lang_strings = {"en" : "english", "fr" : "french", "de" : "german", "es" : "spanish", "it" : "italian"}
11 | 
12 | parser = argparse.ArgumentParser(description='Performs text scaling (assigns a score to each text on a linear scale).')
13 | parser.add_argument('datadir', help='A path to the directory containing the input text files for scaling (one score will be assigned per file).')
14 | parser.add_argument('embs', help='A path to the file containing pre-trained word embeddings')
15 | parser.add_argument('output', help='A file path to which to store the scaling results.')
16 | parser.add_argument('--emb_cutoff', help='Length of the embedding-dictionary to use.')
17 | parser.add_argument('--stopwords', help='A file to the path containing stopwords')
18 | 
19 | args = parser.parse_args()
20 | 
21 | if not os.path.isdir(os.path.dirname(args.datadir)):
22 | 	print("Error: Directory containing the input files not found.")
23 | 	exit(code = 1)
24 | 
25 | if not os.path.isfile(args.embs):
26 | 	print("Error: File containing pre-trained word embeddings not found.")
27 | 	exit(code = 1)
28 | 
29 | if not os.path.isdir(os.path.dirname(args.output)) and not os.path.dirname(args.output) == "":
30 | 	print("Error: Directory of the output file does not exist.")
31 | 	exit(code = 1)
32 | 
33 | if not args.emb_cutoff:
34 | 	args.emb_cutoff = None
35 | 	print("Note: Number of embeddings-cutoff is not provided, so we consider the entire vocabulary size.")
36 | else:
37 | 	args.emb_cutoff = int(args.emb_cutoff)
38 |     
39 | if args.stopwords and not os.path.isfile(args.stopwords):
40 | 	print("Error: File containing stopwords not found.")
41 | 	exit(code = 1)
42 | 
43 | files = io_helper.load_all_files(args.datadir)
44 | if len(files) < 4:
45 | 	print("Error: There need to be at least 4 texts for a meaningful scaling.")
46 | 	exit(code = 1)
47 | 
48 | filenames = [x[0] for x in files]
49 | texts = [x[1] for x in files]
50 | 
51 | wrong_lang = False
52 | languages = [x.split("\n", 1)[0].strip().lower() for x in texts]
53 | texts = [x.split("\n", 1)[1].strip().lower() for x in texts]
54 | for i in range(len(languages)):
55 | 	if languages[i] not in supported_lang_strings.keys() and languages[i] not in supported_lang_strings.values():
56 | 		print("The format of the file is incorrect, unspecified or unsupported language: " + str(filenames[i]))
57 | 		wrong_lang = True
58 | if wrong_lang:
59 | 	exit(code = 2)
60 | 
61 | langs = [(l if l in supported_lang_strings.values() else supported_lang_strings[l]) for l in languages]
62 | 
63 | if args.stopwords:
64 | 	stopwords = io_helper.load_lines(args.stopwords)
65 | else:
66 | 	stopwords = []
67 | 
68 | predictions_serialization_path = args.output
69 | 
70 | embeddings = text_embeddings.Embeddings()
71 | embeddings.load_embeddings(args.embs, limit = args.emb_cutoff, language = 'default', print_loading = True, skip_first_line = True)
72 | nlp.scale_efficient(filenames, texts, langs, embeddings, predictions_serialization_path, stopwords)
73 | print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Scaling completed.", flush = True)


--------------------------------------------------------------------------------
/sts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codogogo/topfish/6b3f5723029616cb430d6226bc59c013fe79eb78/sts/__init__.py


--------------------------------------------------------------------------------
/sts/simple_sts.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import nltk
  4 | import math
  5 | 
  6 | def build_tf_idf_indices(texts_tokenized):
  7 | 	idf_index = {}
  8 | 	tf_index = {}
  9 | 	for i in range(len(texts_tokenized)):
 10 | 		tf_index[i] = {}
 11 | 		for j in range(len(texts_tokenized[i])):
 12 | 			w = texts_tokenized[i][j]
 13 | 			if w not in tf_index[i]:
 14 | 				tf_index[i][w] = 1
 15 | 			else:
 16 | 				tf_index[i][w] += 1
 17 | 			if w not in idf_index:
 18 | 				idf_index[w] = {}
 19 | 			if i not in idf_index[w]:
 20 | 				idf_index[w][i] = 1
 21 | 		max_word_freq = max([tf_index[i][x] for x in tf_index[i]])
 22 | 		print("Max word freq: " + str(max_word_freq))
 23 | 		for w in tf_index[i]:
 24 | 			tf_index[i][w] = tf_index[i][w] / max_word_freq
 25 | 	for w in idf_index:
 26 | 		idf_index[w] = math.log(len(texts_tokenized) / len(idf_index[w]))
 27 | 	return tf_index, idf_index
 28 | 
 29 | def fix_tokenization(tokens):
 30 | 	punctuation = [".", ",", "!", ":", "?", ";", "-", ")", "(", "[", "]", "{", "}", "...", "/", "\\", "''", "\"", "'"]
 31 | 	for i in range(len(tokens)):
 32 | 		pcs = [p for p in punctuation if tokens[i].endswith(p)]
 33 | 		if (len(pcs) > 0):
 34 | 			tokens[i] = tokens[i][:-1]
 35 | 		pcs = [p for p in punctuation if tokens[i].startswith(p)]
 36 | 		if (len(pcs) > 0):
 37 | 			tokens[i] = tokens[i][1:]
 38 | 
 39 | def build_vocab(tokens, count_treshold = 1):
 40 | 	print("Building full vocabulary...")
 41 | 	full_vocab = {}
 42 | 	for t in tokens:	
 43 | 		if t in full_vocab:
 44 | 			full_vocab[t] = full_vocab[t] + 1
 45 | 		else:
 46 | 			full_vocab[t] = 1
 47 | 
 48 | 	print("Tresholding vocabulary...")
 49 | 	vocab = [x for x in full_vocab if full_vocab[x] >= count_treshold]
 50 | 	print("Vocabulary length: " + str(len(vocab)))
 51 | 	print("Building index dicts...")
 52 | 	dict = { x : vocab.index(x) for x in vocab }
 53 | 	inv_dict = { vocab.index(x) : x for x in vocab }
 54 | 	print("Building count dict...")	
 55 | 	counts = {  vocab.index(x) : full_vocab[x] for x in vocab }
 56 | 	
 57 | 	return (dict, inv_dict, counts)
 58 | 
 59 | def simple_tokenize(text, stopwords, lower = True, lang_prefix = None):
 60 | 	print("Tokenizing text...")
 61 | 	punctuation = [".", ",", "!", ":", "?", ";", "-", ")", "(", "[", "]", "{", "}", "...", "/", "\\", "''", "\"", "'"]
 62 | 	toks = [(x.strip().lower() if lower else x.strip()) for x in nltk.word_tokenize(text) if x.strip().lower() not in stopwords and x.strip().lower() not in punctuation]
 63 | 	#toks = [(x.strip().lower() if lower else x.strip()) for x in nltk.word_tokenize(text) if x.strip().lower() not in stopwords and x.strip().lower()]
 64 | 
 65 | 	fix_tokenization(toks)
 66 | 	
 67 | 	if lang_prefix:
 68 | 		toks = [lang_prefix + "__" + x for x in toks]
 69 | 	return toks
 70 | 
 71 | def aggregate_weighted_text_embedding(embeddings, tf_index, idf_index, lang = "default", weigh_idf = True):
 72 | 	agg_vec = np.zeros(embeddings.emb_sizes[lang])
 73 | 	for t in tf_index:
 74 | 		emb = embeddings.get_vector(lang, t)
 75 | 		if emb is not None:
 76 | 			if weigh_idf:
 77 | 				weight = tf_index[t] * idf_index[t]
 78 | 			else:
 79 | 				weight = tf_index[t]
 80 | 			agg_vec = np.add(agg_vec, np.multiply(weight, emb))
 81 | 	return agg_vec
 82 | 
 83 | def word_movers_distance(embeddings, first_tokens, second_tokens):
 84 | 	return embeddings.wmdistance(first_tokens, second_tokens)
 85 | 
 86 | def greedy_alignment_similarity(embeddings, first_doc, second_doc, lowest_sim = 0.3, length_factor = 0.1):		
 87 | 		print("Greedy aligning...")
 88 | 		first_vocab, first_vocab_inv, first_counts_cpy = first_doc
 89 | 		second_vocab, second_vocab_inv, second_counts_cpy = second_doc
 90 | 	
 91 | 		if len(first_vocab) == 0 or len(second_vocab) == 0:
 92 | 			return 0
 93 | 
 94 | 		first_counts = {x : first_counts_cpy[x] for x in first_counts_cpy }
 95 | 		second_counts = {x : second_counts_cpy[x] for x in second_counts_cpy}
 96 | 
 97 | 		#print("Computing actual document lengths...")		
 98 | 		len_first = sum(first_counts_cpy.values())
 99 | 		len_second = sum(second_counts_cpy.values())
100 | 		
101 | 		# similarity matrix computation
102 | 		matrix = np.zeros((len(first_vocab), len(second_vocab)))
103 | 		print("Computing the similarity matrix...")
104 | 		#print("Vocab. size first: " + str(len(first_vocab)) + "Vocab. size second: " + str(len(second_vocab)))
105 | 		cntr = 0
106 | 		for ft in first_vocab:
107 | 			cntr += 1
108 | 			#if cntr % 10 == 0:
109 | 			#	print("First vocab, item: " + str(cntr))
110 | 			first_index = first_vocab[ft]
111 | 			for st in second_vocab:
112 | 				second_index = second_vocab[st]
113 | 				sim = embeddings.word_similarity(ft, st, "default", "default")
114 | 				#print("Embedding similarity, " + ft + "::" + st + ": " + str(sim))
115 | 				matrix[first_index, second_index] = sim
116 | 
117 | 		# greedy alignment
118 | 		print("Computing the alignment...")
119 | 		greedy_align_sum = 0.0		
120 | 		counter_left_first = len_first
121 | 		counter_left_second = len_second
122 | 		tok_to_align = min(counter_left_first, counter_left_second)
123 | 		while counter_left_first > 0 and counter_left_second > 0:
124 | 			new_tok_to_align = min(counter_left_first, counter_left_second)
125 | 			if new_tok_to_align == tok_to_align or (tok_to_align - new_tok_to_align > 10000):
126 | 				tok_to_align = new_tok_to_align
127 | 				print("Left tokens to align: " + str(tok_to_align))
128 | 			ind = np.argmax(matrix.flatten())
129 | 			ind_src = ind // matrix.shape[1]
130 | 			ind_trg =  ind % matrix.shape[1]
131 | 
132 | 			simil = matrix[ind_src, ind_trg]
133 | 			#print("Similarity: " + str(simil))
134 | 
135 | 			#print("Index src: " + str(ind_src) + ", word src: " + str(first_vocab_inv[ind_src].encode(encoding='UTF-8', errors='ignore')))
136 | 			#print("Index trg: " + str(ind_trg) + ", word trg: " + str(second_vocab_inv[ind_trg].encode(encoding='UTF-8', errors='ignore')))
137 | 
138 | 			if simil < lowest_sim:
139 | 				break;
140 | 
141 | 			min_freq = min(first_counts[ind_src], second_counts[ind_trg])
142 | 			greedy_align_sum += simil * min_freq
143 | 			matrix[ind_src, ind_trg] = -2
144 | 			
145 | 			first_counts[ind_src] = first_counts[ind_src] - min_freq	
146 | 			second_counts[ind_trg] = second_counts[ind_trg] - min_freq
147 | 			
148 | 			if first_counts[ind_src] == 0:
149 | 				matrix[ind_src, :] = -2
150 | 			if second_counts[ind_trg] == 0:
151 | 				matrix[:, ind_trg] = -2
152 | 
153 | 			counter_left_first = counter_left_first - min_freq
154 | 			counter_left_second = counter_left_second - min_freq
155 | 				
156 | 		prec = greedy_align_sum / min(len_first, len_second)
157 | 		rec = greedy_align_sum / max(len_first, len_second)
158 | 		return (((1 - length_factor) * prec) + (length_factor * rec))


--------------------------------------------------------------------------------
/supervised-scaler.py:
--------------------------------------------------------------------------------
 1 | from embeddings import text_embeddings
 2 | import nlp
 3 | from helpers import io_helper
 4 | from sts import simple_sts 
 5 | from sys import stdin
 6 | import argparse
 7 | import os
 8 | from datetime import datetime
 9 | 
10 | supported_lang_strings = {"en" : "english", "fr" : "french", "de" : "german", "es" : "spanish", "it" : "italian"}
11 | 
12 | parser = argparse.ArgumentParser(description='Performs text scaling (assigns a score to each text on a linear scale).')
13 | parser.add_argument('datadir', help='A path to the directory containing the input text files for scaling (one score will be assigned per file).')
14 | parser.add_argument('embs', help='A path to the file containing pre-trained word embeddings')
15 | parser.add_argument('output', help='A file path to which to store the scaling results.')
16 | parser.add_argument('pivot1', help='First pivot')
17 | parser.add_argument('pivot2', help='Second pivot')
18 | parser.add_argument('--stopwords', help='A file to the path containing stopwords')
19 | 
20 | args = parser.parse_args()
21 | 
22 | if not os.path.isdir(os.path.dirname(args.datadir)):
23 | 	print("Error: Directory containing the input files not found.")
24 | 	exit(code = 1)
25 | 
26 | if not os.path.isfile(args.embs):
27 | 	print("Error: File containing pre-trained word embeddings not found.")
28 | 	exit(code = 1)
29 | 
30 | if not os.path.isdir(os.path.dirname(args.output)) and not os.path.dirname(args.output) == "":
31 | 	print("Error: Directory of the output file does not exist.")
32 | 	exit(code = 1)
33 | 
34 | if not os.path.isdir(os.path.dirname(args.pivot1)) and not os.path.dirname(args.pivot1) == "":
35 | 	print("Error: pivot1 does not exist.")
36 | 	exit(code = 1)    
37 | 
38 | if not os.path.isdir(os.path.dirname(args.pivot2)) and not os.path.dirname(args.pivot2) == "":
39 | 	print("Error: pivot2 does not exist.")
40 | 	exit(code = 1) 
41 |     
42 | if args.stopwords and not os.path.isfile(args.stopwords):
43 | 	print("Error: File containing stopwords not found.")
44 | 	exit(code = 1)
45 | 
46 | files = io_helper.load_all_files(args.datadir)
47 | if len(files) < 4:
48 | 	print("Error: There need to be at least 4 texts for a meaningful scaling.")
49 | 	exit(code = 1)
50 | 
51 | filenames = [x[0] for x in files]
52 | texts = [x[1] for x in files]
53 | 
54 | wrong_lang = False
55 | languages = [x.split("\n", 1)[0].strip().lower() for x in texts]
56 | texts = [x.split("\n", 1)[1].strip().lower() for x in texts]
57 | for i in range(len(languages)):
58 | 	if languages[i] not in supported_lang_strings.keys() and languages[i] not in supported_lang_strings.values():
59 | 		print("The format of the file is incorrect, unspecified or unsupported language: " + str(filenames[i]))
60 | 		wrong_lang = True
61 | if wrong_lang:
62 | 	exit(code = 2)
63 | 
64 | langs = [(l if l in supported_lang_strings.values() else supported_lang_strings[l]) for l in languages]
65 | 
66 | if args.stopwords:
67 | 	stopwords = io_helper.load_lines(args.stopwords)
68 | else:
69 | 	stopwords = []
70 | 
71 | predictions_serialization_path = args.output
72 | 
73 | pivot1 = args.pivot1
74 | pivot2 = args.pivot2
75 | 
76 | embeddings = text_embeddings.Embeddings()
77 | embeddings.load_embeddings(args.embs, limit = 1000000, language = 'default', print_loading = True, skip_first_line = True)
78 | nlp.scale_supervised(filenames, texts, langs, embeddings, predictions_serialization_path,pivot1,pivot2, stopwords)
79 | print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Scaling completed.", flush = True)
80 | 


--------------------------------------------------------------------------------
/wfcode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codogogo/topfish/6b3f5723029616cb430d6226bc59c013fe79eb78/wfcode/__init__.py


--------------------------------------------------------------------------------
/wfcode/corpus.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import numpy as np
  3 | import math
  4 | from scipy import spatial
  5 | import time
  6 | from sys import stdin
  7 | from datetime import datetime
  8 | 
  9 | class Corpus(object):
 10 | 	"""description of class"""
 11 | 	def __init__(self, documents, docpairs = None):
 12 | 		print("Loading corpus, received: " + str(len(documents)) + " docs.")
 13 | 		self.docs_raw = [d[1] for d in documents]
 14 | 		self.docs_names = [d[0] for d in documents]
 15 | 		self.punctuation = [".", ",", "!", ":", "?", ";", "-", ")", "(", "[", "]", "{", "}", "...", "/", "\\", u"``", "''", "\"", "'", "-", "$" ]
 16 | 		self.doc_pairs = docpairs
 17 | 		self.results = {}
 18 | 
 19 | 	def tokenize(self, stopwords = None, freq_treshold = 5):
 20 | 		self.stopwords = stopwords
 21 | 		print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Preprocessing corpus...", flush = True)
 22 | 		self.docs_tokens = [[tok.strip() for tok in nltk.word_tokenize(doc) if tok.strip() not in self.punctuation and len(tok.strip()) > 2] for doc in self.docs_raw]
 23 | 		#self.docs_tokens = [[tok.strip() for tok in nltk.word_tokenize(doc)] for doc in self.docs_raw]
 24 | 
 25 |         
 26 | 		self.freq_dicts = []
 27 | 		if self.stopwords is not None:
 28 | 			for i in range(len(self.docs_tokens)):
 29 | 				self.docs_tokens[i] = [tok.strip() for tok in self.docs_tokens[i] if tok.strip().lower() not in self.stopwords]	
 30 | 					
 31 | 	def build_occurrences(self):
 32 | 		print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Building vocabulary...", flush = True)
 33 | 		self.vocabulary = {} 
 34 | 		for dt in self.docs_tokens:
 35 | 			for t in dt:
 36 | 				if t not in self.vocabulary:
 37 | 					self.vocabulary[t] = len(self.vocabulary)
 38 | 
 39 | 		print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Building coocurrence matrix...", flush = True)
 40 | 		self.occurrences = np.ones((len(self.docs_tokens), len(self.vocabulary)), dtype = np.float32)
 41 | 		cnt = 0
 42 | 		for i in range(len(self.docs_tokens)):
 43 | 			cnt += 1
 44 | 			print(str(cnt) + "/" + str(len(self.docs_tokens)))
 45 | 			for j in range(len(self.docs_tokens[i])):
 46 | 				word = self.docs_tokens[i][j]
 47 | 				self.occurrences[i][self.vocabulary[word]] += 1
 48 | 		if np.isnan(self.occurrences).any():
 49 | 			raise ValueError("NaN in self.occurrences")
 50 | 
 51 | 	def set_doc_positions(self, positions):
 52 | 		for i in range(len(self.docs_names)):
 53 | 			self.results[self.docs_names[i]] = positions[i]
 54 | 
 55 | 	#def compute_semantic_similarities_aggregation(self, aggreg_sim_func, embeddings):
 56 | 	#	sims = []
 57 | 	#	if self.doc_pairs is None:
 58 | 	#		for i in range(len(self.docs_names) - 1):
 59 | 	#			for j in range(i + 1, len(self.docs_names)):
 60 | 	#				score = aggreg_sim_func(self.freq_dicts[i], self.freq_dicts[j], embeddings, self.docs_langs[i], self.docs_langs[j])
 61 | 	#				sims.append((self.docs_names[i], self.docs_names[j], score)) 
 62 | 	#	else:
 63 | 	#		for dp in self.doc_pairs:
 64 | 	#			i = self.docs_names.index(dp[0])
 65 | 	#			j = self.docs_names.index(dp[1])
 66 | 	#			score = aggreg_sim_func(self.freq_dicts[i], self.freq_dicts[j], embeddings, self.docs_langs[i], self.docs_langs[j])
 67 | 	#			sims.append((self.docs_names[i], self.docs_names[j], score)) 
 68 | 
 69 | 	#	self.raw_sims = sims
 70 | 	#	print("\n Sorted semantic similarities, s1: ")
 71 | 	#	self.raw_sims.sort(key=lambda tup: tup[2])
 72 | 	#	for s in self.raw_sims:
 73 | 	#		print(s[0], s[1], str(s[2]))
 74 | 			
 75 | #	def compute_semantic_similarities(self, doc_similarity_function, sent_similarity_function, embedding_similarity_function):
 76 | #		sims = []
 77 | #		#tasks = []		
 78 | #		start_time = time.time()
 79 | #		if self.doc_pairs is None:
 80 | #			for i in range(len(self.docs_names) - 1):
 81 | #				for j in range(i + 1, len(self.docs_names)):
 82 | #					#tasks.append((i, j))
 83 | #					#print(self.docs_names[i], self.docs_names[j])
 84 | #					score = doc_similarity_function(self.freq_dicts[i], self.freq_dicts[j], sent_similarity_function, embedding_similarity_function, self.docs_langs[i], self.docs_langs[j])
 85 | #					#sst = SemSimThread(doc_similarity_function, sent_similarity_function, embedding_similarity_function, self.docs_tokens[i], self.docs_tokens[j], self.docs_langs[i], self.docs_langs[j], "Thread-" + self.docs_names[i] + "-" + self.docs_names[j], 1)
 86 | #					#sst.start()
 87 | #					#sst.join()
 88 | #					sims.append((self.docs_names[i], self.docs_names[j], score[0], score[1]))
 89 | #					#print("Similarity: " + str(sst.result))
 90 | #		else:
 91 | #			for dp in self.doc_pairs:
 92 | #				i = self.docs_names.index(dp[0])
 93 | #				j = self.docs_names.index(dp[1])
 94 | #				print("Measuring similarity: " + dp[0] + " :: " + dp[1])
 95 | #				score = doc_similarity_function(self.freq_dicts[i], self.freq_dicts[j], sent_similarity_function, embedding_similarity_function, self.docs_langs[i], self.docs_langs[j])
 96 | #				print("Score: " + str(score[0]) + "; " + str(score[1]) + "; " + str(score[2]))
 97 | #				sims.append((self.docs_names[i], self.docs_names[j], score))
 98 | 
 99 | #		end_time = time.time()
100 | #		print("Time elapsed: " + str(end_time-start_time))
101 | 		
102 | #		#num_parallel = 10
103 | #		#num_batches = math.ceil((1.0*len(tasks)) / (1.0*num_parallel))
104 | #		#for i in range(num_batches):
105 | #		#	start_time = time.time()
106 | #		#	print("Batch: " + str(i+1) + "/" + str(num_batches))
107 | #		#	start_range = i * num_parallel
108 | #		#	end_range = (i+1)*num_parallel if (i+1)*num_parallel < len(tasks) else len(tasks)
109 | #		#	threads = [SemSimThread(doc_similarity_function, sent_similarity_function, embedding_similarity_function, self.docs_tokens[task[0]], self.docs_tokens[task[1]], self.docs_langs[task[0]], self.docs_langs[task[1]], self.docs_names[task[0]], self.docs_names[task[1]]) for task in tasks[start_range:end_range]]
110 | #		#	for thr in threads:
111 | #		#		thr.start()
112 | #		#	for thr in threads:
113 | #		#		thr.join()
114 | #		#	print("Thread results: ")
115 | #		#	for thr in threads:
116 | #		#		print(thr.threadID + " " + str(thr.result))	
117 | #		#		sims.append((thr.first_name, thr.second_name, thr.result))
118 | #		#	end_time = time.time()
119 | #		#	print("Time elapsed: " + str(end_time-start_time))
120 | 		
121 | 
122 | #		#sim = parallel(delayed(doc_similarity_function)) doc_similarity_function(self.docs_tokens[i], self.docs_tokens[j], sent_similarity_function, embedding_similarity_function, self.docs_langs[i], self.docs_langs[j])
123 | #		#sims = [(self.docs_names[tasks[i][0]], self.docs_names[tasks[i][1]], threads[i].result) for i in range(len(tasks))]
124 | 
125 | #		#min_sim = min([x[2] for x in sims])
126 | #		#max_sim = max([x[2] for x in sims])
127 | #		self.raw_sims = sims
128 | #		#self.similarities = [(x[0], x[1], (x[2] - min_sim)/(max_sim - min_sim)) for x in sims]
129 | 
130 | #		print("\n Sorted semantic similarities, s1: ")
131 | #		#self.raw_sims.sort(key=lambda tup: tup[2][0])
132 | #		for s in self.raw_sims:
133 | #			print(s[0], s[1], str(s[2][0]), str(s[2][1]), str(s[2][2]))
134 | 
135 | #	def compute_term_similarities(self):
136 | #		sims = []
137 | #		for i in range(len(self.docs_names) - 1):
138 | #			for j in range(i + 1, len(self.docs_names)):
139 | #				print(self.docs_names[i], self.docs_names[j])
140 | #				sim = 1 - spatial.distance.cosine(self.tf_idf_vectors[i], self.tf_idf_vectors[j])
141 | #				print("Term-based similarity: " + str(sim))
142 | #				sims.append((self.docs_names[i], self.docs_names[j], sim))
143 | #		min_sim = min([x[2] for x in sims])
144 | #		max_sim = max([x[2] for x in sims])
145 | #		self.raw_sims = sims
146 | #		self.similarities = [(x[0], x[1], (x[2] - min_sim)/(max_sim - min_sim)) for x in sims]
147 | 
148 | #		print("\n Sorted tf-idf similarities: ")
149 | #		self.raw_sims.sort(key=lambda tup: tup[2])
150 | #		for s in self.raw_sims:
151 | #			print(s[0], s[1], str(s[2]))
152 | 
153 | #def most_dissimilar_vector(nodes, edges):
154 | #	vec = []
155 | #	min_score = min([x[2] for x in edges])
156 | #	min_pair = ([x for x in edges if x[2] == min_score])[0]
157 | #	first_added = False
158 | #	for i in range(len(nodes)):
159 | #		if nodes[i] == min_pair[0] or nodes[i] == min_pair[1]:
160 | #			vec.append(-1 if first_added else 1)
161 | #			if not first_added:
162 | #				first_added = True 
163 | #		else: 
164 | #			vec.append(0)
165 | #	return vec
166 | 				
167 | 				
168 | 					 
169 | 		
170 | 			
171 | 
172 | 
173 | 


--------------------------------------------------------------------------------
/wfcode/scaler.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import math
  3 | 
  4 | class LinearScaler(object):
  5 | 	def __init__(self, items, sims):
  6 | 		self.items = items
  7 | 		self.sims = sims
  8 | 
  9 | 	def scale(self):
 10 | 		minsim = min([x[2] for x in self.sims])
 11 | 		minsim_edge = ([x for x in self.sims if x[2] == minsim])[0]		
 12 | 		ind_first = self.items.index(minsim_edge[0])
 13 | 		ind_second = self.items.index(minsim_edge[1])
 14 | 
 15 | 		# linear interpolation scaling
 16 | 		scales = {}
 17 | 		for i in range(len(self.items)):
 18 | 			if i != ind_first and i != ind_second:
 19 | 				sim_minus = ([x[2] for x in self.sims if (x[0] == minsim_edge[1] and x[1] == self.items[i]) or (x[1] == minsim_edge[1] and x[0] == self.items[i])])[0]
 20 | 				sim_plus = ([x[2] for x in self.sims if (x[0] == minsim_edge[0] and x[1] == self.items[i]) or (x[1] == minsim_edge[0] and x[0] == self.items[i])])[0]
 21 | 				scales[self.items[i]] = (-1 * sim_minus) / (sim_minus + sim_plus) + sim_plus / (sim_minus + sim_plus)
 22 | 			elif i == ind_first:
 23 | 				scales[self.items[i]] = 1
 24 | 			elif i == ind_second:
 25 | 				scales[self.items[i]] = -1
 26 | 		return scales
 27 | 
 28 | class WordfishScaler(object):
 29 | 	"""implementation of a WordFish-like scaling"""
 30 | 	def __init__(self, corpus):
 31 | 		self.corpus = corpus
 32 | 		self.num_docs = len(self.corpus.docs_raw)
 33 | 		self.num_words = len(self.corpus.vocabulary)
 34 | 		
 35 | 		self.alpha_docs = np.zeros(self.num_docs)
 36 | 		self.theta_docs = np.zeros(self.num_docs)
 37 | 		self.beta_words = np.zeros(self.num_words)
 38 | 		self.psi_words = np.zeros(self.num_words)
 39 | 		self.log_expectations = np.zeros((self.num_docs, self.num_words))
 40 | 
 41 | 	def initialize(self):
 42 | 		print("Initializing...")
 43 | 		# Setting initial values for word fixed effects (psi)			
 44 | 		self.psi_words =  np.log(np.average(self.corpus.occurrences, axis = 0))
 45 | 				
 46 | 		# Setting initial values for document fixed effects (Alphas) 
 47 | 		counts = np.sum(self.corpus.occurrences, axis = 1)
 48 | 		self.alpha_docs = np.log(np.multiply(counts, 1.0 / counts[0]))
 49 | 		print("Alpha docs: ")
 50 | 		print(self.alpha_docs)
 51 | 
 52 | 		# Setting initial values for betas and omegas
 53 | 		matrix = np.log(np.transpose(self.corpus.occurrences)) - np.transpose(np.repeat(np.expand_dims(self.psi_words, 0), self.num_docs, axis = 0)) - np.repeat(np.expand_dims(self.alpha_docs, 0), self.num_words, axis = 0)
 54 | 		u, s, v = np.linalg.svd(matrix, full_matrices = False, compute_uv = True)
 55 | 		self.beta_words = u[:,0]
 56 | 		self.theta_docs = v[0,:]
 57 | 
 58 | 	def normalize_positions(self):
 59 | 		self.alpha_docs[0] = 0
 60 | 		self.theta_docs = np.divide((self.theta_docs - np.full((1, self.num_docs), np.mean(self.theta_docs))), np.full((1, self.num_docs), np.std(self.theta_docs)))
 61 | 		self.theta_docs = self.theta_docs[0] 
 62 | 
 63 | 	def train(self, learning_rate, num_iters):
 64 | 		print("Training...")
 65 | 		# Computing the objective and also refreshing lambdas (log-likelihoods) for all pairs of word-document
 66 | 		self.normalize_positions()
 67 | 		obj_score = self.objective() 
 68 | 		print("Initial objective score: " + str(obj_score))
 69 | 
 70 | 		for i in range(num_iters):
 71 | 			# Updating document parameters
 72 | 			alpha_grads, theta_grads = self.gradients_docs()
 73 | 			self.alpha_docs = self.alpha_docs - np.multiply(alpha_grads, learning_rate / self.num_words)
 74 | 			self.theta_docs = self.theta_docs - np.multiply(theta_grads, learning_rate / self.num_words)
 75 | 
 76 | 			self.normalize_positions()
 77 | 
 78 | 			#obj_score = self.objective() 
 79 | 			#if i % 100 == 0: print("Iteration (primary) " + str(i+1) + ": " + str(obj_score))
 80 | 						
 81 | 			# Updating word parameters
 82 | 			beta_grads, psi_grads = self.gradients_words()
 83 | 			self.beta_words = self.beta_words - np.multiply(beta_grads, learning_rate / self.num_docs)
 84 | 			self.psi_words = self.psi_words - np.multiply(psi_grads, learning_rate / self.num_docs)
 85 | 
 86 | 			obj_score = self.objective() 
 87 | 			if i % 100 == 0:
 88 | 				print("Iteration (secondary) " + str(i+1) + ": " + str(obj_score))
 89 | 
 90 | 		self.normalize_positions()
 91 | 		self.corpus.set_doc_positions(self.theta_docs)
 92 | 		
 93 | 	def objective(self): 
 94 | 		self.log_expectations = self.log_expectation()
 95 | 		return -1 * np.sum(np.multiply(self.corpus.occurrences, self.log_expectations) - np.exp(self.log_expectations))
 96 | 
 97 | 	def log_expectation(self):
 98 | 		return np.transpose(np.repeat(np.expand_dims(self.alpha_docs, 0), self.num_words, axis = 0)) + np.repeat(np.expand_dims(self.psi_words, 0), self.num_docs, axis = 0) + np.outer(self.theta_docs, self.beta_words)
 99 | 		
100 | 	def gradients_words(self):
101 | 		psi_grads = np.sum(np.exp(self.log_expectations) - self.corpus.occurrences, axis = 0)	
102 | 		beta_grads = np.sum(np.multiply(np.exp(self.log_expectations) - self.corpus.occurrences, np.transpose(np.repeat(np.expand_dims(self.theta_docs, 0), self.num_words, axis = 0))), axis = 0)
103 | 		return [beta_grads, psi_grads]
104 | 
105 | 	def gradients_docs(self):
106 | 		alpha_grads = np.sum(np.exp(self.log_expectations) - self.corpus.occurrences, axis = 1)	
107 | 		theta_grads = np.sum(np.multiply(np.exp(self.log_expectations) - self.corpus.occurrences, np.repeat(np.expand_dims(self.beta_words, 0), self.num_docs, axis = 0)), axis = 1)
108 | 		return [alpha_grads, theta_grads]


--------------------------------------------------------------------------------
/wordfish.py:
--------------------------------------------------------------------------------
 1 | from helpers import io_helper
 2 | from wfcode import corpus
 3 | from wfcode import scaler
 4 | import argparse
 5 | import os
 6 | from datetime import datetime
 7 | 
 8 | parser = argparse.ArgumentParser(description='Trains a model for classifying lexico-semantic relations.')
 9 | parser.add_argument('datadir', help='A path to the directory containing the input text files for scaling (one score will be assigned per file).')
10 | parser.add_argument('output', help='A file path to which to store the scaling results.')
11 | parser.add_argument('--stopwords', help='A file to the path containing stopwords')
12 | parser.add_argument('-f', '--freqthold', type=int, help='A frequency threshold -- all words appearing less than -ft times will be ignored (default 2)')
13 | parser.add_argument('-l', '--learnrate', type=float, help='Learning rate value (default = 0.00001)')
14 | parser.add_argument('-t', '--trainiters', type=int, help='Number of optimization iterations (default = 5000)')
15 | 
16 | args = parser.parse_args()
17 | 
18 | if args.trainiters:
19 | 	niter = args.trainiters
20 | else:
21 | 	niter = 5000
22 | 
23 | if args.learnrate:
24 | 	lr = args.learnrate
25 | else:
26 | 	lr = 0.00001
27 | 
28 | if args.freqthold:
29 | 	ft = args.freqthold
30 | else:
31 | 	ft = 2
32 | 
33 | if not os.path.isdir(os.path.dirname(args.datadir)):
34 | 	print("Error: Directory containing the input files not found.")
35 | 	exit(code = 1)
36 | 
37 | if not os.path.isdir(os.path.dirname(args.output)) and not os.path.dirname(args.output) == "":
38 | 	print("Error: Directory of the output file does not exist.")
39 | 	exit(code = 1)
40 | 
41 | if args.stopwords and not os.path.isfile(args.stopwords):
42 | 	print("Error: File containing stopwords not found.")
43 | 	exit(code = 1)
44 | 
45 | if args.stopwords:
46 | 	stopwords = io_helper.load_file_lines(args.stopwords)
47 | else:
48 | 	stopwords = None
49 | 
50 | files = io_helper.load_all_files(args.datadir)
51 | corp = corpus.Corpus(files)	
52 | corp.tokenize(stopwords = stopwords, freq_treshold = ft)
53 | corp.build_occurrences()
54 | 
55 | wf_scaler = scaler.WordfishScaler(corp)
56 | wf_scaler.initialize()
57 | wf_scaler.train(learning_rate = lr, num_iters = niter)
58 | 
59 | print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " WordFish scaling completed.", flush = True)
60 | 
61 | scale = []
62 | for x in corp.results:
63 | 	scale.append(str(x) + "\t" + str(corp.results[x]))
64 | io_helper.write_list(args.output, scale)
65 | 


--------------------------------------------------------------------------------