├── features ├── __init__.py └── features.py ├── preproc ├── __init__.py ├── error_ana.py ├── log_utils.py ├── plot_utils.py ├── fnc_data_splits.py ├── batch.py ├── data_reader.py ├── vocab.py └── map.py ├── constants.py ├── README.md ├── data └── download_data.sh ├── mtl ├── tensoriser.py ├── nn.py └── training.py └── main.py /features/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /preproc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /constants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Constants shared across modules. 3 | """ 4 | 5 | STANCE = "semeval2016-task6-stance" 6 | FNC = "fakenewschallenge" 7 | NLI = "multinli" 8 | TOPIC = "topic-based" 9 | TOPIC_5WAY = "topic-based-5way" 10 | LAPTOP = "absa-laptops" 11 | RESTAURANT = "absa-restaurants" 12 | TARGET = "target-dependent" 13 | TASKS = [STANCE, FNC, NLI, TOPIC, TOPIC_5WAY, LAPTOP, RESTAURANT, TARGET] 14 | RNN_CELL_TYPES = ["lstm", "phased_lstm", "layer_norm", "nas"] # LSTM, plus the RNN cell types in Tensorflow interchangable with it 15 | 16 | TASK_NAMES_SHORT = {"semeval2016-task6-stance": "STANCE", "fakenewschallenge": "FNC", "topic-based": "TOPIC", "multinli": "NLI", 17 | "topic-based-5way": "TOPIC_5WAY", "absa-laptops": "LAPTOP", "absa-restaurants":"RESTAURANT", "target-dependent": "TARGET"} 18 | 19 | STANCE_LABELS = ['AGAINST', 'FAVOR', 'NONE'] 20 | FNC_LABELS = ['agree', 'disagree', 'discuss', 'unrelated'] 21 | NLI_LABELS = ['contradiction', 'entailment', 'neutral'] 22 | TOPIC_LABELS = ['negative', 'positive'] 23 | TOPIC_5WAY_LABELS = [-2.0, -1.0, 0.0, 1.0, 2.0] 24 | ABSA_LABELS = ['negative', 'neutral', 'positive'] 25 | TARGET_LABELS = ['-1', '0', '1'] 26 | 27 | SIM = 'similarity' 28 | DIV = 'diversity' 29 | NONE = 'predsonly' 30 | SIMILARITY_FEATURES = ['jensen-shannon', 'renyi', 'cosine', 'euclidean', 31 | 'variational', 'bhattacharyya'] 32 | DIVERSITY_FEATURES = ['num_word_types', 'type_token_ratio', 'entropy', 33 | 'simpsons_index', 'renyi_entropy'] 34 | # we don't use 'quadratic_entropy' at the moment, as it requires word vectors 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mtl-disparate 2 | Code for NAACL 2018 paper ["Multi-task Learning of Pairwise Sequence Classification Tasks Over Disparate Label Spaces"](https://arxiv.org/abs/1802.09913) by Isabelle Augenstein, Sebastian Ruder, Anders Søgaard 3 | 4 | Note that this is research code and will not be maintained to e.g. ensure compatibility with more recent library versions. 5 | 6 | 7 | Requirements: 8 | 9 | - Tensorflow 1.5 10 | - Numpy 1.12.1 11 | - sklearn 0.18.1 12 | - scipy 13 | 14 | Steps to run: 15 | 16 | - run data/download_data.sh to download and extract data 17 | - preproc/data_reader.py tests if all the data readers work 18 | - preproc/fnc_data_splits.py to split the FNC training dataset into a training and dev set 19 | - main.py trains models 20 | 21 | # Datasets 22 | 23 | ## SemEval 2016 Task 6 Stance detection 24 | 25 | - [Task website](http://alt.qcri.org/semeval2016/task6/) 26 | 27 | ## Fake News Challenge (FNC) 28 | 29 | - [Task website](http://www.fakenewschallenge.org/) 30 | 31 | ## Multi-NLI 32 | 33 | - [Task website](http://www.nyu.edu/projects/bowman/multinli/) 34 | 35 | ## SemEval 2016 Task 4 Subtask B Topic-based Twitter sentiment analysis 36 | 37 | - [Task website](http://alt.qcri.org/semeval2016/task4/) 38 | - [Task description paper](https://aclweb.org/anthology/S/S16/S16-1001.pdf) 39 | - Note: Same dataset was used as rerun in [2017](http://alt.qcri.org/semeval2017/task4/) 40 | 41 | ## SemEval 2016 Task 5 Subtask 1 Slot 3 Aspect-based sentiment analysis 42 | 43 | - [Task website](http://alt.qcri.org/semeval2016/task5/) 44 | 45 | ## Clickbait Challenge 2017 46 | 47 | - [Task website](http://www.clickbait-challenge.org/) -------------------------------------------------------------------------------- /preproc/error_ana.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import defaultdict 3 | 4 | def count_overlap(file): 5 | with open(file, "r") as indsf: 6 | indsmap = defaultdict(dict) 7 | for l in indsf: 8 | if len(l.split("\t")) == 3: 9 | task, model, inds = l.strip("\n").split("\t") 10 | indsmap[task][model] = inds.split(" ") 11 | else: 12 | task, model, iter, inds = l.strip("\n").split("\t") 13 | indsmap[task + "_" + iter][model] = inds.split(" ") 14 | for task, entries in indsmap.items(): 15 | main_correct = 0.0 16 | relabel_correct = 0.0 17 | both_correct = 0.0 18 | both_incorrect = 0.0 19 | len_gold = len(indsmap[task]["Gold"]) 20 | all = float(len_gold) 21 | for i in range(0, len_gold): 22 | if (indsmap[task]["Gold"][i] == indsmap[task]["Relabel model"][i]) and (indsmap[task]["Relabel model"][i] == indsmap[task]["Main model"][i]): 23 | both_correct += 1 24 | elif (indsmap[task]["Relabel model"][i] == indsmap[task]["Main model"][i]) and (indsmap[task]["Main model"][i] != indsmap[task]["Gold"][i]): 25 | both_incorrect += 1 26 | elif indsmap[task]["Gold"][i] == indsmap[task]["Relabel model"][i]: 27 | relabel_correct += 1 28 | else: 29 | main_correct += 1 30 | rate_both_correct = (both_correct/all) 31 | rate_both_incorect = (both_incorrect / all) 32 | rate_relab_correct = (relabel_correct / all) 33 | rate_main_correct = (main_correct / all) 34 | prop_main = rate_main_correct / (rate_both_correct + rate_relab_correct + rate_main_correct) 35 | prop_relab = rate_relab_correct / (rate_both_correct + rate_relab_correct + rate_main_correct) 36 | print(task, "Rate both correct", str(rate_both_correct)) 37 | print(task, "Rate both incorrect", str(rate_both_incorect)) 38 | print(task, "Rate only relabel correct", str(rate_relab_correct)) 39 | print(task, "Rate only main correct", str(rate_main_correct)) 40 | print(task, "Prop main", str(prop_main * 100)) 41 | print(task, "Prop relab", str(prop_relab * 100)) 42 | 43 | 44 | if __name__ == "__main__": 45 | #reformat_log_tabs() 46 | dirpath = "../" 47 | files = os.listdir(dirpath) 48 | for f in files: 49 | if f.endswith("_inds.txt"): 50 | if not "learningcurve" in f: 51 | continue 52 | if not "label-transfer" in f: 53 | continue 54 | if not "multi" in f: 55 | continue 56 | print("Reading file", f) 57 | count_overlap(os.path.join(dirpath, f)) 58 | print("") -------------------------------------------------------------------------------- /data/download_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Download the SemEval 2016 Task 6 Stance detection dataset 4 | mkdir semeval2016-task6-stance ; cd semeval2016-task6-stance 5 | wget http://alt.qcri.org/semeval2016/task6/data/uploads/stancedataset.zip 6 | wget http://alt.qcri.org/semeval2016/task6/data/uploads/semeval2016-task6-trialdata.txt 7 | curl -L "https://drive.google.com/uc?export=download&id=0B2Z1kbILu3YtenFDUzM5dGZEX2s" > downloaded_Donald_Trump.txt 8 | unzip stancedataset.zip -d . ; mv StanceDataset/* . 9 | rm stancedataset.zip ; rm -r StanceDataset __MACOSX 10 | cd .. 11 | 12 | # Download the Fake News Challenge datset 13 | mkdir fakenewschallenge ; cd fakenewschallenge 14 | wget https://raw.githubusercontent.com/FakeNewsChallenge/fnc-1/master/competition_test_stances.csv 15 | wget https://raw.githubusercontent.com/FakeNewsChallenge/fnc-1/master/competition_test_bodies.csv 16 | wget https://github.com/FakeNewsChallenge/fnc-1/archive/master.zip 17 | unzip master.zip -d . ; mv fnc-1-master/* . 18 | rm -r fnc-1-master ; rm master.zip 19 | cd .. 20 | 21 | # Download the Multi-NLI dataset 22 | mkdir multinli ; cd multinli 23 | wget http://www.nyu.edu/projects/bowman/multinli/multinli_0.9.zip 24 | unzip multinli_0.9.zip -d . ; mv multinli_0.9/* . 25 | rm multinli_0.9.zip ; rm -r multinli_0.9 26 | cd .. 27 | 28 | # Download the SemEval 2016 Task 4 Subtask B Topic-based Twitter sentiment analysis dataset 29 | mkdir semeval2016-task4b-topic-based-sentiment ; cd semeval2016-task4b-topic-based-sentiment 30 | curl -L "https://drive.google.com/uc?export=download&id=0B3emjZ5O5vDtSGpKcjQ3cnhldmc" > semeval2016_task4b_topic-based_sentiment.zip 31 | unzip semeval2016_task4b_topic-based_sentiment.zip -d . 32 | rm semeval2016_task4b_topic-based_sentiment.zip 33 | cd .. 34 | 35 | # Download the SemEval 2016 Task 4 Subtask C Topic-based 5-way Twitter sentiment analysis dataset 36 | mkdir semeval2016-task4c-topic-based-sentiment ; cd semeval2016-task4c-topic-based-sentiment 37 | curl -L "https://drive.google.com/uc?export=download&id=1eS67x5vedrzVVk-tcyKSrumigbJKuqH-" > semeval2016_task4c_topic-based_sentiment.zip 38 | unzip semeval2016_task4c_topic-based_sentiment.zip -d . 39 | rm semeval2016_task4c_topic-based_sentiment.zip 40 | cd .. 41 | 42 | # Download the SemEval 2016 Task 5 Aspect-based sentiment analysis dataset 43 | mkdir semeval2016-task5-absa-english ; cd semeval2016-task5-absa-english 44 | curl -L "https://drive.google.com/uc?export=download&id=0B3emjZ5O5vDtbTJnUHRIdFBULTg" > semeval2016_task5_absa_english.zip 45 | unzip semeval2016_task5_absa_english.zip -d . 46 | rm semeval2016_task5_absa_english.zip 47 | cd .. 48 | 49 | # Download the target-dependent sentiment analysis dataset of Dong et al. (2014): 50 | # Adaptive Recursive Neural Network for Target-dependent Twitter Sentiment Classification 51 | mkdir target-dependent ; cd target-dependent 52 | curl -L "https://drive.google.com/uc?export=download&id=0B3emjZ5O5vDtTW1SZjItWFlxUUU" > target_dependent.zip 53 | unzip target_dependent.zip -d . 54 | rm target_dependent.zip 55 | cd .. 56 | -------------------------------------------------------------------------------- /preproc/log_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility methods for logging and analyzing results. 3 | """ 4 | 5 | from collections import defaultdict 6 | from datetime import datetime 7 | import numpy as np 8 | 9 | from sklearn.metrics import recall_score, mean_absolute_error, f1_score,\ 10 | accuracy_score 11 | 12 | from constants import * 13 | import os 14 | 15 | 16 | FORMAT = '%Y-%m-%d-%H%M%S' 17 | 18 | RECALL = 'recall' 19 | MAE = 'mae' 20 | TOPIC_BASED_SCORES = [RECALL, MAE] 21 | 22 | def stance_postproc_init(vocab): 23 | inds = [] 24 | for id, tok in vocab.id2sym.items(): 25 | if "trump" in tok or "donald" in tok: 26 | inds.append(id) 27 | return inds 28 | 29 | def postproc_stance(inds, placeholders, batch, p): 30 | for i, b in enumerate(batch[placeholders["seq1"]]): 31 | in_ind = False 32 | for ind in inds: 33 | if ind in batch[placeholders["seq1"]][i]: 34 | in_ind = True 35 | break 36 | # labels are always: AGAINST, FAVOR, NONE 37 | if in_ind: 38 | if p[i][0] > p[i][1]: 39 | p[i][0] = 1.0 40 | else: 41 | p[i][1] = 1.0 42 | return p 43 | 44 | 45 | def task2score(task, y_true, y_pred, topics): 46 | if task == STANCE: 47 | return macro_averaged_pos_neg_f1_score(y_true, y_pred) 48 | if task == TOPIC: 49 | return topic_based_macro_averaged_score(y_true, y_pred, topics, RECALL) 50 | if task == TOPIC_5WAY: 51 | return topic_based_macro_averaged_score(y_true, y_pred, topics, MAE) 52 | if task in [LAPTOP, RESTAURANT]: 53 | return accuracy_score(y_true, y_pred) 54 | if task in [TARGET]: 55 | return f1_score(y_true, y_pred, average='macro') 56 | return f1_score(y_true, y_pred, average='micro') 57 | 58 | 59 | def macro_averaged_pos_neg_f1_score(y_true, y_pred): 60 | """Compute the macro-average of the favor and against F1 scores for stance 61 | detection.""" 62 | # order of labels is AGAINST, FAVOR, NONE 63 | f1_scores = f1_score(y_true, y_pred, average=None) 64 | return np.mean([f1_scores[0], f1_scores[1]]) 65 | 66 | 67 | def topic_based_macro_averaged_score(y_true, y_pred, topics, score): 68 | """ 69 | Compute score macro-averaged across topics. Score is macro-averaged recall 70 | for subtask B and mean absolute error for subtask C. 71 | """ 72 | assert score in TOPIC_BASED_SCORES, 'Error: %s is not valid.' % score 73 | scores = [] 74 | topic2y_true = defaultdict(list) 75 | topic2y_pred = defaultdict(list) 76 | 77 | # aggregate the labels and predictions for each topic 78 | for y_t, y_p, topic in zip(y_true, y_pred, topics): 79 | topic2y_true[str(topic)].append(y_t) 80 | topic2y_pred[str(topic)].append(y_p) 81 | 82 | for topic in topic2y_true.keys(): 83 | y_true_topic = topic2y_true[str(topic)] 84 | y_pred_topic = topic2y_pred[str(topic)] 85 | if score == RECALL: 86 | score_value = recall_score(y_true_topic, y_pred_topic, average='macro') 87 | else: 88 | # for MAE, we have to manually perform macro-averaging 89 | # labels are 0-4 and correspond to original labels -2,-1,0,1,2 90 | temp_scores = [] 91 | for label_id in range(5): 92 | true_pred_pairs = [(y_t, y_p) for y_t, y_p in 93 | zip(y_true_topic, y_pred_topic) 94 | if y_t == label_id] 95 | if len(true_pred_pairs) == 0: 96 | # some topics do not appear with a certain label 97 | continue 98 | y_true_temp, y_pred_temp = zip(*true_pred_pairs) 99 | temp_scores.append(mean_absolute_error(y_true_temp, y_pred_temp)) 100 | score_value = np.mean(temp_scores) 101 | scores.append(score_value) 102 | return np.mean(scores) 103 | 104 | 105 | def log_results(options, task_score, f1_score, relabel_score, task): 106 | """ 107 | Log the results to a file. 108 | :param options: the options used as input to the script 109 | :param task_score: the task-specific score achieved on the test set 110 | :param f1_score: the micro-averaged f1 score achieved on the test set 111 | :param relabel_score: the score achieved by the relabeling function on the 112 | test set 113 | :param task: the task the model was evaluated on 114 | """ 115 | with open(options['log_file'], 'a') as f: 116 | print('Writing results to %s...' % options['log_file']) 117 | f.write('%s\t%s\t%.4f\t%.4f\t%.4f\t%s\n' % 118 | (datetime.now().strftime(FORMAT), task, task_score, f1_score, 119 | relabel_score, ' '.join(['%s=%s' % (opt, options[opt]) 120 | for opt in options.keys()]))) 121 | -------------------------------------------------------------------------------- /preproc/plot_utils.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | import matplotlib.patches as mpatches 5 | from sklearn.manifold import TSNE 6 | from sklearn.decomposition import PCA 7 | import tensorflow as tf 8 | import numpy as np 9 | 10 | from constants import FNC, STANCE, NLI, TOPIC, LAPTOP, RESTAURANT, TARGET,\ 11 | TOPIC_5WAY, STANCE_LABELS, FNC_LABELS, NLI_LABELS, TOPIC_LABELS, \ 12 | TOPIC_5WAY_LABELS, ABSA_LABELS, TARGET_LABELS 13 | 14 | 15 | def task2labels(task): 16 | if task == STANCE: 17 | return STANCE_LABELS 18 | if task == FNC: 19 | return FNC_LABELS 20 | if task == NLI: 21 | return NLI_LABELS 22 | if task == TOPIC: 23 | return TOPIC_LABELS 24 | if task == TOPIC_5WAY: 25 | return TOPIC_5WAY_LABELS 26 | if task in [LAPTOP, RESTAURANT]: 27 | return ABSA_LABELS 28 | if task == TARGET: 29 | return TARGET_LABELS 30 | raise ValueError('No labels available for task %s.' % task) 31 | 32 | 33 | def task2display_name(task): 34 | if task == STANCE: 35 | return 'Stance' 36 | if task == FNC: 37 | return 'FNC-1' 38 | if task == NLI: 39 | return 'MultiNLI' 40 | if task == TOPIC: 41 | return 'Topic-2' 42 | if task == TOPIC_5WAY: 43 | return 'Topic-5' 44 | if task == LAPTOP: 45 | return 'ABSA-L' 46 | if task == RESTAURANT: 47 | return 'ABSA-R' 48 | if task == TARGET: 49 | return 'Target' 50 | raise ValueError('%s is not a valid task.' % task) 51 | 52 | 53 | def task2color(task): 54 | if task == TOPIC: 55 | return 'forestgreen' 56 | if task == TOPIC_5WAY: 57 | return 'yellowgreen' 58 | if task == LAPTOP: 59 | return 'cornflowerblue' 60 | if task == RESTAURANT: 61 | return 'mediumblue' 62 | if task == STANCE: 63 | return 'midnightblue' 64 | if task == TARGET: 65 | return 'saddlebrown' 66 | if task == FNC: 67 | return 'darkgoldenrod' 68 | if task == NLI: 69 | return 'slategray' 70 | raise ValueError('%s is not available.' % task) 71 | 72 | 73 | def label2display_name(label): 74 | if label in ['AGAINST', 'FAVOR', 'NONE']: 75 | return label.lower() 76 | try: 77 | label = float(label) 78 | if label == 0: 79 | return 'neutral' 80 | if label == -1: 81 | return 'negative' 82 | if label == -2: 83 | return 'highly negative' 84 | if label == 1: 85 | return 'positive' 86 | if label == 2: 87 | return 'highly positive' 88 | except: 89 | return label 90 | return label 91 | 92 | 93 | def plot_label_embeddings(sess, tasks, label_vocab): 94 | var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "label_embeddings/label_embeddings") 95 | assert len(var_list) > 0, 'Error: Label embeddings have not been saved.' 96 | assert len(var_list) == 1 97 | 98 | label_embeddings = sess.run(var_list[0]) 99 | print('Loaded label embeddings of shape:', label_embeddings.shape) 100 | 101 | assert label_vocab is not None 102 | 103 | # remove the UNK label of the label embeddings 104 | label_embeddings = label_embeddings[1:, :] 105 | 106 | colors = ['red', 'blue', 'green', 'purple', 'orange', 'olive', 'cyan', 'brown'] 107 | 108 | # tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=3000) 109 | pca = PCA(n_components=2) 110 | 111 | label_embeddings_tsne = pca.fit_transform(label_embeddings) 112 | label_names = [] 113 | task_names = [] 114 | for i, task in enumerate(tasks): 115 | task_labels = task2labels(task) 116 | label_names += task_labels 117 | task_names += [task] * len(task_labels) 118 | # as a sanity check, make sure that the labels correspond with those in the 119 | # label vocab; +1 because the labels start at 1 (0 is UNK) 120 | for i in range(label_embeddings.shape[0]): 121 | label_id = "%s_%s" % (task_names[i], str(label_names[i])) 122 | # print(i+1, label_id, label_vocab.sym2id[label_id]) 123 | assert i+1 == label_vocab.sym2id[label_id],\ 124 | 'Error: Id %d != label id %d for %s.' % (i+1, label_id, task_names[i]) 125 | 126 | file_name = 'label_embeddings.png' 127 | plot_embedding(label_embeddings_tsne, label_names, task_names, file_name=file_name) 128 | 129 | 130 | def plot_embedding(X, y, tasks, title=None, file_name=None): 131 | """Plot an embedding X with the label y colored by colors.""" 132 | x_min, x_max = np.min(X, 0), np.max(X, 0) 133 | X = (X - x_min) / (x_max - x_min) 134 | 135 | # we can increase the resolution by increasing the figure size 136 | plt.figure(figsize=(5,5)) 137 | ax = plt.subplot(111) 138 | for i in range(X.shape[0]): 139 | if tasks[i] == STANCE: 140 | # skip stance and plot later 141 | continue 142 | plt.text(X[i, 0], X[i, 1], label2display_name(str(y[i])), 143 | color=task2color(tasks[i]), 144 | fontdict={'weight': 'bold', 'size': 9}) 145 | 146 | for i in range(X.shape[0]): 147 | if tasks[i] == STANCE: 148 | plt.text(X[i, 0], X[i, 1], label2display_name(str(y[i])), 149 | color=task2color(tasks[i]), 150 | fontdict={'weight': 'bold', 'size': 9}) 151 | 152 | # create patches for the legend 153 | patches = [] 154 | for task in sorted(list(set(tasks))): 155 | patches.append(mpatches.Patch(color=task2color(task), label=task2display_name(task))) 156 | lgd = plt.legend(handles=patches, loc='upper left', bbox_to_anchor=(1, 1), 157 | edgecolor='black') 158 | 159 | # plt.xticks([]), plt.yticks([]) 160 | if title is not None: 161 | plt.title(title) 162 | # plt.show() 163 | plt.savefig(file_name, bbox_extra_artists=(lgd,), bbox_inches='tight') 164 | -------------------------------------------------------------------------------- /preproc/fnc_data_splits.py: -------------------------------------------------------------------------------- 1 | import random 2 | from csv import DictReader 3 | from csv import DictWriter 4 | 5 | # Define data class 6 | class FNCData: 7 | 8 | """ 9 | Define class for Fake News Challenge data 10 | """ 11 | 12 | def __init__(self, file_instances, file_bodies): 13 | 14 | # Load data 15 | self.instances = self.read(file_instances) 16 | bodies = self.read(file_bodies) 17 | self.heads = {} 18 | self.bodies = {} 19 | 20 | # Process instances 21 | for instance in self.instances: 22 | if instance['Headline'] not in self.heads: 23 | head_id = len(self.heads) 24 | self.heads[instance['Headline']] = head_id 25 | instance['Body ID'] = int(instance['Body ID']) 26 | 27 | # Process bodies 28 | for body in bodies: 29 | self.bodies[int(body['Body ID'])] = body['articleBody'] 30 | 31 | def read(self, filename): 32 | 33 | """ 34 | Read Fake News Challenge data from CSV file 35 | Args: 36 | filename: str, filename + extension 37 | Returns: 38 | rows: list, of dict per instance 39 | """ 40 | 41 | # Initialise 42 | rows = [] 43 | 44 | # Process file 45 | with open(filename, "r", encoding='utf-8') as table: 46 | r = DictReader(table) 47 | for line in r: 48 | rows.append(line) 49 | 50 | return rows 51 | 52 | 53 | def split_seen(data, rand=False, prop_dev=0.2, rnd_sd=1489215): 54 | 55 | """ 56 | 57 | Split data into separate sets with overlapping headlines 58 | 59 | Args: 60 | data: FNCData object 61 | rand: bool, True: random split and False: use seed for official baseline split 62 | prop_dev: float, proportion of data for dev set 63 | rnd_sd: int, random seed to use for split 64 | 65 | Returns: 66 | train: list, of dict per instance 67 | dev: list, of dict per instance 68 | 69 | """ 70 | 71 | # Initialise 72 | list_bodies = [body for body in data.bodies] 73 | n_dev_bodies = round(len(list_bodies) * prop_dev) 74 | r = random.Random() 75 | if rand is False: 76 | r.seed(rnd_sd) 77 | train = [] 78 | dev = [] 79 | 80 | # Generate list of bodies for dev set 81 | r.shuffle(list_bodies) 82 | list_dev_bodies = list_bodies[-n_dev_bodies:] 83 | 84 | # Generate train and dev sets 85 | for stance in data.instances: 86 | if stance['Body ID'] not in list_dev_bodies: 87 | train.append(stance) 88 | else: 89 | dev.append(stance) 90 | 91 | return train, dev 92 | 93 | 94 | def split_unseen(data, rand=False, prop_dev=0.2, rnd_sd=1489215): 95 | 96 | """ 97 | 98 | Split data into completely separate sets (i.e. non-overlap of headlines and bodies) 99 | 100 | Args: 101 | data: FNCData object 102 | rand: bool, True: random split and False: constant split 103 | prop_dev: float, target proportion of data for dev set 104 | rnd_sd: int, random seed to use for split 105 | 106 | Returns: 107 | train: list, of dict per instance 108 | dev: list, of dict per instance 109 | 110 | """ 111 | 112 | # Initialise 113 | n = len(data.instances) 114 | n_dev = round(n * prop_dev) 115 | dev_ind = {} 116 | r = random.Random() 117 | if rand is False: 118 | r.seed(rnd_sd) 119 | train = [] 120 | dev = [] 121 | 122 | # Identify instances for dev set 123 | while len(dev_ind) < n_dev: 124 | rand_ind = r.randrange(n) 125 | if not data.instances[rand_ind]['Stance'] in ['agree', 'disagree', 'discuss']: 126 | continue 127 | if rand_ind not in dev_ind: 128 | rand_head = data.instances[rand_ind]['Headline'] 129 | rand_body_id = data.instances[rand_ind]['Body ID'] 130 | dev_ind[rand_ind] = 1 131 | track_heads = {} 132 | track_bodies = {} 133 | track_heads[rand_head] = 1 134 | track_bodies[rand_body_id] = 1 135 | pre_len_heads = len(track_heads) 136 | pre_len_bodies = len(track_bodies) 137 | post_len_heads = 0 138 | post_len_bodies = 0 139 | while pre_len_heads != post_len_heads and pre_len_bodies != post_len_bodies: 140 | pre_len_heads = len(track_heads) 141 | pre_len_bodies = len(track_bodies) 142 | for i, stance in enumerate(data.instances): 143 | if not data.instances[i]['Stance'] in ['agree', 'disagree', 'discuss']: 144 | continue 145 | if i != rand_ind and (stance['Headline'] in track_heads or stance['Body ID'] in track_bodies): 146 | track_heads[stance['Headline']] = 1 147 | track_bodies[stance['Body ID']] = 1 148 | post_len_heads = len(track_heads) 149 | post_len_bodies = len(track_bodies) 150 | 151 | for k, stance in enumerate(data.instances): 152 | if k != rand_ind and (stance['Headline'] in track_heads or stance['Body ID'] in track_bodies) and (stance['Stance'] in ['agree', 'disagree', 'discuss']): 153 | dev_ind[k] = 1 154 | 155 | # Generate train and dev sets 156 | for k, stance in enumerate(data.instances): 157 | if k in dev_ind: 158 | dev.append(stance) 159 | else: 160 | train.append(stance) 161 | 162 | return train, dev 163 | 164 | 165 | def save_csv(data_split, filepath): 166 | """ 167 | Save predictions to CSV file 168 | Args: 169 | pred: numpy array, of numeric predictions 170 | file: str, filename + extension 171 | """ 172 | 173 | with open(filepath, 'w', encoding='utf-8') as csvfile: 174 | fieldnames = ['Headline','Body ID','Stance'] 175 | writer = DictWriter(csvfile, fieldnames=fieldnames) 176 | 177 | writer.writeheader() 178 | for instance in data_split: 179 | writer.writerow({'Headline': instance["Headline"], 'Body ID': instance["Body ID"], 'Stance': instance["Stance"]}) 180 | 181 | 182 | if __name__ == "__main__": 183 | data = FNCData("../data/fakenewschallenge/train_stances.csv", "../data/fakenewschallenge/train_bodies.csv") 184 | train, dev = split_unseen(data) 185 | save_csv(train, "../data/fakenewschallenge/trainsplit_stances.csv") 186 | save_csv(dev, "../data/fakenewschallenge/devsplit_stances.csv") -------------------------------------------------------------------------------- /mtl/tensoriser.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | from preproc.vocab import Vocab 4 | from preproc.batch import get_feed_dicts 5 | from preproc.map import numpify, tokenize, lower, deep_map, deep_seq_map, map_to_targets 6 | from preproc.data_reader import task2data_reader 7 | from features.features import create_features 8 | import numpy as np 9 | 10 | 11 | def load_data(placeholders, target_labels, target_sizes, **options): 12 | batch_size = options["batch_size"] 13 | prepared_data = defaultdict(dict) 14 | feature_data = {} 15 | if options['ltn_pred_type'] == 'hard': 16 | num_preds_ltn = len(options["tasks"]) - 2 # relevant for softmax / output layer 17 | else: 18 | num_preds_ltn = {} 19 | total_preds_ltn = 0 20 | 21 | if options["lab_emb_dim"] != 0: 22 | total_num_labels = 0 23 | for task in options["tasks"]: 24 | total_num_labels += target_sizes[task] 25 | 26 | vocab, label_vocab = None, None 27 | label_to_labelvocab = defaultdict(list) 28 | for task in options["tasks"]: 29 | read_data = task2data_reader(task) 30 | data_train, data_dev, data_test = read_data( 31 | debug=options["debug"], num_instances=options["num_instances"]) 32 | target_labels[task] = data_train["labels"] 33 | if options['ltn_pred_type'] == 'soft': 34 | total_preds_ltn += len(data_train["labels"]) 35 | 36 | # add data for creating data features 37 | feature_data[task] = data_train.get("seq2", []) +\ 38 | data_dev.get("seq2", []) +\ 39 | data_test.get("seq2", []) 40 | 41 | label_to_labelvocab_task = None 42 | if options["lab_emb_dim"] != 0: 43 | if label_vocab is None: 44 | label_vocab = Vocab() # unk is id 0 45 | label_to_labelvocab_i = [0] * total_num_labels 46 | for taskl in data_train["labels"]: 47 | labid = label_vocab(task + "_" + str(taskl)) 48 | label_to_labelvocab_i[labid-1] = (labid) # -1 because the first one is UNK 49 | label_to_labelvocab_task = label_to_labelvocab_i 50 | label_to_labelvocab[task] = label_to_labelvocab_task 51 | 52 | 53 | prepared_data[task]["train"], vocab, label_vocab = prepare_data(placeholders, 54 | data_train, vocab, label_vocab, label_to_labelvocab_task) 55 | prepared_data[task]["dev"], vocab, label_vocab = prepare_data(placeholders, 56 | data_dev, vocab, label_vocab, label_to_labelvocab_task) 57 | prepared_data[task]["test"], vocab, label_vocab = prepare_data(placeholders, 58 | data_test, vocab, label_vocab, label_to_labelvocab_task) 59 | 60 | 61 | vocab.freeze() # this makes sure that nothing further is added to the vocab, otherwise deep_map will extend it 62 | if label_vocab is not None: 63 | label_vocab.freeze() 64 | 65 | if options['model_type'] != 'hard-sharing' and options["feature_sets"] != "predsonly": 66 | # create a mapping of tasks to an array for each training example 67 | print("Creating features") 68 | task2features = create_features(options["feature_sets"], feature_data, 69 | vocab, options["features_path"]) 70 | 71 | if options['model_type'] == 'label-transfer': 72 | for task in options["tasks"]: 73 | num_preds_ltn[task] = total_preds_ltn 74 | if task != options['main_task']: 75 | num_preds_ltn[task] = total_preds_ltn - len(target_labels[task]) 76 | 77 | if options["lab_emb_dim"] > 0: 78 | num_preds_ltn[task] = len(label_vocab.id2sym.keys()) -1 79 | if task != options['main_task']: 80 | num_preds_ltn[task] = len(label_vocab.id2sym.keys()) - 1 - len(target_labels[task]) 81 | 82 | elif options['em_pred_type'] == 'soft': 83 | for task in options["tasks"]: 84 | num_preds_ltn[task] = total_preds_ltn - len(target_labels[task]) - len(target_labels[options['main_task']]) 85 | 86 | if options["lab_emb_dim"] > 0: 87 | num_preds_ltn[task] = len(label_vocab.id2sym.keys()) -1 - len(target_labels[task]) - len(target_labels[options['main_task']]) 88 | 89 | #print(num_preds_ltn) 90 | 91 | train_feed_dicts, dev_feed_dicts, test_feed_dicts = {}, {}, {} 92 | for task in options["tasks"]: 93 | 94 | # padding to same length and converting lists to numpy arrays 95 | train_data = numpify(prepared_data[task]["train"], pad=0) 96 | dev_data = numpify(prepared_data[task]["dev"], pad=0) 97 | test_data = numpify(prepared_data[task]["test"], pad=0) 98 | 99 | if options['model_type'] != 'hard-sharing': 100 | if options["feature_sets"] != "predsonly": 101 | # add the data features to the data splits 102 | train_size, dev_size, test_size = train_data['seq1'].shape[0], \ 103 | dev_data['seq1'].shape[0], test_data['seq1'].shape[0] 104 | train_data['features'] = task2features[task][0:train_size] 105 | dev_data['features'] = task2features[task][ 106 | train_size:(train_size+dev_size)] 107 | test_data['features'] = task2features[task][-test_size:] 108 | 109 | if options['ltn_pred_type'] == 'soft': 110 | num_pr_ltn = num_preds_ltn[task] 111 | else: 112 | num_pr_ltn = num_preds_ltn 113 | 114 | train_data['preds_for_ltn'] = np.zeros([len(train_data["seq1"]), num_pr_ltn], np.float32) 115 | dev_data['preds_for_ltn'] = np.zeros([len(dev_data["seq1"]), num_pr_ltn], np.float32) 116 | test_data['preds_for_ltn'] = np.zeros([len(test_data["seq1"]), num_pr_ltn], np.float32) 117 | 118 | if options["lab_emb_dim"] > 0 and options["lab_embs_for_ltn"] and options["relabel_with_ltn"]: 119 | # this is just so that we can get main task predictions from models for any task more easily, using the label emb representation 120 | targets_main_len = target_sizes[options["main_task"]] 121 | train_data["targets_main"] = np.zeros([len(train_data["seq1"]), targets_main_len], np.int32) 122 | dev_data["targets_main"] = np.zeros([len(dev_data["seq1"]), targets_main_len], np.int32) 123 | test_data["targets_main"] = np.zeros([len(test_data["seq1"]), targets_main_len], np.int32) 124 | 125 | train_data["label_vocab_inds_main"] = [label_to_labelvocab[options["main_task"]] for inst in train_data["targets"]] 126 | dev_data["label_vocab_inds_main"] = [label_to_labelvocab[options["main_task"]] for inst in dev_data["targets"]] 127 | test_data["label_vocab_inds_main"] = [label_to_labelvocab[options["main_task"]] for inst in test_data["targets"]] 128 | 129 | train_feed_dicts[task] = get_feed_dicts( 130 | train_data, placeholders, batch_size=batch_size, 131 | inst_length=len(train_data["seq1"])) 132 | dev_feed_dicts[task] = get_feed_dicts( 133 | dev_data, placeholders, batch_size=batch_size, 134 | inst_length=len(dev_data["seq1"])) 135 | test_feed_dicts[task] = get_feed_dicts( 136 | test_data, placeholders, batch_size=batch_size, 137 | inst_length=len(test_data["seq1"])) 138 | 139 | return train_feed_dicts, dev_feed_dicts, test_feed_dicts, vocab, label_vocab, num_preds_ltn, label_to_labelvocab 140 | 141 | 142 | def prepare_data(placeholders, data, vocab=None, label_vocab=None, label_to_labelvocab=None): 143 | data_tokenized = deep_map(data, tokenize, ['seq1', 'seq2']) 144 | data_lower = deep_seq_map(data_tokenized, lower, ['seq1', 'seq2']) 145 | data = deep_seq_map(data_lower, lambda xs: [""] + xs + [""], ["seq1", "seq2"]) 146 | if vocab is None: 147 | vocab = Vocab() 148 | for instance in data["seq1"] + data["seq2"]: 149 | for token in instance: 150 | vocab(token) 151 | 152 | data = map_to_targets(data, "labels", "stance") # map stance IDs to one-hot vectors, save in data["targets"] 153 | if label_vocab != None: # then we want label embeddings 154 | data["label_vocab_inds"] = [label_to_labelvocab for inst in data["targets"]] 155 | data_ids = deep_map(data, vocab, ["seq1", "seq2"]) 156 | data_ids = deep_seq_map(data_ids, lambda xs: len(xs), keys=['seq1', 'seq2'], fun_name='lengths', expand=True) 157 | 158 | # removing data that's not a placeholder 159 | popl = [] 160 | for k in data_ids.keys(): 161 | if not k in placeholders.keys(): 162 | popl.append(k) 163 | for p in popl: 164 | data_ids.pop(p, None) 165 | 166 | return data_ids, vocab, label_vocab 167 | -------------------------------------------------------------------------------- /preproc/batch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from itertools import islice 3 | from preproc.map import numpify 4 | from numpy.random import choice 5 | #from jtr.util.rs import DefaultRandomState 6 | 7 | rs = np.random.RandomState(1337) 8 | #rs = DefaultRandomState(1337)#new seed ignored if set previously 9 | 10 | 11 | def get_buckets(data, order, structure): 12 | """ 13 | Generates mapping between data instances and bucket-ID's. 14 | 15 | `data`: dict of nested sequences in which each top-level sequence has the same length, 16 | and all inner sequences have the __len__ attribute. 17 | `order`: (None or) tuple with data keys used for bucketing 18 | For example: 19 | ```list(data.keys()) = ["sentences1", "lengths1", "sentences2", "lengths2", "targets"]``` 20 | and we want bucketing according to the lengths of inner sequences in "sentences1" and "sentences2": 21 | `order = ("sentences1", "sentences2")` performs bucketing on "sentences1", and within each bucket, 22 | again creates buckets according to "sentences2" 23 | (automatic bucketing will result in different "sentences2" bucket boundaries 24 | within each bucket according to "sentences1"). 25 | `order = ("sentences2", "sentences1")`: vice versa, with "sentences2" for highest-level buckets 26 | `structure`: (None or) sequence with same length as `order`, each element is an integer or a list of integers 27 | For each position: 28 | - integer: denotes number of buckets, to be determined automatically 29 | - list: determines bucket boundaries. E.g.: [10, 20, 30] will result in 4 buckets 30 | (1) lengths 0-10, (2) lengths 11-20, (3) lengths 21-30, (4) lengths > 30 31 | For example: 32 | `order` = ("sentences1", "sentences2") and `structure` = (3, [10]) generates 6 buckets: 33 | within each of 3 partitions based on "sentences1", 34 | there is a bucket with instances of "sentences2" with length 10 or less, 35 | and one for lengths > 10. 36 | 37 | Returns: 38 | buckets2ids, ids2buckets 39 | dicts that map instance-id (index along 1st dimension of values in data) to bucket-id, 40 | and vice versa. 41 | """ 42 | assert isinstance(data, dict) 43 | 44 | n_tot = len(list(data.values())[0]) 45 | if order is None or structure is None: 46 | # all in 1 bucket, with id '(0)' 47 | buckets2ids = {'(0)': list(range(n_tot))} 48 | ids2buckets = dict(zip(list(range(n_tot)), ['(0)'] * n_tot)) 49 | return buckets2ids, ids2buckets 50 | 51 | def _chunk(it, size): 52 | """returns iterator of chunks (tuples) from it (input iterator), with given size (last one may be shorter)""" 53 | it = iter(it) 54 | return iter(lambda: tuple(islice(it, size)), ()) 55 | 56 | def _partition(_buckets2ids, _order, _structure): 57 | """update _buckets2ids according to _order and _structure""" 58 | # update all current buckets according to first item in _order and _structure 59 | buckets2ids_new = {} 60 | for bid, ids in sorted(_buckets2ids.items(), key=lambda x: x[0]): 61 | lengths = [len(data[_order[0]][id]) for id in ids] 62 | sorted_ids_lengths = sorted(zip(ids, lengths), key=lambda x: x[1]) 63 | if isinstance(_structure[0], int): # automatic bucketing 64 | size = len(lengths) // _structure[0] if len(lengths) % _structure[0] == 0 \ 65 | else 1 + (len(lengths) // _structure[0]) 66 | buckets = list(_chunk([tup[0] for tup in sorted_ids_lengths], size)) 67 | else: # structure_is sequence of ints 68 | struct = list(sorted(_structure[0])) + [np.inf] 69 | bin_max, struct = struct[0], struct[1:] 70 | buckets = [[]] 71 | for id, l in sorted_ids_lengths: 72 | if l > bin_max: # never happens when bin_max = np.inf 73 | bin_max, struct = struct[0], struct[1:] 74 | buckets.append([]) 75 | buckets[-1].append(id) 76 | buckets2ids_new.update({tuple(list(bid) + [i]): list(bucket) for i, bucket in enumerate(buckets)}) 77 | # call again if _order and _structure have more than 1 item 78 | if len(_order) > 1: 79 | buckets2ids_new = _partition(buckets2ids_new, _order[1:], _structure[1:]) 80 | 81 | buckets2ids_new = {bid: bucket for bid, bucket in buckets2ids_new.items() if len(bucket) > 0} 82 | return buckets2ids_new 83 | 84 | 85 | buckets2ids = _partition({(): list(range(n_tot))}, order, structure) 86 | buckets2ids = {str(bid): buckets2ids[bid] for bid in buckets2ids} # make bucket-ids strings (for random.choice) 87 | 88 | ids2buckets = {} 89 | for bid, bucket in buckets2ids.items(): 90 | ids2buckets.update({id: bid for id in bucket}) 91 | return buckets2ids, ids2buckets 92 | 93 | 94 | def get_batches(data, batch_size=32, pad=0, bucket_order=None, bucket_structure=None, exact_epoch=False): 95 | """ 96 | Creates generator that batches `data`. 97 | To avoid biases, it is advised to keep `bucket_order=None` and `bucket_structure=None` if computationally possible. 98 | (which will sample batches from all instances) 99 | 100 | Args: 101 | `data`: dict with (multi-dimensional) numpy arrays or (nested) lists; 102 | first inner dimension (`num_instances`) should be the same over all data values. 103 | `batch_size`: the desired batch size 104 | `pad`: padding symbol in case data contains lists of lists of different sizes 105 | `bucket_order`: argument `order` in get_buckets (list with keys); `None` if no bucketing 106 | `bucket_structure`: argument `structure` in get_buckets; `None` if no bucketing 107 | `exact_epoch`: if set to `True`, final batch per bucket may be smaller, but each instance will be seen exactly 108 | once during training. Default: `False`, to be certain during training 109 | that each instance per batch gets same weight in the total loss 110 | (but not all instances are observed per epoch if bucket sizes are no multiple of `batch_size`). 111 | 112 | Returns: 113 | a generator that generates a dict with same keys as `data`, and 114 | as values data batches consisting of `[batch_size x num_instances]` 2D numpy tensors 115 | (1st dimension is at most `batch_size` but may be smaller to cover all instances exactly once per epoch, 116 | if `exact_epoch=True`) 117 | """ 118 | assert isinstance(data, dict) 119 | 120 | data0 = list(data.values())[0] 121 | if not isinstance(data0, np.ndarray): 122 | data_np = numpify(data, pad) # still need original data for length-based bucketing 123 | else: 124 | data_np = data 125 | 126 | def get_bucket_probs(_buckets2instances): 127 | N = float(np.sum([len(ids) for ids in _buckets2instances.values()])) 128 | return {bid: len(ids) / N if N > 0. else 0. for bid, ids in _buckets2instances.items()} 129 | 130 | def shuffle_buckets(_buckets2instances): 131 | for bid in sorted(_buckets2instances.keys()): # sorted: to keep deterministic 132 | rs.shuffle(_buckets2instances[bid]) 133 | 134 | buckets2instances, _ = get_buckets(data, bucket_order, bucket_structure) 135 | n_buckets = len(buckets2instances) 136 | 137 | exact_epoch = True if len(data0) < n_buckets*batch_size else exact_epoch 138 | #if average instances/bucket smaller than batch_size: set exact_epoch = True 139 | #to avoid empty batches during debugging on small data samples 140 | 141 | def bucket_generator(): 142 | buckets2instances, _ = get_buckets(data, bucket_order, bucket_structure) 143 | shuffle_buckets(buckets2instances) 144 | all_seen = False 145 | while not all_seen: 146 | bids, probs = zip(*sorted(get_bucket_probs(buckets2instances).items(), key=lambda x: x[0])) 147 | # sorted keys: to keep deterministic 148 | if np.sum(probs) == 0.: 149 | all_seen = True 150 | else: 151 | bid = rs.choice(bids, replace=False, p=probs) # sample bucket according to remaining size 152 | batch_indices = buckets2instances[bid][:batch_size] 153 | buckets2instances[bid] = buckets2instances[bid][batch_size:] 154 | # if required by exact_epoch: also include last batch in bucket if too small 155 | if len(batch_indices) == batch_size or exact_epoch: 156 | yield {k: data_np[k][batch_indices] for k in data_np} 157 | 158 | return GeneratorWithRestart(bucket_generator) 159 | 160 | 161 | def get_feed_dicts(data_train_np, placeholders, batch_size, inst_length): 162 | data_train_batched = [] 163 | realsamp = int(inst_length/batch_size) 164 | additionsamp = inst_length%batch_size 165 | if additionsamp != 0: 166 | realsamp += 1 167 | ids1 = choice(range(0, inst_length), inst_length, replace=False) # sample without replacement so we get every sample once # -additionsamp 168 | ids2 = choice(range(0, inst_length), additionsamp, replace=True) # sample a few additional ones to fill up batch 169 | ids = np.append(ids1, ids2) 170 | 171 | start = 0 172 | for i in range(0, realsamp): 173 | batch_i = {} 174 | if i != 0: 175 | start = i * batch_size 176 | if i != realsamp: 177 | ids_sup = ids[start:((i+1)*batch_size)] 178 | else: 179 | ids_sup = ids[start:realsamp] 180 | #print(ids_sup) 181 | for key, value in data_train_np.items(): 182 | #print(key) 183 | #print(data_train_np[key]) 184 | batch_i[placeholders[key]] = [data_train_np[key][ii] for ii in ids_sup] 185 | 186 | data_train_batched.append(batch_i) 187 | 188 | return data_train_batched 189 | 190 | 191 | def batch_feed_dicts(data_train_np, batch_size, inst_length): 192 | data_train_batched = [] 193 | realsamp = int(inst_length/batch_size) 194 | additionsamp = inst_length%batch_size 195 | if additionsamp != 0: 196 | realsamp += 1 197 | ids1 = choice(range(0, inst_length), inst_length, replace=False) # sample without replacement so we get every sample once # -additionsamp 198 | ids2 = choice(range(0, inst_length), additionsamp, replace=True) # sample a few additional ones to fill up batch 199 | ids = np.append(ids1, ids2) 200 | 201 | start = 0 202 | for i in range(0, realsamp): 203 | batch_i = {} 204 | if i != 0: 205 | start = i * batch_size 206 | if i != realsamp: 207 | ids_sup = ids[start:((i+1)*batch_size)] 208 | else: 209 | ids_sup = ids[start:realsamp] 210 | for key, value in data_train_np.items(): 211 | batch_i[key] = [data_train_np[key][ii] for ii in ids_sup] 212 | 213 | data_train_batched.append(batch_i) 214 | 215 | return data_train_batched 216 | 217 | 218 | def get_feed_dicts_old(data, placeholders, batch_size=32, pad=0, bucket_order=None, bucket_structure=None, exact_epoch=False): 219 | """Creates feed dicts for all batches with a given batch size. 220 | 221 | Args: 222 | `data` (dict): The input data for the feed dicts. 223 | `placeholders` (dict): The TensorFlow placeholders for the data 224 | (placeholders.keys() must form a subset of data.keys()). 225 | `batch_size` (int): The batch size for the data. 226 | `pad` (int): Padding symbol index to pad lists of different sizes. 227 | `bucket_order`: argument `order` in get_buckets (list with keys); `None` if no bucketing 228 | `bucket_structure`: argument `structure` in get_buckets; `None` if no bucketing 229 | `exact_epoch`: if set to `True`, final batch per bucket may be smaller, but each instance will be seen exactly 230 | once during training. Default: `False`, to be certain during training 231 | that each instance per batch gets same weight in the total loss. 232 | 233 | Returns: 234 | GeneratorWithRestart: Generator that yields a feed_dict for each 235 | iteration. A feed dict consists of '{ placeholder : data-batch }` key-value pairs. 236 | """ 237 | assert isinstance(data, dict) and isinstance(placeholders, dict) 238 | assert set(placeholders.keys()).issubset(set(data.keys())), \ 239 | 'data keys %s \nnot compatible with placeholder keys %s' % (set(placeholders.keys()), set(data.keys())) 240 | 241 | def generator(): 242 | batches = get_batches(data, batch_size, pad, bucket_order, bucket_structure, exact_epoch) 243 | # fixme: this is potentially inefficient as it might be called every time we retrieve a batch 244 | # todo: measure and fix if significant impact 245 | mapped = map(lambda xs: {placeholders[k]: xs[k] for k in placeholders}, batches) 246 | #for each key in placeholders dict, pair the placeholder with the corresponding batch dict value 247 | for x in mapped: 248 | yield x 249 | 250 | return GeneratorWithRestart(generator) 251 | 252 | 253 | class GeneratorWithRestart(object): 254 | def __init__(self, iterator): 255 | self.iterator = iterator 256 | 257 | def __iter__(self): 258 | return self.iterator() 259 | 260 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tensorflow as tf 3 | import os 4 | import numpy as np 5 | import argparse 6 | import copy 7 | from sklearn.metrics import classification_report 8 | from mtl.tensoriser import load_data 9 | from mtl.training import train, restore_trained_model, get_preds_for_ltn 10 | from preproc.log_utils import log_results, task2score 11 | from constants import FNC, STANCE, NLI, TOPIC, LAPTOP, RESTAURANT, TASKS,\ 12 | SIM, DIV, TARGET, RNN_CELL_TYPES, TOPIC_5WAY 13 | from preproc.plot_utils import plot_label_embeddings 14 | 15 | seq1 = tf.placeholder(tf.int32, [None, None], name="seq1") 16 | seq1_lengths = tf.placeholder(tf.int32, [None], name="seq1_lengths") 17 | seq2 = tf.placeholder(tf.int32, [None, None], name="seq2") 18 | seq2_lengths = tf.placeholder(tf.int32, [None], name="seq2_lengths") 19 | targets = tf.placeholder(tf.int32, [None, None], name="targets") 20 | targets_main = tf.placeholder(tf.int32, [None, None], name="targets_main") # targets for main task 21 | features = tf.placeholder(tf.float32, [None, None], name="features") 22 | preds_for_ltn = tf.placeholder(tf.float32, [None, None], name="preds_for_ltn") # this is set to 0 initially and constantly updated during training 23 | label_vocab_inds = tf.placeholder(tf.int32, [None, None], name="label_vocab_inds") 24 | label_vocab_inds_main = tf.placeholder(tf.int32, [None, None], name="label_vocab_inds_main") # label target for main task 25 | 26 | 27 | # This dictionary determines which tasks are used. By default, it contains 28 | # all existing tasks and is then modified during setup accordingly. 29 | target_sizes = {FNC: 4, STANCE: 3, NLI: 3, TOPIC: 2, LAPTOP: 3, RESTAURANT: 3, 30 | TARGET: 3, TOPIC_5WAY: 5} 31 | target_labels = {FNC: [], STANCE: [], NLI: [], TOPIC: [], LAPTOP: [], 32 | RESTAURANT: [], TARGET: [], TOPIC_5WAY: []} 33 | 34 | placeholders = {"seq1": seq1, "seq1_lengths": seq1_lengths, "seq2": seq2, 35 | "seq2_lengths": seq2_lengths, "targets": targets, "targets_main": targets_main, 36 | "features": features, "preds_for_ltn": preds_for_ltn, 37 | "label_vocab_inds": label_vocab_inds, "label_vocab_inds_main": label_vocab_inds_main} 38 | 39 | 40 | def main(**options): 41 | 42 | # create the log directory if it does not exist 43 | log_dir = os.path.dirname(args.log_file) 44 | if not os.path.exists(log_dir): 45 | print('Creating %s...' % log_dir) 46 | os.makedirs(log_dir) 47 | 48 | train_feed_dicts, dev_feed_dicts, test_feed_dicts, vocab, label_vocab, ltn_sizes, label_to_labelvocab = load_data(placeholders, target_labels, target_sizes, **options) 49 | 50 | # remove tasks from target_sizes if not used 51 | for task in copy.deepcopy(set(target_sizes.keys())): 52 | if not task in options["tasks"]: 53 | target_sizes.pop(task) 54 | 55 | print("Data loaded and tensorised. Training model with settings: " + str(options)) 56 | 57 | if options['model_type'] != 'hard-sharing' and options["feature_sets"] != "predsonly": 58 | ex1 = train_feed_dicts[options["main_task"]][0] 59 | ex1feats = ex1[placeholders["features"]] 60 | input_size_preds = len(ex1feats[0]) 61 | else: 62 | input_size_preds = 0 63 | 64 | if label_vocab == None: 65 | label_vocab_len = 0 66 | else: 67 | label_vocab_len = len(label_vocab) 68 | 69 | # Do not take up all the GPU memory all the time. 70 | sess_config = tf.ConfigProto() 71 | sess_config.gpu_options.allow_growth = True 72 | with tf.Session(config=sess_config) as sess: 73 | if options["plot_embeddings"]: 74 | print('Loading the model for plotting label embeddings...') 75 | _, _, _, _, _, _, _ = restore_trained_model( 76 | placeholders, target_sizes, train_feed_dicts, vocab, 77 | label_vocab_len, label_to_labelvocab, input_size_preds, ltn_sizes, sess=sess, 78 | **options) 79 | plot_label_embeddings(sess, args.tasks, label_vocab) 80 | sys.exit(0) 81 | elif options["apply_existing_model"] == False: 82 | logits, loss, preds, logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict = train(placeholders, target_sizes, train_feed_dicts, dev_feed_dicts, vocab, label_vocab, input_size_preds, ltn_sizes, label_to_labelvocab, sess=sess, **options) 83 | else: 84 | logits, loss, preds, logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict = restore_trained_model(placeholders, target_sizes, train_feed_dicts, vocab, label_vocab_len, label_to_labelvocab, input_size_preds, ltn_sizes, sess=sess, **options) 85 | print('============') 86 | # Test on test data 87 | for task in target_sizes.keys(): 88 | correct_test_all, total_test, correct_test_all_ltn = 0.0, 0.0, 0.0 89 | p_inds, g_inds, p_inds_ltn, topics = [], [], [], [] 90 | for j, batch in enumerate(test_feed_dicts[task]): 91 | p = sess.run(preds[task], feed_dict=batch) 92 | pred_inds = [np.argmax(pp) for pp in p] 93 | p_inds.extend(pred_inds) 94 | gold_inds = [np.argmax(batch[placeholders["targets"]][i]) for i, targ in enumerate(batch[placeholders["targets"]])] 95 | g_inds.extend(gold_inds) 96 | hits = [pp for i, pp in enumerate(p) if np.argmax(pp) == np.argmax(batch[placeholders["targets"]][i])] 97 | correct_test_all += len(hits) 98 | total_test += len(batch[placeholders["targets"]]) 99 | 100 | # keep track of the targets for topic-based scores 101 | topics += [t for t in batch[placeholders["seq1"]]] 102 | 103 | if options["model_type"] == "semi-supervised" or options["model_type"] == "label-transfer": 104 | 105 | batch_test = get_preds_for_ltn(sess, batch, placeholders, target_sizes, task, options["main_task"], preds, 106 | options["ltn_pred_type"], label_to_labelvocab, options["lab_emb_dim"], options["model_type"]) 107 | 108 | p_ltn = sess.run(preds_dict_ltn[task], feed_dict=batch_test) 109 | pred_inds_ltn = [np.argmax(pp_dev) for pp_dev in p_ltn] 110 | p_inds_ltn.extend(pred_inds_ltn) 111 | with open(options['log_file'].replace('.txt', '_inds.txt'), 'a') as f: 112 | f.write(task + "\tMain model\t" + str(p_inds).replace("[", "").replace("]", "").replace(",", "") + "\n") 113 | f.write(task + "\tRelabel model\t" + str(p_inds_ltn).replace("[", "").replace("]", "").replace(",", "") + "\n") 114 | f.write(task + "\tGold\t" + str(g_inds).replace("[", "").replace("]", "").replace(",", "") + "\n") 115 | 116 | acc_test = correct_test_all/total_test 117 | print('Test performance :', "Task: " + task, "Acc: ", acc_test) 118 | test_score = task2score(task, g_inds, p_inds, topics) 119 | print('Score on test set:', test_score) 120 | try: 121 | # labels for topic 5-way are floats, so convert to string 122 | print(classification_report(g_inds, p_inds, target_names=[str(l) for l in target_labels[task]])) 123 | except IndexError: 124 | print("Training labels inconsistent with testing labels") 125 | print(classification_report(g_inds, p_inds)) 126 | 127 | acc_test_ltn = 0. 128 | if options["model_type"] == "semi-supervised" or options["model_type"] == "label_transfer": 129 | acc_test_ltn = correct_test_all_ltn / total_test 130 | task_score_ltn = task2score(task, g_inds, p_inds_ltn, topics) 131 | print('Test performance LTN:', "Task: " + task, "Acc: ", acc_test_ltn, "Task score", task_score_ltn) 132 | try: 133 | print(classification_report(g_inds, p_inds_ltn, target_names=[str(l) for l in target_labels[task]])) 134 | except IndexError: 135 | print("Training labels inconsistent with testing labels") 136 | print(classification_report(g_inds, p_inds_ltn)) 137 | log_results(options, acc_test, test_score, task_score_ltn, task) 138 | else: 139 | log_results(options, acc_test, test_score, 0.0, task) 140 | 141 | 142 | if __name__ == "__main__": 143 | 144 | parser = argparse.ArgumentParser(description='Train and Evaluate a MTL model with incompatible outputs') 145 | parser.add_argument('--debug', default=True, action='store_true', help="Debug mode -- for this, only a small portion of the data is used to test code functionality") 146 | parser.add_argument('--dev_res_during_training', default=False, action='store_true', help="If true, computes results on dev set during training") 147 | parser.add_argument('--num_instances', type=int, default=128, help="What is the maximum number of instances to use per task") 148 | parser.add_argument('--apply_existing_model', default=False, action='store_true', help="If set to True, doesn't train model but only applies trained model to test data") 149 | parser.add_argument('--tasks', nargs='+', default=TASKS, help="Tasks to train on. If this is the same as the main task, a single-task model is trained. Options:" + str(TOPIC_5WAY)) 150 | parser.add_argument('--main_task', type=str, default=RESTAURANT, help="The main task.") 151 | parser.add_argument('--feature_sets', nargs='+', help='data feature sets. In the paper, only diversity features are tested.', default=DIV) 152 | parser.add_argument('--ltn_pred_type', type=str, help='Whether to use hard or soft predictions as input to LTN model. In the experiments described in the paper, only soft predictions are used.', default='soft') 153 | parser.add_argument('--main_num_layers', type=int, help='If > 1, number of hidden layer for main model.', default=1) 154 | parser.add_argument('--lel_hid_size', type=int, help='If > 0, size of hidden layer for label embedding layer, as described in Section 3.2 of the paper.', default=0) 155 | parser.add_argument('--model_type', default='label-transfer', choices={'hard-sharing', 'label-transfer', 'semi-supervised'}, help="What model variant to use: " 156 | "'hard-sharing' is the MTL with hard parameter sharing model (Section 3.1), " 157 | "'label-transfer' is the label transfer network (Section 3.3), " 158 | "'semi-supervised' is the semi-supervised MTL (Section 3.4)") 159 | parser.add_argument('--relabel_with_ltn', default=False, action='store_true', help="Only relevant for semi-supervised model: do we actually use it to relabel data or not. The latter can be used for debugging purposes. " 160 | "Otherwise, this is the semi-supervised variant of the LTN described in Section 3.4 of the paper") 161 | parser.add_argument('--task_specific_layer_size', type=int, default=1, help="If >0, adds a task-specific hidden layer with that size and skip-connections") 162 | parser.add_argument('--batch_size', type=int, default=16, help="What batch size should be used") 163 | parser.add_argument('--max_epochs', type=int, default=1, help="What is the maximum number of epochs to train main model for") 164 | parser.add_argument('--max_epochs_ltn', type=int, default=2, help="What is the maximum number of epochs to train LTN model for") 165 | parser.add_argument('--max_epochs_after_ltn', type=int, default=0, help="After we've trained the relabelling function, how many epochs should we train for with augmented data.") 166 | parser.add_argument('--early_stopping', type=float, default=1.0, help="Threshold for early stopping on dev set of main task. If 1.0, there is no early stopping.") 167 | parser.add_argument('--emb_dim', type=int, default=16, help="What embedding size should be used") 168 | parser.add_argument('--lab_emb_dim', type=int, default=16, help='What embedding size should be used for the label embeddings. If 0, no label embeddings are used.') 169 | parser.add_argument('--lab_embs_for_ltn', default=False, action='store_true', help='Whether to use label embeddings for relabelling function or not.') 170 | parser.add_argument('--skip_connections', default=False, action='store_true', help='Skip connections for the RNN or not') 171 | parser.add_argument('--learning_rate', type=float, default=0.01, help="What initial learning rate should be used") 172 | parser.add_argument('--dropout_rate', type=float, default=1.0, help="What rate of dropout should be used. 1.0 -> no dropout") 173 | parser.add_argument('--l1_rate_main', type=float, default=1.0, help="What rate of l1 regularisation should be used for main model. 1.0 -> no l1") 174 | parser.add_argument('--l2_rate_main', type=float, default=1.0, help="What rate of l2 regularisation should be used for main model. 1.0 -> no l2") 175 | parser.add_argument('--l1_rate_ltn', type=float, default=1.0, help="What rate of l1 regularisation should be used for em model. 1.0 -> no l1") 176 | parser.add_argument('--l2_rate_ltn', type=float, default=1.0, help="What rate of l2 regularisation should be used for em model. 1.0 -> no l2") 177 | parser.add_argument('--rnn_cell_type', type=str, help='RNN cell type. Options:' + str(RNN_CELL_TYPES), default="lstm") 178 | parser.add_argument('--attention', default=False, action='store_true', help='Word by word attention mechanism') 179 | parser.add_argument('--save_model', default=False, action='store_true', help="Save model after end of training") 180 | parser.add_argument('--exp_id', type=str, default="run1", help="Experiment ID. In case the same experiment with the same configurations needs to be run more than once.") 181 | parser.add_argument('--features-path', type=str, default='saved_features_new', help='the directory where the computed features are saved') 182 | parser.add_argument('--log_file', type=str, default="./log.txt", help='the path to which results should be logged') 183 | parser.add_argument('--alternate_batches', default=True, action='store_true', help='alternate tasks between batches instead of between epochs during training') 184 | parser.add_argument('--plot_embeddings', action='store_true', help='plot label embeddings of trained model') 185 | 186 | args = parser.parse_args() 187 | if args.debug: 188 | print('Debugging is switched on. Only a small portion of data is used.') 189 | if args.apply_existing_model: 190 | args.save_model = False 191 | if args.alternate_batches: 192 | print('Alternating tasks between batches...') 193 | else: 194 | print('Alternating tasks between epochs...') 195 | if args.feature_sets == 'predsonly' and args.model_type == 'semi-supervised': 196 | print("The model type 'label-transfer' needs to be used for this to work. Changing it to that setting.") 197 | args.model_type = 'label-transfer' 198 | main(**vars(args)) 199 | -------------------------------------------------------------------------------- /features/features.py: -------------------------------------------------------------------------------- 1 | """ 2 | Methods to create the similarity and diversity features used in 3 | Ruder & Plank (2017). 4 | """ 5 | 6 | import sys 7 | import os 8 | import numpy as np 9 | np.seterr(all='raise') 10 | import scipy.stats 11 | import scipy.spatial 12 | 13 | from preproc.map import tokenize, lower, deep_map, deep_seq_map 14 | from constants import SIMILARITY_FEATURES, DIVERSITY_FEATURES, SIM, DIV 15 | 16 | # ------------ Feature methods ------------ 17 | 18 | 19 | def create_features(feature_sets, task2examples, vocab, save_path): 20 | """ 21 | Retrieve the feature representations of a list of examples. 22 | :param feature_sets: a list containing the names of features to be used 23 | :param task2examples: mapping of tasks to lists of untokenized texts 24 | :param vocab: the Vocabulary object 25 | :param save_path: the directory where the features should be stored 26 | :return: a mapping of tasks to feature representations of shape 27 | (num_examples, num_features); the features correspond to the order 28 | of the data; first training examples, then dev, then test 29 | """ 30 | # create the features for each example in each task 31 | task2features = {t: [] for t in task2examples.keys()} 32 | 33 | # get the feature names 34 | feature_names = [] 35 | if SIM in feature_sets: 36 | feature_names += SIMILARITY_FEATURES 37 | if DIV in feature_sets: 38 | feature_names += DIVERSITY_FEATURES 39 | 40 | print("Trying to find feature files in", save_path) 41 | if os.path.exists(save_path) and os.path.isdir(save_path) and len( 42 | os.listdir(save_path)) > 0: 43 | feature_dim = None 44 | for task in task2examples.keys(): 45 | assert task in os.listdir(save_path),\ 46 | 'Error: No saved features available for task %s in dir %s.' \ 47 | % (task, save_path) 48 | print("Task2features") 49 | print(task2features) 50 | print("Files in features folder") 51 | print(os.listdir(save_path)) 52 | for task in os.listdir(save_path): 53 | # then we don't need to load it 54 | if not task in task2features.keys(): 55 | continue 56 | with open(os.path.join(save_path, task), 'r') as f: 57 | for line in f: 58 | features = np.fromstring(line.strip('[]'), dtype=float, 59 | sep=' ') 60 | if feature_dim is None: 61 | feature_dim = len(features) 62 | assert feature_dim == len(feature_names),\ 63 | 'Error: # of loaded features %d != # of specified '\ 64 | 'features %d.' % (feature_dim, len(feature_names)) 65 | assert feature_dim == len(features),\ 66 | 'Error: Different # of features among examples, ' \ 67 | 'i.e. %d and %d.' % (feature_dim, len(features)) 68 | task2features[task].append(features) 69 | print('Loaded %d-d features for %s from %s...' 70 | % (feature_dim, task, save_path)) 71 | return task2features 72 | 73 | if not os.path.exists(save_path): 74 | os.makedirs(save_path) 75 | 76 | # tokenize and lower-case the documents 77 | for task, examples in task2examples.items(): 78 | examples = deep_map(examples, tokenize) 79 | examples = deep_seq_map(examples, lower) 80 | task2examples[task] = examples 81 | 82 | # get the term distribution of the data for each task (shape (vocab_size,) ) 83 | # and for each example (shape (num_examples, vocab_size) ) 84 | task2task_term_dist = {} 85 | for task, examples in task2examples.items(): 86 | task2task_term_dist[task] = get_term_dist(examples, vocab.sym2id) 87 | 88 | for task, examples in task2examples.items(): 89 | for i, example in enumerate(examples): 90 | term_dist = get_term_dist([example], vocab.sym2id) 91 | features = [] 92 | for f_name in feature_names: 93 | # check whether feature belongs to similarity-based features, 94 | # diversity-based features, etc. 95 | if f_name in SIMILARITY_FEATURES: 96 | # compute the similarity with regard to each task 97 | for target_task in task2examples.keys(): 98 | f = similarity_name2value( 99 | f_name, term_dist, task2task_term_dist[target_task]) 100 | if np.isnan(f).any() or np.isinf(f).any(): 101 | if type(f) != list: 102 | f = [0 for ff in f] 103 | elif type(f) == int: 104 | f = 0 105 | elif type(f) == float: 106 | f = 0.0 107 | features.append(f) 108 | elif f_name in DIVERSITY_FEATURES: 109 | f = diversity_feature_name2value( 110 | f_name, example, task2task_term_dist[task], 111 | vocab.sym2id) 112 | if np.isnan(f).any() or np.isinf(f).any(): 113 | if type(f) != list: 114 | f = [0 for ff in f] 115 | elif type(f) == int: 116 | f = 0 117 | elif type(f) == float: 118 | f = 0.0 119 | features.append(f) 120 | else: 121 | raise ValueError('%s is not a valid feature name.' % f_name) 122 | #assert not np.isnan(features).any(), 'Error: NAN value in array.' 123 | #assert not np.isinf(features).any(), 'Error: inf or -inf value.' 124 | task2features[task].append(features) 125 | if i % 100 == 0 and i > 0: 126 | print('%s. Created features for %d examples.' % (task, i)) 127 | task2features[task] = np.array(task2features[task]) 128 | 129 | # z-normalize the feature scores 130 | feature_values = scipy.stats.zscore(np.vstack([f for f in 131 | task2features.values()]), axis=0) 132 | start_idx = 0 133 | for task, features in task2features.items(): 134 | task2features[task] = feature_values[ 135 | start_idx:start_idx+features.shape[0], :] 136 | start_idx += features.shape[0] 137 | 138 | # write the features to the corresponding file 139 | file_path = os.path.join(save_path, task) 140 | with open(file_path, 'w') as f: 141 | for example_features in task2features[task]: 142 | # set max_line_width so that features don't wrap across lines 143 | f.write('%s\n' % np.array_str(example_features, 144 | max_line_width=sys.maxsize)) 145 | print('Wrote %s %d-d features to %s...' % (task, len(feature_names), 146 | file_path)) 147 | print('Created features.') 148 | return task2features 149 | 150 | 151 | def get_term_dist(docs, word2id, lowercase=True): 152 | """ 153 | Calculates the term distribution of a list of documents. 154 | :param docs: a list of tokenized docs; can also contain a single document 155 | :param vocab: the Vocabulary object 156 | :param lowercase: lower-case the input data 157 | :return: the term distribution of the input documents, 158 | i.e. a numpy array of shape (vocab_size,) 159 | """ 160 | term_dist = np.zeros(len(word2id)) 161 | for doc in docs: 162 | for word in doc: 163 | if lowercase: 164 | word = word.lower() 165 | if word in word2id: 166 | term_dist[word2id[word]] += 1 167 | 168 | # normalize absolute freqs to obtain a relative frequency term distribution 169 | term_dist /= np.sum(term_dist) 170 | if np.isnan(np.sum(term_dist)): 171 | # the sum is nan if docs only contains one document and that document 172 | # has no words in the vocabulary 173 | term_dist = np.zeros(len(word2id)) 174 | return term_dist 175 | 176 | 177 | # ------------ Similarity features ------------ 178 | 179 | def jensen_shannon_divergence(repr1, repr2): 180 | """Calculates Jensen-Shannon divergence (https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence).""" 181 | avg_repr = 0.5 * (repr1 + repr2) 182 | sim = 1 - 0.5 * (scipy.stats.entropy(repr1, avg_repr) + scipy.stats.entropy(repr1, avg_repr)) 183 | if np.isinf(sim): 184 | # the similarity is -inf if no term in the document is in the vocabulary 185 | return 0 186 | return sim 187 | 188 | 189 | def renyi_divergence(repr1, repr2, alpha=0.99): 190 | """Calculates Renyi divergence (https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy#R.C3.A9nyi_divergence).""" 191 | try: 192 | log_sum = np.sum([np.power(p, alpha) / np.power(q, alpha-1) for (p, q) in zip(repr1, repr2)]) 193 | sim = 1 / (alpha - 1) * np.log(log_sum) 194 | except FloatingPointError: # division by 0 error 195 | return 0 196 | if np.isinf(sim): 197 | # the similarity is -inf if no term in the document is in the vocabulary 198 | return 0 199 | return sim 200 | 201 | 202 | def cosine_similarity(repr1, repr2): 203 | """Calculates cosine similarity (https://en.wikipedia.org/wiki/Cosine_similarity).""" 204 | if repr1 is None or repr2 is None: 205 | return 0 206 | assert not (np.isnan(repr2).any() or np.isinf(repr2).any()) 207 | assert not (np.isnan(repr1).any() or np.isinf(repr1).any()) 208 | sim = 1 - scipy.spatial.distance.cosine(repr1, repr2) 209 | if np.isnan(sim): 210 | # the similarity is nan if no term in the document is in the vocabulary 211 | return 0 212 | return sim 213 | 214 | 215 | def euclidean_distance(repr1, repr2): 216 | """Calculates Euclidean distance (https://en.wikipedia.org/wiki/Euclidean_distance).""" 217 | sim = np.sqrt(np.sum([np.power(p-q, 2) for (p, q) in zip(repr1, repr2)])) 218 | return sim 219 | 220 | 221 | def variational_distance(repr1, repr2): 222 | """Also known as L1 or Manhattan distance (https://en.wikipedia.org/wiki/Taxicab_geometry).""" 223 | sim = np.sum([np.abs(p-q) for (p, q) in zip(repr1, repr2)]) 224 | return sim 225 | 226 | 227 | def kl_divergence(repr1, repr2): 228 | """Calculates Kullback-Leibler divergence (https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).""" 229 | sim = scipy.stats.entropy(repr1, repr2) 230 | return sim 231 | 232 | 233 | def bhattacharyya_distance(repr1, repr2): 234 | """Calculates Bhattacharyya distance (https://en.wikipedia.org/wiki/Bhattacharyya_distance).""" 235 | try: 236 | sim = - np.log(np.sum([np.sqrt(p*q) for (p, q) in zip(repr1, repr2)])) 237 | except FloatingPointError: # division by 0 error 238 | return 0 239 | assert not np.isnan(sim), 'Error: Similarity is nan.' 240 | if np.isinf(sim): 241 | # the similarity is -inf if no term in the review is in the vocabulary 242 | return 0 243 | return sim 244 | 245 | 246 | def similarity_name2value(s_name, repr1, repr2): 247 | """Given a similarity function name, return the corresponding similarity function value.""" 248 | if s_name == 'jensen-shannon': 249 | return jensen_shannon_divergence(repr1, repr2) 250 | if s_name == 'renyi': 251 | return renyi_divergence(repr1, repr2) 252 | if s_name == 'cos' or s_name == 'cosine': 253 | return cosine_similarity(repr1, repr2) 254 | if s_name == 'euclidean': 255 | return euclidean_distance(repr1, repr2) 256 | if s_name == 'variational': 257 | return variational_distance(repr1, repr2) 258 | if s_name == 'kl': 259 | return kl_divergence(repr1, repr2) 260 | if s_name == 'bhattacharyya': 261 | return bhattacharyya_distance(repr1, repr2) 262 | raise ValueError('%s is not a valid feature name.' % s_name) 263 | 264 | 265 | # ------------ Diversity features ------------ 266 | 267 | def number_of_word_types(example): 268 | """Counts the number of word types of the example.""" 269 | return len(set(example)) 270 | 271 | 272 | def type_token_ratio(example): 273 | """Calculates the type-token ratio of the example.""" 274 | return number_of_word_types(example) / len(example) 275 | 276 | 277 | def entropy(example, train_term_dist, word2id): 278 | """Calculates Entropy (https://en.wikipedia.org/wiki/Entropy_(information_theory)).""" 279 | summed = 0 280 | for word in set(example): 281 | if word in word2id: 282 | p_word = train_term_dist[word2id[word]] 283 | summed += p_word * np.log(p_word) 284 | return - summed 285 | 286 | 287 | def simpsons_index(example, train_term_dist, word2id): 288 | """Calculates Simpson's Index (https://en.wikipedia.org/wiki/Diversity_index#Simpson_index).""" 289 | score = np.sum([np.power(train_term_dist[word2id[word]], 2) if word in word2id else 0 290 | for word in set(example)]) 291 | return score 292 | 293 | 294 | def quadratic_entropy(example, train_term_dist, word2id, word2vec): 295 | """Calculates Quadratic Entropy.""" 296 | assert word2vec is not None, ('Error: Word vector representations have to ' 297 | 'be available for quadratic entropy.') 298 | summed = 0 299 | for word_1 in set(example): 300 | if word_1 not in word2id or word_1 not in word2vec: 301 | continue # continue as the product will be 0 302 | for word_2 in set(example): 303 | if word_2 not in word2id or word_2 not in word2vec: 304 | continue # continue as the product will be 0 305 | p_1 = train_term_dist[word2id[word_1]] 306 | p_2 = train_term_dist[word2id[word_2]] 307 | vec_1 = word2vec[word_1] 308 | vec_2 = word2vec[word_2] 309 | sim = cosine_similarity(vec_1, vec_2) 310 | summed += sim * p_1 * p_2 311 | return summed 312 | 313 | 314 | def renyi_entropy(example, domain_term_dist, word2id): 315 | """Calculates Rényi Entropy (https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy).""" 316 | alpha = 0.99 317 | summed = np.sum([np.power(domain_term_dist[word2id[word]], alpha) if word in word2id else 0 for word in set(example)]) 318 | if summed == 0: 319 | # 0 if none of the words appear in the dictionary; 320 | # set to a small constant == low prob instead 321 | summed = 0.0001 322 | score = 1 / (1 - alpha) * np.log(summed) 323 | return score 324 | 325 | 326 | def diversity_feature_name2value(f_name, example, task_term_dist, word2id): 327 | """ 328 | Given a feature name, return the corresponding feature value. 329 | :param f_name: the name of the feature 330 | :param example: the tokenised example document 331 | :param task_term_dist: the term distribution of the task of the example 332 | :param word2id: the word-to-id mapping 333 | :param word2vec: a mapping of a word to its word vector representation (e.g. GloVe or word2vec) 334 | :return: the value of the corresponding feature 335 | """ 336 | if f_name == 'num_word_types': 337 | return number_of_word_types(example) 338 | if f_name == 'type_token_ratio': 339 | return type_token_ratio(example) 340 | if f_name == 'entropy': 341 | return entropy(example, task_term_dist, word2id) 342 | if f_name == 'simpsons_index': 343 | return simpsons_index(example, task_term_dist, word2id) 344 | # if f_name == 'quadratic_entropy': 345 | # return quadratic_entropy(example, train_term_dist, word2id, word2vec) 346 | if f_name == 'renyi_entropy': 347 | return renyi_entropy(example, task_term_dist, word2id) 348 | raise ValueError('%s is not a valid feature name.' % f_name) 349 | -------------------------------------------------------------------------------- /mtl/nn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def bicond_reader(placeholders, target_sizes, vocab_size, label_vocab_size, **options): 5 | emb_dim = options["emb_dim"] 6 | lab_emb_dim = options["lab_emb_dim"] 7 | 8 | # [batch_size, max_seq1_length] 9 | seq1 = placeholders['seq1'] 10 | 11 | # [batch_size, max_seq2_length] 12 | seq2 = placeholders['seq2'] 13 | 14 | # [batch_size, labels_size] 15 | targets = tf.to_float(placeholders['targets']) 16 | 17 | label_vocab_inds = placeholders['label_vocab_inds'] 18 | 19 | init = tf.contrib.layers.xavier_initializer(uniform=True) 20 | 21 | with tf.variable_scope("embeddings"): 22 | embeddings = tf.get_variable("word_embeddings", [vocab_size, emb_dim], dtype=tf.float32, initializer=init) 23 | 24 | with tf.variable_scope("embedders") as varscope: 25 | seq1_embedded = tf.nn.embedding_lookup(embeddings, seq1) 26 | varscope.reuse_variables() 27 | seq2_embedded = tf.nn.embedding_lookup(embeddings, seq2) 28 | 29 | with tf.variable_scope("conditional_reader_seq1") as varscope1: 30 | # seq1_states: (c_fw, h_fw), (c_bw, h_bw) 31 | _, seq1_states = reader(seq1_embedded, placeholders['seq1_lengths'], emb_dim, 32 | scope=varscope1, **options) 33 | 34 | with tf.variable_scope("conditional_reader_seq2") as varscope2: 35 | varscope1.reuse_variables() 36 | outputs, states = reader(seq2_embedded, placeholders['seq2_lengths'], emb_dim, seq1_states, scope=varscope2, **options) 37 | 38 | # shape output: [batch_size, 2*emb_dim] 39 | if options["main_num_layers"] == 1: 40 | # shape states: [2, 2] 41 | output = tf.concat([states[0][1], states[1][1]], 1) 42 | else: 43 | # shape states: [2, num_layers, 2] 44 | output = tf.concat([states[0][-1][1], states[1][-1][1]], 1) 45 | 46 | if lab_emb_dim != 0: 47 | with tf.variable_scope("label_embeddings"): 48 | label_embeddings = tf.get_variable("label_embeddings", [label_vocab_size, lab_emb_dim], dtype=tf.float32, initializer=init) 49 | 50 | with tf.variable_scope("bicond_preds"): 51 | # output of sequence encoders is projected into separate output layers, one for each task 52 | scores_dict, loss_dict, predict_dict = {}, {}, {} 53 | # iterate over the tasks 54 | for k in target_sizes.keys(): 55 | # use task name as variable scope 56 | with tf.variable_scope(k) as varscope_task: 57 | if options["task_specific_layer_size"] > 0: 58 | with tf.variable_scope(k + "_task_spec_layer") as task_spec_layer_scope: 59 | output = tf.contrib.layers.fully_connected(output, options["task_specific_layer_size"], 60 | weights_initializer=init, 61 | activation_fn=tf.tanh, scope=task_spec_layer_scope) 62 | if lab_emb_dim != 0: 63 | 64 | # placeholders['label_vocab_inds'] contain the index of the labels and 0 elsewhere, e.g. [0, 0, 0, 4, 5, 6, 0, 0, ...] 65 | # shape: [batch_size, num_tasks*num_labels, label_embed_dim] 66 | labels_embedded = tf.nn.embedding_lookup(label_embeddings, label_vocab_inds) 67 | 68 | output_dim = emb_dim*2 69 | if options["task_specific_layer_size"] > 0: 70 | output_dim = options["task_specific_layer_size"] 71 | 72 | output, labels_embedded = pad_output(output, labels_embedded, output_dim, lab_emb_dim) 73 | 74 | # get predictions with dot product between output and embedded labels. 75 | scores = dotprod_with_lab_embs(output, labels_embedded, label_vocab_inds) 76 | 77 | # boolean_mask returns a 1-d tensor, so we need to reshape 78 | # works for all models since we compute the target sizes for all models 79 | scores = tf.reshape(scores, [-1, target_sizes[k]]) 80 | loss = tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=targets) 81 | predict = tf.nn.softmax(scores) 82 | 83 | else: 84 | label_embeddings = None 85 | if options["l1_rate_main"] != 1.0 or options["l2_rate_main"] != 1.0: 86 | with tf.variable_scope(k + "_l1l2_reg") as l1l2scope: 87 | l1_l2 = tf.contrib.layers.l1_l2_regularizer(scale_l1=options["l1_rate_main"], scale_l2=options["l2_rate_main"], scope=l1l2scope) 88 | scores = tf.contrib.layers.fully_connected(output, target_sizes[k], weights_initializer=init, 89 | activation_fn=tf.tanh, scope=varscope_task, weights_regularizer=l1_l2) # target_size 90 | else: 91 | scores = tf.contrib.layers.fully_connected(output, target_sizes[k], weights_initializer=init, activation_fn=tf.tanh, scope=varscope_task) # target_size 92 | loss = tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=targets) 93 | predict = tf.nn.softmax(scores) 94 | 95 | scores_dict[k] = scores 96 | loss_dict[k] = loss 97 | predict_dict[k] = predict 98 | 99 | return scores_dict, loss_dict, predict_dict, label_embeddings 100 | 101 | 102 | def relabel_model(placeholders, target_sizes, input_size_feats, input_size_preds, label_embeddings, **options): 103 | lab_emb_dim = options["lab_emb_dim"] 104 | 105 | soft_or_hard = options['ltn_pred_type'] 106 | hidd_layer_size = options['lel_hid_size'] 107 | 108 | # [batch_size, num_tasks - 2] 109 | ltn_preds = placeholders['preds_for_ltn'] 110 | 111 | if options["feature_sets"] != "predsonly": 112 | # [batch_size, num_features] 113 | features = placeholders['features'] 114 | 115 | # [batch_size, labels_size] 116 | targets = tf.to_float(placeholders['targets']) 117 | 118 | label_vocab_inds = placeholders['label_vocab_inds'] 119 | 120 | # for returning main task predictions for relabelling with EM 121 | targets_main = tf.to_float(placeholders['targets_main']) 122 | label_vocab_inds_main = placeholders['label_vocab_inds_main'] 123 | 124 | with tf.variable_scope("ltn_preds"): 125 | # output of sequence encoders is projected into separate output layers, one for each task 126 | init = tf.contrib.layers.xavier_initializer(uniform=True) 127 | scores_dict, loss_dict, predict_dict, predict_main_dict = {}, {}, {}, {} 128 | # iterate over the tasks 129 | for k in target_sizes.keys(): 130 | # use task name as variable scope 131 | with tf.variable_scope(k) as varscope_task: 132 | if options["feature_sets"] != "predsonly": 133 | # concatenate the predictions with the features 134 | if soft_or_hard == 'hard': 135 | emb_size = input_size_feats + input_size_preds 136 | else: 137 | emb_size = input_size_feats + input_size_preds[k] 138 | 139 | output = tf.reshape(tf.concat([ltn_preds, features], 1), [-1, emb_size]) 140 | else: 141 | if soft_or_hard == 'hard': 142 | emb_size = input_size_preds 143 | else: 144 | emb_size = input_size_preds[k] 145 | output = tf.reshape(ltn_preds, [-1, emb_size]) 146 | 147 | 148 | if options["l1_rate_ltn"] != 1.0 or options["l2_rate_ltn"] != 1.0: 149 | l1_l2 = tf.contrib.layers.l1_l2_regularizer(scale_l1=options["l1_rate_ltn"], scale_l2=options["l2_rate_ltn"]) 150 | 151 | output_dim = emb_size 152 | 153 | if hidd_layer_size != 0: 154 | if options["l1_rate_ltn"] != 1.0 or options["l2_rate_ltn"] != 1.0: 155 | with tf.variable_scope(k + "_relabel_hidd_layer") as task_spec_relabel_layer_scope: 156 | output = tf.contrib.layers.fully_connected(output, hidd_layer_size, weights_initializer=init, weights_regularizer=l1_l2, scope=task_spec_relabel_layer_scope) 157 | else: 158 | with tf.variable_scope(k + "_relabel_hidd_layer") as task_spec_relabel_layer_scope: 159 | output = tf.contrib.layers.fully_connected(output, hidd_layer_size, weights_initializer=init, scope=task_spec_relabel_layer_scope) 160 | 161 | output_dim = hidd_layer_size 162 | 163 | predict_main = None 164 | 165 | if options["lab_embs_for_ltn"]: 166 | 167 | # placeholders['label_vocab_inds'] contain the index of the labels and 0 elsewhere, e.g. [0, 0, 0, 4, 5, 6, 0, 0, ...] 168 | # shape: [batch_size, num_tasks*num_labels, label_embed_dim] 169 | labels_embedded = tf.nn.embedding_lookup(label_embeddings, label_vocab_inds) 170 | 171 | output_padded, labels_embedded = pad_output(output, labels_embedded, output_dim, lab_emb_dim) 172 | 173 | # get predictions with dot product between output and embedded labels. 174 | scores = dotprod_with_lab_embs(output_padded, labels_embedded, label_vocab_inds) 175 | 176 | # boolean_mask returns a 1-d tensor, so we need to reshape 177 | scores = tf.reshape(scores, tf.shape(targets)) 178 | 179 | # then we also want to return predictions for the main task 180 | if options["relabel_with_ltn"]: 181 | labels_embedded_main = tf.nn.embedding_lookup(label_embeddings, label_vocab_inds_main) 182 | 183 | output_padded_main, labels_embedded_main = pad_output(output, labels_embedded_main, output_dim, lab_emb_dim) 184 | 185 | # get predictions with dot product between output and embedded main task labels. 186 | scores_main = dotprod_with_lab_embs(output_padded_main, labels_embedded_main, label_vocab_inds_main) 187 | 188 | # boolean_mask returns a 1-d tensor, so we need to reshape 189 | scores_main = tf.reshape(scores_main, tf.shape(targets_main)) 190 | 191 | predict_main = tf.nn.softmax(scores_main) 192 | 193 | 194 | else: 195 | if options["l1_rate_ltn"] != 1.0 or options["l2_rate_ltn"] != 1.0: 196 | scores = tf.contrib.layers.fully_connected(output, target_sizes[k], weights_initializer=init, 197 | activation_fn=tf.tanh, scope=varscope_task, weights_regularizer=l1_l2) # target_size 198 | 199 | else: 200 | scores = tf.contrib.layers.fully_connected(output, target_sizes[k], weights_initializer=init, activation_fn=tf.tanh, scope=varscope_task) # target_size 201 | loss = tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=targets) 202 | predict = tf.nn.softmax(scores) 203 | 204 | scores_dict[k] = scores 205 | loss_dict[k] = loss 206 | predict_dict[k] = predict 207 | predict_main_dict[k] = predict_main 208 | 209 | return scores_dict, loss_dict, predict_dict, predict_main_dict 210 | 211 | 212 | def dotprod_with_lab_embs(output, labels_embedded, label_vocab_inds): 213 | # dot product needs to happen with all the labels 214 | # shape output_expanded: [batch_size, 1, emb_dim*2] 215 | # shape labels_expanded: [batch_size, num_labels, emb_dim*2] 216 | # shape comb_repr: [batch_size, num_labels, emb_dim*2] 217 | output_expanded = tf.expand_dims(output, 1) 218 | comb_repr = tf.multiply(output_expanded, labels_embedded) 219 | 220 | # we remove the embedding dimension so we just have the scores 221 | #  shape: [batch_size, num_tasks*num_labels] 222 | reduced_output = tf.reduce_sum(comb_repr, 2) 223 | # now we want to mask it so only the labels for the task for which we have training data is taken into account for the loss 224 | # ... but this doesn't work yet 225 | #  a vector of zeros: [0, 0, 0, 0, 0, ...] 226 | zeroes = tf.zeros_like(label_vocab_inds) 227 | #  a vector indicating where label indices != 0 228 | #  [False, False, False, True, True, True, False, False, ...] 229 | mask = tf.not_equal(label_vocab_inds, zeroes) 230 | scores = tf.boolean_mask(reduced_output, mask) 231 | 232 | return scores 233 | 234 | 235 | def pad_output(output, labels_embedded, output_dim, lab_emb_dim): 236 | if output_dim > lab_emb_dim: 237 | howmany = output_dim - lab_emb_dim 238 | labels_embedded = tf.pad(labels_embedded, [[0, 0], [0, 0], [0, howmany]], constant_values=0) 239 | elif lab_emb_dim > output_dim: 240 | howmany = lab_emb_dim - output_dim 241 | output = tf.pad(output, [[0, 0], [0, howmany]], constant_values=0) 242 | return output, labels_embedded 243 | 244 | 245 | def reader(inputs, lengths, output_size, contexts=(None, None), scope=None, **options): 246 | """Dynamic bi-LSTM reader; can be conditioned with initial state of other rnn. 247 | 248 | Args: 249 | inputs (tensor): The inputs into the bi-LSTM 250 | lengths (tensor): The lengths of the sequences 251 | output_size (int): Size of the LSTM state of the reader. 252 | context (tensor=None, tensor=None): Tuple of initial (forward, backward) states 253 | for the LSTM 254 | scope (string): The TensorFlow scope for the reader. 255 | drop_keep_drop (float=1.0): The keep probability for dropout. 256 | 257 | Returns: 258 | Outputs (tensor): The outputs from the bi-LSTM. 259 | States (tensor): The cell states from the bi-LSTM. 260 | """ 261 | 262 | skip_connections = options["skip_connections"] 263 | attention = options["attention"] 264 | num_layers = options["main_num_layers"] 265 | drop_keep_prob = options["dropout_rate"] 266 | 267 | with tf.variable_scope(scope or "reader") as varscope: 268 | if options["rnn_cell_type"] == "layer_norm": 269 | cell_fw = tf.contrib.rnn.LayerNormBasicLSTMCell(output_size) 270 | cell_bw = tf.contrib.rnn.LayerNormBasicLSTMCell(output_size) 271 | elif options["rnn_cell_type"] == "nas": 272 | cell_fw = tf.contrib.rnn.NASCell(output_size) 273 | cell_bw = tf.contrib.rnn.NASCell(output_size) 274 | elif options["rnn_cell_type"] == "phasedlstm": 275 | cell_fw = tf.contrib.rnn.PhasedLSTMCell(output_size) 276 | cell_bw = tf.contrib.rnn.PhasedLSTMCell(output_size) 277 | else: #LSTM cell 278 | cell_fw = tf.contrib.rnn.LSTMCell(output_size, initializer=tf.contrib.layers.xavier_initializer()) 279 | cell_bw = tf.contrib.rnn.LSTMCell(output_size, initializer=tf.contrib.layers.xavier_initializer()) 280 | if num_layers > 1: 281 | cell_fw = tf.nn.rnn_cell.MultiRNNCell([cell_fw] * num_layers) 282 | cell_bw = tf.nn.rnn_cell.MultiRNNCell([cell_bw] * num_layers) 283 | 284 | if drop_keep_prob != 1.0: 285 | cell_fw = tf.contrib.rnn.DropoutWrapper(cell=cell_fw, output_keep_prob=drop_keep_prob) 286 | cell_bw = tf.contrib.rnn.DropoutWrapper(cell=cell_bw, output_keep_prob=drop_keep_prob) 287 | 288 | if skip_connections == True: 289 | cell_fw = tf.contrib.rnn.ResidualWrapper(cell_fw) 290 | cell_bw = tf.contrib.rnn.ResidualWrapper(cell_bw) 291 | 292 | if attention == True: 293 | cell_fw = tf.contrib.rnn.AttentionCellWrapper(cell_fw, attn_length=10) 294 | cell_bw = tf.contrib.rnn.AttentionCellWrapper(cell_bw, attn_length=10) 295 | 296 | outputs, states = tf.nn.bidirectional_dynamic_rnn( 297 | cell_fw, 298 | cell_bw, 299 | inputs, 300 | sequence_length=lengths, 301 | initial_state_fw=contexts[0], 302 | initial_state_bw=contexts[1], 303 | dtype=tf.float32 304 | ) 305 | 306 | # ( (outputs_fw,outputs_bw) , (output_state_fw,output_state_bw) ) 307 | # in case LSTMCell: output_state_fw = (c_fw,h_fw), and output_state_bw = (c_bw,h_bw) 308 | # each [batch_size x max_seq_length x output_size] 309 | return outputs, states 310 | -------------------------------------------------------------------------------- /preproc/data_reader.py: -------------------------------------------------------------------------------- 1 | import csv, os 2 | import xml.etree.ElementTree as ET 3 | from collections import defaultdict 4 | 5 | from sklearn.model_selection import train_test_split 6 | 7 | from constants import FNC, STANCE, NLI, TOPIC, LAPTOP, RESTAURANT, TARGET,\ 8 | TOPIC_5WAY, STANCE_LABELS, FNC_LABELS, NLI_LABELS, TOPIC_LABELS, \ 9 | TOPIC_5WAY_LABELS, ABSA_LABELS, TARGET_LABELS 10 | 11 | 12 | def task2data_reader(task): 13 | if task == STANCE: # all data available 14 | return readSemEval2016Task6 15 | if task == FNC: # all data available 16 | return readFakeNewsChallengeData 17 | if task == NLI: # test data not available - so we use every other dev example as test example 18 | return readMultinliData 19 | if task == TOPIC: # all data available 20 | return readTopicBased 21 | if task == TOPIC_5WAY: 22 | return readTopic5Way 23 | if task == LAPTOP: # all data available 24 | return read_absa_laptops 25 | if task == RESTAURANT: # all data available 26 | return read_absa_restaurants 27 | if task == TARGET: # all data available 28 | return read_target_dependent 29 | raise ValueError('No data reader available for %s.' % task) 30 | 31 | 32 | def readSemEval2016Task6(datafolder="./data/", debug=True, num_instances=20): 33 | data_train = {"seq1": [], "seq2": [], "stance": [], "opinion_towards": [], "sentiment": [], "labels": []} 34 | data_dev = {"seq1": [], "seq2": [], "stance": [], "opinion_towards": [], "sentiment": [], "labels": []} 35 | data_test = {"seq1": [], "seq2": [], "stance": [], "opinion_towards": [], "sentiment": [], "labels": []} 36 | data_train, data_dev = parse_semeval_csv(os.path.join(datafolder, 'semeval2016-task6-stance/train.csv'), data_train, data_dev, "train", debug, num_instances) 37 | data_test, data_dev = parse_semeval_csv(os.path.join(datafolder, 'semeval2016-task6-stance/test.csv'), data_test, data_dev, "test", False, num_instances) # setting debug to False to get all test instances 38 | 39 | # For the final task training, the dev set is used as part of the training set 40 | for i, inst in enumerate(data_dev["stance"]): 41 | data_train["seq1"].append(data_dev["seq1"][i]) 42 | data_train["seq2"].append(data_dev["seq2"][i]) 43 | data_train["stance"].append(data_dev["stance"][i]) 44 | data_train["opinion_towards"].append(data_dev["opinion_towards"][i]) 45 | data_train["sentiment"].append(data_dev["sentiment"][i]) 46 | 47 | # sort the labels so that they are always in the same order so that we can 48 | # compute averaged positive and negative F1 (AGAINST, FAVOR, NONE) 49 | labels = sorted(list(set(data_train["stance"]))) 50 | assert labels == STANCE_LABELS 51 | data_train["labels"] = labels 52 | data_dev["labels"] = labels 53 | data_test["labels"] = labels 54 | 55 | # we do not use the raw data ATM to correspond with the signature of the other data readers 56 | return data_train, data_dev, data_test 57 | 58 | 59 | def parse_semeval_csv(filepath, empty_dict_1, empty_dict_2, mode, debug=False, num_instances=20): 60 | with open(filepath, 'r', encoding="latin-1") as csvfile: 61 | csvreader = csv.reader(csvfile, delimiter=',') 62 | i = -1 63 | for row in csvreader: 64 | i += 1 65 | if i == 0: 66 | continue 67 | if debug and i >= num_instances+1: 68 | continue 69 | tweet, target, stance, opinion_towards, sentiment = row 70 | dict_chosen = empty_dict_1 71 | if target == "Hillary Clinton": 72 | dict_chosen = empty_dict_2 73 | if mode == "train" or target == "Hillary Clinton" or (mode == "test" and target == "Donald Trump"): 74 | dict_chosen["seq1"].append(target) 75 | dict_chosen["seq2"].append(tweet) 76 | dict_chosen["stance"].append(stance) 77 | dict_chosen["opinion_towards"].append(opinion_towards) 78 | dict_chosen["sentiment"].append(sentiment) 79 | return empty_dict_1, empty_dict_2 80 | 81 | 82 | def readFakeNewsChallengeData(datafolder="./data/", debug=True, num_instances=20): 83 | data_train = {"seq1": [], "seq2": [], "stance": [], "labels": []} 84 | data_train = parseFakeNewsChallengeData(datafolder, "fakenewschallenge/train_bodies.csv", "fakenewschallenge/trainsplit_stances.csv", data_train, debug, num_instances) 85 | data_dev = {"seq1": [], "seq2": [], "stance": [], "labels": []} 86 | data_dev = parseFakeNewsChallengeData(datafolder, "fakenewschallenge/train_bodies.csv", "fakenewschallenge/devsplit_stances.csv", data_dev, debug, num_instances) 87 | data_test = {"seq1": [], "seq2": [], "stance": [], "labels": []} 88 | data_test = parseFakeNewsChallengeData(datafolder, "fakenewschallenge/competition_test_bodies.csv", "fakenewschallenge/competition_test_stances.csv", data_test, debug, num_instances) 89 | data_train["labels"] = sorted(data_train["labels"]) 90 | assert data_train["labels"] == FNC_LABELS 91 | data_dev["labels"] = data_train["labels"] 92 | data_test["labels"] = data_train["labels"] 93 | return data_train, data_dev, data_test 94 | 95 | 96 | def parseFakeNewsChallengeData(datafolder, datafile_bodies, datafile_stances, data_dict, debug, num_instances): 97 | id2body = {} 98 | with open(os.path.join(datafolder, datafile_bodies), 'r', encoding='utf-8') as csvfile: 99 | csvreader = csv.reader(csvfile, delimiter=',') 100 | i = -1 101 | for row in csvreader: 102 | i += 1 103 | if i == 0: 104 | continue 105 | body_id, body = row 106 | id2body[body_id] = body 107 | 108 | with open(os.path.join(datafolder, datafile_stances), 'r', encoding='utf-8') as csvfile: 109 | csvreader = csv.reader(csvfile, delimiter=',') 110 | i = -1 111 | for row in csvreader: 112 | i += 1 113 | if i == 0: 114 | continue 115 | if debug and i >= num_instances+1: 116 | continue 117 | headline, body_id, stance = row 118 | data_dict["seq1"].append(headline) 119 | data_dict["seq2"].append(id2body[body_id]) 120 | data_dict["stance"].append(stance) 121 | 122 | for lab in set(data_dict["stance"]): 123 | data_dict["labels"].append(lab) 124 | 125 | return data_dict 126 | 127 | 128 | def readMultinliData(datafolder="./data/", debug=True, num_instances=20): 129 | 130 | max_count = None 131 | if debug == True: 132 | max_count = num_instances+1 133 | 134 | data_train = {"seq1": [], "seq2": [], "stance": [], "genre": [], "labels": []} 135 | data_train, _ = parseMultinliFile(os.path.join(datafolder, 'multinli/multinli_0.9_train.txt'), data_train, {}, max_count, "train") 136 | data_dev = {"seq1": [], "seq2": [], "stance": [], "genre": [], "labels": []} 137 | data_test = {"seq1": [], "seq2": [], "stance": [], "genre": [], "labels": []} 138 | data_dev, data_test = parseMultinliFile(os.path.join(datafolder, 'multinli/multinli_0.9_dev_matched.txt'), data_dev, data_test, max_count, "test") 139 | 140 | return data_train, data_dev, data_test 141 | 142 | 143 | def parseMultinliFile(filepath, data_1, data_2, max_count, mode): 144 | reading_dataset = open(filepath, "r", encoding='utf-8') 145 | # The script reads into those lists. If IDs for questions, supports or targets are defined, those are ignored. 146 | count, counti = 0, 0 147 | 148 | for line in reading_dataset: 149 | if max_count is None or count < max_count: 150 | lspl = line.strip("\n").split("\t") 151 | if len(lspl) == 15: 152 | gold_label, _, _, _, _, sentence1, sentence2, promptID, pairID, genre, _, _, _, _, _ = lspl 153 | if gold_label == "gold_label" or gold_label == "-": 154 | continue 155 | data_dict = data_1 156 | if mode == "train" or (mode == "test" and count % 2 == 0): 157 | data_dict = data_1 158 | elif mode == "test": 159 | data_dict = data_2 160 | data_dict["seq1"].append(sentence1) 161 | data_dict["seq2"].append(sentence2) 162 | data_dict["stance"].append(gold_label) 163 | data_dict["genre"].append(genre) 164 | count += 1 165 | 166 | for lab in set(data_1["stance"]): 167 | data_1["labels"].append(lab) 168 | data_1["labels"] = sorted(data_1["labels"]) 169 | assert data_1["labels"] == NLI_LABELS 170 | 171 | if data_2 != {}: 172 | for lab in set(data_2["stance"]): 173 | data_2["labels"].append(lab) 174 | data_2["labels"] = sorted(data_2["labels"]) 175 | assert data_2["labels"] == NLI_LABELS 176 | 177 | return data_1, data_2 178 | 179 | 180 | def readTopicBased(datafolder="./data/", debug=True, num_instances=20): 181 | topic_based_path = os.path.join(datafolder, 'semeval2016-task4b-topic-based-sentiment') 182 | train_path = os.path.join(topic_based_path, '100_topics_XXX_tweets.topic-two-point.subtask-BD.train.gold_downloaded.tsv') 183 | dev1_path = os.path.join(topic_based_path, '100_topics_XXX_tweets.topic-two-point.subtask-BD.dev.gold_downloaded.tsv') 184 | dev2_path = os.path.join(topic_based_path, '100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.gold_downloaded.tsv') 185 | test_data_path = os.path.join(topic_based_path, 'SemEval2016-task4-test.subtask-BD.txt') 186 | test_labels_path = os.path.join(topic_based_path, 'SemEval2016_task4_subtaskB_test_gold.txt') 187 | 188 | for path_ in [topic_based_path, train_path, dev1_path, dev2_path, test_data_path, test_labels_path]: 189 | assert os.path.exists(path_), 'Error: %s does not exist.' % path_ 190 | 191 | data_train = parse_topic_based(train_path, debug, num_instances) 192 | data_dev1 = parse_topic_based(dev1_path, debug, num_instances) 193 | data_dev2 = parse_topic_based(dev2_path, debug, num_instances) 194 | data_test = parse_topic_test_data(test_data_path, test_labels_path) 195 | assert data_train["labels"] == TOPIC_LABELS 196 | data_dev1["labels"] = data_train["labels"] 197 | data_test["labels"] = data_train["labels"] 198 | 199 | # add the second dev data to the train set 200 | data_train["seq1"] += data_dev2["seq1"] 201 | data_train["seq2"] += data_dev2["seq2"] 202 | data_train["stance"] += data_dev2["stance"] 203 | return data_train, data_dev1, data_test 204 | 205 | 206 | def readTopic5Way(datafolder="./data/", debug=True, num_instances=20): 207 | topic_based_path = os.path.join(datafolder, 'semeval2016-task4c-topic-based-sentiment') 208 | train_path = os.path.join(topic_based_path, '100_topics_100_tweets.topic-five-point.subtask-CE.train.gold_downloaded.tsv') 209 | dev1_path = os.path.join(topic_based_path, '100_topics_100_tweets.topic-five-point.subtask-CE.dev.gold_downloaded.tsv') 210 | dev2_path = os.path.join(topic_based_path, '100_topics_100_tweets.topic-five-point.subtask-CE.devtest.gold_downloaded.tsv') 211 | test_data_path = os.path.join(topic_based_path, 'SemEval2016-task4-test.subtask-CE.txt') 212 | test_labels_path = os.path.join(topic_based_path, 'SemEval2016_task4_subtaskC_test_gold.txt') 213 | 214 | for path_ in [topic_based_path, train_path, dev1_path, dev2_path, 215 | test_data_path, test_labels_path]: 216 | assert os.path.exists(path_), 'Error: %s does not exist.' % path_ 217 | 218 | data_train = parse_topic_based(train_path, debug, num_instances) 219 | data_dev1 = parse_topic_based(dev1_path, debug, num_instances) 220 | data_dev2 = parse_topic_based(dev2_path, debug, num_instances) 221 | data_test = parse_topic_test_data(test_data_path, test_labels_path) 222 | assert data_train["labels"] == TOPIC_5WAY_LABELS 223 | data_dev1["labels"] = data_train["labels"] 224 | data_test["labels"] = data_train["labels"] 225 | 226 | # add the second dev data to the train set 227 | data_train["seq1"] += data_dev2["seq1"] 228 | data_train["seq2"] += data_dev2["seq2"] 229 | data_train["stance"] += data_dev2["stance"] 230 | return data_train, data_dev1, data_test 231 | 232 | 233 | def parse_topic_based(file_path, debug=False, num_instances=20): 234 | data = {"seq1": [], "seq2": [], "stance": []} 235 | with open(file_path) as f: 236 | for i, line in enumerate(f): 237 | id_, target, sentiment, tweet = line.split('\t') 238 | try: 239 | sentiment = float(sentiment) 240 | except ValueError: 241 | pass 242 | if debug and i >= num_instances+1: 243 | continue 244 | if tweet.strip() == 'Not Available': 245 | continue 246 | data["seq1"].append(target) 247 | data["seq2"].append(tweet) 248 | data["stance"].append(sentiment) 249 | 250 | # we have to sort the labels so that they're in the order 251 | # -2,-1,0,1,2 and are mapped to 0,1,2,3,4 (for subtask C) 252 | data["labels"] = sorted(list(set(data["stance"]))) 253 | return data 254 | 255 | 256 | def parse_topic_test_data(examples_path, labels_path): 257 | # Note: no debugging for the test data (20k tweets for subtask C) 258 | data = {"seq1": [], "seq2": [], "stance": []} 259 | with open(examples_path) as f_examples, open(labels_path) as f_labels: 260 | for i, (line_examples, line_labels) in enumerate(zip(f_examples, f_labels)): 261 | _, examples_target, _, *tweet = line_examples.strip().split('\t') 262 | # two lines contain a tweet, for some reason 263 | _, labels_target, sentiment, *_ = line_labels.strip().split('\t') 264 | # one test tweet contains a tab character 265 | if isinstance(tweet, list): 266 | tweet = '\t'.join(tweet) 267 | try: 268 | sentiment = float(sentiment) 269 | except ValueError: 270 | pass 271 | 272 | assert examples_target == labels_target,\ 273 | '%s != %s at line %d in files %s and %s.' % ( 274 | examples_target, labels_target, i, examples_path, labels_path) 275 | 276 | if tweet.strip() == 'Not Available': 277 | continue 278 | data["seq1"].append(examples_target) 279 | data["seq2"].append(tweet) 280 | data["stance"].append(sentiment) 281 | data["labels"] = sorted(list(set(data["stance"]))) 282 | return data 283 | 284 | 285 | def read_absa_laptops(datafolder="./data/", debug=True, num_instances=20): 286 | return read_absa('laptops', datafolder, debug, num_instances) 287 | 288 | 289 | def read_absa_restaurants(datafolder="./data/", debug=True, num_instances=20): 290 | return read_absa('restaurants', datafolder, debug, num_instances) 291 | 292 | 293 | def read_absa(domain, datafolder="./data/", debug=True, num_instances=20): 294 | assert domain in ['laptops', 'restaurants'], '%s is not a valid domain.' % domain 295 | absa_path = os.path.join(datafolder, 'semeval2016-task5-absa-english') 296 | train_path = os.path.join(absa_path, '%s_english_training.xml' % domain) 297 | test_path = os.path.join(absa_path, '%s_english_test.xml' % domain) 298 | for path_ in [absa_path, train_path, test_path]: 299 | assert os.path.exists(path_), 'Error: %s does not exist.' % path_ 300 | 301 | data_train = parse_absa(train_path, debug, num_instances) 302 | data_test = parse_absa(test_path) 303 | 304 | # trial data is a subset of training data; instead we split the train data 305 | data_train, data_dev = split_train_data(data_train) 306 | return data_train, data_dev, data_test 307 | 308 | 309 | def parse_absa(file_path, debug=False, num_instances=20): 310 | """ 311 | Extracts all reviews from an XML file and returns them as a list of Review objects. 312 | Adds a NONE aspect to all sentences with no aspect. 313 | :param file_path: the path of the XML file 314 | :return: a list of Review objects each containing a list of Sentence objects and other attributes 315 | """ 316 | data = {"seq1": [], "seq2": [], "stance": []} 317 | e = ET.parse(file_path).getroot() 318 | for i, review_e in enumerate(e): 319 | if debug and i >= num_instances+1: 320 | continue 321 | for sentence_e in review_e.find('sentences'): 322 | text = sentence_e.find('text').text 323 | # we do not care about sentences that do not contain an aspect 324 | if sentence_e.find('Opinions') is not None: 325 | for op in sentence_e.find('Opinions'): 326 | # the category is of the form ENTITY#ATTRIBUTE, e.g. LAPTOP#GENERAL 327 | target = ' '.join(op.get('category').split('#')) 328 | polarity = op.get('polarity') 329 | data['seq1'].append(target) 330 | data['seq2'].append(text) 331 | data['stance'].append(polarity) 332 | data["labels"] = sorted(list(set(data["stance"]))) 333 | assert data["labels"] == ABSA_LABELS 334 | return data 335 | 336 | 337 | def read_target_dependent(datafolder="./data/", debug=True, num_instances=20): 338 | target_dependent_path = os.path.join(datafolder, 'target-dependent') 339 | train_path = os.path.join(target_dependent_path, 'train.raw') 340 | test_path = os.path.join(target_dependent_path, 'test.raw') 341 | for path_ in [target_dependent_path, train_path, test_path]: 342 | assert os.path.exists(path_), 'Error: %s does not exist.' % path_ 343 | 344 | data_train = parse_target_dependent(train_path, debug, num_instances) 345 | data_test = parse_target_dependent(test_path) 346 | data_train, data_dev = split_train_data(data_train) 347 | return data_train, data_dev, data_test 348 | 349 | 350 | def parse_target_dependent(file_path, debug=False, num_instances=20): 351 | data = {"seq1": [], "seq2": [], "stance": []} 352 | with open(file_path, encoding='utf-8') as f: 353 | for i, line in enumerate(f): 354 | if i % 3 == 0: # the tweet is always first 355 | data["seq2"].append(line.strip()) 356 | elif i % 3 == 1: # followed by the target 357 | data["seq1"].append(line.strip()) 358 | elif i % 3 == 2: # followed by the sentiment 359 | data["stance"].append(line.strip()) 360 | if debug and i >= num_instances+1: 361 | continue 362 | assert len(data["seq1"]) == len(data["seq2"]) == len(data["stance"]),\ 363 | 'Error: %d != %d != %d.' % (len(data["seq1"]), len(data["seq2"]), 364 | len(data["stance"])) 365 | 366 | # replace the placeholder $T$ in every tweet with the target 367 | for i in range(len(data["seq1"])): 368 | target = data["seq1"][i] 369 | data["seq2"][i] = data["seq2"][i].replace("$T$", target) 370 | data["labels"] = sorted(list(set(data["stance"]))) 371 | assert data["labels"] == TARGET_LABELS 372 | return data 373 | 374 | 375 | def split_train_data(data_train): 376 | """Split the train data into train and dev data.""" 377 | train_ids, _ = train_test_split(range(len(data_train['seq1'])), 378 | test_size=0.1, random_state=42) 379 | data_dev = defaultdict(list) 380 | new_data_train = defaultdict(list) 381 | for key, examples in data_train.items(): 382 | if key == 'labels': 383 | continue 384 | # no numpy indexing, so we iterate over the examples 385 | for i, example in enumerate(examples): 386 | if i in train_ids: 387 | new_data_train[key].append(example) 388 | else: 389 | data_dev[key].append(example) 390 | new_data_train['labels'] = data_train['labels'] 391 | data_dev['labels'] = data_train['labels'] 392 | return new_data_train, data_dev 393 | 394 | 395 | if __name__ == "__main__": 396 | readMultinliData(datafolder="../data/") 397 | -------------------------------------------------------------------------------- /preproc/vocab.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import operator 3 | import sys 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | 9 | class Vocab(object): 10 | """ 11 | Vocab objects for use in jtr pipelines. 12 | 13 | Example: 14 | 15 | >>> # Test Vocab without pre-trained embeddings 16 | >>> vocab = Vocab() 17 | >>> print(vocab("blah")) 18 | 1 19 | >>> print(vocab("bluh")) 20 | 2 21 | >>> print(vocab("bleh")) 22 | 3 23 | >>> print(vocab("bluh")) 24 | 2 25 | >>> print(vocab("hello")) 26 | 4 27 | >>> print(vocab("world")) 28 | 5 29 | 30 | >>> # Sym2id before freezing: 31 | >>> for k in sorted(vocab.sym2id.keys()): 32 | ... print(k,' : ',vocab.sym2id[k]) 33 | : 0 34 | blah : 1 35 | bleh : 3 36 | bluh : 2 37 | hello : 4 38 | world : 5 39 | 40 | >>> # Sym2id after freezing (no difference, because no pre-trained embeddings used): 41 | >>> vocab.freeze() 42 | >>> for k in sorted(vocab.sym2id.keys()): 43 | ... print(k,' : ',vocab.sym2id[k]) 44 | : 0 45 | blah : 1 46 | bleh : 3 47 | bluh : 2 48 | hello : 4 49 | world : 5 50 | 51 | >>> # Test Vocab with pre-trained embeddings 52 | >>> def emb(w): 53 | ... v = {'blah':[1.7,0,.3],'bluh':[0,1.5,0.5],'bleh':[0,0,2]} 54 | ... return None if not w in v else v[w] 55 | >>> vocab = Vocab(emb=emb) 56 | >>> print(vocab("blah")) 57 | -1 58 | >>> print(vocab("bluh")) 59 | -2 60 | >>> print(vocab("bleh")) 61 | -3 62 | >>> print(vocab("bluh")) 63 | -2 64 | >>> print(vocab("hello")) 65 | 1 66 | >>> print(vocab("world")) 67 | 2 68 | 69 | >>> # Sym2id before freezing: 70 | >>> for k in sorted(vocab.sym2id.keys()): 71 | ... print(k,' : ',vocab.sym2id[k]) 72 | : 0 73 | blah : -1 74 | bleh : -3 75 | bluh : -2 76 | hello : 1 77 | world : 2 78 | 79 | >>> # Sym2id after freezing: normalized (positive) ids, also for pre-trained terms 80 | >>> vocab.freeze() 81 | >>> for k in sorted(vocab.sym2id.keys()): 82 | ... print(k,' : ',vocab.sym2id[k]) 83 | : 0 84 | blah : 3 85 | bleh : 5 86 | bluh : 4 87 | hello : 1 88 | world : 2 89 | 90 | >>> # Test pretrained and out-of-vocab id's before freezing 91 | >>> vocab.unfreeze() 92 | >>> vocab.get_ids_pretrained() 93 | [-1, -2, -3] 94 | >>> vocab.get_ids_oov() 95 | [0, 1, 2] 96 | 97 | >>> # Test pretrained and out-of-vocab id's after freezing 98 | >>> vocab.freeze() 99 | >>> vocab.get_ids_pretrained() 100 | [3, 4, 5] 101 | >>> vocab.get_ids_oov() 102 | [0, 1, 2] 103 | 104 | >>> # Test calling frozen Vocab object 105 | >>> vocab(['bluh','world','wake','up']) #last 2 are new words, hence unknown 106 | [4, 2, 0, 0] 107 | 108 | >>> # Test calling unfrozen Vocab object 109 | >>> vocab.unfreeze() 110 | >>> vocab(['bluh','world','wake','up']) #last 2 are new words, hence added to Vocab 111 | [-2, 2, 3, 4] 112 | 113 | >>> #Test sym2id after freezing again 114 | >>> vocab.freeze() 115 | >>> for k in sorted(vocab.sym2id.keys()): 116 | ... print(k,' : ',vocab.sym2id[k]) 117 | : 0 118 | blah : 5 119 | bleh : 7 120 | bluh : 6 121 | hello : 1 122 | up : 4 123 | wake : 3 124 | world : 2 125 | """ 126 | 127 | DEFAULT_UNK = "" 128 | 129 | def __init__(self, unk=DEFAULT_UNK, emb=None, init_from_embeddings=False): 130 | """ 131 | Creates Vocab object. 132 | 133 | Args: 134 | `unk`: symbol for unknown term (default: ""). 135 | If set to `None`, and `None` is not included as symbol while unfrozen, 136 | it will return `None` upon calling `get_id(None)` when frozen. 137 | `emb`: function handle; returns pre-trained embedding (fixed-size numerical list or ndarray) 138 | for a given symbol, and None for unknown symbols. 139 | """ 140 | self.next_neg = -1 141 | self.unk = unk 142 | self.emb = emb if emb is not None else lambda _ : None # if emb is None: same behavior as for o-o-v words 143 | 144 | if init_from_embeddings and emb is not None: 145 | self.sym2id = dict(emb.vocabulary.word2idx) 146 | self.id2sym = {v: k for k, v in emb.vocabulary.word2idx.items()} 147 | if unk is not None and unk not in self.sym2id: 148 | self.sym2id[unk] = len(self.sym2id) 149 | self.id2sym[len(self.id2sym)] = unk 150 | self.sym2freqs = {w: emb.vocabulary.get_word_count(w) for w in self.sym2id} 151 | self.frozen = True 152 | else: 153 | self.sym2id = {} 154 | # with pos and neg indices 155 | self.id2sym = {} 156 | self.next_pos = 0 157 | self.sym2freqs = {} 158 | if unk is not None: 159 | self.sym2id[unk] = 0 160 | # with pos and neg indices 161 | self.id2sym[0] = unk 162 | self.next_pos = 1 163 | self.sym2freqs[unk] = 0 164 | self.frozen = False 165 | 166 | if emb is not None and hasattr(emb, "lookup") and isinstance(emb.lookup, np.ndarray): 167 | self.emb_length = emb.lookup.shape[1] 168 | else: 169 | self.emb_length = None 170 | 171 | def freeze(self): 172 | """Freeze current Vocab object (set `self.frozen` to True). 173 | To be used after loading symbols from a given corpus; 174 | transforms all internal symbol id's to positive indices (for use in tensors). 175 | 176 | - additional calls to the __call__ method will return the id for the unknown symbold 177 | - out-of-vocab id's are positive integers and do not change 178 | - id's of symbols with pre-trained embeddings are converted to positive integer id's, 179 | counting up from the all out-of-vocab id's. 180 | """ 181 | # if any pretrained have been encountered 182 | if not self.frozen and self.next_neg < -1: 183 | sym2id = {sym: self._normalize(id) for sym, id in self.sym2id.items()} 184 | id2sym = {self._normalize(id): sym for id, sym in self.id2sym.items()} 185 | self.sym2id = sym2id 186 | self.id2sym = id2sym 187 | self.frozen = True 188 | 189 | def unfreeze(self): 190 | """Unfreeze current Vocab object (set `self.frozen` to False). 191 | Caution: use with care! Unfreezing a Vocab, adding new terms, and again Freezing it, 192 | will result in shifted id's for pre-trained symbols. 193 | 194 | - maps all normalized id's to the original internal id's. 195 | - additional calls to __call__ will allow adding new symbols to the vocabulary. 196 | """ 197 | if self.frozen and self.next_neg < -1: 198 | sym2id = {sym: self._denormalize(id) for sym, id in self.sym2id.items()} 199 | id2sym = {self._denormalize(id): sym for id, sym in self.id2sym.items()} 200 | self.sym2id = sym2id 201 | self.id2sym = id2sym 202 | self.frozen = False 203 | 204 | def get_id(self, sym, is_num=False): 205 | """ 206 | Returns the id of `sym`; different behavior depending on the state of the Vocab: 207 | 208 | - In case self.frozen==False (default): returns internal id, 209 | that is, positive for out-of-vocab symbol, negative for symbol 210 | found in `self.emb`. If `sym` is a new symbol, it is added to the Vocab. 211 | 212 | - In case self.frozen==True (after explicit call to 'freeze()', or after building a `NeuralVocab` with it): 213 | Returns normalized id (positive integer, also for symbols with pre-trained embedding) 214 | If `sym` is a new symbol, the id for unknown terms is returned, if available, 215 | and otherwise `None` (only possible when input argument `unk` for `Vocab.__init__()` was set to `None`, e.g. ; 216 | for classification labels; it is assumed action is taken in the pipeline 217 | creating or calling the `Vocab` object, when `None` is encountered). 218 | 219 | Args: 220 | `sym`: symbol (e.g., token) 221 | """ 222 | if not self.frozen: 223 | vec = self.emb(sym) 224 | if self.emb_length is None and vec is not None: 225 | self.emb_length = len(vec) if isinstance(vec, list) else vec.shape[0] 226 | if sym not in self.sym2id: 227 | if vec is None: 228 | self.sym2id[sym] = self.next_pos 229 | self.id2sym[self.next_pos] = sym 230 | self.next_pos += 1 231 | else: 232 | self.sym2id[sym] = self.next_neg 233 | self.id2sym[self.next_neg] = sym 234 | self.next_neg -= 1 235 | self.sym2freqs[sym] = 1 236 | else: 237 | self.sym2freqs[sym] += 1 238 | if sym in self.sym2id: 239 | return self.sym2id[sym] 240 | else: 241 | if self.unk in self.sym2id: 242 | return self.sym2id[self.unk] 243 | # can happen for `Vocab` initialized with `unk` argument set to `None` 244 | else: 245 | return None 246 | 247 | def get_sym(self, id): 248 | """returns symbol for a given id (consistent with the `self.frozen` state), and None if not found.""" 249 | return None if not id in self.id2sym else self.id2sym[id] 250 | 251 | def __call__(self, *args, **kwargs): 252 | """ 253 | calls the `get_id` function for the provided symbol(s), which adds symbols to the Vocab if needed and allowed, 254 | and returns their id(s). 255 | 256 | Args: 257 | *args: a single symbol, a list of symbols, or multiple symbols 258 | """ 259 | symbols = args 260 | if len(args) == 1: 261 | if isinstance(args[0], list): 262 | symbols = args[0] 263 | else: 264 | return self.get_id(args[0]) 265 | return [self.get_id(sym) for sym in symbols] 266 | 267 | def __len__(self): 268 | """returns number of unique symbols (including the unknown symbol)""" 269 | return len(self.id2sym) 270 | 271 | def __contains__(self, sym): 272 | """checks if `sym` already in the Vocab object""" 273 | return sym in self.sym2id 274 | 275 | def _normalize(self, id): 276 | """map original (pos/neg) ids to normalized (non-neg) ids: first new symbols, then those in emb""" 277 | # e.g. -1 should be mapped to self.next_pos + 0 278 | # e.g. -3 should be mapped to self.next_pos + 2 279 | return id if id >=0 else self.next_pos - id - 1 280 | 281 | def _denormalize(self, id): 282 | # self.next_pos + i is mapped back to -1-i 283 | return id if id < self.next_pos else -1-(id-self.next_pos) 284 | 285 | def get_ids_pretrained(self): 286 | """return internal or normalized id's (depending on frozen/unfrozen state) 287 | for symbols that have an embedding in `self.emb` """ 288 | if self.frozen: 289 | return list(range(self.next_pos,self.next_pos+self.count_pretrained())) 290 | else: 291 | return list(range(-1,self.next_neg,-1)) 292 | 293 | def get_ids_oov(self): 294 | """return out-of-vocab id's (indep. of frozen/unfrozen state)""" 295 | return list(range(self.next_pos)) 296 | 297 | def count_pretrained(self): 298 | """equivalent to `len(get_ids_pretrained())`""" 299 | return -self.next_neg - 1 300 | 301 | def count_oov(self): 302 | """equivalent to `len(get_ids_oov())`""" 303 | return self.next_pos 304 | 305 | def prune(self, min_freq=5, max_size=sys.maxsize): 306 | """returns new Vocab object, pruned based on minimum symbol frequency""" 307 | pruned_vocab = Vocab(unk=self.unk, emb=self.emb) 308 | cnt = 0 309 | for sym, freq in sorted(self.sym2freqs.items(), key=operator.itemgetter(1), reverse=True): 310 | # for sym in self.sym2freqs: 311 | # freq = self.sym2freqs[sym] 312 | cnt += 1 313 | if freq >= min_freq and cnt < max_size: 314 | pruned_vocab(sym) 315 | pruned_vocab.sym2freqs[sym] = freq 316 | if self.frozen: 317 | # if original Vocab was frozen, freeze new one 318 | pruned_vocab.freeze() 319 | 320 | return pruned_vocab 321 | 322 | 323 | class NeuralVocab(Vocab): 324 | """ 325 | Wrapper around Vocab to go from indices to tensors. 326 | 327 | Example: 328 | >>> # Start from same Vocab as the doctest example in Vocab 329 | >>> def emb(w): 330 | ... v = {'blah':[1.7,0,.3],'bluh':[0,1.5,0.5],'bleh':[0,0,2]} 331 | ... return None if not w in v else v[w] 332 | >>> vocab = Vocab(emb=emb) 333 | >>> vocab("blah", "bluh", "bleh", "hello", "world") # symbols as multiple arguments 334 | [-1, -2, -3, 1, 2] 335 | >>> vocab(['bluh','world','wake','up']) # as list of symbols 336 | [-2, 2, 3, 4] 337 | 338 | 339 | >>> # Test NeuralVocab with pre-trained embeddings (case: input_size larger than pre-trained embeddings) 340 | >>> with tf.variable_scope('neural_test2'): 341 | ... for w in ['blah','bluh','bleh']: 342 | ... w, emb(w) 343 | ... nvocab = NeuralVocab(vocab, None, 4, unit_normalize=True, use_pretrained=True, train_pretrained=False) 344 | ('blah', [1.7, 0, 0.3]) 345 | ('bluh', [0, 1.5, 0.5]) 346 | ('bleh', [0, 0, 2]) 347 | 348 | Interpretation of number of trainable variables from neural_test2: 349 | out-of-vocab: 8 - 3 = 5 symbols, with each 4 dimensions = 20; 350 | for fixed pre-trained embeddings with length 3, three times 1 extra trainable dimension for total embedding length 4. 351 | Total is 23. 352 | """ 353 | 354 | def __init__(self, base_vocab, embedding_matrix=None, 355 | input_size=None, reduced_input_size=None, use_pretrained=True, train_pretrained=False, unit_normalize=True): 356 | """ 357 | Creates NeuralVocab object from a given Vocab object `base_vocab`. 358 | Pre-calculates embedding vector (as `Tensor` object) for each symbol in Vocab 359 | 360 | Args: 361 | `base_vocab`: 362 | `embedding_matrix`: tensor with shape (len_vocab, input_size). If provided, 363 | the arguments `input_size`, `use_trained`, `train_pretrained`, and `unit_normalize` are ignored. 364 | `input_size`: integer; embedding length in case embedding matrix not provided, else ignored. 365 | If shorter than pre-trained embeddings, only their first `input_size` dimensions are used. 366 | If longer, extra (Trainable) dimensions are added. 367 | `reduced_input_size`: integer; optional; ignored in case `None`. If set to positive integer, an additional 368 | linear layer is introduced to reduce (or extend) the embeddings to the indicated size. 369 | `use_pretrained`: boolean; True (default): use pre-trained if available through `base_vocab`. 370 | False: ignore pre-trained embeddings accessible through `base_vocab` 371 | `train_pretrained`: boolean; False (default): fix pretrained embeddings. True: continue training. 372 | Ignored if embedding_matrix is given. 373 | `unit_normalize`: initialize pre-trained vectors with unit norm 374 | (note: randomly initialized embeddings are always initialized with expected unit norm) 375 | """ 376 | super(NeuralVocab, self).__init__(unk=base_vocab.unk, emb=base_vocab.emb) 377 | 378 | assert (embedding_matrix, input_size) is not (None, None), "if no embedding_matrix is provided, define input_size" 379 | 380 | self.freeze() # has no actual functionality here 381 | base_vocab.freeze() # freeze if not frozen (to ensure fixed non-negative indices) 382 | 383 | self.sym2id = base_vocab.sym2id 384 | self.id2sym = base_vocab.id2sym 385 | self.sym2freqs = base_vocab.sym2freqs 386 | self.unit_normalize = unit_normalize 387 | 388 | def np_normalize(v): 389 | return v / np.sqrt(np.sum(np.square(v))) 390 | 391 | if embedding_matrix is None: 392 | # construct part oov 393 | n_oov = base_vocab.count_oov() 394 | n_pre = base_vocab.count_pretrained() 395 | E_oov = tf.get_variable("embeddings_oov", [n_oov, input_size], 396 | initializer=tf.random_normal_initializer(0, 1./np.sqrt(input_size)), 397 | trainable=True, dtype="float32") 398 | # stdev = 1/sqrt(length): then expected initial L2 norm is 1 399 | 400 | # construct part pretrained 401 | if use_pretrained and base_vocab.emb_length is not None: 402 | # load embeddings into numpy tensor with shape (count_pretrained, min(input_size,emb_length)) 403 | np_E_pre = np.zeros([n_pre, min(input_size, base_vocab.emb_length)]).astype("float32") 404 | for id in base_vocab.get_ids_pretrained(): 405 | sym = base_vocab.id2sym[id] 406 | i = id - n_oov # shifted to start from 0 407 | np_E_pre[i, :] = base_vocab.emb(sym)[:min(input_size,base_vocab.emb_length)] 408 | if unit_normalize: 409 | np_E_pre[i, :] = np_normalize(np_E_pre[i, :]) 410 | E_pre = tf.get_variable("embeddings_pretrained", 411 | initializer=tf.identity(np_E_pre), 412 | trainable=train_pretrained, dtype="float32") 413 | 414 | if input_size > base_vocab.emb_length: 415 | E_pre_ext = tf.get_variable("embeddings_extra", [n_pre, input_size-base_vocab.emb_length], 416 | initializer=tf.random_normal_initializer(0.0, 1. / np.sqrt(base_vocab.emb_length)), dtype="float32", trainable=True) 417 | # note: stdev = 1/sqrt(emb_length) means: elements from same normal distr. as normalized first part (in case normally distr.) 418 | E_pre = tf.concat([E_pre, E_pre_ext], 1, name="embeddings_pretrained_extended") 419 | else: 420 | # initialize all randomly anyway 421 | E_pre = tf.get_variable("embeddings_not_pretrained", [n_pre, input_size], 422 | initializer=tf.random_normal_initializer(0., 1./np.sqrt(input_size)), 423 | trainable=True, dtype="float32") 424 | # again: initialize with expected unit norm 425 | 426 | # must be provided is embedding_matrix is None 427 | self.input_size = input_size 428 | self.embedding_matrix = tf.concat([E_oov, E_pre], 0, name="embeddings") 429 | 430 | else: 431 | # ignore input argument input_size 432 | self.input_size = embedding_matrix.get_shape()[1] 433 | self.embedding_matrix = embedding_matrix 434 | 435 | if isinstance(reduced_input_size, int) and reduced_input_size > 0: 436 | # uniform=False for truncated normal 437 | init = tf.contrib.layers.xavier_initializer(uniform=True) 438 | self.embedding_matrix = tf.contrib.layers.fully_connected(self.embedding_matrix, reduced_input_size, 439 | weights_initializer=init, activation_fn=None) 440 | 441 | # pre-assign embedding vectors to all ids 442 | # always OK if frozen 443 | self.id2vec = [tf.nn.embedding_lookup(self.embedding_matrix, idx) for idx in range(len(self))] 444 | 445 | def embed_symbol(self, ids): 446 | """returns embedded id's 447 | 448 | Args: 449 | `ids`: integer, ndarray with np.int32 integers, or tensor with tf.int32 integers. 450 | These integers correspond to (normalized) id's for symbols in `self.base_vocab`. 451 | 452 | Returns: 453 | tensor with id's embedded by numerical vectors (in last dimension) 454 | """ 455 | return tf.nn.embedding_lookup(self.embedding_matrix, ids) 456 | 457 | def __call__(self, *args, **kwargs): 458 | """ 459 | Calling the NeuralVocab object with symbol id's, 460 | returns a `Tensor` with corresponding embeddings. 461 | 462 | Args: 463 | `*args`: `Tensor` with integer indices 464 | (such as a placeholder, to be evaluated when run in a `tf.Session`), 465 | or list of integer id's, 466 | or just multiple integer ids as input arguments 467 | 468 | Returns: 469 | Embedded `Tensor` in case a `Tensor` was provided as input, 470 | and otherwise a list of embedded input id's under the form of fixed-length embeddings (`Tensor` objects). 471 | """ 472 | # tuple with length 1: then either list with ids, tensor with ids, or single id 473 | if len(args) == 1: 474 | if isinstance(args[0], list): 475 | ids = args[0] 476 | elif tf.contrib.framework.is_tensor(args[0]): 477 | # return embedded tensor 478 | return self.embed_symbol(args[0]) 479 | else: 480 | return self.id2vec[args[0]] 481 | else: # tuple with ids 482 | ids = args 483 | return [self.id2vec[id] for id in ids] 484 | 485 | def get_embedding_matrix(self): 486 | return self.embedding_matrix 487 | 488 | 489 | if __name__ == '__main__': 490 | import doctest 491 | tf.set_random_seed(1337) 492 | 493 | print(doctest.testmod()) 494 | -------------------------------------------------------------------------------- /preproc/map.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import re 3 | import numpy as np 4 | import pprint 5 | from preproc.vocab import Vocab 6 | #from jtr.util.rs import DefaultRandomState 7 | 8 | rs = np.random.RandomState(1337) 9 | 10 | #rs = DefaultRandomState(1337)#new seed ignored if set previously 11 | 12 | # sym (e.g. token, token id or class label) 13 | # seq (e.g. sequence of tokens) 14 | # seqs (sequence of sequences) 15 | # corpus (sequence of sequence of sequences) 16 | # e.g. hypotheses (sequence of sequences) 17 | # premises (sequence of sequences) 18 | # support (sequence of sequence of sequences) 19 | # labels (sequence of symbols) 20 | # corpus = [hypotheses, premises, support, labels] 21 | 22 | 23 | def tokenize(xs, pattern="([\s'\-\.\,\!])"): 24 | """Splits sentences into tokens by regex over punctuation: ( -.,!])[""" 25 | return [x for x in re.split(pattern, xs) 26 | if not re.match("\s", x) and x != ""] 27 | 28 | def notokenize(xs): 29 | """Embeds deepest itemns into a list""" 30 | return [xs] 31 | 32 | 33 | def lower(xs): 34 | """returns lowercase for sequence of strings""" 35 | # """performs lowercasing on string or sequence of strings""" 36 | # if isinstance(xs, str): 37 | # return xs.lower() 38 | return [x.lower() for x in xs] 39 | 40 | 41 | def deep_map(xs, fun, keys=None, fun_name='trf', expand=False, cache_fun=False): 42 | """Applies fun to a dict or list; adds the results in-place. 43 | 44 | Usage: Transform a corpus iteratively by applying functions like 45 | `tokenize`, `lower`, or vocabulary functions (word -> embedding id) to it. 46 | :: 47 | from jtr.sisyphos.vocab import Vocab 48 | vocab = Vocab() 49 | keys = ['question', 'support'] 50 | corpus = deep_map(corpus, lambda x: x.lower(), keys) 51 | corpus = deep_map(corpus, tokenize, keys) 52 | corpus = deep_map(corpus, vocab, keys) 53 | corpus = deep_map(corpus, vocab._normalize, keys=keys) 54 | 55 | From here we can create batches from the corpus and feed it into a model. 56 | 57 | In case `expand==False` each top-level entry of `xs` to be transformed 58 | replaces the original entry. 59 | `deep_map` supports `xs` to be a dictionary or a list/tuple: 60 | - In case `xs` is a dictionary, its transformed value is also a dictionary, and `keys` contains the keys of the 61 | values to be transformed. 62 | - In case `xs` is a list/tuple, `keys` contains the indices of the entries to be transformed 63 | The function `deep_map` is recursively applied to the values of `xs`, 64 | only at the deepest level, where the entries are no longer sequences/dicts, after which `fun` is applied. 65 | 66 | Args: 67 | `xs`: a sequence (list/tuple) of objects or sequences of objects. 68 | `fun`: a function to transform objects 69 | `keys`: seq with keys if `xs` is dict; seq with integer indices if `xs` is seq. 70 | For entries not in `keys`, the original `xs` value is retained. 71 | `fun_name`: default value 'trf'; string with function tag (e.g. 'lengths'), 72 | used if '''expand==True''' and '''isinstance(xs,dict)''' 73 | Say for example fun_name='lengths', and `keys` contains 'sentence', then the transformed dict would look like 74 | '''{'sentence':[sentences], 'sentence_lengths':[fun(sentences)] ...}''' 75 | `cache_fun`: should the function values for seen inputs be cached. Use with care, as it will affect functions with side effects. 76 | 77 | Returns: 78 | Transformed sequence or dictionary. 79 | 80 | Example: 81 | 82 | >>> #(1) Test with sequence of stuff 83 | >>> dave = [ 84 | ... "All work and no play makes Jack a dull boy", 85 | ... "All work and no play makes Jack a dull boy.", 86 | ... "All work and no play makes Jack a very dull boy!"] 87 | >>> jack = [ 88 | ... "I'm sorry Dave, I'm afraid I can't do that!", 89 | ... "I'm sorry Dave, I'm afraid I can't do that", 90 | ... "I'm sorry Dave, I'm afraid I cannot do that"] 91 | >>> support = [ 92 | ... ["Play makes really dull", "really dull"], 93 | ... ["Dave is human"], 94 | ... ["All work", "all dull", "dull"]] 95 | >>> data1 = [dave, jack, support] 96 | >>> vocab1 = Vocab() 97 | >>> data1_lower = deep_map(data1, lambda s:s.lower()) 98 | >>> data1_tokenized = deep_map(data1_lower, tokenize) 99 | >>> data1_ids = deep_map(data1_tokenized, vocab1) 100 | >>> pprint.pprint(data1_ids) 101 | [[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 102 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 103 | [1, 2, 3, 4, 5, 6, 7, 8, 12, 9, 10, 13]], 104 | [[14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 21, 15, 22, 23, 24, 13], 105 | [14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 21, 15, 22, 23, 24], 106 | [14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 25, 23, 24]], 107 | [[[5, 6, 26, 9], [26, 9]], [[18, 27, 28]], [[1, 2], [1, 9], [9]]]] 108 | >>> data1_ids_with_lengths = deep_seq_map(data1_ids, lambda xs: len(xs), 109 | ... fun_name='lengths', expand=True) 110 | >>> pprint.pprint(data1_ids_with_lengths) 111 | [[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 112 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 113 | [1, 2, 3, 4, 5, 6, 7, 8, 12, 9, 10, 13]], 114 | [10, 11, 12], 115 | [[14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 21, 15, 22, 23, 24, 13], 116 | [14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 21, 15, 22, 23, 24], 117 | [14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 25, 23, 24]], 118 | [17, 16, 14], 119 | [[[5, 6, 26, 9], [26, 9]], [[18, 27, 28]], [[1, 2], [1, 9], [9]]], 120 | [[4, 2], [3], [2, 2, 1]]] 121 | 122 | 123 | >>> #(2) Test with data dictionary 124 | >>> data2 = {'dave': dave, 'jack': jack, 'support': support} 125 | >>> pprint.pprint(data2) 126 | {'dave': ['All work and no play makes Jack a dull boy', 127 | 'All work and no play makes Jack a dull boy.', 128 | 'All work and no play makes Jack a very dull boy!'], 129 | 'jack': ["I'm sorry Dave, I'm afraid I can't do that!", 130 | "I'm sorry Dave, I'm afraid I can't do that", 131 | "I'm sorry Dave, I'm afraid I cannot do that"], 132 | 'support': [['Play makes really dull', 'really dull'], 133 | ['Dave is human'], 134 | ['All work', 'all dull', 'dull']]} 135 | >>> data2_tokenized = deep_map(data2, tokenize) 136 | >>> pprint.pprint(data2_tokenized['support']) 137 | [[['Play', 'makes', 'really', 'dull'], ['really', 'dull']], 138 | [['Dave', 'is', 'human']], 139 | [['All', 'work'], ['all', 'dull'], ['dull']]] 140 | """ 141 | 142 | cache = {} 143 | 144 | def deep_map_recursion(inner_xs, keys=None): 145 | if cache_fun and id(inner_xs) in cache: 146 | return cache[id(inner_xs)] 147 | if isinstance(inner_xs, dict): 148 | xs_mapped = {} 149 | for k, x in sorted(inner_xs.items(), 150 | key=lambda it: it[0]): # to make deterministic (e.g. for consistent symbol id's) 151 | if keys is None or k in keys: 152 | if expand: 153 | xs_mapped[k] = x 154 | # if expand: create new key for transformed element, else use same key 155 | k = '%s_%s' % (str(k), str(fun_name)) 156 | if isinstance(x, list) or isinstance(x, dict): 157 | x_mapped = deep_map_recursion(x) 158 | else: 159 | x_mapped = fun(x) 160 | xs_mapped[k] = x_mapped 161 | else: 162 | xs_mapped[k] = x 163 | else: 164 | xs_mapped = [] 165 | for k, x in enumerate(inner_xs): 166 | if keys is None or k in keys: 167 | if expand: 168 | xs_mapped.append(x) 169 | if isinstance(x, list) or isinstance(x, dict): 170 | x_mapped = deep_map_recursion(x) #deep_map(x, fun, fun_name=fun_name) 171 | else: 172 | x_mapped = fun(x) 173 | xs_mapped.append(x_mapped) 174 | else: 175 | xs_mapped.append(x) 176 | if cache_fun: 177 | cache[id(inner_xs)] = xs_mapped 178 | return xs_mapped 179 | 180 | return deep_map_recursion(xs,keys) 181 | 182 | 183 | def deep_seq_map(xss, fun, keys=None, fun_name=None, expand=False): 184 | """Applies fun to list of or dict of lists; adds the results in-place. 185 | 186 | Usage: Transform a corpus iteratively by applying functions like 187 | `tokenize`, `lower`, or vocabulary functions (word -> embedding id) to it. 188 | 189 | from jtr.sisyphos.vocab import Vocab 190 | vocab = Vocab() 191 | keys = ['question', 'support'] 192 | 193 | corpus = deep_map(corpus, lambda x: x.lower(), keys) 194 | corpus = deep_map(corpus, tokenize, keys) 195 | corpus = deep_map(corpus, vocab, keys) 196 | corpus = deep_map(corpus, vocab._normalize, keys=keys) 197 | -> through tokenize we go from a dict of sentences to 198 | a dict of words (list of lists), thus we now apply deep_seq_map for 199 | processing to add start of and end of sentence tags: 200 | corpus = deep_seq_map(corpus, lambda xs: [""] + xs + 201 | [""], 202 | ['question', 'support']) 203 | 204 | -> From here we can create batches from the corpus and feed it into a model. 205 | 206 | In case `expand==False` each top-level entry of `xs` to be transformed 207 | replaces the original entry. 208 | `deep_map` supports `xs` to be a dictionary or a list/tuple: 209 | - In case `xs` is a dictionary, its transformed value is also a dictionary, and `keys` contains the keys of the 210 | values to be transformed. 211 | - In case `xs` is a list/tuple, `keys` contains the indices of the entries to be transformed 212 | The function `deep_map` is recursively applied to the values of `xs`; 213 | the function `fun` takes a sequence as input, and is applied at the one but deepest level, 214 | where the entries are sequences of objects (no longer sequences of sequences). 215 | This is the only difference with `deep_map` 216 | 217 | Args: 218 | `xs`: a sequence (list/tuple) of objects or sequences of objects. 219 | `fun`: a function to transform sequences 220 | `keys`: seq with keys if `xs` is dict; seq with integer indices if `xs` is seq. 221 | For entries not in `keys`, the original `xs` value is retained. 222 | `fun_name`: default value 'trf'; string with function tag (e.g. 'lengths'), 223 | used if '''expand==True''' and '''isinstance(xs,dict)''' 224 | Say for example fun_name='count', and `keys` contains 'sentence', then the transformed dict would look like 225 | '''{'sentence':[sentences], 'sentence_lengths':[fun(sentences)] ...}''' 226 | 227 | Returns: 228 | Transformed sequence or dictionary. 229 | 230 | Example: 231 | >>> dave = [ 232 | ... "All work and no play makes Jack a dull boy", 233 | ... "All work and no play makes Jack a dull boy.", 234 | ... "All work and no play makes Jack a very dull boy!"] 235 | >>> jack = [ 236 | ... "I'm sorry Dave, I'm afraid I can't do that!", 237 | ... "I'm sorry Dave, I'm afraid I can't do that", 238 | ... "I'm sorry Dave, I'm afraid I cannot do that"] 239 | >>> support = [ 240 | ... ["Play makes really dull", "really dull"], 241 | ... ["Dave is human"], 242 | ... ["All work", "all dull", "dull"]] 243 | >>> data2 = {'dave': dave, 'jack': jack, 'support': support} 244 | >>> vocab2 = Vocab() 245 | >>> data2_processed = deep_map(data2, lambda x: tokenize(x.lower())) 246 | >>> data2_ids = deep_map(data2_processed, vocab2) 247 | >>> data2_ids_with_lengths = deep_seq_map(data2_ids, lambda xs: len(xs), keys=['dave','jack','support'], 248 | ... fun_name='lengths', expand=True) 249 | >>> pprint.pprint(data2_ids_with_lengths) 250 | {'dave': [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 251 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 252 | [1, 2, 3, 4, 5, 6, 7, 8, 12, 9, 10, 13]], 253 | 'dave_lengths': [10, 11, 12], 254 | 'jack': [[14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 21, 15, 22, 23, 24, 13], 255 | [14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 21, 15, 22, 23, 24], 256 | [14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 25, 23, 24]], 257 | 'jack_lengths': [17, 16, 14], 258 | 'support': [[[5, 6, 26, 9], [26, 9]], [[18, 27, 28]], [[1, 2], [1, 9], [9]]], 259 | 'support_lengths': [[4, 2], [3], [2, 2, 1]]} 260 | """ 261 | 262 | if isinstance(xss, list) and all([not isinstance(xs, list) for xs in xss]): 263 | return fun(xss) 264 | else: 265 | if isinstance(xss, dict): 266 | xss_mapped = {} 267 | for k, xs in xss.items(): 268 | if keys is None or k in keys: 269 | if expand: 270 | xss_mapped[k] = xs 271 | k = '%s_%s' % (str(k), str(fun_name) if fun_name is not None else 'trf') 272 | if isinstance(xs, list) and all([not isinstance(x, list) for x in xs]): 273 | xss_mapped[k] = fun(xs) 274 | else: 275 | xss_mapped[k] = deep_seq_map(xs, fun) # fun_name not needed, because expand==False 276 | else: 277 | xss_mapped[k] = xs 278 | else: 279 | xss_mapped = [] 280 | for k, xs in enumerate(xss): 281 | if keys is None or k in keys: 282 | if expand: 283 | xss_mapped.append(xs) 284 | if isinstance(xs, list) and all([not isinstance(x, list) for x in xs]): 285 | xss_mapped.append(fun(xs)) 286 | else: 287 | xss_mapped.append(deep_seq_map(xs, fun)) 288 | else: 289 | xss_mapped.append(xs) 290 | return xss_mapped 291 | 292 | 293 | def dynamic_subsample(xs, candidate_key, answer_key, how_many=1, avoid=[]): 294 | """Replaces candidates by a mix of answers and random candidates. 295 | 296 | Creates negative samples by combining the true answers and some random 297 | deletion of entries in the candidates. Then replaces the candidates 298 | dictionary and returns it. 299 | 300 | Replace a list of lists with a list of dynamically subsampled lists. The dynamic list will 301 | always contain the elements from the `answer_key` list, and a subsample of size `how_many` from 302 | the corresponding `candidate_key` list 303 | Args: 304 | xs: a dictionary of keys to lists 305 | candidate_key: the key of the candidate list 306 | answer_key: the key of the answer list 307 | how_many: how many samples from the candidate list should we take 308 | avoid: list of candidates to be avoided 309 | (note: only those are avoided, any instances according to `answer_key` which are not 310 | in `avoid`, may still be sampled!) 311 | 312 | Returns: 313 | a new dictionary identical to `xs` for all but the `candidate_key`. For that key the value 314 | is a list of `DynamicSubsampledList` objects. 315 | 316 | Example: 317 | >>> data = {'answers':[[1,2],[3,4]], 'candidates': [range(0,100), range(0,100)]} 318 | >>> processed = dynamic_subsample(data, 'candidates', 'answers', 2) 319 | >>> " | ".join([" ".join([str(elem) for elem in elems]) for elems in processed['candidates']]) 320 | '1 2 89 39 | 3 4 90 82' 321 | >>> " | ".join([" ".join([str(elem) for elem in elems]) for elems in processed['candidates']]) 322 | '1 2 84 72 | 3 4 9 6' 323 | >>> " | ".join([" ".join([str(elem) for elem in elems]) for elems in processed['answers']]) 324 | '1 2 | 3 4' 325 | >>> processed = dynamic_subsample(data, 'candidates', 'answers', 5, avoid=range(91)) 326 | >>> " | ".join([" ".join([str(elem) for elem in elems]) for elems in processed['candidates']]) 327 | '1 2 93 91 91 95 97 | 3 4 93 99 92 98 93' 328 | """ 329 | candidate_dataset = xs[candidate_key] 330 | answer_dataset = xs[answer_key] 331 | new_candidates = [] 332 | assert (len(candidate_dataset) == len(answer_dataset)) 333 | for i in range(0, len(candidate_dataset)): 334 | candidates = candidate_dataset[i] 335 | answers = [answer_dataset[i]] if not hasattr(answer_dataset[i],'__len__') else answer_dataset[i] 336 | new_candidates.append(DynamicSubsampledList(answers, candidates, how_many, avoid=avoid, rand=rs)) 337 | result = {} 338 | result.update(xs) 339 | result[candidate_key] = new_candidates 340 | return result 341 | 342 | 343 | 344 | 345 | class DynamicSubsampledList: 346 | """ 347 | A container that produces different list subsamples on every call to `__iter__`. 348 | 349 | >>> dlist = DynamicSubsampledList([1,2], range(0,100),2, rand=rs) 350 | >>> print(" ".join([str(e) for e in dlist])) 351 | 1 2 23 61 352 | >>> print(" ".join([str(e) for e in dlist])) 353 | 1 2 92 39 354 | """ 355 | 356 | def __init__(self, always_in, to_sample_from, how_many, avoid=[], rand=rs): 357 | self.always_in = always_in 358 | self.to_sample_from = to_sample_from 359 | self.how_many = how_many 360 | self.avoid = set(avoid) 361 | self.random = rand 362 | 363 | def __iter__(self): 364 | result = [] 365 | result += self.always_in 366 | if len(self.avoid) == 0: 367 | result.extend(list(self.random.choice(self.to_sample_from, size=self.how_many, replace=True))) 368 | else: 369 | for _ in range(self.how_many): 370 | avoided = False 371 | trial, max_trial = 0, 50 372 | while (not avoided and trial < max_trial): 373 | samp = self.random.choice(self.to_sample_from) 374 | trial += 1 375 | avoided = False if samp in self.avoid else True 376 | result.append(samp) 377 | return result.__iter__() 378 | 379 | def __len__(self): 380 | return len(self.always_in)+self.how_many#number of items is the number of answers plus number of negative samples 381 | 382 | def __getitem__(self, key): 383 | #todo: verify 384 | return self.always_in[0] 385 | 386 | 387 | def get_list_shape(xs): 388 | if isinstance(xs,int): 389 | shape=[] 390 | else: 391 | shape = [len(xs)] 392 | for i, x in enumerate(xs): 393 | if isinstance(x, list) or isinstance(x, DynamicSubsampledList): 394 | if len(shape) == 1: 395 | shape.append(0) 396 | shape[1] = max(len(x), shape[1]) 397 | for j, y in enumerate(x): 398 | if isinstance(y, list) or isinstance(y, DynamicSubsampledList): 399 | if len(shape) == 2: 400 | shape.append(0) 401 | shape[2] = max(len(y), shape[2]) 402 | return shape 403 | 404 | 405 | def get_seq_depth(xs): 406 | return [n - 1 for n in get_list_shape(xs)] 407 | 408 | 409 | 410 | def get_entry_dims(corpus): 411 | """ 412 | get number of dimensions for each entry; needed for placeholder generation 413 | """ 414 | #todo: implement recursive form; now only OK for 'regular' (=most common type of) data structures 415 | if isinstance(corpus, dict): 416 | keys = list(corpus.keys()) 417 | dims = {key: 0 for key in keys} 418 | else: 419 | keys = range(len(corpus)) 420 | dims = [0 for i in range(len(corpus))] #scalars have dim 0 (but tensor version will have shape length 1) 421 | for key in keys: 422 | entry = corpus[key] 423 | try: 424 | while hasattr(entry, '__len__'): 425 | dims[key] += 1 426 | entry = entry[0] #will fail if entry is dict 427 | except: 428 | dims[key] = None 429 | return dims 430 | 431 | 432 | 433 | def numpify(xs, pad=0, keys=None, dtypes=None): 434 | """Converts a dict or list of Python data into a dict of numpy arrays.""" 435 | is_dict = isinstance(xs, dict) 436 | xs_np = {} if is_dict else [0] * len(xs) 437 | xs_iter = xs.items() if is_dict else enumerate(xs) 438 | 439 | for i, (key, x) in enumerate(xs_iter): 440 | if keys is None or key in keys: 441 | shape = get_list_shape(x) 442 | if dtypes is None: 443 | dtype = np.int64 444 | else: 445 | dtype = dtypes[i] 446 | x_np = np.full(shape, pad, dtype) 447 | dims = len(shape) 448 | if dims == 0: 449 | x_np=x 450 | elif dims == 1: 451 | x_np[0:shape[0]] = x 452 | elif dims == 2: 453 | for j, y in enumerate(x): 454 | x_np[j, 0:len(y)] = [ys for ys in y]#this comprehension turns DynamicSubsampledList into a list 455 | elif dims == 3: 456 | for j, ys in enumerate(x): 457 | for k, y in enumerate(ys): 458 | x_np[j, k, 0:len(y)] = y 459 | else: 460 | raise (NotImplementedError) 461 | # todo: extend to general case 462 | pass 463 | xs_np[key] = x_np 464 | else: 465 | xs_np[key] = x 466 | return xs_np 467 | 468 | 469 | def map_to_targets(xs, cands_name, ans_name): 470 | """ 471 | Create cand-length vector for each training instance with 1.0s for cands which are the correct answ and 0.0s for cands which are the wrong answ 472 | #@todo: integrate this function with the one below - the pipeline() method only works with this function 473 | """ 474 | targs = [] 475 | for i in range(len(xs[ans_name])): 476 | targ = [] 477 | for cand in xs[cands_name]: 478 | if xs[ans_name][i] == cand: 479 | targ.append(1.0) 480 | else: 481 | targ.append(0.0) 482 | targs.append(targ) 483 | xs["targets"] = targs 484 | return xs 485 | 486 | if __name__ == '__main__': 487 | import doctest 488 | 489 | print(doctest.testmod()) 490 | -------------------------------------------------------------------------------- /mtl/training.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from collections import defaultdict 3 | import numpy as np 4 | from mtl.nn import bicond_reader, relabel_model 5 | import os 6 | from sklearn.metrics import classification_report 7 | from preproc.log_utils import log_results, task2score, postproc_stance 8 | from sklearn.metrics import f1_score 9 | from copy import deepcopy 10 | from constants import TASK_NAMES_SHORT 11 | import copy 12 | from preproc import batch 13 | 14 | def alternate_epochs(target_sizes, max_iter, train_feed_dicts): 15 | """Return a batch generator that returns one epoch per batch and then 16 | switches tasks.""" 17 | for task in target_sizes.keys(): 18 | for j in range(0, max_iter): 19 | yield task, train_feed_dicts[task][j] 20 | 21 | 22 | def alternate_batches(target_sizes, max_iter, train_feed_dicts): 23 | """Return a batch generator that returns one batch per task and then 24 | switches tasks.""" 25 | for j in range(0, max_iter): 26 | for task in target_sizes.keys(): 27 | yield task, train_feed_dicts[task][j] 28 | 29 | 30 | def balanced_mtl_training_loop(placeholders, target_sizes, train_feed_dicts, dev_feed_dicts, max_iter, 31 | min_op, logits_dict, loss_dict, preds_dict, sess, **options): 32 | # trains a MTL model, samples equal amounts of training data from each task and weighs the updates equally 33 | # early stopping based on main task dev set 34 | 35 | max_epochs = options["max_epochs"] 36 | main_task = options["main_task"] 37 | early_stopping = options["early_stopping"] 38 | batch_iter = alternate_batches if options["alternate_batches"] else alternate_epochs 39 | 40 | main_task_dev_acc = [] 41 | stopping_criteron_reached = False 42 | 43 | for i in range(1, max_epochs + 1): 44 | task2loss_all, task2correct_all = defaultdict(list), defaultdict(float) 45 | task2total, task2correct_dev_all = defaultdict(float), defaultdict(float) 46 | task2total_dev = defaultdict(float) 47 | for task, batch in batch_iter(target_sizes, max_iter, train_feed_dicts): 48 | _, current_loss, p = sess.run([min_op[task], loss_dict[task], preds_dict[task]], feed_dict=batch) 49 | task2loss_all[task].extend(current_loss) 50 | hits = [pp for ii, pp in enumerate(p) if np.argmax(pp) == np.argmax(batch[placeholders["targets"]][ii])] 51 | task2correct_all[task] += len(hits) 52 | task2total[task] += len(batch[placeholders["targets"]]) 53 | 54 | for task in target_sizes.keys(): 55 | if options['dev_res_during_training']: 56 | p_inds_dev, g_inds_dev = [], [] 57 | for j, batch_dev in enumerate(dev_feed_dicts[task]): 58 | p_dev = sess.run(preds_dict[task], feed_dict=batch_dev) 59 | 60 | # this is for super detailed results -- maybe we don't want to print this every epoch later on 61 | if i % 1 == 0: 62 | pred_inds = [np.argmax(pp_dev) for pp_dev in p_dev] 63 | p_inds_dev.extend(pred_inds) 64 | gold_inds = [np.argmax(batch_dev[placeholders["targets"]][i_d]) for i_d, targ in 65 | enumerate(batch_dev[placeholders["targets"]])] 66 | g_inds_dev.extend(gold_inds) 67 | 68 | hits = [pp for k, pp in enumerate(p_dev) if 69 | np.argmax(pp) == np.argmax(batch_dev[placeholders["targets"]][k])] 70 | task2correct_dev_all[task] += len(hits) 71 | task2total_dev[task] += len(batch_dev[placeholders["targets"]]) 72 | 73 | # Randomise batch IDs, so that selection of batch is random 74 | np.random.shuffle(train_feed_dicts[task]) 75 | np.random.shuffle(dev_feed_dicts[task]) 76 | acc = task2correct_all[task] / task2total[task] 77 | acc_dev = 0 78 | if options['dev_res_during_training']: 79 | acc_dev = task2correct_dev_all[task] / task2total_dev[task] 80 | if task != main_task: 81 | print('Epoch %d :' % i, "Task: " + task, "Loss: ", np.mean(task2loss_all[task]), "Acc: ", acc, "Acc Dev: ", 82 | acc_dev) 83 | else: 84 | print('Epoch %d :' % i, "Task: " + task, "Loss: ", np.mean(task2loss_all[task]), "Acc: ", acc, "Acc Dev: ", 85 | acc_dev, "Previous Acc Dev: ", main_task_dev_acc) 86 | 87 | # too much information 88 | #if options['dev_res_during_training'] and i % 1 == 0: 89 | # print(classification_report(g_inds_dev, p_inds_dev, target_names=target_labels[task])) 90 | 91 | if task == main_task: 92 | if acc_dev >= early_stopping and len(main_task_dev_acc) >= 3 and acc_dev < main_task_dev_acc[-3]: 93 | print("Dev accuracy is smaller than 4 epochs ago, early stopping criteron reached.") 94 | stopping_criteron_reached = True 95 | break 96 | main_task_dev_acc.append(acc_dev) 97 | if stopping_criteron_reached == True: 98 | break 99 | 100 | return logits_dict, loss_dict, preds_dict 101 | 102 | 103 | 104 | 105 | def balanced_mtl_with_ltn_training_loop(placeholders, target_sizes, train_feed_dicts, dev_feed_dicts, max_iter, min_op, min_op_ltn, 106 | logits_dict, loss_dict, preds_dict, logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict, label_to_labelvocab, sess, **options): 107 | 108 | 109 | max_epochs_ltn = options["max_epochs_ltn"] 110 | main_task = options["main_task"] 111 | early_stopping = options["early_stopping"] 112 | hard_or_soft = options['ltn_pred_type'] 113 | batch_iter = alternate_batches if options["alternate_batches"] else alternate_epochs 114 | 115 | main_task_dev_acc = [] 116 | stopping_criteron_reached = False 117 | 118 | augment_data_from_epoch = max_epochs_ltn 119 | if options["relabel_with_ltn"]: 120 | # extend the training loop - for post-LTN data augmentation 121 | augment_data_from_epoch = max_epochs_ltn 122 | max_epochs_ltn = max_epochs_ltn + options["max_epochs_after_ltn"] 123 | 124 | for i in range(1, max_epochs_ltn + 1): 125 | task2loss_all, task2correct_all = defaultdict(list), defaultdict(float) 126 | task2total, task2correct_dev_all = defaultdict(float), defaultdict(float) 127 | task2total_dev = defaultdict(float) 128 | task2loss_all_ltn, task2correct_all_ltn = defaultdict(list), defaultdict(float) 129 | task2total_ltn, task2correct_dev_all_ltn = defaultdict(float), defaultdict(float) 130 | 131 | # we collect these only if we want to relabel 132 | task2preds = defaultdict(list) 133 | batches_to_relab = [] # collect those we've covered in this round 134 | batch2task = defaultdict() # this one is for error analysis 135 | 136 | batch_id = 0 137 | for task, batch in batch_iter(target_sizes, max_iter, train_feed_dicts): 138 | 139 | # this is just the normal training step - we minimise the loss on the task's own training data here 140 | #batch = train_feed_dicts[task][j] 141 | _, current_loss, p = sess.run([min_op[task], loss_dict[task], preds_dict[task]], feed_dict=batch) 142 | task2loss_all[task].extend(current_loss) 143 | hits = [pp for ii, pp in enumerate(p) if np.argmax(pp) == np.argmax(batch[placeholders["targets"]][ii])] 144 | task2correct_all[task] += len(hits) 145 | task2total[task] += len(batch[placeholders["targets"]]) 146 | 147 | # now we apply the models for all the other tasks to the batch and collect the predictions 148 | # this is used as input to the LTN model, to determine which of the instances in the batch can 149 | # also be used as training data for any of the other tasks 150 | batch = get_preds_for_ltn(sess, batch, placeholders, target_sizes, task, main_task, preds_dict, 151 | hard_or_soft, label_to_labelvocab, options["lab_emb_dim"], options["model_type"]) 152 | 153 | if task != main_task: # then we want to have a relabelling model training step 154 | _, current_loss_ltn, p_ltn = sess.run([min_op_ltn[task], loss_dict_ltn[task], preds_dict_ltn[task]], feed_dict=batch) 155 | task2loss_all_ltn[task].extend(current_loss_ltn) 156 | hits_ltn = [pp for ii, pp in enumerate(p_ltn) if np.argmax(pp) == np.argmax(batch[placeholders["targets"]][ii])] 157 | task2correct_all_ltn[task] += len(hits_ltn) 158 | task2total_ltn[task] += len(batch[placeholders["targets"]]) 159 | 160 | if task == main_task and options["relabel_with_ltn"] and i >= augment_data_from_epoch: 161 | p_ltn = sess.run(predict_main_dict[task], feed_dict=batch) 162 | task2preds[task].append(p_ltn) 163 | batches_to_relab.append(batch) 164 | batch2task[batch_id] = task 165 | 166 | batch_id += 1 167 | 168 | if options["relabel_with_ltn"] and i >= augment_data_from_epoch: 169 | # we need to apply the relabelling function on the main task data first here, then pass the results on as train_feed_dicts 170 | 171 | # Before we reshuffle, see if we should augment the main task data 172 | train_data_additional = relabel_data_with_ltn_preds(batches_to_relab, task2preds, options["batch_size"]) 173 | train_feed_dicts[main_task].append(train_data_additional) 174 | 175 | for task in target_sizes.keys(): 176 | p_inds_dev, g_inds_dev, p_ids_ltn = [], [], [] 177 | if options['dev_res_during_training']: 178 | for batch_dev in dev_feed_dicts[task]: 179 | 180 | batch_dev = get_preds_for_ltn(sess, batch_dev, placeholders, target_sizes, task, main_task, preds_dict, 181 | hard_or_soft, label_to_labelvocab, options["lab_emb_dim"], options["model_type"]) 182 | 183 | p_dev = sess.run(preds_dict[task], feed_dict=batch_dev) 184 | hits = [pp for k, pp in enumerate(p_dev) if 185 | np.argmax(pp) == np.argmax(batch_dev[placeholders["targets"]][k])] 186 | task2correct_dev_all[task] += len(hits) 187 | task2total_dev[task] += len(batch_dev[placeholders["targets"]]) 188 | 189 | # this is for super detailed results -- maybe we don't want to print this every epoch later on 190 | if i % 1 == 0: 191 | pred_inds = [np.argmax(pp_dev) for pp_dev in p_dev] 192 | p_inds_dev.extend(pred_inds) 193 | gold_inds = [np.argmax(batch_dev[placeholders["targets"]][i_d]) for i_d, targ in 194 | enumerate(batch_dev[placeholders["targets"]])] 195 | g_inds_dev.extend(gold_inds) 196 | 197 | p_dev_ltn = sess.run(preds_dict_ltn[task], feed_dict=batch_dev) 198 | pred_inds_ltn = [np.argmax(pp_dev) for pp_dev in p_dev_ltn] 199 | p_ids_ltn.extend(pred_inds_ltn) 200 | 201 | 202 | # Randomise batch IDs, so that selection of batch is random 203 | np.random.shuffle(train_feed_dicts[task]) 204 | np.random.shuffle(dev_feed_dicts[task]) 205 | 206 | if options['dev_res_during_training']: 207 | acc, acc_dev = task2correct_all[task] / task2total[task], task2correct_dev_all[task] / task2total_dev[task] 208 | else: 209 | acc = task2correct_all[task] / task2total[task] 210 | acc_dev = 0.0 211 | 212 | try: 213 | acc_ltn_train = task2correct_all_ltn[task] / task2total_ltn[task] 214 | except ZeroDivisionError: 215 | acc_ltn_train = 0 216 | 217 | if options["model_type"] == "label-transfer" or (options["model_type"] == 'semi-supervised' and task != main_task): 218 | print('Epoch %d :' % i, "Task: " + task, "Loss: ", np.mean(task2loss_all[task]), "Acc: ", acc, "Acc Dev: ", 219 | acc_dev, "Acc LTN Train: ", acc_ltn_train) 220 | else: 221 | print('Epoch %d :' % i, "Task: " + task, "Loss: ", np.mean(task2loss_all[task]), "Acc: ", acc, "Acc Dev: ", 222 | acc_dev, "Previous Acc Dev: ", main_task_dev_acc, "Acc LTN Train: ", acc_ltn_train) 223 | 224 | 225 | if task == main_task: 226 | if acc_dev >= early_stopping and len(main_task_dev_acc) >= 3 and acc_dev < main_task_dev_acc[-3]: 227 | print("Dev accuracy is smaller than 4 epochs ago, early stopping criteron reached.") 228 | stopping_criteron_reached = True 229 | break 230 | main_task_dev_acc.append(acc_dev) 231 | if stopping_criteron_reached == True: 232 | break 233 | 234 | return logits_dict, loss_dict, preds_dict, logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict 235 | 236 | 237 | def relabel_data_with_ltn_preds(all_batches, preds_ltn, batch_size): 238 | print("Executing the data augmentation function") 239 | 240 | # flatten batches for easier handling 241 | batches_flattenend = defaultdict(list) 242 | for key in all_batches[0].keys(): 243 | values_flat = [all_batches[i][key] for i in range(0, len(all_batches))] 244 | values_flatter = [ii for i in values_flat for ii in i] 245 | batches_flattenend[key] = np.stack(values_flatter, axis=0) 246 | print("number instances") 247 | print(len(batches_flattenend[key])) 248 | 249 | relabelled_insts = defaultdict(list) 250 | num_sampled = 0 251 | 252 | # for each task, get the predictions 253 | for task, preds in preds_ltn.items(): 254 | 255 | # flatten the preds batches 256 | preds = [pp for p in preds for pp in p] 257 | stacked = np.stack(preds, axis=1) 258 | 259 | # for each row (which are the probabilities for each label), sort the row by descending value and 260 | # store the index of the original array in the transformed array 261 | stacked_sorted = np.argsort(-stacked) 262 | 263 | label2inds = defaultdict(list) 264 | 265 | labelindex = 0 266 | for row in stacked_sorted: 267 | # the number of instances to sample are the top 10%, equally distributed across the labels 268 | num_inst_to_samp = int(len(row) * 0.1 * (1/len(stacked_sorted))) 269 | # determine how many batches this makes. We only want full batches so we might sometimes take slightly 270 | # less than the top 10% 271 | number_batches_to_samp = int(num_inst_to_samp/batch_size) 272 | num_inst_to_samp_final = number_batches_to_samp*batch_size 273 | label2inds[labelindex] = row[:num_inst_to_samp_final] 274 | 275 | num_sampled += num_inst_to_samp_final 276 | 277 | # store which batches are useful so we can iterate over these afterwards 278 | for instid in row[:num_inst_to_samp]: 279 | 280 | for key in all_batches[0].keys(): 281 | if key.name.startswith("label_vocab_inds:"): 282 | relabelled_insts[key].append(batches_flattenend[key][instid]) 283 | 284 | elif key.name.startswith("targets:"): 285 | # re-initialise the targets, then set the one for the predicted label to 1 286 | targets_here = np.zeros([len(batches_flattenend[key][instid])], np.int32) 287 | targets_here[labelindex] = 1 288 | relabelled_insts[key].append(targets_here) 289 | 290 | else: 291 | relabelled_insts[key].append(batches_flattenend[key][instid]) 292 | 293 | labelindex += 1 294 | 295 | # now all the relabelled data is in relabelled_insts and we need to change it to batch format again 296 | rebatched_instances = batch.batch_feed_dicts(relabelled_insts, batch_size, num_sampled) 297 | 298 | return rebatched_instances 299 | 300 | 301 | def get_preds_for_ltn(sess, batch, placeholders, target_sizes, task, main_task, preds_dict, hard_or_soft, label_to_labelvocab, lab_emb_dim, model_type): 302 | # get predictions on dev data for EM 303 | p_task_for_ltn = [] 304 | if lab_emb_dim > 0: 305 | # we don't want to modify the original batch 306 | batch_copy = dict.copy(batch) #copy.copy(batch) 307 | for taskjj in target_sizes.keys(): 308 | if model_type == 'semi-supervised' and (taskjj == task or taskjj == main_task): 309 | continue 310 | elif model_type == 'label-transfer' and (taskjj == task): 311 | if taskjj != main_task: 312 | continue 313 | if lab_emb_dim > 0: 314 | label_vocab_inds = np.array([label_to_labelvocab[taskjj] for i in range(0, len(batch[placeholders["seq1"]]))], np.int64) 315 | batch_copy[placeholders["label_vocab_inds"]] = label_vocab_inds 316 | p_jj = sess.run([preds_dict[taskjj]], feed_dict=batch_copy) 317 | else: 318 | p_jj = sess.run([preds_dict[taskjj]], feed_dict=batch) 319 | if hard_or_soft == 'hard': 320 | pred_inds = [np.argmax(pp, 1) for pp in p_jj] 321 | else: 322 | pred_inds = p_jj 323 | if p_task_for_ltn == []: 324 | p_task_for_ltn = pred_inds 325 | else: 326 | p_task_for_ltn.extend(pred_inds) 327 | 328 | if model_type == 'label-transfer' or task != main_task: # then we want to have an LTN model training step 329 | # enter current predictions in feed_dicts so main model predictions can be used by LTN model 330 | if hard_or_soft == 'hard': 331 | preds_for_ltn = np.stack(p_task_for_ltn, 1) 332 | else: 333 | preds_for_ltn = np.concatenate(p_task_for_ltn, 1) 334 | batch[placeholders["preds_for_ltn"]] = preds_for_ltn 335 | 336 | return batch 337 | 338 | 339 | def train(placeholders, target_sizes, train_feed_dicts, dev_feed_dicts, vocab, label_vocab, input_size_preds, num_preds_ltn, label_to_labelvocab, sess=None, **options): 340 | 341 | max_num_batches = {} 342 | for task in target_sizes.keys(): 343 | max_num_batches[task] = len(train_feed_dicts[task]) 344 | if label_vocab == None: 345 | label_vocab_len = 0 346 | else: 347 | label_vocab_len = len(label_vocab) 348 | 349 | # create model 350 | logits_dict, loss_dict, preds_dict, label_embeddings = bicond_reader(placeholders, target_sizes, len(vocab), label_vocab_len, **options) # those return dicts where the keys are the task names 351 | 352 | optim = tf.train.RMSPropOptimizer(learning_rate=options["learning_rate"]) 353 | 354 | if options["model_type"] == "semi-supervised" or options["model_type"] == "label-transfer": 355 | # additional TF model needed for estimating relabelling function 356 | logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict = relabel_model(placeholders, target_sizes, input_size_preds, num_preds_ltn, label_embeddings, **options) # those return dicts where the keys are the task names 357 | min_op_ltn = {} 358 | for task in target_sizes.keys(): 359 | min_op_ltn[task] = optim.minimize(tf.reduce_mean(loss_dict_ltn[task])) 360 | 361 | min_op = {} 362 | for task in target_sizes.keys(): 363 | min_op[task] = optim.minimize(tf.reduce_mean(loss_dict[task])) 364 | # The maximum number of iterations should be based on the number of batches in the smallest training set 365 | max_iter = min(max_num_batches.values()) 366 | print("Max number batches for each task:", max_num_batches) 367 | print("Randomly sampling one from", str(max_iter), "batches for each task every training epoch") 368 | 369 | tf.global_variables_initializer().run(session=sess) 370 | 371 | if options["save_model"] == True: 372 | saver = tf.train.Saver(max_to_keep=100) 373 | 374 | if options["model_type"] == "hard-sharing": 375 | logits_dict, loss_dict, preds_dict = balanced_mtl_training_loop(placeholders, target_sizes, train_feed_dicts, 376 | dev_feed_dicts, max_iter, 377 | min_op, logits_dict, loss_dict, preds_dict, sess, **options) 378 | 379 | logits_dict_ltn, loss_dict_ltn, preds_dict_ltn = {}, {}, {} 380 | 381 | elif options["model_type"] == "semi-supervised" or options["model_type"] == "label-transfer": 382 | # load pre-trained mtl model 383 | print("Check if pre-trained MTL model exists...") 384 | save_path = get_save_path(create_path=False, **options) 385 | if not os.path.exists(save_path): 386 | print("Save path", save_path, "does not exist. Training MTL model first.") 387 | logits_dict, loss_dict, preds_dict = balanced_mtl_training_loop(placeholders, target_sizes, train_feed_dicts, 388 | dev_feed_dicts, max_iter, 389 | min_op, logits_dict, loss_dict, preds_dict, sess, **options) 390 | else: 391 | print("Model already exists. Restoring model.") 392 | saver = tf.train.Saver(max_to_keep=100) 393 | saver.restore(sess, save_path + "/model.ckpt") 394 | print("Model " + save_path + "/model.ckpt" + " restored.") 395 | 396 | print("\nStarting LTN training...") 397 | logits_dict, loss_dict, preds_dict, logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict = balanced_mtl_with_ltn_training_loop(placeholders, target_sizes, train_feed_dicts, 398 | dev_feed_dicts, max_iter, min_op, min_op_ltn, logits_dict, loss_dict, preds_dict, 399 | logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict, label_to_labelvocab, sess, **options) 400 | 401 | if options["save_model"] == True: 402 | savepath = get_save_path(create_path=True, **options) 403 | print("Saving model at location:", savepath) 404 | saver.save(sess, savepath + "/model.ckpt") 405 | 406 | return logits_dict, loss_dict, preds_dict, logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict 407 | 408 | 409 | def restore_trained_model(placeholders, target_sizes, train_feed_dicts, vocab, label_vocab_len, label_to_labelvocab, input_size_preds, num_preds_ltn, sess=None, **options): 410 | 411 | max_num_batches = {} 412 | for task in target_sizes.keys(): 413 | max_num_batches[task] = len(train_feed_dicts[task]) 414 | 415 | # create model 416 | logits_dict, loss_dict, preds_dict, label_embeddings = bicond_reader(placeholders, target_sizes, len(vocab), label_vocab_len, **options) # those return dicts where the keys are the task names 417 | 418 | logits_dict_ltn = loss_dict_ltn = preds_dict_ltn = None 419 | if options["model_type"] == "semi-supervised": 420 | logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict = relabel_model(placeholders, target_sizes, input_size_preds, num_preds_ltn, label_embeddings, **options) # those return dicts where the keys are the task names 421 | 422 | tf.global_variables_initializer().run(session=sess) 423 | 424 | saver = tf.train.Saver(max_to_keep=100) 425 | save_path = get_save_path(create_path=False, **options) 426 | if not os.path.exists(save_path): 427 | print("Save path", save_path, "does not exist. Model cannot be loaded. Aborting.") 428 | return "", "", "" 429 | saver.restore(sess, save_path + "/model.ckpt") 430 | print("Model " + save_path + "/model.ckpt" + " restored.") 431 | 432 | return logits_dict, loss_dict, preds_dict, logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict 433 | 434 | 435 | def get_save_path(create_path=True, **options): 436 | 437 | tasks = ",".join([TASK_NAMES_SHORT[tsk] for tsk in options["tasks"]]) 438 | if type(options["feature_sets"]) == list: 439 | features = ",".join(options["feature_sets"]) 440 | else: 441 | features = options["feature_sets"] 442 | debug = False 443 | if str(options["debug"]): 444 | debug = True 445 | lab_embs_for_ltn = False 446 | if str(options["lab_embs_for_ltn"]): 447 | lab_embs_for_ltn = True 448 | skip_connections = False 449 | if str(options["skip_connections"]): 450 | skip_connections = True 451 | attention = False 452 | if str(options["attention"]): 453 | attention = True 454 | alternate_batches = False 455 | if str(options["alternate_batches"]): 456 | alternate_batches = True 457 | ltn_pred_type, lel_hid_size, max_ltn = "", "", "" 458 | if options["model_type"] != "hard_sharing": 459 | ltn_pred_type = options['ltn_pred_type'] 460 | lel_hid_size = str(options["lel_hid_size"]) 461 | max_ltn = str(options["max_epochs_ltn"]) 462 | 463 | save_model_dir = "_".join([options["model_type"], ltn_pred_type, options["main_task"], tasks, str(debug), 464 | str(options["num_instances"]), str(options["emb_dim"]), lel_hid_size, 465 | str(options["task_specific_layer_size"]), str(options["lab_emb_dim"]), 466 | str(skip_connections), features, str(options["main_num_layers"]), 467 | str(options["rnn_cell_type"]), 468 | str(lab_embs_for_ltn), str(attention), str(alternate_batches), 469 | str(options["batch_size"]), str(options["max_epochs"]), max_ltn, 470 | str(options["early_stopping"]), str(options["learning_rate"]), 471 | str(options["l1_rate_main"]), str(options["l2_rate_main"]), 472 | str(options["l1_rate_ltn"]), str(options["l2_rate_ltn"]), 473 | str(options["dropout_rate"]), str(options["exp_id"])]) 474 | 475 | save_path = os.path.abspath(os.path.join("./save/", save_model_dir)) 476 | if create_path == True and not os.path.exists(save_path): 477 | os.makedirs(save_path) 478 | return save_path --------------------------------------------------------------------------------