├── features
    ├── __init__.py
    └── features.py
├── preproc
    ├── __init__.py
    ├── error_ana.py
    ├── log_utils.py
    ├── plot_utils.py
    ├── fnc_data_splits.py
    ├── batch.py
    ├── data_reader.py
    ├── vocab.py
    └── map.py
├── constants.py
├── README.md
├── data
    └── download_data.sh
├── mtl
    ├── tensoriser.py
    ├── nn.py
    └── training.py
└── main.py


/features/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/preproc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/constants.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Constants shared across modules.
 3 | """
 4 | 
 5 | STANCE = "semeval2016-task6-stance"
 6 | FNC = "fakenewschallenge"
 7 | NLI = "multinli"
 8 | TOPIC = "topic-based"
 9 | TOPIC_5WAY = "topic-based-5way"
10 | LAPTOP = "absa-laptops"
11 | RESTAURANT = "absa-restaurants"
12 | TARGET = "target-dependent"
13 | TASKS = [STANCE, FNC, NLI, TOPIC, TOPIC_5WAY, LAPTOP, RESTAURANT, TARGET]
14 | RNN_CELL_TYPES = ["lstm", "phased_lstm", "layer_norm", "nas"]  # LSTM, plus the RNN cell types in Tensorflow interchangable with it
15 | 
16 | TASK_NAMES_SHORT = {"semeval2016-task6-stance": "STANCE", "fakenewschallenge": "FNC", "topic-based": "TOPIC", "multinli": "NLI",
17 |                     "topic-based-5way": "TOPIC_5WAY", "absa-laptops": "LAPTOP", "absa-restaurants":"RESTAURANT", "target-dependent": "TARGET"}
18 | 
19 | STANCE_LABELS = ['AGAINST', 'FAVOR', 'NONE']
20 | FNC_LABELS = ['agree', 'disagree', 'discuss', 'unrelated']
21 | NLI_LABELS = ['contradiction', 'entailment', 'neutral']
22 | TOPIC_LABELS = ['negative', 'positive']
23 | TOPIC_5WAY_LABELS = [-2.0, -1.0, 0.0, 1.0, 2.0]
24 | ABSA_LABELS = ['negative', 'neutral', 'positive']
25 | TARGET_LABELS = ['-1', '0', '1']
26 | 
27 | SIM = 'similarity'
28 | DIV = 'diversity'
29 | NONE = 'predsonly'
30 | SIMILARITY_FEATURES = ['jensen-shannon', 'renyi', 'cosine', 'euclidean',
31 |                        'variational', 'bhattacharyya']
32 | DIVERSITY_FEATURES = ['num_word_types', 'type_token_ratio', 'entropy',
33 |                       'simpsons_index', 'renyi_entropy']
34 | # we don't use 'quadratic_entropy' at the moment, as it requires word vectors
35 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # mtl-disparate
 2 | Code for NAACL 2018 paper ["Multi-task Learning of Pairwise Sequence Classification Tasks Over Disparate Label Spaces"](https://arxiv.org/abs/1802.09913) by Isabelle Augenstein, Sebastian Ruder, Anders Søgaard
 3 | 
 4 | Note that this is research code and will not be maintained to e.g. ensure compatibility with more recent library versions.
 5 | 
 6 | 
 7 | Requirements:
 8 | 
 9 | - Tensorflow 1.5
10 | - Numpy 1.12.1
11 | - sklearn 0.18.1
12 | - scipy
13 | 
14 | Steps to run:
15 | 
16 | - run data/download_data.sh to download and extract data
17 | - preproc/data_reader.py tests if all the data readers work
18 | - preproc/fnc_data_splits.py to split the FNC training dataset into a training and dev set
19 | - main.py trains models
20 | 
21 | # Datasets
22 | 
23 | ## SemEval 2016 Task 6 Stance detection
24 | 
25 | - [Task website](http://alt.qcri.org/semeval2016/task6/)
26 | 
27 | ## Fake News Challenge (FNC)
28 | 
29 | - [Task website](http://www.fakenewschallenge.org/)
30 | 
31 | ## Multi-NLI
32 | 
33 | - [Task website](http://www.nyu.edu/projects/bowman/multinli/)
34 | 
35 | ## SemEval 2016 Task 4 Subtask B Topic-based Twitter sentiment analysis
36 | 
37 | - [Task website](http://alt.qcri.org/semeval2016/task4/)
38 | - [Task description paper](https://aclweb.org/anthology/S/S16/S16-1001.pdf)
39 | - Note: Same dataset was used as rerun in [2017](http://alt.qcri.org/semeval2017/task4/)
40 | 
41 | ## SemEval 2016 Task 5 Subtask 1 Slot 3 Aspect-based sentiment analysis
42 | 
43 | - [Task website](http://alt.qcri.org/semeval2016/task5/)
44 | 
45 | ## Clickbait Challenge 2017
46 | 
47 | - [Task website](http://www.clickbait-challenge.org/)


--------------------------------------------------------------------------------
/preproc/error_ana.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from collections import defaultdict
 3 | 
 4 | def count_overlap(file):
 5 |     with open(file, "r") as indsf:
 6 |         indsmap = defaultdict(dict)
 7 |         for l in indsf:
 8 |             if len(l.split("\t")) == 3:
 9 |                 task, model, inds = l.strip("\n").split("\t")
10 |                 indsmap[task][model] = inds.split(" ")
11 |             else:
12 |                 task, model, iter, inds = l.strip("\n").split("\t")
13 |                 indsmap[task + "_" + iter][model] = inds.split(" ")
14 |         for task, entries in indsmap.items():
15 |             main_correct = 0.0
16 |             relabel_correct = 0.0
17 |             both_correct = 0.0
18 |             both_incorrect = 0.0
19 |             len_gold = len(indsmap[task]["Gold"])
20 |             all = float(len_gold)
21 |             for i in range(0, len_gold):
22 |                 if (indsmap[task]["Gold"][i] == indsmap[task]["Relabel model"][i]) and (indsmap[task]["Relabel model"][i] == indsmap[task]["Main model"][i]):
23 |                     both_correct += 1
24 |                 elif (indsmap[task]["Relabel model"][i] == indsmap[task]["Main model"][i]) and  (indsmap[task]["Main model"][i] != indsmap[task]["Gold"][i]):
25 |                     both_incorrect += 1
26 |                 elif indsmap[task]["Gold"][i] == indsmap[task]["Relabel model"][i]:
27 |                     relabel_correct += 1
28 |                 else:
29 |                     main_correct += 1
30 |             rate_both_correct = (both_correct/all)
31 |             rate_both_incorect = (both_incorrect / all)
32 |             rate_relab_correct = (relabel_correct / all)
33 |             rate_main_correct = (main_correct / all)
34 |             prop_main = rate_main_correct / (rate_both_correct + rate_relab_correct + rate_main_correct)
35 |             prop_relab = rate_relab_correct / (rate_both_correct + rate_relab_correct + rate_main_correct)
36 |             print(task, "Rate both correct", str(rate_both_correct))
37 |             print(task, "Rate both incorrect", str(rate_both_incorect))
38 |             print(task, "Rate only relabel correct", str(rate_relab_correct))
39 |             print(task, "Rate only main correct", str(rate_main_correct))
40 |             print(task, "Prop main", str(prop_main * 100))
41 |             print(task, "Prop relab", str(prop_relab * 100))
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     #reformat_log_tabs()
46 |     dirpath = "../"
47 |     files = os.listdir(dirpath)
48 |     for f in files:
49 |         if f.endswith("_inds.txt"):
50 |             if not "learningcurve" in f:
51 |                 continue
52 |             if not "label-transfer" in f:
53 |                 continue
54 |             if not "multi" in f:
55 |                 continue
56 |             print("Reading file", f)
57 |             count_overlap(os.path.join(dirpath, f))
58 |             print("")


--------------------------------------------------------------------------------
/data/download_data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Download the SemEval 2016 Task 6 Stance detection dataset
 4 | mkdir semeval2016-task6-stance ; cd semeval2016-task6-stance
 5 | wget http://alt.qcri.org/semeval2016/task6/data/uploads/stancedataset.zip
 6 | wget http://alt.qcri.org/semeval2016/task6/data/uploads/semeval2016-task6-trialdata.txt
 7 | curl -L "https://drive.google.com/uc?export=download&id=0B2Z1kbILu3YtenFDUzM5dGZEX2s" > downloaded_Donald_Trump.txt
 8 | unzip stancedataset.zip -d . ; mv StanceDataset/* .
 9 | rm stancedataset.zip ; rm -r StanceDataset __MACOSX
10 | cd ..
11 | 
12 | # Download the Fake News Challenge datset
13 | mkdir fakenewschallenge ; cd fakenewschallenge
14 | wget https://raw.githubusercontent.com/FakeNewsChallenge/fnc-1/master/competition_test_stances.csv
15 | wget https://raw.githubusercontent.com/FakeNewsChallenge/fnc-1/master/competition_test_bodies.csv
16 | wget https://github.com/FakeNewsChallenge/fnc-1/archive/master.zip
17 | unzip master.zip -d . ; mv fnc-1-master/* .
18 | rm -r fnc-1-master ; rm master.zip
19 | cd ..
20 | 
21 | # Download the Multi-NLI dataset
22 | mkdir multinli ; cd multinli
23 | wget http://www.nyu.edu/projects/bowman/multinli/multinli_0.9.zip
24 | unzip multinli_0.9.zip -d . ; mv multinli_0.9/* .
25 | rm multinli_0.9.zip ; rm -r multinli_0.9
26 | cd ..
27 | 
28 | # Download the SemEval 2016 Task 4 Subtask B Topic-based Twitter sentiment analysis dataset
29 | mkdir semeval2016-task4b-topic-based-sentiment ; cd semeval2016-task4b-topic-based-sentiment
30 | curl -L "https://drive.google.com/uc?export=download&id=0B3emjZ5O5vDtSGpKcjQ3cnhldmc" > semeval2016_task4b_topic-based_sentiment.zip
31 | unzip semeval2016_task4b_topic-based_sentiment.zip -d .
32 | rm semeval2016_task4b_topic-based_sentiment.zip
33 | cd ..
34 | 
35 | # Download the SemEval 2016 Task 4 Subtask C Topic-based 5-way Twitter sentiment analysis dataset
36 | mkdir semeval2016-task4c-topic-based-sentiment ; cd semeval2016-task4c-topic-based-sentiment
37 | curl -L "https://drive.google.com/uc?export=download&id=1eS67x5vedrzVVk-tcyKSrumigbJKuqH-" > semeval2016_task4c_topic-based_sentiment.zip
38 | unzip semeval2016_task4c_topic-based_sentiment.zip -d .
39 | rm semeval2016_task4c_topic-based_sentiment.zip
40 | cd ..
41 | 
42 | # Download the SemEval 2016 Task 5 Aspect-based sentiment analysis dataset
43 | mkdir semeval2016-task5-absa-english ; cd semeval2016-task5-absa-english
44 | curl -L "https://drive.google.com/uc?export=download&id=0B3emjZ5O5vDtbTJnUHRIdFBULTg" > semeval2016_task5_absa_english.zip
45 | unzip semeval2016_task5_absa_english.zip -d .
46 | rm semeval2016_task5_absa_english.zip
47 | cd ..
48 | 
49 | # Download the target-dependent sentiment analysis dataset of Dong et al. (2014):
50 | # Adaptive Recursive Neural Network for Target-dependent Twitter Sentiment Classification
51 | mkdir target-dependent ; cd target-dependent
52 | curl -L "https://drive.google.com/uc?export=download&id=0B3emjZ5O5vDtTW1SZjItWFlxUUU" > target_dependent.zip
53 | unzip target_dependent.zip -d .
54 | rm target_dependent.zip
55 | cd ..
56 | 


--------------------------------------------------------------------------------
/preproc/log_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility methods for logging and analyzing results.
  3 | """
  4 | 
  5 | from collections import defaultdict
  6 | from datetime import datetime
  7 | import numpy as np
  8 | 
  9 | from sklearn.metrics import recall_score, mean_absolute_error, f1_score,\
 10 |     accuracy_score
 11 | 
 12 | from constants import *
 13 | import os
 14 | 
 15 | 
 16 | FORMAT = '%Y-%m-%d-%H%M%S'
 17 | 
 18 | RECALL = 'recall'
 19 | MAE = 'mae'
 20 | TOPIC_BASED_SCORES = [RECALL, MAE]
 21 | 
 22 | def stance_postproc_init(vocab):
 23 |     inds = []
 24 |     for id, tok in vocab.id2sym.items():
 25 |         if "trump" in tok or "donald" in tok:
 26 |             inds.append(id)
 27 |     return inds
 28 | 
 29 | def postproc_stance(inds, placeholders, batch, p):
 30 |     for i, b in enumerate(batch[placeholders["seq1"]]):
 31 |         in_ind = False
 32 |         for ind in inds:
 33 |             if ind in batch[placeholders["seq1"]][i]:
 34 |                 in_ind = True
 35 |                 break
 36 |         # labels are always: AGAINST, FAVOR, NONE
 37 |         if in_ind:
 38 |             if p[i][0] > p[i][1]:
 39 |                 p[i][0] = 1.0
 40 |             else:
 41 |                 p[i][1] = 1.0
 42 |     return p
 43 | 
 44 | 
 45 | def task2score(task, y_true, y_pred, topics):
 46 |     if task == STANCE:
 47 |         return macro_averaged_pos_neg_f1_score(y_true, y_pred)
 48 |     if task == TOPIC:
 49 |         return topic_based_macro_averaged_score(y_true, y_pred, topics, RECALL)
 50 |     if task == TOPIC_5WAY:
 51 |         return topic_based_macro_averaged_score(y_true, y_pred, topics, MAE)
 52 |     if task in [LAPTOP, RESTAURANT]:
 53 |         return accuracy_score(y_true, y_pred)
 54 |     if task in [TARGET]:
 55 |         return f1_score(y_true, y_pred, average='macro')
 56 |     return f1_score(y_true, y_pred, average='micro')
 57 | 
 58 | 
 59 | def macro_averaged_pos_neg_f1_score(y_true, y_pred):
 60 |     """Compute the macro-average of the favor and against F1 scores for stance
 61 |     detection."""
 62 |     # order of labels is AGAINST, FAVOR, NONE
 63 |     f1_scores = f1_score(y_true, y_pred, average=None)
 64 |     return np.mean([f1_scores[0], f1_scores[1]])
 65 | 
 66 | 
 67 | def topic_based_macro_averaged_score(y_true, y_pred, topics, score):
 68 |     """
 69 |     Compute score macro-averaged across topics. Score is macro-averaged recall
 70 |     for subtask B and mean absolute error for subtask C.
 71 |     """
 72 |     assert score in TOPIC_BASED_SCORES, 'Error: %s is not valid.' % score
 73 |     scores = []
 74 |     topic2y_true = defaultdict(list)
 75 |     topic2y_pred = defaultdict(list)
 76 | 
 77 |     # aggregate the labels and predictions for each topic
 78 |     for y_t, y_p, topic in zip(y_true, y_pred, topics):
 79 |         topic2y_true[str(topic)].append(y_t)
 80 |         topic2y_pred[str(topic)].append(y_p)
 81 | 
 82 |     for topic in topic2y_true.keys():
 83 |         y_true_topic = topic2y_true[str(topic)]
 84 |         y_pred_topic = topic2y_pred[str(topic)]
 85 |         if score == RECALL:
 86 |             score_value = recall_score(y_true_topic, y_pred_topic, average='macro')
 87 |         else:
 88 |             # for MAE, we have to manually perform macro-averaging
 89 |             # labels are 0-4 and correspond to original labels -2,-1,0,1,2
 90 |             temp_scores = []
 91 |             for label_id in range(5):
 92 |                 true_pred_pairs = [(y_t, y_p) for y_t, y_p in
 93 |                                    zip(y_true_topic, y_pred_topic)
 94 |                                    if y_t == label_id]
 95 |                 if len(true_pred_pairs) == 0:
 96 |                     # some topics do not appear with a certain label
 97 |                     continue
 98 |                 y_true_temp, y_pred_temp = zip(*true_pred_pairs)
 99 |                 temp_scores.append(mean_absolute_error(y_true_temp, y_pred_temp))
100 |             score_value = np.mean(temp_scores)
101 |         scores.append(score_value)
102 |     return np.mean(scores)
103 | 
104 | 
105 | def log_results(options, task_score, f1_score, relabel_score, task):
106 |     """
107 |     Log the results to a file.
108 |     :param options: the options used as input to the script
109 |     :param task_score: the task-specific score achieved on the test set
110 |     :param f1_score: the micro-averaged f1 score achieved on the test set
111 |     :param relabel_score: the score achieved by the relabeling function on the
112 |                           test set
113 |     :param task: the task the model was evaluated on
114 |     """
115 |     with open(options['log_file'], 'a') as f:
116 |         print('Writing results to %s...' % options['log_file'])
117 |         f.write('%s\t%s\t%.4f\t%.4f\t%.4f\t%s\n' %
118 |                 (datetime.now().strftime(FORMAT), task, task_score, f1_score,
119 |                  relabel_score, ' '.join(['%s=%s' % (opt, options[opt])
120 |                                           for opt in options.keys()])))
121 | 


--------------------------------------------------------------------------------
/preproc/plot_utils.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('Agg')
  3 | import matplotlib.pyplot as plt
  4 | import matplotlib.patches as mpatches
  5 | from sklearn.manifold import TSNE
  6 | from sklearn.decomposition import PCA
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | 
 10 | from constants import FNC, STANCE, NLI, TOPIC, LAPTOP, RESTAURANT, TARGET,\
 11 |     TOPIC_5WAY, STANCE_LABELS, FNC_LABELS, NLI_LABELS, TOPIC_LABELS, \
 12 |     TOPIC_5WAY_LABELS, ABSA_LABELS, TARGET_LABELS
 13 | 
 14 | 
 15 | def task2labels(task):
 16 |     if task == STANCE:
 17 |         return STANCE_LABELS
 18 |     if task == FNC:
 19 |         return FNC_LABELS
 20 |     if task == NLI:
 21 |         return NLI_LABELS
 22 |     if task == TOPIC:
 23 |         return TOPIC_LABELS
 24 |     if task == TOPIC_5WAY:
 25 |         return TOPIC_5WAY_LABELS
 26 |     if task in [LAPTOP, RESTAURANT]:
 27 |         return ABSA_LABELS
 28 |     if task == TARGET:
 29 |         return TARGET_LABELS
 30 |     raise ValueError('No labels available for task %s.' % task)
 31 | 
 32 | 
 33 | def task2display_name(task):
 34 |     if task == STANCE:
 35 |         return 'Stance'
 36 |     if task == FNC:
 37 |         return 'FNC-1'
 38 |     if task == NLI:
 39 |         return 'MultiNLI'
 40 |     if task == TOPIC:
 41 |         return 'Topic-2'
 42 |     if task == TOPIC_5WAY:
 43 |         return 'Topic-5'
 44 |     if task == LAPTOP:
 45 |         return 'ABSA-L'
 46 |     if task == RESTAURANT:
 47 |         return 'ABSA-R'
 48 |     if task == TARGET:
 49 |         return 'Target'
 50 |     raise ValueError('%s is not a valid task.' % task)
 51 | 
 52 | 
 53 | def task2color(task):
 54 |     if task == TOPIC:
 55 |         return 'forestgreen'
 56 |     if task == TOPIC_5WAY:
 57 |         return 'yellowgreen'
 58 |     if task == LAPTOP:
 59 |         return 'cornflowerblue'
 60 |     if task == RESTAURANT:
 61 |         return 'mediumblue'
 62 |     if task == STANCE:
 63 |         return 'midnightblue'
 64 |     if task == TARGET:
 65 |         return 'saddlebrown'
 66 |     if task == FNC:
 67 |         return 'darkgoldenrod'
 68 |     if task == NLI:
 69 |         return 'slategray'
 70 |     raise ValueError('%s is not available.' % task)
 71 | 
 72 | 
 73 | def label2display_name(label):
 74 |     if label in ['AGAINST', 'FAVOR', 'NONE']:
 75 |         return label.lower()
 76 |     try:
 77 |         label = float(label)
 78 |         if label == 0:
 79 |             return 'neutral'
 80 |         if label == -1:
 81 |             return 'negative'
 82 |         if label == -2:
 83 |             return 'highly negative'
 84 |         if label == 1:
 85 |             return 'positive'
 86 |         if label == 2:
 87 |             return 'highly positive'
 88 |     except:
 89 |         return label
 90 |     return label
 91 | 
 92 | 
 93 | def plot_label_embeddings(sess, tasks, label_vocab):
 94 |     var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "label_embeddings/label_embeddings")
 95 |     assert len(var_list) > 0, 'Error: Label embeddings have not been saved.'
 96 |     assert len(var_list) == 1
 97 | 
 98 |     label_embeddings = sess.run(var_list[0])
 99 |     print('Loaded label embeddings of shape:', label_embeddings.shape)
100 | 
101 |     assert label_vocab is not None
102 | 
103 |     # remove the UNK label of the label embeddings
104 |     label_embeddings = label_embeddings[1:, :]
105 | 
106 |     colors = ['red', 'blue', 'green', 'purple', 'orange', 'olive', 'cyan', 'brown']
107 | 
108 |     # tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=3000)
109 |     pca = PCA(n_components=2)
110 | 
111 |     label_embeddings_tsne = pca.fit_transform(label_embeddings)
112 |     label_names = []
113 |     task_names = []
114 |     for i, task in enumerate(tasks):
115 |         task_labels = task2labels(task)
116 |         label_names += task_labels
117 |         task_names += [task] * len(task_labels)
118 |     # as a sanity check, make sure that the labels correspond with those in the
119 |     # label vocab; +1 because the labels start at 1 (0 is UNK)
120 |     for i in range(label_embeddings.shape[0]):
121 |         label_id = "%s_%s" % (task_names[i], str(label_names[i]))
122 |         # print(i+1, label_id, label_vocab.sym2id[label_id])
123 |         assert i+1 == label_vocab.sym2id[label_id],\
124 |             'Error: Id %d != label id %d for %s.' % (i+1, label_id, task_names[i])
125 | 
126 |     file_name = 'label_embeddings.png'
127 |     plot_embedding(label_embeddings_tsne, label_names, task_names, file_name=file_name)
128 | 
129 | 
130 | def plot_embedding(X, y, tasks, title=None, file_name=None):
131 |     """Plot an embedding X with the label y colored by colors."""
132 |     x_min, x_max = np.min(X, 0), np.max(X, 0)
133 |     X = (X - x_min) / (x_max - x_min)
134 | 
135 |     # we can increase the resolution by increasing the figure size
136 |     plt.figure(figsize=(5,5))
137 |     ax = plt.subplot(111)
138 |     for i in range(X.shape[0]):
139 |         if tasks[i] == STANCE:
140 |             # skip stance and plot later
141 |             continue
142 |         plt.text(X[i, 0], X[i, 1], label2display_name(str(y[i])),
143 |                  color=task2color(tasks[i]),
144 |                  fontdict={'weight': 'bold', 'size': 9})
145 | 
146 |     for i in range(X.shape[0]):
147 |         if tasks[i] == STANCE:
148 |             plt.text(X[i, 0], X[i, 1], label2display_name(str(y[i])),
149 |                      color=task2color(tasks[i]),
150 |                      fontdict={'weight': 'bold', 'size': 9})
151 | 
152 |     # create patches for the legend
153 |     patches = []
154 |     for task in sorted(list(set(tasks))):
155 |         patches.append(mpatches.Patch(color=task2color(task), label=task2display_name(task)))
156 |     lgd = plt.legend(handles=patches, loc='upper left', bbox_to_anchor=(1, 1),
157 |                      edgecolor='black')
158 | 
159 |     # plt.xticks([]), plt.yticks([])
160 |     if title is not None:
161 |         plt.title(title)
162 |     # plt.show()
163 |     plt.savefig(file_name, bbox_extra_artists=(lgd,), bbox_inches='tight')
164 | 


--------------------------------------------------------------------------------
/preproc/fnc_data_splits.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from csv import DictReader
  3 | from csv import DictWriter
  4 | 
  5 | # Define data class
  6 | class FNCData:
  7 | 
  8 |     """
  9 |     Define class for Fake News Challenge data
 10 |     """
 11 | 
 12 |     def __init__(self, file_instances, file_bodies):
 13 | 
 14 |         # Load data
 15 |         self.instances = self.read(file_instances)
 16 |         bodies = self.read(file_bodies)
 17 |         self.heads = {}
 18 |         self.bodies = {}
 19 | 
 20 |         # Process instances
 21 |         for instance in self.instances:
 22 |             if instance['Headline'] not in self.heads:
 23 |                 head_id = len(self.heads)
 24 |                 self.heads[instance['Headline']] = head_id
 25 |             instance['Body ID'] = int(instance['Body ID'])
 26 | 
 27 |         # Process bodies
 28 |         for body in bodies:
 29 |             self.bodies[int(body['Body ID'])] = body['articleBody']
 30 | 
 31 |     def read(self, filename):
 32 | 
 33 |         """
 34 |         Read Fake News Challenge data from CSV file
 35 |         Args:
 36 |             filename: str, filename + extension
 37 |         Returns:
 38 |             rows: list, of dict per instance
 39 |         """
 40 | 
 41 |         # Initialise
 42 |         rows = []
 43 | 
 44 |         # Process file
 45 |         with open(filename, "r", encoding='utf-8') as table:
 46 |             r = DictReader(table)
 47 |             for line in r:
 48 |                 rows.append(line)
 49 | 
 50 |         return rows
 51 | 
 52 | 
 53 | def split_seen(data, rand=False, prop_dev=0.2, rnd_sd=1489215):
 54 | 
 55 |     """
 56 | 
 57 |     Split data into separate sets with overlapping headlines
 58 | 
 59 |     Args:
 60 |         data: FNCData object
 61 |         rand: bool, True: random split and False: use seed for official baseline split
 62 |         prop_dev: float, proportion of data for dev set
 63 |         rnd_sd: int, random seed to use for split
 64 | 
 65 |     Returns:
 66 |         train: list, of dict per instance
 67 |         dev: list, of dict per instance
 68 | 
 69 |     """
 70 | 
 71 |     # Initialise
 72 |     list_bodies = [body for body in data.bodies]
 73 |     n_dev_bodies = round(len(list_bodies) * prop_dev)
 74 |     r = random.Random()
 75 |     if rand is False:
 76 |         r.seed(rnd_sd)
 77 |     train = []
 78 |     dev = []
 79 | 
 80 |     # Generate list of bodies for dev set
 81 |     r.shuffle(list_bodies)
 82 |     list_dev_bodies = list_bodies[-n_dev_bodies:]
 83 | 
 84 |     # Generate train and dev sets
 85 |     for stance in data.instances:
 86 |         if stance['Body ID'] not in list_dev_bodies:
 87 |             train.append(stance)
 88 |         else:
 89 |             dev.append(stance)
 90 | 
 91 |     return train, dev
 92 | 
 93 | 
 94 | def split_unseen(data, rand=False, prop_dev=0.2, rnd_sd=1489215):
 95 | 
 96 |     """
 97 | 
 98 |     Split data into completely separate sets (i.e. non-overlap of headlines and bodies)
 99 | 
100 |     Args:
101 |         data: FNCData object
102 |         rand: bool, True: random split and False: constant split
103 |         prop_dev: float, target proportion of data for dev set
104 |         rnd_sd: int, random seed to use for split
105 | 
106 |     Returns:
107 |         train: list, of dict per instance
108 |         dev: list, of dict per instance
109 | 
110 |     """
111 | 
112 |     # Initialise
113 |     n = len(data.instances)
114 |     n_dev = round(n * prop_dev)
115 |     dev_ind = {}
116 |     r = random.Random()
117 |     if rand is False:
118 |         r.seed(rnd_sd)
119 |     train = []
120 |     dev = []
121 | 
122 |     # Identify instances for dev set
123 |     while len(dev_ind) < n_dev:
124 |         rand_ind = r.randrange(n)
125 |         if not data.instances[rand_ind]['Stance'] in ['agree', 'disagree', 'discuss']:
126 |             continue
127 |         if rand_ind not in dev_ind:
128 |             rand_head = data.instances[rand_ind]['Headline']
129 |             rand_body_id = data.instances[rand_ind]['Body ID']
130 |             dev_ind[rand_ind] = 1
131 |             track_heads = {}
132 |             track_bodies = {}
133 |             track_heads[rand_head] = 1
134 |             track_bodies[rand_body_id] = 1
135 |             pre_len_heads = len(track_heads)
136 |             pre_len_bodies = len(track_bodies)
137 |             post_len_heads = 0
138 |             post_len_bodies = 0
139 |             while pre_len_heads != post_len_heads and pre_len_bodies != post_len_bodies:
140 |                 pre_len_heads = len(track_heads)
141 |                 pre_len_bodies = len(track_bodies)
142 |                 for i, stance in enumerate(data.instances):
143 |                     if not data.instances[i]['Stance'] in ['agree', 'disagree', 'discuss']:
144 |                         continue
145 |                     if i != rand_ind and (stance['Headline'] in track_heads or stance['Body ID'] in track_bodies):
146 |                         track_heads[stance['Headline']] = 1
147 |                         track_bodies[stance['Body ID']] = 1
148 |                 post_len_heads = len(track_heads)
149 |                 post_len_bodies = len(track_bodies)
150 | 
151 |             for k, stance in enumerate(data.instances):
152 |                 if k != rand_ind and (stance['Headline'] in track_heads or stance['Body ID'] in track_bodies) and (stance['Stance'] in ['agree', 'disagree', 'discuss']):
153 |                     dev_ind[k] = 1
154 | 
155 |     # Generate train and dev sets
156 |     for k, stance in enumerate(data.instances):
157 |         if k in dev_ind:
158 |             dev.append(stance)
159 |         else:
160 |             train.append(stance)
161 | 
162 |     return train, dev
163 | 
164 | 
165 | def save_csv(data_split, filepath):
166 |     """
167 |     Save predictions to CSV file
168 |     Args:
169 |         pred: numpy array, of numeric predictions
170 |         file: str, filename + extension
171 |     """
172 | 
173 |     with open(filepath, 'w', encoding='utf-8') as csvfile:
174 |         fieldnames = ['Headline','Body ID','Stance']
175 |         writer = DictWriter(csvfile, fieldnames=fieldnames)
176 | 
177 |         writer.writeheader()
178 |         for instance in data_split:
179 |             writer.writerow({'Headline': instance["Headline"], 'Body ID': instance["Body ID"], 'Stance': instance["Stance"]})
180 | 
181 | 
182 | if __name__ == "__main__":
183 |     data = FNCData("../data/fakenewschallenge/train_stances.csv", "../data/fakenewschallenge/train_bodies.csv")
184 |     train, dev = split_unseen(data)
185 |     save_csv(train, "../data/fakenewschallenge/trainsplit_stances.csv")
186 |     save_csv(dev, "../data/fakenewschallenge/devsplit_stances.csv")


--------------------------------------------------------------------------------
/mtl/tensoriser.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | 
  3 | from preproc.vocab import Vocab
  4 | from preproc.batch import get_feed_dicts
  5 | from preproc.map import numpify, tokenize, lower, deep_map, deep_seq_map, map_to_targets
  6 | from preproc.data_reader import task2data_reader
  7 | from features.features import create_features
  8 | import numpy as np
  9 | 
 10 | 
 11 | def load_data(placeholders, target_labels, target_sizes, **options):
 12 |     batch_size = options["batch_size"]
 13 |     prepared_data = defaultdict(dict)
 14 |     feature_data = {}
 15 |     if options['ltn_pred_type'] == 'hard':
 16 |         num_preds_ltn = len(options["tasks"]) - 2  # relevant for softmax / output layer
 17 |     else:
 18 |         num_preds_ltn = {}
 19 |         total_preds_ltn = 0
 20 | 
 21 |     if options["lab_emb_dim"] != 0:
 22 |         total_num_labels = 0
 23 |         for task in options["tasks"]:
 24 |             total_num_labels += target_sizes[task]
 25 | 
 26 |     vocab, label_vocab = None, None
 27 |     label_to_labelvocab = defaultdict(list)
 28 |     for task in options["tasks"]:
 29 |         read_data = task2data_reader(task)
 30 |         data_train, data_dev, data_test = read_data(
 31 |             debug=options["debug"], num_instances=options["num_instances"])
 32 |         target_labels[task] = data_train["labels"]
 33 |         if options['ltn_pred_type'] == 'soft':
 34 |             total_preds_ltn += len(data_train["labels"])
 35 | 
 36 |         # add data for creating data features
 37 |         feature_data[task] = data_train.get("seq2", []) +\
 38 |             data_dev.get("seq2", []) +\
 39 |             data_test.get("seq2", [])
 40 | 
 41 |         label_to_labelvocab_task = None
 42 |         if options["lab_emb_dim"] != 0:
 43 |             if label_vocab is None:
 44 |                 label_vocab = Vocab() # unk is id 0
 45 |             label_to_labelvocab_i = [0] * total_num_labels
 46 |             for taskl in data_train["labels"]:
 47 |                 labid = label_vocab(task + "_" + str(taskl))
 48 |                 label_to_labelvocab_i[labid-1] = (labid) # -1 because the first one is UNK
 49 |             label_to_labelvocab_task = label_to_labelvocab_i
 50 |             label_to_labelvocab[task] = label_to_labelvocab_task
 51 | 
 52 | 
 53 |         prepared_data[task]["train"], vocab, label_vocab = prepare_data(placeholders,
 54 |                                                            data_train, vocab, label_vocab, label_to_labelvocab_task)
 55 |         prepared_data[task]["dev"], vocab, label_vocab = prepare_data(placeholders,
 56 |                                                          data_dev, vocab, label_vocab, label_to_labelvocab_task)
 57 |         prepared_data[task]["test"], vocab, label_vocab = prepare_data(placeholders,
 58 |                                                           data_test, vocab, label_vocab, label_to_labelvocab_task)
 59 | 
 60 | 
 61 |     vocab.freeze()  # this makes sure that nothing further is added to the vocab, otherwise deep_map will extend it
 62 |     if label_vocab is not None:
 63 |         label_vocab.freeze()
 64 | 
 65 |     if options['model_type'] != 'hard-sharing' and options["feature_sets"] != "predsonly":
 66 |         # create a mapping of tasks to an array for each training example
 67 |         print("Creating features")
 68 |         task2features = create_features(options["feature_sets"], feature_data,
 69 |                                     vocab, options["features_path"])
 70 | 
 71 |     if options['model_type'] == 'label-transfer':
 72 |         for task in options["tasks"]:
 73 |             num_preds_ltn[task] = total_preds_ltn
 74 |             if task != options['main_task']:
 75 |                 num_preds_ltn[task] = total_preds_ltn - len(target_labels[task])
 76 | 
 77 |             if options["lab_emb_dim"] > 0:
 78 |                 num_preds_ltn[task] = len(label_vocab.id2sym.keys()) -1
 79 |                 if task != options['main_task']:
 80 |                     num_preds_ltn[task] = len(label_vocab.id2sym.keys()) - 1 - len(target_labels[task])
 81 | 
 82 |     elif options['em_pred_type'] == 'soft':
 83 |         for task in options["tasks"]:
 84 |             num_preds_ltn[task] = total_preds_ltn - len(target_labels[task]) - len(target_labels[options['main_task']])
 85 | 
 86 |             if options["lab_emb_dim"] > 0:
 87 |                 num_preds_ltn[task] = len(label_vocab.id2sym.keys()) -1 - len(target_labels[task]) - len(target_labels[options['main_task']])
 88 | 
 89 |     #print(num_preds_ltn)
 90 | 
 91 |     train_feed_dicts, dev_feed_dicts, test_feed_dicts = {}, {}, {}
 92 |     for task in options["tasks"]:
 93 | 
 94 |         # padding to same length and converting lists to numpy arrays
 95 |         train_data = numpify(prepared_data[task]["train"], pad=0)
 96 |         dev_data = numpify(prepared_data[task]["dev"], pad=0)
 97 |         test_data = numpify(prepared_data[task]["test"], pad=0)
 98 | 
 99 |         if options['model_type'] != 'hard-sharing':
100 |             if options["feature_sets"] != "predsonly":
101 |                 # add the data features to the data splits
102 |                 train_size, dev_size, test_size = train_data['seq1'].shape[0], \
103 |                     dev_data['seq1'].shape[0], test_data['seq1'].shape[0]
104 |                 train_data['features'] = task2features[task][0:train_size]
105 |                 dev_data['features'] = task2features[task][
106 |                                train_size:(train_size+dev_size)]
107 |                 test_data['features'] = task2features[task][-test_size:]
108 | 
109 |             if options['ltn_pred_type'] == 'soft':
110 |                 num_pr_ltn = num_preds_ltn[task]
111 |             else:
112 |                 num_pr_ltn = num_preds_ltn
113 | 
114 |             train_data['preds_for_ltn'] = np.zeros([len(train_data["seq1"]), num_pr_ltn], np.float32)
115 |             dev_data['preds_for_ltn'] = np.zeros([len(dev_data["seq1"]), num_pr_ltn], np.float32)
116 |             test_data['preds_for_ltn'] = np.zeros([len(test_data["seq1"]), num_pr_ltn], np.float32)
117 | 
118 |             if options["lab_emb_dim"] > 0 and options["lab_embs_for_ltn"] and options["relabel_with_ltn"]:
119 |                 # this is just so that we can get main task predictions from models for any task more easily, using the label emb representation
120 |                 targets_main_len = target_sizes[options["main_task"]]
121 |                 train_data["targets_main"] = np.zeros([len(train_data["seq1"]), targets_main_len], np.int32)
122 |                 dev_data["targets_main"] = np.zeros([len(dev_data["seq1"]), targets_main_len], np.int32)
123 |                 test_data["targets_main"] = np.zeros([len(test_data["seq1"]), targets_main_len], np.int32)
124 | 
125 |                 train_data["label_vocab_inds_main"] = [label_to_labelvocab[options["main_task"]] for inst in train_data["targets"]]
126 |                 dev_data["label_vocab_inds_main"] = [label_to_labelvocab[options["main_task"]] for inst in dev_data["targets"]]
127 |                 test_data["label_vocab_inds_main"] = [label_to_labelvocab[options["main_task"]] for inst in test_data["targets"]]
128 | 
129 |         train_feed_dicts[task] = get_feed_dicts(
130 |             train_data, placeholders, batch_size=batch_size,
131 |             inst_length=len(train_data["seq1"]))
132 |         dev_feed_dicts[task] = get_feed_dicts(
133 |             dev_data, placeholders, batch_size=batch_size,
134 |             inst_length=len(dev_data["seq1"]))
135 |         test_feed_dicts[task] = get_feed_dicts(
136 |             test_data, placeholders, batch_size=batch_size,
137 |             inst_length=len(test_data["seq1"]))
138 | 
139 |     return train_feed_dicts, dev_feed_dicts, test_feed_dicts, vocab, label_vocab, num_preds_ltn, label_to_labelvocab
140 | 
141 | 
142 | def prepare_data(placeholders, data, vocab=None, label_vocab=None, label_to_labelvocab=None):
143 |     data_tokenized = deep_map(data, tokenize, ['seq1', 'seq2'])
144 |     data_lower = deep_seq_map(data_tokenized, lower, ['seq1', 'seq2'])
145 |     data = deep_seq_map(data_lower, lambda xs: ["<SOS>"] + xs + ["<EOS>"], ["seq1", "seq2"])
146 |     if vocab is None:
147 |         vocab = Vocab()
148 |         for instance in data["seq1"] + data["seq2"]:
149 |             for token in instance:
150 |                 vocab(token)
151 | 
152 |     data = map_to_targets(data, "labels", "stance")  # map stance IDs to one-hot vectors, save in data["targets"]
153 |     if label_vocab != None: # then we want label embeddings
154 |         data["label_vocab_inds"] = [label_to_labelvocab for inst in data["targets"]]
155 |     data_ids = deep_map(data, vocab, ["seq1", "seq2"])
156 |     data_ids = deep_seq_map(data_ids, lambda xs: len(xs), keys=['seq1', 'seq2'], fun_name='lengths', expand=True)
157 | 
158 |     # removing data that's not a placeholder
159 |     popl = []
160 |     for k in data_ids.keys():
161 |         if not k in placeholders.keys():
162 |             popl.append(k)
163 |     for p in popl:
164 |         data_ids.pop(p, None)
165 | 
166 |     return data_ids, vocab, label_vocab
167 | 


--------------------------------------------------------------------------------
/preproc/batch.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from itertools import islice
  3 | from preproc.map import numpify
  4 | from numpy.random import choice
  5 | #from jtr.util.rs import DefaultRandomState
  6 | 
  7 | rs = np.random.RandomState(1337)
  8 | #rs = DefaultRandomState(1337)#new seed ignored if set previously
  9 | 
 10 | 
 11 | def get_buckets(data, order, structure):
 12 |     """
 13 |     Generates mapping between data instances and bucket-ID's.
 14 | 
 15 |     `data`: dict of nested sequences in which each top-level sequence has the same length,
 16 |         and all inner sequences have the __len__ attribute.
 17 |     `order`: (None or) tuple with data keys used for bucketing
 18 |         For example:
 19 |         ```list(data.keys()) = ["sentences1", "lengths1", "sentences2", "lengths2", "targets"]```
 20 |         and we want bucketing according to the lengths of inner sequences in "sentences1" and "sentences2":
 21 |         `order = ("sentences1", "sentences2")` performs bucketing on "sentences1", and within each bucket,
 22 |         again creates buckets according to "sentences2"
 23 |         (automatic bucketing will result in different "sentences2" bucket boundaries
 24 |         within each bucket according to "sentences1").
 25 |         `order = ("sentences2", "sentences1")`: vice versa, with "sentences2" for highest-level buckets
 26 |     `structure`: (None or) sequence with same length as `order`, each element is an integer or a list of integers
 27 |         For each position:
 28 |             - integer: denotes number of buckets, to be determined automatically
 29 |             - list: determines bucket boundaries. E.g.: [10, 20, 30] will result in 4 buckets
 30 |               (1) lengths 0-10, (2) lengths 11-20, (3) lengths 21-30, (4) lengths > 30
 31 |         For example:
 32 |         `order` = ("sentences1", "sentences2") and `structure` = (3, [10]) generates 6 buckets:
 33 |         within each of 3 partitions based on "sentences1",
 34 |         there is a bucket with instances of "sentences2" with length 10 or less,
 35 |         and one for lengths > 10.
 36 | 
 37 |     Returns:
 38 |         buckets2ids, ids2buckets
 39 |         dicts that map instance-id (index along 1st dimension of values in data) to bucket-id,
 40 |         and vice versa.
 41 |     """
 42 |     assert isinstance(data, dict)
 43 | 
 44 |     n_tot = len(list(data.values())[0])
 45 |     if order is None or structure is None:
 46 |         # all in 1 bucket, with id '(0)'
 47 |         buckets2ids = {'(0)': list(range(n_tot))}
 48 |         ids2buckets = dict(zip(list(range(n_tot)), ['(0)'] * n_tot))
 49 |         return buckets2ids, ids2buckets
 50 | 
 51 |     def _chunk(it, size):
 52 |         """returns iterator of chunks (tuples) from it (input iterator), with given size (last one may be shorter)"""
 53 |         it = iter(it)
 54 |         return iter(lambda: tuple(islice(it, size)), ())
 55 | 
 56 |     def _partition(_buckets2ids, _order, _structure):
 57 |         """update _buckets2ids according to _order and _structure"""
 58 |         # update all current buckets according to first item in _order and _structure
 59 |         buckets2ids_new = {}
 60 |         for bid, ids in sorted(_buckets2ids.items(), key=lambda x: x[0]):
 61 |             lengths = [len(data[_order[0]][id]) for id in ids]
 62 |             sorted_ids_lengths = sorted(zip(ids, lengths), key=lambda x: x[1])
 63 |             if isinstance(_structure[0], int):  # automatic bucketing
 64 |                 size = len(lengths) // _structure[0] if len(lengths) % _structure[0] == 0 \
 65 |                     else 1 + (len(lengths) // _structure[0])
 66 |                 buckets = list(_chunk([tup[0] for tup in sorted_ids_lengths], size))
 67 |             else:  # structure_is sequence of ints
 68 |                 struct = list(sorted(_structure[0])) + [np.inf]
 69 |                 bin_max, struct = struct[0], struct[1:]
 70 |                 buckets = [[]]
 71 |                 for id, l in sorted_ids_lengths:
 72 |                     if l > bin_max:  # never happens when bin_max = np.inf
 73 |                         bin_max, struct = struct[0], struct[1:]
 74 |                         buckets.append([])
 75 |                     buckets[-1].append(id)
 76 |             buckets2ids_new.update({tuple(list(bid) + [i]): list(bucket) for i, bucket in enumerate(buckets)})
 77 |         # call again if _order and _structure have more than 1 item
 78 |         if len(_order) > 1:
 79 |             buckets2ids_new = _partition(buckets2ids_new, _order[1:], _structure[1:])
 80 | 
 81 |         buckets2ids_new = {bid: bucket for bid, bucket in buckets2ids_new.items() if len(bucket) > 0}
 82 |         return buckets2ids_new
 83 | 
 84 | 
 85 |     buckets2ids = _partition({(): list(range(n_tot))}, order, structure)
 86 |     buckets2ids = {str(bid): buckets2ids[bid] for bid in buckets2ids}  # make bucket-ids strings (for random.choice)
 87 | 
 88 |     ids2buckets = {}
 89 |     for bid, bucket in buckets2ids.items():
 90 |         ids2buckets.update({id: bid for id in bucket})
 91 |     return buckets2ids, ids2buckets
 92 | 
 93 | 
 94 | def get_batches(data, batch_size=32, pad=0, bucket_order=None, bucket_structure=None, exact_epoch=False):
 95 |     """
 96 |     Creates generator that batches `data`.
 97 |     To avoid biases, it is advised to keep `bucket_order=None` and `bucket_structure=None` if computationally possible.
 98 |     (which will sample batches from all instances)
 99 | 
100 |     Args:
101 |         `data`: dict with (multi-dimensional) numpy arrays or (nested) lists;
102 |             first inner dimension (`num_instances`) should be the same over all data values.
103 |         `batch_size`: the desired batch size
104 |         `pad`: padding symbol in case data contains lists of lists of different sizes
105 |         `bucket_order`: argument `order` in get_buckets (list with keys); `None` if no bucketing
106 |         `bucket_structure`: argument `structure` in get_buckets; `None` if no bucketing
107 |         `exact_epoch`: if set to `True`, final batch per bucket may be smaller, but each instance will be seen exactly
108 |             once during training. Default: `False`, to be certain during training
109 |             that each instance per batch gets same weight in the total loss
110 |             (but not all instances are observed per epoch if bucket sizes are no multiple of `batch_size`).
111 | 
112 |     Returns:
113 |         a generator that generates a dict with same keys as `data`, and
114 |         as values data batches consisting of `[batch_size x num_instances]` 2D numpy tensors
115 |         (1st dimension is at most `batch_size` but may be smaller to cover all instances exactly once per epoch,
116 |         if `exact_epoch=True`)
117 |      """
118 |     assert isinstance(data, dict)
119 | 
120 |     data0 = list(data.values())[0]
121 |     if not isinstance(data0, np.ndarray):
122 |         data_np = numpify(data, pad)  # still need original data for length-based bucketing
123 |     else:
124 |         data_np = data
125 | 
126 |     def get_bucket_probs(_buckets2instances):
127 |         N = float(np.sum([len(ids) for ids in _buckets2instances.values()]))
128 |         return {bid: len(ids) / N if N > 0. else 0. for bid, ids in _buckets2instances.items()}
129 | 
130 |     def shuffle_buckets(_buckets2instances):
131 |         for bid in sorted(_buckets2instances.keys()):  # sorted: to keep deterministic
132 |             rs.shuffle(_buckets2instances[bid])
133 | 
134 |     buckets2instances, _ = get_buckets(data, bucket_order, bucket_structure)
135 |     n_buckets = len(buckets2instances)
136 | 
137 |     exact_epoch = True if len(data0) < n_buckets*batch_size else exact_epoch
138 |     #if average instances/bucket smaller than batch_size: set exact_epoch = True
139 |     #to avoid empty batches during debugging on small data samples
140 | 
141 |     def bucket_generator():
142 |         buckets2instances, _ = get_buckets(data, bucket_order, bucket_structure)
143 |         shuffle_buckets(buckets2instances)
144 |         all_seen = False
145 |         while not all_seen:
146 |             bids, probs = zip(*sorted(get_bucket_probs(buckets2instances).items(), key=lambda x: x[0]))
147 |             # sorted keys: to keep deterministic
148 |             if np.sum(probs) == 0.:
149 |                 all_seen = True
150 |             else:
151 |                 bid = rs.choice(bids, replace=False, p=probs)  # sample bucket according to remaining size
152 |                 batch_indices = buckets2instances[bid][:batch_size]
153 |                 buckets2instances[bid] = buckets2instances[bid][batch_size:]
154 |                 # if required by exact_epoch: also include last batch in bucket if too small
155 |                 if len(batch_indices) == batch_size or exact_epoch:
156 |                     yield {k: data_np[k][batch_indices] for k in data_np}
157 | 
158 |     return GeneratorWithRestart(bucket_generator)
159 | 
160 | 
161 | def get_feed_dicts(data_train_np, placeholders, batch_size, inst_length):
162 |     data_train_batched = []
163 |     realsamp = int(inst_length/batch_size)
164 |     additionsamp = inst_length%batch_size
165 |     if additionsamp != 0:
166 |         realsamp += 1
167 |     ids1 = choice(range(0, inst_length), inst_length, replace=False)  # sample without replacement so we get every sample once # -additionsamp
168 |     ids2 = choice(range(0, inst_length), additionsamp, replace=True)  # sample a few additional ones to fill up batch
169 |     ids = np.append(ids1, ids2)
170 | 
171 |     start = 0
172 |     for i in range(0, realsamp):
173 |         batch_i = {}
174 |         if i != 0:
175 |             start = i * batch_size
176 |         if i != realsamp:
177 |             ids_sup = ids[start:((i+1)*batch_size)]
178 |         else:
179 |             ids_sup = ids[start:realsamp]
180 |         #print(ids_sup)
181 |         for key, value in data_train_np.items():
182 |             #print(key)
183 |             #print(data_train_np[key])
184 |             batch_i[placeholders[key]] = [data_train_np[key][ii] for ii in ids_sup]
185 | 
186 |         data_train_batched.append(batch_i)
187 | 
188 |     return data_train_batched
189 | 
190 | 
191 | def batch_feed_dicts(data_train_np, batch_size, inst_length):
192 |     data_train_batched = []
193 |     realsamp = int(inst_length/batch_size)
194 |     additionsamp = inst_length%batch_size
195 |     if additionsamp != 0:
196 |         realsamp += 1
197 |     ids1 = choice(range(0, inst_length), inst_length, replace=False)  # sample without replacement so we get every sample once # -additionsamp
198 |     ids2 = choice(range(0, inst_length), additionsamp, replace=True)  # sample a few additional ones to fill up batch
199 |     ids = np.append(ids1, ids2)
200 | 
201 |     start = 0
202 |     for i in range(0, realsamp):
203 |         batch_i = {}
204 |         if i != 0:
205 |             start = i * batch_size
206 |         if i != realsamp:
207 |             ids_sup = ids[start:((i+1)*batch_size)]
208 |         else:
209 |             ids_sup = ids[start:realsamp]
210 |         for key, value in data_train_np.items():
211 |             batch_i[key] = [data_train_np[key][ii] for ii in ids_sup]
212 | 
213 |         data_train_batched.append(batch_i)
214 | 
215 |     return data_train_batched
216 | 
217 | 
218 | def get_feed_dicts_old(data, placeholders, batch_size=32, pad=0, bucket_order=None, bucket_structure=None, exact_epoch=False):
219 |     """Creates feed dicts for all batches with a given batch size.
220 | 
221 |     Args:
222 |         `data` (dict): The input data for the feed dicts.
223 |         `placeholders` (dict): The TensorFlow placeholders for the data
224 |             (placeholders.keys() must form a subset of data.keys()).
225 |         `batch_size` (int): The batch size for the data.
226 |         `pad` (int): Padding symbol index to pad lists of different sizes.
227 |         `bucket_order`: argument `order` in get_buckets (list with keys); `None` if no bucketing
228 |         `bucket_structure`: argument `structure` in get_buckets; `None` if no bucketing
229 |         `exact_epoch`: if set to `True`, final batch per bucket may be smaller, but each instance will be seen exactly
230 |             once during training. Default: `False`, to be certain during training
231 |             that each instance per batch gets same weight in the total loss.
232 | 
233 |     Returns:
234 |         GeneratorWithRestart: Generator that yields a feed_dict for each
235 |         iteration. A feed dict consists of '{ placeholder : data-batch }` key-value pairs.
236 |     """
237 |     assert isinstance(data, dict) and isinstance(placeholders, dict)
238 |     assert set(placeholders.keys()).issubset(set(data.keys())), \
239 |         'data keys %s \nnot compatible with placeholder keys %s' % (set(placeholders.keys()), set(data.keys()))
240 | 
241 |     def generator():
242 |         batches = get_batches(data, batch_size, pad, bucket_order, bucket_structure, exact_epoch)
243 |         # fixme: this is potentially inefficient as it might be called every time we retrieve a batch
244 |         # todo: measure and fix if significant impact
245 |         mapped = map(lambda xs: {placeholders[k]: xs[k] for k in placeholders}, batches)
246 |         #for each key in placeholders dict, pair the placeholder with the corresponding batch dict value
247 |         for x in mapped:
248 |             yield x
249 | 
250 |     return GeneratorWithRestart(generator)
251 | 
252 | 
253 | class GeneratorWithRestart(object):
254 |     def __init__(self, iterator):
255 |         self.iterator = iterator
256 | 
257 |     def __iter__(self):
258 |         return self.iterator()
259 | 
260 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import tensorflow as tf
  3 | import os
  4 | import numpy as np
  5 | import argparse
  6 | import copy
  7 | from sklearn.metrics import classification_report
  8 | from mtl.tensoriser import load_data
  9 | from mtl.training import train, restore_trained_model, get_preds_for_ltn
 10 | from preproc.log_utils import log_results, task2score
 11 | from constants import FNC, STANCE, NLI, TOPIC, LAPTOP, RESTAURANT, TASKS,\
 12 |     SIM, DIV, TARGET, RNN_CELL_TYPES, TOPIC_5WAY
 13 | from preproc.plot_utils import plot_label_embeddings
 14 | 
 15 | seq1 = tf.placeholder(tf.int32, [None, None], name="seq1")
 16 | seq1_lengths = tf.placeholder(tf.int32, [None], name="seq1_lengths")
 17 | seq2 = tf.placeholder(tf.int32, [None, None], name="seq2")
 18 | seq2_lengths = tf.placeholder(tf.int32, [None], name="seq2_lengths")
 19 | targets = tf.placeholder(tf.int32, [None, None], name="targets")
 20 | targets_main = tf.placeholder(tf.int32, [None, None], name="targets_main")  # targets for main task
 21 | features = tf.placeholder(tf.float32, [None, None], name="features")
 22 | preds_for_ltn = tf.placeholder(tf.float32, [None, None], name="preds_for_ltn") # this is set to 0 initially and constantly updated during training
 23 | label_vocab_inds = tf.placeholder(tf.int32, [None, None], name="label_vocab_inds")
 24 | label_vocab_inds_main = tf.placeholder(tf.int32, [None, None], name="label_vocab_inds_main")  # label target for main task
 25 | 
 26 | 
 27 | # This dictionary determines which tasks are used. By default, it contains
 28 | # all existing tasks and is then modified during setup accordingly.
 29 | target_sizes = {FNC: 4, STANCE: 3, NLI: 3, TOPIC: 2, LAPTOP: 3, RESTAURANT: 3,
 30 |                 TARGET: 3, TOPIC_5WAY: 5}
 31 | target_labels = {FNC: [], STANCE: [], NLI: [], TOPIC: [], LAPTOP: [],
 32 |                  RESTAURANT: [], TARGET: [], TOPIC_5WAY: []}
 33 | 
 34 | placeholders = {"seq1": seq1, "seq1_lengths": seq1_lengths, "seq2": seq2,
 35 |                 "seq2_lengths": seq2_lengths, "targets": targets, "targets_main": targets_main,
 36 |                 "features": features, "preds_for_ltn": preds_for_ltn,
 37 |                 "label_vocab_inds": label_vocab_inds, "label_vocab_inds_main": label_vocab_inds_main}
 38 | 
 39 | 
 40 | def main(**options):
 41 | 
 42 |     # create the log directory if it does not exist
 43 |     log_dir = os.path.dirname(args.log_file)
 44 |     if not os.path.exists(log_dir):
 45 |         print('Creating %s...' % log_dir)
 46 |         os.makedirs(log_dir)
 47 | 
 48 |     train_feed_dicts, dev_feed_dicts, test_feed_dicts, vocab, label_vocab, ltn_sizes, label_to_labelvocab = load_data(placeholders, target_labels, target_sizes, **options)
 49 | 
 50 |     # remove tasks from target_sizes if not used
 51 |     for task in copy.deepcopy(set(target_sizes.keys())):
 52 |         if not task in options["tasks"]:
 53 |             target_sizes.pop(task)
 54 | 
 55 |     print("Data loaded and tensorised. Training model with settings: " + str(options))
 56 | 
 57 |     if options['model_type'] != 'hard-sharing' and options["feature_sets"] != "predsonly":
 58 |         ex1 = train_feed_dicts[options["main_task"]][0]
 59 |         ex1feats = ex1[placeholders["features"]]
 60 |         input_size_preds = len(ex1feats[0])
 61 |     else:
 62 |         input_size_preds = 0
 63 | 
 64 |     if label_vocab == None:
 65 |         label_vocab_len = 0
 66 |     else:
 67 |         label_vocab_len = len(label_vocab)
 68 | 
 69 |     # Do not take up all the GPU memory all the time.
 70 |     sess_config = tf.ConfigProto()
 71 |     sess_config.gpu_options.allow_growth = True
 72 |     with tf.Session(config=sess_config) as sess:
 73 |         if options["plot_embeddings"]:
 74 |             print('Loading the model for plotting label embeddings...')
 75 |             _, _, _, _, _, _, _ = restore_trained_model(
 76 |                 placeholders, target_sizes, train_feed_dicts, vocab,
 77 |                 label_vocab_len, label_to_labelvocab, input_size_preds, ltn_sizes, sess=sess,
 78 |                 **options)
 79 |             plot_label_embeddings(sess, args.tasks, label_vocab)
 80 |             sys.exit(0)
 81 |         elif options["apply_existing_model"] == False:
 82 |             logits, loss, preds, logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict = train(placeholders, target_sizes, train_feed_dicts, dev_feed_dicts, vocab, label_vocab, input_size_preds, ltn_sizes, label_to_labelvocab, sess=sess, **options)
 83 |         else:
 84 |             logits, loss, preds, logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict = restore_trained_model(placeholders, target_sizes, train_feed_dicts, vocab, label_vocab_len, label_to_labelvocab, input_size_preds, ltn_sizes, sess=sess, **options)
 85 |         print('============')
 86 |         # Test on test data
 87 |         for task in target_sizes.keys():
 88 |             correct_test_all, total_test, correct_test_all_ltn = 0.0, 0.0, 0.0
 89 |             p_inds, g_inds, p_inds_ltn, topics = [], [], [], []
 90 |             for j, batch in enumerate(test_feed_dicts[task]):
 91 |                 p = sess.run(preds[task], feed_dict=batch)
 92 |                 pred_inds = [np.argmax(pp) for pp in p]
 93 |                 p_inds.extend(pred_inds)
 94 |                 gold_inds = [np.argmax(batch[placeholders["targets"]][i]) for i, targ in enumerate(batch[placeholders["targets"]])]
 95 |                 g_inds.extend(gold_inds)
 96 |                 hits = [pp for i, pp in enumerate(p) if np.argmax(pp) == np.argmax(batch[placeholders["targets"]][i])]
 97 |                 correct_test_all += len(hits)
 98 |                 total_test += len(batch[placeholders["targets"]])
 99 | 
100 |                 # keep track of the targets for topic-based scores
101 |                 topics += [t for t in batch[placeholders["seq1"]]]
102 | 
103 |                 if options["model_type"] == "semi-supervised" or options["model_type"] == "label-transfer":
104 | 
105 |                     batch_test = get_preds_for_ltn(sess, batch, placeholders, target_sizes, task, options["main_task"], preds,
106 |                                                    options["ltn_pred_type"], label_to_labelvocab, options["lab_emb_dim"], options["model_type"])
107 | 
108 |                     p_ltn = sess.run(preds_dict_ltn[task], feed_dict=batch_test)
109 |                     pred_inds_ltn = [np.argmax(pp_dev) for pp_dev in p_ltn]
110 |                     p_inds_ltn.extend(pred_inds_ltn)
111 |             with open(options['log_file'].replace('.txt', '_inds.txt'), 'a') as f:
112 |                 f.write(task + "\tMain model\t" + str(p_inds).replace("[", "").replace("]", "").replace(",", "") + "\n")
113 |                 f.write(task + "\tRelabel model\t" + str(p_inds_ltn).replace("[", "").replace("]", "").replace(",", "") + "\n")
114 |                 f.write(task + "\tGold\t" + str(g_inds).replace("[", "").replace("]", "").replace(",", "") + "\n")
115 | 
116 |             acc_test = correct_test_all/total_test
117 |             print('Test performance :', "Task: " + task, "Acc: ", acc_test)
118 |             test_score = task2score(task, g_inds, p_inds, topics)
119 |             print('Score on test set:', test_score)
120 |             try:
121 |                 # labels for topic 5-way are floats, so convert to string
122 |                 print(classification_report(g_inds, p_inds, target_names=[str(l) for l in target_labels[task]]))
123 |             except IndexError:
124 |                 print("Training labels inconsistent with testing labels")
125 |                 print(classification_report(g_inds, p_inds))
126 | 
127 |             acc_test_ltn = 0.
128 |             if options["model_type"] == "semi-supervised" or options["model_type"] == "label_transfer":
129 |                 acc_test_ltn = correct_test_all_ltn / total_test
130 |                 task_score_ltn = task2score(task, g_inds, p_inds_ltn, topics)
131 |                 print('Test performance LTN:', "Task: " + task, "Acc: ", acc_test_ltn, "Task score", task_score_ltn)
132 |                 try:
133 |                     print(classification_report(g_inds, p_inds_ltn, target_names=[str(l) for l in target_labels[task]]))
134 |                 except IndexError:
135 |                     print("Training labels inconsistent with testing labels")
136 |                     print(classification_report(g_inds, p_inds_ltn))
137 |                 log_results(options, acc_test, test_score, task_score_ltn, task)
138 |             else:
139 |                 log_results(options, acc_test, test_score, 0.0, task)
140 | 
141 | 
142 | if __name__ == "__main__":
143 | 
144 |     parser = argparse.ArgumentParser(description='Train and Evaluate a MTL model with incompatible outputs')
145 |     parser.add_argument('--debug', default=True, action='store_true', help="Debug mode -- for this, only a small portion of the data is used to test code functionality")
146 |     parser.add_argument('--dev_res_during_training', default=False, action='store_true', help="If true, computes results on dev set during training")
147 |     parser.add_argument('--num_instances', type=int, default=128, help="What is the maximum number of instances to use per task")
148 |     parser.add_argument('--apply_existing_model', default=False, action='store_true', help="If set to True, doesn't train model but only applies trained model to test data")
149 |     parser.add_argument('--tasks', nargs='+', default=TASKS, help="Tasks to train on. If this is the same as the main task, a single-task model is trained. Options:" + str(TOPIC_5WAY))
150 |     parser.add_argument('--main_task', type=str, default=RESTAURANT, help="The main task.")
151 |     parser.add_argument('--feature_sets', nargs='+', help='data feature sets. In the paper, only diversity features are tested.', default=DIV)
152 |     parser.add_argument('--ltn_pred_type', type=str, help='Whether to use hard or soft predictions as input to LTN model. In the experiments described in the paper, only soft predictions are used.', default='soft')
153 |     parser.add_argument('--main_num_layers', type=int, help='If > 1, number of hidden layer for main model.', default=1)
154 |     parser.add_argument('--lel_hid_size', type=int, help='If > 0, size of hidden layer for label embedding layer, as described in Section 3.2 of the paper.', default=0)
155 |     parser.add_argument('--model_type', default='label-transfer', choices={'hard-sharing', 'label-transfer', 'semi-supervised'}, help="What model variant to use: "
156 |                                                                                         "'hard-sharing' is the MTL with hard parameter sharing model (Section 3.1), "
157 |                                                                                         "'label-transfer' is the label transfer network (Section 3.3), "
158 |                                                                                         "'semi-supervised' is the semi-supervised MTL (Section 3.4)")
159 |     parser.add_argument('--relabel_with_ltn', default=False, action='store_true', help="Only relevant for semi-supervised model: do we actually use it to relabel data or not. The latter can be used for debugging purposes. "
160 |                                                                                        "Otherwise, this is the semi-supervised variant of the LTN described in Section 3.4 of the paper")
161 |     parser.add_argument('--task_specific_layer_size', type=int, default=1, help="If >0, adds a task-specific hidden layer with that size and skip-connections")
162 |     parser.add_argument('--batch_size', type=int, default=16, help="What batch size should be used")
163 |     parser.add_argument('--max_epochs', type=int, default=1, help="What is the maximum number of epochs to train main model for")
164 |     parser.add_argument('--max_epochs_ltn', type=int, default=2, help="What is the maximum number of epochs to train LTN model for")
165 |     parser.add_argument('--max_epochs_after_ltn', type=int, default=0, help="After we've trained the relabelling function, how many epochs should we train for with augmented data.")
166 |     parser.add_argument('--early_stopping', type=float, default=1.0, help="Threshold for early stopping on dev set of main task. If 1.0, there is no early stopping.")
167 |     parser.add_argument('--emb_dim', type=int, default=16, help="What embedding size should be used")
168 |     parser.add_argument('--lab_emb_dim', type=int, default=16, help='What embedding size should be used for the label embeddings. If 0, no label embeddings are used.')
169 |     parser.add_argument('--lab_embs_for_ltn', default=False, action='store_true', help='Whether to use label embeddings for relabelling function or not.')
170 |     parser.add_argument('--skip_connections', default=False, action='store_true', help='Skip connections for the RNN or not')
171 |     parser.add_argument('--learning_rate', type=float, default=0.01, help="What initial learning rate should be used")
172 |     parser.add_argument('--dropout_rate', type=float, default=1.0, help="What rate of dropout should be used. 1.0 -> no dropout")
173 |     parser.add_argument('--l1_rate_main', type=float, default=1.0, help="What rate of l1 regularisation should be used for main model. 1.0 -> no l1")
174 |     parser.add_argument('--l2_rate_main', type=float, default=1.0, help="What rate of l2 regularisation should be used for main model. 1.0 -> no l2")
175 |     parser.add_argument('--l1_rate_ltn', type=float, default=1.0, help="What rate of l1 regularisation should be used for em model. 1.0 -> no l1")
176 |     parser.add_argument('--l2_rate_ltn', type=float, default=1.0, help="What rate of l2 regularisation should be used for em model. 1.0 -> no l2")
177 |     parser.add_argument('--rnn_cell_type', type=str, help='RNN cell type. Options:' + str(RNN_CELL_TYPES), default="lstm")
178 |     parser.add_argument('--attention', default=False, action='store_true', help='Word by word attention mechanism')
179 |     parser.add_argument('--save_model', default=False, action='store_true', help="Save model after end of training")
180 |     parser.add_argument('--exp_id', type=str, default="run1", help="Experiment ID. In case the same experiment with the same configurations needs to be run more than once.")
181 |     parser.add_argument('--features-path', type=str, default='saved_features_new', help='the directory where the computed features are saved')
182 |     parser.add_argument('--log_file', type=str, default="./log.txt", help='the path to which results should be logged')
183 |     parser.add_argument('--alternate_batches', default=True, action='store_true', help='alternate tasks between batches instead of between epochs during training')
184 |     parser.add_argument('--plot_embeddings', action='store_true', help='plot label embeddings of trained model')
185 | 
186 |     args = parser.parse_args()
187 |     if args.debug:
188 |         print('Debugging is switched on. Only a small portion of data is used.')
189 |     if args.apply_existing_model:
190 |         args.save_model = False
191 |     if args.alternate_batches:
192 |         print('Alternating tasks between batches...')
193 |     else:
194 |         print('Alternating tasks between epochs...')
195 |     if args.feature_sets == 'predsonly' and args.model_type == 'semi-supervised':
196 |         print("The model type 'label-transfer' needs to be used for this to work. Changing it to that setting.")
197 |         args.model_type = 'label-transfer'
198 |     main(**vars(args))
199 | 


--------------------------------------------------------------------------------
/features/features.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Methods to create the similarity and diversity features used in
  3 | Ruder & Plank (2017).
  4 | """
  5 | 
  6 | import sys
  7 | import os
  8 | import numpy as np
  9 | np.seterr(all='raise')
 10 | import scipy.stats
 11 | import scipy.spatial
 12 | 
 13 | from preproc.map import tokenize, lower, deep_map, deep_seq_map
 14 | from constants import SIMILARITY_FEATURES, DIVERSITY_FEATURES, SIM, DIV
 15 | 
 16 | # ------------ Feature methods ------------
 17 | 
 18 | 
 19 | def create_features(feature_sets, task2examples, vocab, save_path):
 20 |     """
 21 |     Retrieve the feature representations of a list of examples.
 22 |     :param feature_sets: a list containing the names of features to be used
 23 |     :param task2examples: mapping of tasks to lists of untokenized texts
 24 |     :param vocab: the Vocabulary object
 25 |     :param save_path: the directory where the features should be stored
 26 |     :return: a mapping of tasks to feature representations of shape
 27 |             (num_examples, num_features); the features correspond to the order
 28 |             of the data; first training examples, then dev, then test
 29 |     """
 30 |     # create the features for each example in each task
 31 |     task2features = {t: [] for t in task2examples.keys()}
 32 | 
 33 |     # get the feature names
 34 |     feature_names = []
 35 |     if SIM in feature_sets:
 36 |         feature_names += SIMILARITY_FEATURES
 37 |     if DIV in feature_sets:
 38 |         feature_names += DIVERSITY_FEATURES
 39 | 
 40 |     print("Trying to find feature files in", save_path)
 41 |     if os.path.exists(save_path) and os.path.isdir(save_path) and len(
 42 |             os.listdir(save_path)) > 0:
 43 |         feature_dim = None
 44 |         for task in task2examples.keys():
 45 |             assert task in os.listdir(save_path),\
 46 |                 'Error: No saved features available for task %s in dir %s.' \
 47 |                 % (task, save_path)
 48 |         print("Task2features")
 49 |         print(task2features)
 50 |         print("Files in features folder")
 51 |         print(os.listdir(save_path))
 52 |         for task in os.listdir(save_path):
 53 |             # then we don't need to load it
 54 |             if not task in task2features.keys():
 55 |                 continue
 56 |             with open(os.path.join(save_path, task), 'r') as f:
 57 |                 for line in f:
 58 |                     features = np.fromstring(line.strip('[]'), dtype=float,
 59 |                                              sep=' ')
 60 |                     if feature_dim is None:
 61 |                         feature_dim = len(features)
 62 |                     assert feature_dim == len(feature_names),\
 63 |                         'Error: # of loaded features %d != # of specified '\
 64 |                         'features %d.' % (feature_dim, len(feature_names))
 65 |                     assert feature_dim == len(features),\
 66 |                         'Error: Different # of features among examples, ' \
 67 |                         'i.e. %d and %d.' % (feature_dim, len(features))
 68 |                     task2features[task].append(features)
 69 |             print('Loaded %d-d features for %s from %s...'
 70 |                   % (feature_dim, task, save_path))
 71 |         return task2features
 72 | 
 73 |     if not os.path.exists(save_path):
 74 |         os.makedirs(save_path)
 75 | 
 76 |     # tokenize and lower-case the documents
 77 |     for task, examples in task2examples.items():
 78 |         examples = deep_map(examples, tokenize)
 79 |         examples = deep_seq_map(examples, lower)
 80 |         task2examples[task] = examples
 81 | 
 82 |     # get the term distribution of the data for each task (shape (vocab_size,) )
 83 |     # and for each example (shape (num_examples, vocab_size) )
 84 |     task2task_term_dist = {}
 85 |     for task, examples in task2examples.items():
 86 |         task2task_term_dist[task] = get_term_dist(examples, vocab.sym2id)
 87 | 
 88 |     for task, examples in task2examples.items():
 89 |         for i, example in enumerate(examples):
 90 |             term_dist = get_term_dist([example], vocab.sym2id)
 91 |             features = []
 92 |             for f_name in feature_names:
 93 |                 # check whether feature belongs to similarity-based features,
 94 |                 # diversity-based features, etc.
 95 |                 if f_name in SIMILARITY_FEATURES:
 96 |                     # compute the similarity with regard to each task
 97 |                     for target_task in task2examples.keys():
 98 |                         f = similarity_name2value(
 99 |                             f_name, term_dist, task2task_term_dist[target_task])
100 |                         if np.isnan(f).any() or np.isinf(f).any():
101 |                             if type(f) != list:
102 |                                 f = [0 for ff in f]
103 |                             elif type(f) == int:
104 |                                 f = 0
105 |                             elif type(f) == float:
106 |                                 f = 0.0
107 |                         features.append(f)
108 |                 elif f_name in DIVERSITY_FEATURES:
109 |                     f = diversity_feature_name2value(
110 |                         f_name, example, task2task_term_dist[task],
111 |                         vocab.sym2id)
112 |                     if np.isnan(f).any() or np.isinf(f).any():
113 |                         if type(f) != list:
114 |                             f = [0 for ff in f]
115 |                         elif type(f) == int:
116 |                             f = 0
117 |                         elif type(f) == float:
118 |                             f = 0.0
119 |                     features.append(f)
120 |                 else:
121 |                     raise ValueError('%s is not a valid feature name.' % f_name)
122 |             #assert not np.isnan(features).any(), 'Error: NAN value in array.'
123 |             #assert not np.isinf(features).any(), 'Error: inf or -inf value.'
124 |             task2features[task].append(features)
125 |             if i % 100 == 0 and i > 0:
126 |                 print('%s. Created features for %d examples.' % (task, i))
127 |         task2features[task] = np.array(task2features[task])
128 | 
129 |     # z-normalize the feature scores
130 |     feature_values = scipy.stats.zscore(np.vstack([f for f in
131 |                                         task2features.values()]), axis=0)
132 |     start_idx = 0
133 |     for task, features in task2features.items():
134 |         task2features[task] = feature_values[
135 |                               start_idx:start_idx+features.shape[0], :]
136 |         start_idx += features.shape[0]
137 | 
138 |         # write the features to the corresponding file
139 |         file_path = os.path.join(save_path, task)
140 |         with open(file_path, 'w') as f:
141 |             for example_features in task2features[task]:
142 |                 # set max_line_width so that features don't wrap across lines
143 |                 f.write('%s\n' % np.array_str(example_features,
144 |                                               max_line_width=sys.maxsize))
145 |         print('Wrote %s %d-d features to %s...' % (task, len(feature_names),
146 |                                                    file_path))
147 |     print('Created features.')
148 |     return task2features
149 | 
150 | 
151 | def get_term_dist(docs, word2id, lowercase=True):
152 |     """
153 |     Calculates the term distribution of a list of documents.
154 |     :param docs: a list of tokenized docs; can also contain a single document
155 |     :param vocab: the Vocabulary object
156 |     :param lowercase: lower-case the input data
157 |     :return: the term distribution of the input documents,
158 |              i.e. a numpy array of shape (vocab_size,)
159 |     """
160 |     term_dist = np.zeros(len(word2id))
161 |     for doc in docs:
162 |         for word in doc:
163 |             if lowercase:
164 |                 word = word.lower()
165 |             if word in word2id:
166 |                 term_dist[word2id[word]] += 1
167 | 
168 |     # normalize absolute freqs to obtain a relative frequency term distribution
169 |     term_dist /= np.sum(term_dist)
170 |     if np.isnan(np.sum(term_dist)):
171 |         # the sum is nan if docs only contains one document and that document
172 |         # has no words in the vocabulary
173 |         term_dist = np.zeros(len(word2id))
174 |     return term_dist
175 | 
176 | 
177 | # ------------ Similarity features ------------
178 | 
179 | def jensen_shannon_divergence(repr1, repr2):
180 |     """Calculates Jensen-Shannon divergence (https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence)."""
181 |     avg_repr = 0.5 * (repr1 + repr2)
182 |     sim = 1 - 0.5 * (scipy.stats.entropy(repr1, avg_repr) + scipy.stats.entropy(repr1, avg_repr))
183 |     if np.isinf(sim):
184 |         # the similarity is -inf if no term in the document is in the vocabulary
185 |         return 0
186 |     return sim
187 | 
188 | 
189 | def renyi_divergence(repr1, repr2, alpha=0.99):
190 |     """Calculates Renyi divergence (https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy#R.C3.A9nyi_divergence)."""
191 |     try:
192 |         log_sum = np.sum([np.power(p, alpha) / np.power(q, alpha-1) for (p, q) in zip(repr1, repr2)])
193 |         sim = 1 / (alpha - 1) * np.log(log_sum)
194 |     except FloatingPointError: # division by 0 error
195 |         return 0
196 |     if np.isinf(sim):
197 |         # the similarity is -inf if no term in the document is in the vocabulary
198 |         return 0
199 |     return sim
200 | 
201 | 
202 | def cosine_similarity(repr1, repr2):
203 |     """Calculates cosine similarity (https://en.wikipedia.org/wiki/Cosine_similarity)."""
204 |     if repr1 is None or repr2 is None:
205 |         return 0
206 |     assert not (np.isnan(repr2).any() or np.isinf(repr2).any())
207 |     assert not (np.isnan(repr1).any() or np.isinf(repr1).any())
208 |     sim = 1 - scipy.spatial.distance.cosine(repr1, repr2)
209 |     if np.isnan(sim):
210 |         # the similarity is nan if no term in the document is in the vocabulary
211 |         return 0
212 |     return sim
213 | 
214 | 
215 | def euclidean_distance(repr1, repr2):
216 |     """Calculates Euclidean distance (https://en.wikipedia.org/wiki/Euclidean_distance)."""
217 |     sim = np.sqrt(np.sum([np.power(p-q, 2) for (p, q) in zip(repr1, repr2)]))
218 |     return sim
219 | 
220 | 
221 | def variational_distance(repr1, repr2):
222 |     """Also known as L1 or Manhattan distance (https://en.wikipedia.org/wiki/Taxicab_geometry)."""
223 |     sim = np.sum([np.abs(p-q) for (p, q) in zip(repr1, repr2)])
224 |     return sim
225 | 
226 | 
227 | def kl_divergence(repr1, repr2):
228 |     """Calculates Kullback-Leibler divergence (https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence)."""
229 |     sim = scipy.stats.entropy(repr1, repr2)
230 |     return sim
231 | 
232 | 
233 | def bhattacharyya_distance(repr1, repr2):
234 |     """Calculates Bhattacharyya distance (https://en.wikipedia.org/wiki/Bhattacharyya_distance)."""
235 |     try:
236 |         sim = - np.log(np.sum([np.sqrt(p*q) for (p, q) in zip(repr1, repr2)]))
237 |     except FloatingPointError:  # division by 0 error
238 |         return 0
239 |     assert not np.isnan(sim), 'Error: Similarity is nan.'
240 |     if np.isinf(sim):
241 |         # the similarity is -inf if no term in the review is in the vocabulary
242 |         return 0
243 |     return sim
244 | 
245 | 
246 | def similarity_name2value(s_name, repr1, repr2):
247 |     """Given a similarity function name, return the corresponding similarity function value."""
248 |     if s_name == 'jensen-shannon':
249 |         return jensen_shannon_divergence(repr1, repr2)
250 |     if s_name == 'renyi':
251 |         return renyi_divergence(repr1, repr2)
252 |     if s_name == 'cos' or s_name == 'cosine':
253 |         return cosine_similarity(repr1, repr2)
254 |     if s_name == 'euclidean':
255 |         return euclidean_distance(repr1, repr2)
256 |     if s_name == 'variational':
257 |         return variational_distance(repr1, repr2)
258 |     if s_name == 'kl':
259 |         return kl_divergence(repr1, repr2)
260 |     if s_name == 'bhattacharyya':
261 |         return bhattacharyya_distance(repr1, repr2)
262 |     raise ValueError('%s is not a valid feature name.' % s_name)
263 | 
264 | 
265 | # ------------ Diversity features ------------
266 | 
267 | def number_of_word_types(example):
268 |     """Counts the number of word types of the example."""
269 |     return len(set(example))
270 | 
271 | 
272 | def type_token_ratio(example):
273 |     """Calculates the type-token ratio of the example."""
274 |     return number_of_word_types(example) / len(example)
275 | 
276 | 
277 | def entropy(example, train_term_dist, word2id):
278 |     """Calculates Entropy (https://en.wikipedia.org/wiki/Entropy_(information_theory))."""
279 |     summed = 0
280 |     for word in set(example):
281 |         if word in word2id:
282 |             p_word = train_term_dist[word2id[word]]
283 |             summed += p_word * np.log(p_word)
284 |     return - summed
285 | 
286 | 
287 | def simpsons_index(example, train_term_dist, word2id):
288 |     """Calculates Simpson's Index (https://en.wikipedia.org/wiki/Diversity_index#Simpson_index)."""
289 |     score = np.sum([np.power(train_term_dist[word2id[word]], 2) if word in word2id else 0
290 |                     for word in set(example)])
291 |     return score
292 | 
293 | 
294 | def quadratic_entropy(example, train_term_dist, word2id, word2vec):
295 |     """Calculates Quadratic Entropy."""
296 |     assert word2vec is not None, ('Error: Word vector representations have to '
297 |                                   'be available for quadratic entropy.')
298 |     summed = 0
299 |     for word_1 in set(example):
300 |         if word_1 not in word2id or word_1 not in word2vec:
301 |             continue  # continue as the product will be 0
302 |         for word_2 in set(example):
303 |             if word_2 not in word2id or word_2 not in word2vec:
304 |                 continue  # continue as the product will be 0
305 |             p_1 = train_term_dist[word2id[word_1]]
306 |             p_2 = train_term_dist[word2id[word_2]]
307 |             vec_1 = word2vec[word_1]
308 |             vec_2 = word2vec[word_2]
309 |             sim = cosine_similarity(vec_1, vec_2)
310 |             summed += sim * p_1 * p_2
311 |     return summed
312 | 
313 | 
314 | def renyi_entropy(example, domain_term_dist, word2id):
315 |     """Calculates Rényi Entropy (https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy)."""
316 |     alpha = 0.99
317 |     summed = np.sum([np.power(domain_term_dist[word2id[word]], alpha) if word in word2id else 0 for word in set(example)])
318 |     if summed == 0:
319 |         # 0 if none of the words appear in the dictionary;
320 |         # set to a small constant == low prob instead
321 |         summed = 0.0001
322 |     score = 1 / (1 - alpha) * np.log(summed)
323 |     return score
324 | 
325 | 
326 | def diversity_feature_name2value(f_name, example, task_term_dist, word2id):
327 |     """
328 |     Given a feature name, return the corresponding feature value.
329 |     :param f_name: the name of the feature
330 |     :param example: the tokenised example document
331 |     :param task_term_dist: the term distribution of the task of the example
332 |     :param word2id: the word-to-id mapping
333 |     :param word2vec: a mapping of a word to its word vector representation (e.g. GloVe or word2vec)
334 |     :return: the value of the corresponding feature
335 |     """
336 |     if f_name == 'num_word_types':
337 |         return number_of_word_types(example)
338 |     if f_name == 'type_token_ratio':
339 |         return type_token_ratio(example)
340 |     if f_name == 'entropy':
341 |         return entropy(example, task_term_dist, word2id)
342 |     if f_name == 'simpsons_index':
343 |         return simpsons_index(example, task_term_dist, word2id)
344 |     # if f_name == 'quadratic_entropy':
345 |     #     return quadratic_entropy(example, train_term_dist, word2id, word2vec)
346 |     if f_name == 'renyi_entropy':
347 |         return renyi_entropy(example, task_term_dist, word2id)
348 |     raise ValueError('%s is not a valid feature name.' % f_name)
349 | 


--------------------------------------------------------------------------------
/mtl/nn.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | 
  4 | def bicond_reader(placeholders, target_sizes, vocab_size, label_vocab_size, **options):
  5 |     emb_dim = options["emb_dim"]
  6 |     lab_emb_dim = options["lab_emb_dim"]
  7 | 
  8 |     # [batch_size, max_seq1_length]
  9 |     seq1 = placeholders['seq1']
 10 | 
 11 |     # [batch_size, max_seq2_length]
 12 |     seq2 = placeholders['seq2']
 13 | 
 14 |     # [batch_size, labels_size]
 15 |     targets = tf.to_float(placeholders['targets'])
 16 | 
 17 |     label_vocab_inds = placeholders['label_vocab_inds']
 18 | 
 19 |     init = tf.contrib.layers.xavier_initializer(uniform=True)
 20 | 
 21 |     with tf.variable_scope("embeddings"):
 22 |         embeddings = tf.get_variable("word_embeddings", [vocab_size, emb_dim], dtype=tf.float32, initializer=init)
 23 | 
 24 |     with tf.variable_scope("embedders") as varscope:
 25 |         seq1_embedded = tf.nn.embedding_lookup(embeddings, seq1)
 26 |         varscope.reuse_variables()
 27 |         seq2_embedded = tf.nn.embedding_lookup(embeddings, seq2)
 28 | 
 29 |     with tf.variable_scope("conditional_reader_seq1") as varscope1:
 30 |         # seq1_states: (c_fw, h_fw), (c_bw, h_bw)
 31 |         _, seq1_states = reader(seq1_embedded, placeholders['seq1_lengths'], emb_dim,
 32 |                             scope=varscope1, **options)
 33 | 
 34 |     with tf.variable_scope("conditional_reader_seq2") as varscope2:
 35 |         varscope1.reuse_variables()
 36 |         outputs, states = reader(seq2_embedded, placeholders['seq2_lengths'], emb_dim, seq1_states, scope=varscope2, **options)
 37 | 
 38 |     # shape output: [batch_size, 2*emb_dim]
 39 |     if options["main_num_layers"] == 1:
 40 |         # shape states: [2, 2]
 41 |         output = tf.concat([states[0][1], states[1][1]], 1)
 42 |     else:
 43 |         # shape states: [2, num_layers, 2]
 44 |         output = tf.concat([states[0][-1][1], states[1][-1][1]], 1)
 45 | 
 46 |     if lab_emb_dim != 0:
 47 |         with tf.variable_scope("label_embeddings"):
 48 |             label_embeddings = tf.get_variable("label_embeddings", [label_vocab_size, lab_emb_dim], dtype=tf.float32, initializer=init)
 49 | 
 50 |     with tf.variable_scope("bicond_preds"):
 51 |         # output of sequence encoders is projected into separate output layers, one for each task
 52 |         scores_dict, loss_dict, predict_dict = {}, {}, {}
 53 |         # iterate over the tasks
 54 |         for k in target_sizes.keys():
 55 |             # use task name as variable scope
 56 |             with tf.variable_scope(k) as varscope_task:
 57 |                 if options["task_specific_layer_size"] > 0:
 58 |                     with tf.variable_scope(k + "_task_spec_layer") as task_spec_layer_scope:
 59 |                         output = tf.contrib.layers.fully_connected(output, options["task_specific_layer_size"],
 60 |                                                                    weights_initializer=init,
 61 |                                                                    activation_fn=tf.tanh, scope=task_spec_layer_scope)
 62 |                 if lab_emb_dim != 0:
 63 | 
 64 |                     # placeholders['label_vocab_inds'] contain the index of the labels and 0 elsewhere, e.g. [0, 0, 0, 4, 5, 6, 0, 0, ...]
 65 |                     # shape: [batch_size, num_tasks*num_labels, label_embed_dim]
 66 |                     labels_embedded = tf.nn.embedding_lookup(label_embeddings, label_vocab_inds)
 67 | 
 68 |                     output_dim = emb_dim*2
 69 |                     if options["task_specific_layer_size"] > 0:
 70 |                         output_dim = options["task_specific_layer_size"]
 71 | 
 72 |                     output, labels_embedded = pad_output(output, labels_embedded, output_dim, lab_emb_dim)
 73 | 
 74 |                     # get predictions with dot product between output and embedded labels.
 75 |                     scores = dotprod_with_lab_embs(output, labels_embedded, label_vocab_inds)
 76 | 
 77 |                     # boolean_mask returns a 1-d tensor, so we need to reshape
 78 |                     # works for all models since we compute the target sizes for all models
 79 |                     scores = tf.reshape(scores, [-1, target_sizes[k]])
 80 |                     loss = tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=targets)
 81 |                     predict = tf.nn.softmax(scores)
 82 | 
 83 |                 else:
 84 |                     label_embeddings = None
 85 |                     if options["l1_rate_main"] != 1.0 or options["l2_rate_main"] != 1.0:
 86 |                         with tf.variable_scope(k + "_l1l2_reg") as l1l2scope:
 87 |                             l1_l2 = tf.contrib.layers.l1_l2_regularizer(scale_l1=options["l1_rate_main"], scale_l2=options["l2_rate_main"], scope=l1l2scope)
 88 |                             scores = tf.contrib.layers.fully_connected(output, target_sizes[k], weights_initializer=init,
 89 |                                         activation_fn=tf.tanh, scope=varscope_task, weights_regularizer=l1_l2)  # target_size
 90 |                     else:
 91 |                         scores = tf.contrib.layers.fully_connected(output, target_sizes[k], weights_initializer=init, activation_fn=tf.tanh, scope=varscope_task) # target_size
 92 |                     loss = tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=targets)
 93 |                     predict = tf.nn.softmax(scores)
 94 | 
 95 |                 scores_dict[k] = scores
 96 |                 loss_dict[k] = loss
 97 |                 predict_dict[k] = predict
 98 | 
 99 |     return scores_dict, loss_dict, predict_dict, label_embeddings
100 | 
101 | 
102 | def relabel_model(placeholders, target_sizes, input_size_feats, input_size_preds, label_embeddings, **options):
103 |     lab_emb_dim = options["lab_emb_dim"]
104 | 
105 |     soft_or_hard = options['ltn_pred_type']
106 |     hidd_layer_size = options['lel_hid_size']
107 | 
108 |     # [batch_size, num_tasks - 2]
109 |     ltn_preds = placeholders['preds_for_ltn']
110 | 
111 |     if options["feature_sets"] != "predsonly":
112 |         # [batch_size, num_features]
113 |         features = placeholders['features']
114 | 
115 |     # [batch_size, labels_size]
116 |     targets = tf.to_float(placeholders['targets'])
117 | 
118 |     label_vocab_inds = placeholders['label_vocab_inds']
119 | 
120 |     # for returning main task predictions for relabelling with EM
121 |     targets_main = tf.to_float(placeholders['targets_main'])
122 |     label_vocab_inds_main = placeholders['label_vocab_inds_main']
123 | 
124 |     with tf.variable_scope("ltn_preds"):
125 |         # output of sequence encoders is projected into separate output layers, one for each task
126 |         init = tf.contrib.layers.xavier_initializer(uniform=True)
127 |         scores_dict, loss_dict, predict_dict, predict_main_dict = {}, {}, {}, {}
128 |         # iterate over the tasks
129 |         for k in target_sizes.keys():
130 |             # use task name as variable scope
131 |             with tf.variable_scope(k) as varscope_task:
132 |                 if options["feature_sets"] != "predsonly":
133 |                     # concatenate the predictions with the features
134 |                     if soft_or_hard == 'hard':
135 |                         emb_size = input_size_feats + input_size_preds
136 |                     else:
137 |                         emb_size = input_size_feats + input_size_preds[k]
138 | 
139 |                     output = tf.reshape(tf.concat([ltn_preds, features], 1), [-1, emb_size])
140 |                 else:
141 |                     if soft_or_hard == 'hard':
142 |                         emb_size = input_size_preds
143 |                     else:
144 |                         emb_size = input_size_preds[k]
145 |                         output = tf.reshape(ltn_preds, [-1, emb_size])
146 | 
147 | 
148 |                 if options["l1_rate_ltn"] != 1.0 or options["l2_rate_ltn"] != 1.0:
149 |                     l1_l2 = tf.contrib.layers.l1_l2_regularizer(scale_l1=options["l1_rate_ltn"], scale_l2=options["l2_rate_ltn"])
150 | 
151 |                 output_dim = emb_size
152 | 
153 |                 if hidd_layer_size != 0:
154 |                     if options["l1_rate_ltn"] != 1.0 or options["l2_rate_ltn"] != 1.0:
155 |                         with tf.variable_scope(k + "_relabel_hidd_layer") as task_spec_relabel_layer_scope:
156 |                             output = tf.contrib.layers.fully_connected(output, hidd_layer_size, weights_initializer=init, weights_regularizer=l1_l2, scope=task_spec_relabel_layer_scope)
157 |                     else:
158 |                         with tf.variable_scope(k + "_relabel_hidd_layer") as task_spec_relabel_layer_scope:
159 |                             output = tf.contrib.layers.fully_connected(output, hidd_layer_size, weights_initializer=init, scope=task_spec_relabel_layer_scope)
160 | 
161 |                     output_dim = hidd_layer_size
162 | 
163 |                 predict_main = None
164 | 
165 |                 if options["lab_embs_for_ltn"]:
166 | 
167 |                     # placeholders['label_vocab_inds'] contain the index of the labels and 0 elsewhere, e.g. [0, 0, 0, 4, 5, 6, 0, 0, ...]
168 |                     # shape: [batch_size, num_tasks*num_labels, label_embed_dim]
169 |                     labels_embedded = tf.nn.embedding_lookup(label_embeddings, label_vocab_inds)
170 | 
171 |                     output_padded, labels_embedded = pad_output(output, labels_embedded, output_dim, lab_emb_dim)
172 | 
173 |                     # get predictions with dot product between output and embedded labels.
174 |                     scores = dotprod_with_lab_embs(output_padded, labels_embedded, label_vocab_inds)
175 | 
176 |                     # boolean_mask returns a 1-d tensor, so we need to reshape
177 |                     scores = tf.reshape(scores, tf.shape(targets))
178 | 
179 |                     # then we also want to return predictions for the main task
180 |                     if options["relabel_with_ltn"]:
181 |                         labels_embedded_main = tf.nn.embedding_lookup(label_embeddings, label_vocab_inds_main)
182 | 
183 |                         output_padded_main, labels_embedded_main = pad_output(output, labels_embedded_main, output_dim, lab_emb_dim)
184 | 
185 |                         # get predictions with dot product between output and embedded main task labels.
186 |                         scores_main = dotprod_with_lab_embs(output_padded_main, labels_embedded_main, label_vocab_inds_main)
187 | 
188 |                         # boolean_mask returns a 1-d tensor, so we need to reshape
189 |                         scores_main = tf.reshape(scores_main, tf.shape(targets_main))
190 | 
191 |                         predict_main = tf.nn.softmax(scores_main)
192 | 
193 | 
194 |                 else:
195 |                     if options["l1_rate_ltn"] != 1.0 or options["l2_rate_ltn"] != 1.0:
196 |                         scores = tf.contrib.layers.fully_connected(output, target_sizes[k], weights_initializer=init,
197 |                                    activation_fn=tf.tanh, scope=varscope_task, weights_regularizer=l1_l2)  # target_size
198 | 
199 |                     else:
200 |                         scores = tf.contrib.layers.fully_connected(output, target_sizes[k], weights_initializer=init, activation_fn=tf.tanh, scope=varscope_task) # target_size
201 |                 loss = tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=targets)
202 |                 predict = tf.nn.softmax(scores)
203 | 
204 |                 scores_dict[k] = scores
205 |                 loss_dict[k] = loss
206 |                 predict_dict[k] = predict
207 |                 predict_main_dict[k] = predict_main
208 | 
209 |     return scores_dict, loss_dict, predict_dict, predict_main_dict
210 | 
211 | 
212 | def dotprod_with_lab_embs(output, labels_embedded, label_vocab_inds):
213 |     # dot product needs to happen with all the labels
214 |     # shape output_expanded: [batch_size, 1, emb_dim*2]
215 |     # shape labels_expanded: [batch_size, num_labels, emb_dim*2]
216 |     # shape comb_repr: [batch_size, num_labels, emb_dim*2]
217 |     output_expanded = tf.expand_dims(output, 1)
218 |     comb_repr = tf.multiply(output_expanded, labels_embedded)
219 | 
220 |     # we remove the embedding dimension so we just have the scores
221 |     #  shape: [batch_size, num_tasks*num_labels]
222 |     reduced_output = tf.reduce_sum(comb_repr, 2)
223 |     # now we want to mask it so only the labels for the task for which we have training data is taken into account for the loss
224 |     # ... but this doesn't work yet
225 |     #  a vector of zeros: [0, 0, 0, 0, 0, ...]
226 |     zeroes = tf.zeros_like(label_vocab_inds)
227 |     #  a vector indicating where label indices != 0
228 |     #  [False, False, False, True, True, True, False, False, ...]
229 |     mask = tf.not_equal(label_vocab_inds, zeroes)
230 |     scores = tf.boolean_mask(reduced_output, mask)
231 | 
232 |     return scores
233 | 
234 | 
235 | def pad_output(output, labels_embedded, output_dim, lab_emb_dim):
236 |     if output_dim > lab_emb_dim:
237 |         howmany = output_dim - lab_emb_dim
238 |         labels_embedded = tf.pad(labels_embedded, [[0, 0], [0, 0], [0, howmany]], constant_values=0)
239 |     elif lab_emb_dim > output_dim:
240 |         howmany = lab_emb_dim - output_dim
241 |         output = tf.pad(output, [[0, 0], [0, howmany]], constant_values=0)
242 |     return output, labels_embedded
243 | 
244 | 
245 | def reader(inputs, lengths, output_size, contexts=(None, None), scope=None, **options):
246 |     """Dynamic bi-LSTM reader; can be conditioned with initial state of other rnn.
247 | 
248 |     Args:
249 |         inputs (tensor): The inputs into the bi-LSTM
250 |         lengths (tensor): The lengths of the sequences
251 |         output_size (int): Size of the LSTM state of the reader.
252 |         context (tensor=None, tensor=None): Tuple of initial (forward, backward) states
253 |                                   for the LSTM
254 |         scope (string): The TensorFlow scope for the reader.
255 |         drop_keep_drop (float=1.0): The keep probability for dropout.
256 | 
257 |     Returns:
258 |         Outputs (tensor): The outputs from the bi-LSTM.
259 |         States (tensor): The cell states from the bi-LSTM.
260 |     """
261 | 
262 |     skip_connections = options["skip_connections"]
263 |     attention = options["attention"]
264 |     num_layers = options["main_num_layers"]
265 |     drop_keep_prob = options["dropout_rate"]
266 | 
267 |     with tf.variable_scope(scope or "reader") as varscope:
268 |         if options["rnn_cell_type"] == "layer_norm":
269 |             cell_fw = tf.contrib.rnn.LayerNormBasicLSTMCell(output_size)
270 |             cell_bw = tf.contrib.rnn.LayerNormBasicLSTMCell(output_size)
271 |         elif options["rnn_cell_type"] == "nas":
272 |             cell_fw = tf.contrib.rnn.NASCell(output_size)
273 |             cell_bw = tf.contrib.rnn.NASCell(output_size)
274 |         elif options["rnn_cell_type"] == "phasedlstm":
275 |             cell_fw = tf.contrib.rnn.PhasedLSTMCell(output_size)
276 |             cell_bw = tf.contrib.rnn.PhasedLSTMCell(output_size)
277 |         else: #LSTM cell
278 |             cell_fw = tf.contrib.rnn.LSTMCell(output_size, initializer=tf.contrib.layers.xavier_initializer())
279 |             cell_bw = tf.contrib.rnn.LSTMCell(output_size, initializer=tf.contrib.layers.xavier_initializer())
280 |         if num_layers > 1:
281 |             cell_fw = tf.nn.rnn_cell.MultiRNNCell([cell_fw] * num_layers)
282 |             cell_bw = tf.nn.rnn_cell.MultiRNNCell([cell_bw] * num_layers)
283 | 
284 |         if drop_keep_prob != 1.0:
285 |             cell_fw = tf.contrib.rnn.DropoutWrapper(cell=cell_fw, output_keep_prob=drop_keep_prob)
286 |             cell_bw = tf.contrib.rnn.DropoutWrapper(cell=cell_bw, output_keep_prob=drop_keep_prob)
287 | 
288 |         if skip_connections == True:
289 |             cell_fw = tf.contrib.rnn.ResidualWrapper(cell_fw)
290 |             cell_bw = tf.contrib.rnn.ResidualWrapper(cell_bw)
291 | 
292 |         if attention == True:
293 |             cell_fw = tf.contrib.rnn.AttentionCellWrapper(cell_fw, attn_length=10)
294 |             cell_bw = tf.contrib.rnn.AttentionCellWrapper(cell_bw, attn_length=10)
295 | 
296 |         outputs, states = tf.nn.bidirectional_dynamic_rnn(
297 |             cell_fw,
298 |             cell_bw,
299 |             inputs,
300 |             sequence_length=lengths,
301 |             initial_state_fw=contexts[0],
302 |             initial_state_bw=contexts[1],
303 |             dtype=tf.float32
304 |         )
305 | 
306 |         # ( (outputs_fw,outputs_bw) , (output_state_fw,output_state_bw) )
307 |         # in case LSTMCell: output_state_fw = (c_fw,h_fw), and output_state_bw = (c_bw,h_bw)
308 |         # each [batch_size x max_seq_length x output_size]
309 |         return outputs, states
310 | 


--------------------------------------------------------------------------------
/preproc/data_reader.py:
--------------------------------------------------------------------------------
  1 | import csv, os
  2 | import xml.etree.ElementTree as ET
  3 | from collections import defaultdict
  4 | 
  5 | from sklearn.model_selection import train_test_split
  6 | 
  7 | from constants import FNC, STANCE, NLI, TOPIC, LAPTOP, RESTAURANT, TARGET,\
  8 |     TOPIC_5WAY, STANCE_LABELS, FNC_LABELS, NLI_LABELS, TOPIC_LABELS, \
  9 |     TOPIC_5WAY_LABELS, ABSA_LABELS, TARGET_LABELS
 10 | 
 11 | 
 12 | def task2data_reader(task):
 13 |     if task == STANCE: # all data available
 14 |         return readSemEval2016Task6
 15 |     if task == FNC: # all data available
 16 |         return readFakeNewsChallengeData
 17 |     if task == NLI:  # test data not available - so we use every other dev example as test example
 18 |         return readMultinliData
 19 |     if task == TOPIC:  # all data available
 20 |         return readTopicBased
 21 |     if task == TOPIC_5WAY:
 22 |         return readTopic5Way
 23 |     if task == LAPTOP: # all data available
 24 |         return read_absa_laptops
 25 |     if task == RESTAURANT: # all data available
 26 |         return read_absa_restaurants
 27 |     if task == TARGET:  # all data available
 28 |         return read_target_dependent
 29 |     raise ValueError('No data reader available for %s.' % task)
 30 | 
 31 | 
 32 | def readSemEval2016Task6(datafolder="./data/", debug=True, num_instances=20):
 33 |     data_train = {"seq1": [], "seq2": [], "stance": [], "opinion_towards": [], "sentiment": [], "labels": []}
 34 |     data_dev = {"seq1": [], "seq2": [], "stance": [], "opinion_towards": [], "sentiment": [], "labels": []}
 35 |     data_test = {"seq1": [], "seq2": [], "stance": [], "opinion_towards": [], "sentiment": [], "labels": []}
 36 |     data_train, data_dev = parse_semeval_csv(os.path.join(datafolder, 'semeval2016-task6-stance/train.csv'), data_train, data_dev, "train", debug, num_instances)
 37 |     data_test, data_dev = parse_semeval_csv(os.path.join(datafolder, 'semeval2016-task6-stance/test.csv'), data_test, data_dev, "test", False, num_instances)  # setting debug to False to get all test instances
 38 | 
 39 |     # For the final task training, the dev set is used as part of the training set
 40 |     for i, inst in enumerate(data_dev["stance"]):
 41 |         data_train["seq1"].append(data_dev["seq1"][i])
 42 |         data_train["seq2"].append(data_dev["seq2"][i])
 43 |         data_train["stance"].append(data_dev["stance"][i])
 44 |         data_train["opinion_towards"].append(data_dev["opinion_towards"][i])
 45 |         data_train["sentiment"].append(data_dev["sentiment"][i])
 46 | 
 47 |     # sort the labels so that they are always in the same order so that we can
 48 |     # compute averaged positive and negative F1 (AGAINST, FAVOR, NONE)
 49 |     labels = sorted(list(set(data_train["stance"])))
 50 |     assert labels == STANCE_LABELS
 51 |     data_train["labels"] = labels
 52 |     data_dev["labels"] = labels
 53 |     data_test["labels"] = labels
 54 | 
 55 |     # we do not use the raw data ATM to correspond with the signature of the other data readers
 56 |     return data_train, data_dev, data_test
 57 | 
 58 | 
 59 | def parse_semeval_csv(filepath, empty_dict_1, empty_dict_2, mode, debug=False, num_instances=20):
 60 |     with open(filepath, 'r', encoding="latin-1") as csvfile:
 61 |         csvreader = csv.reader(csvfile, delimiter=',')
 62 |         i = -1
 63 |         for row in csvreader:
 64 |             i += 1
 65 |             if i == 0:
 66 |                 continue
 67 |             if debug and i >= num_instances+1:
 68 |                 continue
 69 |             tweet, target, stance, opinion_towards, sentiment = row
 70 |             dict_chosen = empty_dict_1
 71 |             if target == "Hillary Clinton":
 72 |                 dict_chosen = empty_dict_2
 73 |             if mode == "train" or target == "Hillary Clinton" or (mode == "test" and target == "Donald Trump"):
 74 |                 dict_chosen["seq1"].append(target)
 75 |                 dict_chosen["seq2"].append(tweet)
 76 |                 dict_chosen["stance"].append(stance)
 77 |                 dict_chosen["opinion_towards"].append(opinion_towards)
 78 |                 dict_chosen["sentiment"].append(sentiment)
 79 |     return empty_dict_1, empty_dict_2
 80 | 
 81 | 
 82 | def readFakeNewsChallengeData(datafolder="./data/", debug=True, num_instances=20):
 83 |     data_train = {"seq1": [], "seq2": [], "stance": [], "labels": []}
 84 |     data_train = parseFakeNewsChallengeData(datafolder, "fakenewschallenge/train_bodies.csv", "fakenewschallenge/trainsplit_stances.csv", data_train, debug, num_instances)
 85 |     data_dev = {"seq1": [], "seq2": [], "stance": [], "labels": []}
 86 |     data_dev = parseFakeNewsChallengeData(datafolder, "fakenewschallenge/train_bodies.csv", "fakenewschallenge/devsplit_stances.csv", data_dev, debug, num_instances)
 87 |     data_test = {"seq1": [], "seq2": [], "stance": [], "labels": []}
 88 |     data_test = parseFakeNewsChallengeData(datafolder, "fakenewschallenge/competition_test_bodies.csv", "fakenewschallenge/competition_test_stances.csv", data_test, debug, num_instances)
 89 |     data_train["labels"] = sorted(data_train["labels"])
 90 |     assert data_train["labels"] == FNC_LABELS
 91 |     data_dev["labels"] = data_train["labels"]
 92 |     data_test["labels"] = data_train["labels"]
 93 |     return data_train, data_dev, data_test
 94 | 
 95 | 
 96 | def parseFakeNewsChallengeData(datafolder, datafile_bodies, datafile_stances, data_dict, debug, num_instances):
 97 |     id2body = {}
 98 |     with open(os.path.join(datafolder, datafile_bodies), 'r', encoding='utf-8') as csvfile:
 99 |         csvreader = csv.reader(csvfile, delimiter=',')
100 |         i = -1
101 |         for row in csvreader:
102 |             i += 1
103 |             if i == 0:
104 |                 continue
105 |             body_id, body = row
106 |             id2body[body_id] = body
107 | 
108 |     with open(os.path.join(datafolder, datafile_stances), 'r', encoding='utf-8') as csvfile:
109 |         csvreader = csv.reader(csvfile, delimiter=',')
110 |         i = -1
111 |         for row in csvreader:
112 |             i += 1
113 |             if i == 0:
114 |                 continue
115 |             if debug and i >= num_instances+1:
116 |                 continue
117 |             headline, body_id, stance = row
118 |             data_dict["seq1"].append(headline)
119 |             data_dict["seq2"].append(id2body[body_id])
120 |             data_dict["stance"].append(stance)
121 | 
122 |     for lab in set(data_dict["stance"]):
123 |         data_dict["labels"].append(lab)
124 | 
125 |     return data_dict
126 | 
127 | 
128 | def readMultinliData(datafolder="./data/", debug=True, num_instances=20):
129 | 
130 |     max_count = None
131 |     if debug == True:
132 |         max_count = num_instances+1
133 | 
134 |     data_train = {"seq1": [], "seq2": [], "stance": [], "genre": [], "labels": []}
135 |     data_train, _ = parseMultinliFile(os.path.join(datafolder, 'multinli/multinli_0.9_train.txt'), data_train, {}, max_count, "train")
136 |     data_dev = {"seq1": [], "seq2": [], "stance": [], "genre": [], "labels": []}
137 |     data_test = {"seq1": [], "seq2": [], "stance": [], "genre": [], "labels": []}
138 |     data_dev, data_test = parseMultinliFile(os.path.join(datafolder, 'multinli/multinli_0.9_dev_matched.txt'), data_dev, data_test, max_count, "test")
139 | 
140 |     return data_train, data_dev, data_test
141 | 
142 | 
143 | def parseMultinliFile(filepath, data_1, data_2, max_count, mode):
144 |     reading_dataset = open(filepath, "r", encoding='utf-8')
145 |     # The script reads into those lists. If IDs for questions, supports or targets are defined, those are ignored.
146 |     count, counti = 0, 0
147 | 
148 |     for line in reading_dataset:
149 |         if max_count is None or count < max_count:
150 |             lspl = line.strip("\n").split("\t")
151 |             if len(lspl) == 15:
152 |                 gold_label, _, _, _, _, sentence1, sentence2, promptID, pairID, genre, _, _, _, _, _ = lspl
153 |                 if gold_label == "gold_label" or gold_label == "-":
154 |                     continue
155 |                 data_dict = data_1
156 |                 if mode == "train" or (mode == "test" and count % 2 == 0):
157 |                     data_dict = data_1
158 |                 elif mode == "test":
159 |                     data_dict = data_2
160 |                 data_dict["seq1"].append(sentence1)
161 |                 data_dict["seq2"].append(sentence2)
162 |                 data_dict["stance"].append(gold_label)
163 |                 data_dict["genre"].append(genre)
164 |                 count += 1
165 | 
166 |     for lab in set(data_1["stance"]):
167 |         data_1["labels"].append(lab)
168 |     data_1["labels"] = sorted(data_1["labels"])
169 |     assert data_1["labels"] == NLI_LABELS
170 | 
171 |     if data_2 != {}:
172 |         for lab in set(data_2["stance"]):
173 |             data_2["labels"].append(lab)
174 |         data_2["labels"] = sorted(data_2["labels"])
175 |         assert data_2["labels"] == NLI_LABELS
176 | 
177 |     return data_1, data_2
178 | 
179 | 
180 | def readTopicBased(datafolder="./data/", debug=True, num_instances=20):
181 |     topic_based_path = os.path.join(datafolder, 'semeval2016-task4b-topic-based-sentiment')
182 |     train_path = os.path.join(topic_based_path, '100_topics_XXX_tweets.topic-two-point.subtask-BD.train.gold_downloaded.tsv')
183 |     dev1_path = os.path.join(topic_based_path, '100_topics_XXX_tweets.topic-two-point.subtask-BD.dev.gold_downloaded.tsv')
184 |     dev2_path = os.path.join(topic_based_path, '100_topics_XXX_tweets.topic-two-point.subtask-BD.devtest.gold_downloaded.tsv')
185 |     test_data_path = os.path.join(topic_based_path, 'SemEval2016-task4-test.subtask-BD.txt')
186 |     test_labels_path = os.path.join(topic_based_path, 'SemEval2016_task4_subtaskB_test_gold.txt')
187 | 
188 |     for path_ in [topic_based_path, train_path, dev1_path, dev2_path, test_data_path, test_labels_path]:
189 |         assert os.path.exists(path_), 'Error: %s does not exist.' % path_
190 | 
191 |     data_train = parse_topic_based(train_path, debug, num_instances)
192 |     data_dev1 = parse_topic_based(dev1_path, debug, num_instances)
193 |     data_dev2 = parse_topic_based(dev2_path, debug, num_instances)
194 |     data_test = parse_topic_test_data(test_data_path, test_labels_path)
195 |     assert data_train["labels"] == TOPIC_LABELS
196 |     data_dev1["labels"] = data_train["labels"]
197 |     data_test["labels"] = data_train["labels"]
198 | 
199 |     # add the second dev data to the train set
200 |     data_train["seq1"] += data_dev2["seq1"]
201 |     data_train["seq2"] += data_dev2["seq2"]
202 |     data_train["stance"] += data_dev2["stance"]
203 |     return data_train, data_dev1, data_test
204 | 
205 | 
206 | def readTopic5Way(datafolder="./data/", debug=True, num_instances=20):
207 |     topic_based_path = os.path.join(datafolder, 'semeval2016-task4c-topic-based-sentiment')
208 |     train_path = os.path.join(topic_based_path, '100_topics_100_tweets.topic-five-point.subtask-CE.train.gold_downloaded.tsv')
209 |     dev1_path = os.path.join(topic_based_path, '100_topics_100_tweets.topic-five-point.subtask-CE.dev.gold_downloaded.tsv')
210 |     dev2_path = os.path.join(topic_based_path, '100_topics_100_tweets.topic-five-point.subtask-CE.devtest.gold_downloaded.tsv')
211 |     test_data_path = os.path.join(topic_based_path, 'SemEval2016-task4-test.subtask-CE.txt')
212 |     test_labels_path = os.path.join(topic_based_path, 'SemEval2016_task4_subtaskC_test_gold.txt')
213 | 
214 |     for path_ in [topic_based_path, train_path, dev1_path, dev2_path,
215 |                   test_data_path, test_labels_path]:
216 |         assert os.path.exists(path_), 'Error: %s does not exist.' % path_
217 | 
218 |     data_train = parse_topic_based(train_path, debug, num_instances)
219 |     data_dev1 = parse_topic_based(dev1_path, debug, num_instances)
220 |     data_dev2 = parse_topic_based(dev2_path, debug, num_instances)
221 |     data_test = parse_topic_test_data(test_data_path, test_labels_path)
222 |     assert data_train["labels"] == TOPIC_5WAY_LABELS
223 |     data_dev1["labels"] = data_train["labels"]
224 |     data_test["labels"] = data_train["labels"]
225 | 
226 |     # add the second dev data to the train set
227 |     data_train["seq1"] += data_dev2["seq1"]
228 |     data_train["seq2"] += data_dev2["seq2"]
229 |     data_train["stance"] += data_dev2["stance"]
230 |     return data_train, data_dev1, data_test
231 | 
232 | 
233 | def parse_topic_based(file_path, debug=False, num_instances=20):
234 |     data = {"seq1": [], "seq2": [], "stance": []}
235 |     with open(file_path) as f:
236 |         for i, line in enumerate(f):
237 |             id_, target, sentiment, tweet = line.split('\t')
238 |             try:
239 |                 sentiment = float(sentiment)
240 |             except ValueError:
241 |                 pass
242 |             if debug and i >= num_instances+1:
243 |                 continue
244 |             if tweet.strip() == 'Not Available':
245 |                 continue
246 |             data["seq1"].append(target)
247 |             data["seq2"].append(tweet)
248 |             data["stance"].append(sentiment)
249 | 
250 |     # we have to sort the labels so that they're in the order
251 |     # -2,-1,0,1,2 and are mapped to 0,1,2,3,4 (for subtask C)
252 |     data["labels"] = sorted(list(set(data["stance"])))
253 |     return data
254 | 
255 | 
256 | def parse_topic_test_data(examples_path, labels_path):
257 |     # Note: no debugging for the test data (20k tweets for subtask C)
258 |     data = {"seq1": [], "seq2": [], "stance": []}
259 |     with open(examples_path) as f_examples, open(labels_path) as f_labels:
260 |         for i, (line_examples, line_labels) in enumerate(zip(f_examples, f_labels)):
261 |             _, examples_target, _, *tweet = line_examples.strip().split('\t')
262 |             # two lines contain a tweet, for some reason
263 |             _, labels_target, sentiment, *_ = line_labels.strip().split('\t')
264 |             # one test tweet contains a tab character
265 |             if isinstance(tweet, list):
266 |                 tweet = '\t'.join(tweet)
267 |             try:
268 |                 sentiment = float(sentiment)
269 |             except ValueError:
270 |                 pass
271 | 
272 |             assert examples_target == labels_target,\
273 |                 '%s != %s at line %d in files %s and %s.' % (
274 |                 examples_target, labels_target, i, examples_path, labels_path)
275 | 
276 |             if tweet.strip() == 'Not Available':
277 |                 continue
278 |             data["seq1"].append(examples_target)
279 |             data["seq2"].append(tweet)
280 |             data["stance"].append(sentiment)
281 |     data["labels"] = sorted(list(set(data["stance"])))
282 |     return data
283 | 
284 | 
285 | def read_absa_laptops(datafolder="./data/", debug=True, num_instances=20):
286 |     return read_absa('laptops', datafolder, debug, num_instances)
287 | 
288 | 
289 | def read_absa_restaurants(datafolder="./data/", debug=True, num_instances=20):
290 |     return read_absa('restaurants', datafolder, debug, num_instances)
291 | 
292 | 
293 | def read_absa(domain, datafolder="./data/", debug=True, num_instances=20):
294 |     assert domain in ['laptops', 'restaurants'], '%s is not a valid domain.' % domain
295 |     absa_path = os.path.join(datafolder, 'semeval2016-task5-absa-english')
296 |     train_path = os.path.join(absa_path, '%s_english_training.xml' % domain)
297 |     test_path = os.path.join(absa_path, '%s_english_test.xml' % domain)
298 |     for path_ in [absa_path, train_path, test_path]:
299 |         assert os.path.exists(path_), 'Error: %s does not exist.' % path_
300 | 
301 |     data_train = parse_absa(train_path, debug, num_instances)
302 |     data_test = parse_absa(test_path)
303 | 
304 |     # trial data is a subset of training data; instead we split the train data
305 |     data_train, data_dev = split_train_data(data_train)
306 |     return data_train, data_dev, data_test
307 | 
308 | 
309 | def parse_absa(file_path, debug=False, num_instances=20):
310 |     """
311 |     Extracts all reviews from an XML file and returns them as a list of Review objects.
312 |     Adds a NONE aspect to all sentences with no aspect.
313 |     :param file_path: the path of the XML file
314 |     :return: a list of Review objects each containing a list of Sentence objects and other attributes
315 |     """
316 |     data = {"seq1": [], "seq2": [], "stance": []}
317 |     e = ET.parse(file_path).getroot()
318 |     for i, review_e in enumerate(e):
319 |         if debug and i >= num_instances+1:
320 |             continue
321 |         for sentence_e in review_e.find('sentences'):
322 |             text = sentence_e.find('text').text
323 |             # we do not care about sentences that do not contain an aspect
324 |             if sentence_e.find('Opinions') is not None:
325 |                 for op in sentence_e.find('Opinions'):
326 |                     # the category is of the form ENTITY#ATTRIBUTE, e.g. LAPTOP#GENERAL
327 |                     target = ' '.join(op.get('category').split('#'))
328 |                     polarity = op.get('polarity')
329 |                     data['seq1'].append(target)
330 |                     data['seq2'].append(text)
331 |                     data['stance'].append(polarity)
332 |     data["labels"] = sorted(list(set(data["stance"])))
333 |     assert data["labels"] == ABSA_LABELS
334 |     return data
335 | 
336 | 
337 | def read_target_dependent(datafolder="./data/", debug=True, num_instances=20):
338 |     target_dependent_path = os.path.join(datafolder, 'target-dependent')
339 |     train_path = os.path.join(target_dependent_path, 'train.raw')
340 |     test_path = os.path.join(target_dependent_path, 'test.raw')
341 |     for path_ in [target_dependent_path, train_path, test_path]:
342 |         assert os.path.exists(path_), 'Error: %s does not exist.' % path_
343 | 
344 |     data_train = parse_target_dependent(train_path, debug, num_instances)
345 |     data_test = parse_target_dependent(test_path)
346 |     data_train, data_dev = split_train_data(data_train)
347 |     return data_train, data_dev, data_test
348 | 
349 | 
350 | def parse_target_dependent(file_path, debug=False, num_instances=20):
351 |     data = {"seq1": [], "seq2": [], "stance": []}
352 |     with open(file_path, encoding='utf-8') as f:
353 |         for i, line in enumerate(f):
354 |             if i % 3 == 0:  # the tweet is always first
355 |                 data["seq2"].append(line.strip())
356 |             elif i % 3 == 1:  # followed by the target
357 |                 data["seq1"].append(line.strip())
358 |             elif i % 3 == 2:  # followed by the sentiment
359 |                 data["stance"].append(line.strip())
360 |             if debug and i >= num_instances+1:
361 |                 continue
362 |         assert len(data["seq1"]) == len(data["seq2"]) == len(data["stance"]),\
363 |             'Error: %d != %d != %d.' % (len(data["seq1"]), len(data["seq2"]),
364 |                                         len(data["stance"]))
365 | 
366 |     # replace the placeholder $T$ in every tweet with the target
367 |     for i in range(len(data["seq1"])):
368 |         target = data["seq1"][i]
369 |         data["seq2"][i] = data["seq2"][i].replace("$T$", target)
370 |     data["labels"] = sorted(list(set(data["stance"])))
371 |     assert data["labels"] == TARGET_LABELS
372 |     return data
373 | 
374 | 
375 | def split_train_data(data_train):
376 |     """Split the train data into train and dev data."""
377 |     train_ids, _ = train_test_split(range(len(data_train['seq1'])),
378 |                                     test_size=0.1, random_state=42)
379 |     data_dev = defaultdict(list)
380 |     new_data_train = defaultdict(list)
381 |     for key, examples in data_train.items():
382 |         if key == 'labels':
383 |             continue
384 |         # no numpy indexing, so we iterate over the examples
385 |         for i, example in enumerate(examples):
386 |             if i in train_ids:
387 |                 new_data_train[key].append(example)
388 |             else:
389 |                 data_dev[key].append(example)
390 |     new_data_train['labels'] = data_train['labels']
391 |     data_dev['labels'] = data_train['labels']
392 |     return new_data_train, data_dev
393 | 
394 | 
395 | if __name__ == "__main__":
396 |     readMultinliData(datafolder="../data/")
397 | 


--------------------------------------------------------------------------------
/preproc/vocab.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import operator
  3 | import sys
  4 | 
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | 
  8 | 
  9 | class Vocab(object):
 10 |     """
 11 |     Vocab objects for use in jtr pipelines.
 12 | 
 13 |     Example:
 14 | 
 15 |         >>> # Test Vocab without pre-trained embeddings
 16 |         >>> vocab = Vocab()
 17 |         >>> print(vocab("blah"))
 18 |         1
 19 |         >>> print(vocab("bluh"))
 20 |         2
 21 |         >>> print(vocab("bleh"))
 22 |         3
 23 |         >>> print(vocab("bluh"))
 24 |         2
 25 |         >>> print(vocab("hello"))
 26 |         4
 27 |         >>> print(vocab("world"))
 28 |         5
 29 | 
 30 |         >>> # Sym2id before freezing:
 31 |         >>> for k in sorted(vocab.sym2id.keys()):
 32 |         ...     print(k,' : ',vocab.sym2id[k])
 33 |         <UNK>  :  0
 34 |         blah  :  1
 35 |         bleh  :  3
 36 |         bluh  :  2
 37 |         hello  :  4
 38 |         world  :  5
 39 | 
 40 |         >>> # Sym2id after freezing (no difference, because no pre-trained embeddings used):
 41 |         >>> vocab.freeze()
 42 |         >>> for k in sorted(vocab.sym2id.keys()):
 43 |         ...     print(k,' : ',vocab.sym2id[k])
 44 |         <UNK>  :  0
 45 |         blah  :  1
 46 |         bleh  :  3
 47 |         bluh  :  2
 48 |         hello  :  4
 49 |         world  :  5
 50 | 
 51 |         >>> # Test Vocab with pre-trained embeddings
 52 |         >>> def emb(w):
 53 |         ...    v = {'blah':[1.7,0,.3],'bluh':[0,1.5,0.5],'bleh':[0,0,2]}
 54 |         ...    return None if not w in v else v[w]
 55 |         >>> vocab = Vocab(emb=emb)
 56 |         >>> print(vocab("blah"))
 57 |         -1
 58 |         >>> print(vocab("bluh"))
 59 |         -2
 60 |         >>> print(vocab("bleh"))
 61 |         -3
 62 |         >>> print(vocab("bluh"))
 63 |         -2
 64 |         >>> print(vocab("hello"))
 65 |         1
 66 |         >>> print(vocab("world"))
 67 |         2
 68 | 
 69 |         >>> # Sym2id before freezing:
 70 |         >>> for k in sorted(vocab.sym2id.keys()):
 71 |         ...     print(k,' : ',vocab.sym2id[k])
 72 |         <UNK>  :  0
 73 |         blah  :  -1
 74 |         bleh  :  -3
 75 |         bluh  :  -2
 76 |         hello  :  1
 77 |         world  :  2
 78 | 
 79 |         >>> # Sym2id after freezing: normalized (positive) ids, also for pre-trained terms
 80 |         >>> vocab.freeze()
 81 |         >>> for k in sorted(vocab.sym2id.keys()):
 82 |         ...     print(k,' : ',vocab.sym2id[k])
 83 |         <UNK>  :  0
 84 |         blah  :  3
 85 |         bleh  :  5
 86 |         bluh  :  4
 87 |         hello  :  1
 88 |         world  :  2
 89 | 
 90 |         >>> # Test pretrained and out-of-vocab id's before freezing
 91 |         >>> vocab.unfreeze()
 92 |         >>> vocab.get_ids_pretrained()
 93 |         [-1, -2, -3]
 94 |         >>> vocab.get_ids_oov()
 95 |         [0, 1, 2]
 96 | 
 97 |         >>> # Test pretrained and out-of-vocab id's after freezing
 98 |         >>> vocab.freeze()
 99 |         >>> vocab.get_ids_pretrained()
100 |         [3, 4, 5]
101 |         >>> vocab.get_ids_oov()
102 |         [0, 1, 2]
103 | 
104 |         >>> # Test calling frozen Vocab object
105 |         >>> vocab(['bluh','world','wake','up']) #last 2 are new words, hence unknown
106 |         [4, 2, 0, 0]
107 | 
108 |         >>> # Test calling unfrozen Vocab object
109 |         >>> vocab.unfreeze()
110 |         >>> vocab(['bluh','world','wake','up']) #last 2 are new words, hence added to Vocab
111 |         [-2, 2, 3, 4]
112 | 
113 |         >>> #Test sym2id after freezing again
114 |         >>> vocab.freeze()
115 |         >>> for k in sorted(vocab.sym2id.keys()):
116 |         ...     print(k,' : ',vocab.sym2id[k])
117 |         <UNK>  :  0
118 |         blah  :  5
119 |         bleh  :  7
120 |         bluh  :  6
121 |         hello  :  1
122 |         up  :  4
123 |         wake  :  3
124 |         world  :  2
125 |     """
126 | 
127 |     DEFAULT_UNK = "<UNK>"
128 | 
129 |     def __init__(self, unk=DEFAULT_UNK, emb=None, init_from_embeddings=False):
130 |         """
131 |         Creates Vocab object.
132 | 
133 |         Args:
134 |             `unk`: symbol for unknown term (default: "<UNK>").
135 |               If set to `None`, and `None` is not included as symbol while unfrozen,
136 |               it will return `None` upon calling `get_id(None)` when frozen.
137 |             `emb`: function handle; returns pre-trained embedding (fixed-size numerical list or ndarray)
138 |               for a given symbol, and None for unknown symbols.
139 |         """
140 |         self.next_neg = -1
141 |         self.unk = unk
142 |         self.emb = emb if emb is not None else lambda _ : None # if emb is None: same behavior as for o-o-v words
143 | 
144 |         if init_from_embeddings and emb is not None:
145 |             self.sym2id = dict(emb.vocabulary.word2idx)
146 |             self.id2sym = {v: k for k, v in emb.vocabulary.word2idx.items()}
147 |             if unk is not None and unk not in self.sym2id:
148 |                 self.sym2id[unk] = len(self.sym2id)
149 |                 self.id2sym[len(self.id2sym)] = unk
150 |             self.sym2freqs = {w: emb.vocabulary.get_word_count(w) for w in self.sym2id}
151 |             self.frozen = True
152 |         else:
153 |             self.sym2id = {}
154 |             # with pos and neg indices
155 |             self.id2sym = {}
156 |             self.next_pos = 0
157 |             self.sym2freqs = {}
158 |             if unk is not None:
159 |                 self.sym2id[unk] = 0
160 |                 # with pos and neg indices
161 |                 self.id2sym[0] = unk
162 |                 self.next_pos = 1
163 |                 self.sym2freqs[unk] = 0
164 |             self.frozen = False
165 | 
166 |         if emb is not None and hasattr(emb, "lookup") and isinstance(emb.lookup, np.ndarray):
167 |             self.emb_length = emb.lookup.shape[1]
168 |         else:
169 |             self.emb_length = None
170 | 
171 |     def freeze(self):
172 |         """Freeze current Vocab object (set `self.frozen` to True).
173 |         To be used after loading symbols from a given corpus;
174 |         transforms all internal symbol id's to positive indices (for use in tensors).
175 | 
176 |         - additional calls to the __call__ method will return the id for the unknown symbold
177 |         - out-of-vocab id's are positive integers and do not change
178 |         - id's of symbols with pre-trained embeddings are converted to positive integer id's,
179 |           counting up from the all out-of-vocab id's.
180 |         """
181 |         # if any pretrained have been encountered
182 |         if not self.frozen and self.next_neg < -1:
183 |             sym2id = {sym: self._normalize(id) for sym, id in self.sym2id.items()}
184 |             id2sym = {self._normalize(id): sym for id, sym in self.id2sym.items()}
185 |             self.sym2id = sym2id
186 |             self.id2sym = id2sym
187 |         self.frozen = True
188 | 
189 |     def unfreeze(self):
190 |         """Unfreeze current Vocab object (set `self.frozen` to False).
191 |         Caution: use with care! Unfreezing a Vocab, adding new terms, and again Freezing it,
192 |         will result in shifted id's for pre-trained symbols.
193 | 
194 |         - maps all normalized id's to the original internal id's.
195 |         - additional calls to __call__ will allow adding new symbols to the vocabulary.
196 |         """
197 |         if self.frozen and self.next_neg < -1:
198 |             sym2id = {sym: self._denormalize(id) for sym, id in self.sym2id.items()}
199 |             id2sym = {self._denormalize(id): sym for id, sym in self.id2sym.items()}
200 |             self.sym2id = sym2id
201 |             self.id2sym = id2sym
202 |         self.frozen = False
203 | 
204 |     def get_id(self, sym, is_num=False):
205 |         """
206 |         Returns the id of `sym`; different behavior depending on the state of the Vocab:
207 | 
208 |         - In case self.frozen==False (default): returns internal id,
209 |           that is, positive for out-of-vocab symbol, negative for symbol
210 |           found in `self.emb`. If `sym` is a new symbol, it is added to the Vocab.
211 | 
212 |         - In case self.frozen==True (after explicit call to 'freeze()', or after building a `NeuralVocab` with it):
213 |           Returns normalized id (positive integer, also for symbols with pre-trained embedding)
214 |           If `sym` is a new symbol, the id for unknown terms is returned, if available,
215 |           and otherwise `None` (only possible when input argument `unk` for `Vocab.__init__()` was set to `None`, e.g. ;
216 |           for classification labels; it is assumed action is taken in the pipeline
217 |           creating or calling the `Vocab` object, when `None` is encountered).
218 | 
219 |         Args:
220 |             `sym`: symbol (e.g., token)
221 |         """
222 |         if not self.frozen:
223 |             vec = self.emb(sym)
224 |             if self.emb_length is None and vec is not None:
225 |                 self.emb_length = len(vec) if isinstance(vec, list) else vec.shape[0]
226 |             if sym not in self.sym2id:
227 |                 if vec is None:
228 |                     self.sym2id[sym] = self.next_pos
229 |                     self.id2sym[self.next_pos] = sym
230 |                     self.next_pos += 1
231 |                 else:
232 |                     self.sym2id[sym] = self.next_neg
233 |                     self.id2sym[self.next_neg] = sym
234 |                     self.next_neg -= 1
235 |                 self.sym2freqs[sym] = 1
236 |             else:
237 |                 self.sym2freqs[sym] += 1
238 |         if sym in self.sym2id:
239 |             return self.sym2id[sym]
240 |         else:
241 |             if self.unk in self.sym2id:
242 |                 return self.sym2id[self.unk]
243 |             # can happen for `Vocab` initialized with `unk` argument set to `None`
244 |             else:
245 |                 return None
246 | 
247 |     def get_sym(self, id):
248 |         """returns symbol for a given id (consistent with the `self.frozen` state), and None if not found."""
249 |         return None if not id in self.id2sym else self.id2sym[id]
250 | 
251 |     def __call__(self, *args, **kwargs):
252 |         """
253 |         calls the `get_id` function for the provided symbol(s), which adds symbols to the Vocab if needed and allowed,
254 |         and returns their id(s).
255 | 
256 |         Args:
257 |             *args: a single symbol, a list of symbols, or multiple symbols
258 |         """
259 |         symbols = args
260 |         if len(args) == 1:
261 |             if isinstance(args[0], list):
262 |                 symbols = args[0]
263 |             else:
264 |                 return self.get_id(args[0])
265 |         return [self.get_id(sym) for sym in symbols]
266 | 
267 |     def __len__(self):
268 |         """returns number of unique symbols (including the unknown symbol)"""
269 |         return len(self.id2sym)
270 | 
271 |     def __contains__(self, sym):
272 |         """checks if `sym` already in the Vocab object"""
273 |         return sym in self.sym2id
274 | 
275 |     def _normalize(self, id):
276 |         """map original (pos/neg) ids to normalized (non-neg) ids: first new symbols, then those in emb"""
277 |         # e.g. -1 should be mapped to self.next_pos + 0
278 |         # e.g. -3 should be mapped to self.next_pos + 2
279 |         return id if id >=0 else self.next_pos - id - 1
280 | 
281 |     def _denormalize(self, id):
282 |         # self.next_pos + i is mapped back to  -1-i
283 |         return id if id < self.next_pos else -1-(id-self.next_pos)
284 | 
285 |     def get_ids_pretrained(self):
286 |         """return internal or normalized id's (depending on frozen/unfrozen state)
287 |         for symbols that have an embedding in `self.emb` """
288 |         if self.frozen:
289 |             return list(range(self.next_pos,self.next_pos+self.count_pretrained()))
290 |         else:
291 |             return list(range(-1,self.next_neg,-1))
292 | 
293 |     def get_ids_oov(self):
294 |         """return out-of-vocab id's (indep. of frozen/unfrozen state)"""
295 |         return list(range(self.next_pos))
296 | 
297 |     def count_pretrained(self):
298 |         """equivalent to `len(get_ids_pretrained())`"""
299 |         return -self.next_neg - 1
300 | 
301 |     def count_oov(self):
302 |         """equivalent to `len(get_ids_oov())`"""
303 |         return self.next_pos
304 | 
305 |     def prune(self, min_freq=5, max_size=sys.maxsize):
306 |         """returns new Vocab object, pruned based on minimum symbol frequency"""
307 |         pruned_vocab = Vocab(unk=self.unk, emb=self.emb)
308 |         cnt = 0
309 |         for sym, freq in sorted(self.sym2freqs.items(), key=operator.itemgetter(1), reverse=True):
310 |             # for sym in self.sym2freqs:
311 |             # freq = self.sym2freqs[sym]
312 |             cnt += 1
313 |             if freq >= min_freq and cnt < max_size:
314 |                 pruned_vocab(sym)
315 |                 pruned_vocab.sym2freqs[sym] = freq
316 |         if self.frozen:
317 |             # if original Vocab was frozen, freeze new one
318 |             pruned_vocab.freeze()
319 | 
320 |         return pruned_vocab
321 | 
322 | 
323 | class NeuralVocab(Vocab):
324 |     """
325 |     Wrapper around Vocab to go from indices to tensors.
326 | 
327 |     Example:
328 |         >>> # Start from same Vocab as the doctest example in Vocab
329 |         >>> def emb(w):
330 |         ...    v = {'blah':[1.7,0,.3],'bluh':[0,1.5,0.5],'bleh':[0,0,2]}
331 |         ...    return None if not w in v else v[w]
332 |         >>> vocab = Vocab(emb=emb)
333 |         >>> vocab("blah", "bluh", "bleh", "hello", "world")  # symbols as multiple arguments
334 |         [-1, -2, -3, 1, 2]
335 |         >>> vocab(['bluh','world','wake','up']) # as list of symbols
336 |         [-2, 2, 3, 4]
337 | 
338 | 
339 |         >>> # Test NeuralVocab with pre-trained embeddings  (case: input_size larger than pre-trained embeddings)
340 |         >>> with tf.variable_scope('neural_test2'):
341 |         ...     for w in ['blah','bluh','bleh']:
342 |         ...         w, emb(w)
343 |         ...     nvocab = NeuralVocab(vocab, None, 4, unit_normalize=True, use_pretrained=True, train_pretrained=False)
344 |         ('blah', [1.7, 0, 0.3])
345 |         ('bluh', [0, 1.5, 0.5])
346 |         ('bleh', [0, 0, 2])
347 | 
348 |     Interpretation of number of trainable variables from neural_test2:
349 |     out-of-vocab: 8 - 3 = 5 symbols, with each 4 dimensions = 20;
350 |     for fixed pre-trained embeddings with length 3, three times 1 extra trainable dimension for total embedding length 4.
351 |     Total is 23.
352 |     """
353 | 
354 |     def __init__(self, base_vocab, embedding_matrix=None,
355 |                  input_size=None, reduced_input_size=None, use_pretrained=True, train_pretrained=False, unit_normalize=True):
356 |         """
357 |         Creates NeuralVocab object from a given Vocab object `base_vocab`.
358 |         Pre-calculates embedding vector (as `Tensor` object) for each symbol in Vocab
359 | 
360 |         Args:
361 |             `base_vocab`:
362 |             `embedding_matrix`: tensor with shape (len_vocab, input_size). If provided,
363 |               the arguments `input_size`, `use_trained`, `train_pretrained`, and `unit_normalize` are ignored.
364 |             `input_size`: integer; embedding length in case embedding matrix not provided, else ignored.
365 |               If shorter than pre-trained embeddings, only their first `input_size` dimensions are used.
366 |               If longer, extra (Trainable) dimensions are added.
367 |             `reduced_input_size`: integer; optional; ignored in case `None`. If set to positive integer, an additional
368 |               linear layer is introduced to reduce (or extend) the embeddings to the indicated size.
369 |             `use_pretrained`:  boolean; True (default): use pre-trained if available through `base_vocab`.
370 |               False: ignore pre-trained embeddings accessible through `base_vocab`
371 |             `train_pretrained`: boolean; False (default): fix pretrained embeddings. True: continue training.
372 |               Ignored if embedding_matrix is given.
373 |             `unit_normalize`: initialize pre-trained vectors with unit norm
374 |               (note: randomly initialized embeddings are always initialized with expected unit norm)
375 |         """
376 |         super(NeuralVocab, self).__init__(unk=base_vocab.unk, emb=base_vocab.emb)
377 | 
378 |         assert (embedding_matrix, input_size) is not (None, None), "if no embedding_matrix is provided, define input_size"
379 | 
380 |         self.freeze() # has no actual functionality here
381 |         base_vocab.freeze() # freeze if not frozen (to ensure fixed non-negative indices)
382 | 
383 |         self.sym2id = base_vocab.sym2id
384 |         self.id2sym = base_vocab.id2sym
385 |         self.sym2freqs = base_vocab.sym2freqs
386 |         self.unit_normalize = unit_normalize
387 | 
388 |         def np_normalize(v):
389 |             return v / np.sqrt(np.sum(np.square(v)))
390 | 
391 |         if embedding_matrix is None:
392 |             # construct part oov
393 |             n_oov = base_vocab.count_oov()
394 |             n_pre = base_vocab.count_pretrained()
395 |             E_oov = tf.get_variable("embeddings_oov", [n_oov, input_size],
396 |                                     initializer=tf.random_normal_initializer(0, 1./np.sqrt(input_size)),
397 |                                     trainable=True, dtype="float32")
398 |             # stdev = 1/sqrt(length): then expected initial L2 norm is 1
399 | 
400 |             # construct part pretrained
401 |             if use_pretrained and base_vocab.emb_length is not None:
402 |                 # load embeddings into numpy tensor with shape (count_pretrained, min(input_size,emb_length))
403 |                 np_E_pre = np.zeros([n_pre, min(input_size, base_vocab.emb_length)]).astype("float32")
404 |                 for id in base_vocab.get_ids_pretrained():
405 |                     sym = base_vocab.id2sym[id]
406 |                     i = id - n_oov  # shifted to start from 0
407 |                     np_E_pre[i, :] = base_vocab.emb(sym)[:min(input_size,base_vocab.emb_length)]
408 |                     if unit_normalize:
409 |                         np_E_pre[i, :] = np_normalize(np_E_pre[i, :])
410 |                 E_pre = tf.get_variable("embeddings_pretrained",
411 |                                         initializer=tf.identity(np_E_pre),
412 |                                         trainable=train_pretrained, dtype="float32")
413 | 
414 |                 if input_size > base_vocab.emb_length:
415 |                     E_pre_ext = tf.get_variable("embeddings_extra", [n_pre, input_size-base_vocab.emb_length],
416 |                                                 initializer=tf.random_normal_initializer(0.0, 1. / np.sqrt(base_vocab.emb_length)), dtype="float32", trainable=True)
417 |                     # note: stdev = 1/sqrt(emb_length) means: elements from same normal distr. as normalized first part (in case normally distr.)
418 |                     E_pre = tf.concat([E_pre, E_pre_ext], 1, name="embeddings_pretrained_extended")
419 |             else:
420 |                 # initialize all randomly anyway
421 |                 E_pre = tf.get_variable("embeddings_not_pretrained", [n_pre, input_size],
422 |                                         initializer=tf.random_normal_initializer(0., 1./np.sqrt(input_size)),
423 |                                         trainable=True, dtype="float32")
424 |                 # again: initialize with expected unit norm
425 | 
426 |             # must be provided is embedding_matrix is None
427 |             self.input_size = input_size
428 |             self.embedding_matrix = tf.concat([E_oov, E_pre], 0, name="embeddings")
429 | 
430 |         else:
431 |             # ignore input argument input_size
432 |             self.input_size = embedding_matrix.get_shape()[1]
433 |             self.embedding_matrix = embedding_matrix
434 | 
435 |         if isinstance(reduced_input_size, int) and reduced_input_size > 0:
436 |             # uniform=False for truncated normal
437 |             init = tf.contrib.layers.xavier_initializer(uniform=True)
438 |             self.embedding_matrix = tf.contrib.layers.fully_connected(self.embedding_matrix, reduced_input_size,
439 |                                                                       weights_initializer=init, activation_fn=None)
440 | 
441 |         # pre-assign embedding vectors to all ids
442 |         # always OK if frozen
443 |         self.id2vec = [tf.nn.embedding_lookup(self.embedding_matrix, idx) for idx in range(len(self))]
444 | 
445 |     def embed_symbol(self, ids):
446 |         """returns embedded id's
447 | 
448 |         Args:
449 |             `ids`: integer, ndarray with np.int32 integers, or tensor with tf.int32 integers.
450 |             These integers correspond to (normalized) id's for symbols in `self.base_vocab`.
451 | 
452 |         Returns:
453 |             tensor with id's embedded by numerical vectors (in last dimension)
454 |         """
455 |         return tf.nn.embedding_lookup(self.embedding_matrix, ids)
456 | 
457 |     def __call__(self, *args, **kwargs):
458 |         """
459 |         Calling the NeuralVocab object with symbol id's,
460 |         returns a `Tensor` with corresponding embeddings.
461 | 
462 |         Args:
463 |             `*args`: `Tensor` with integer indices
464 |               (such as a placeholder, to be evaluated when run in a `tf.Session`),
465 |               or list of integer id's,
466 |               or just multiple integer ids as input arguments
467 | 
468 |         Returns:
469 |             Embedded `Tensor` in case a `Tensor` was provided as input,
470 |             and otherwise a list of embedded input id's under the form of fixed-length embeddings (`Tensor` objects).
471 |         """
472 |         # tuple with length 1: then either list with ids, tensor with ids, or single id
473 |         if len(args) == 1:
474 |             if isinstance(args[0], list):
475 |                 ids = args[0]
476 |             elif tf.contrib.framework.is_tensor(args[0]):
477 |                 # return embedded tensor
478 |                 return self.embed_symbol(args[0])
479 |             else:
480 |                 return self.id2vec[args[0]]
481 |         else:  # tuple with ids
482 |             ids = args
483 |         return [self.id2vec[id] for id in ids]
484 | 
485 |     def get_embedding_matrix(self):
486 |         return self.embedding_matrix
487 | 
488 | 
489 | if __name__ == '__main__':
490 |     import doctest
491 |     tf.set_random_seed(1337)
492 | 
493 |     print(doctest.testmod())
494 | 


--------------------------------------------------------------------------------
/preproc/map.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import re
  3 | import numpy as np
  4 | import pprint
  5 | from preproc.vocab import Vocab
  6 | #from jtr.util.rs import DefaultRandomState
  7 | 
  8 | rs = np.random.RandomState(1337)
  9 | 
 10 | #rs = DefaultRandomState(1337)#new seed ignored if set previously
 11 | 
 12 | # sym (e.g. token, token id or class label)
 13 | # seq (e.g. sequence of tokens)
 14 | # seqs (sequence of sequences)
 15 | # corpus (sequence of sequence of sequences)
 16 | #   e.g. hypotheses (sequence of sequences)
 17 | #        premises (sequence of sequences)
 18 | #        support (sequence of sequence of sequences)
 19 | #        labels (sequence of symbols)
 20 | # corpus = [hypotheses, premises, support, labels]
 21 | 
 22 | 
 23 | def tokenize(xs, pattern="([\s'\-\.\,\!])"):
 24 |     """Splits sentences into tokens by regex over punctuation: ( -.,!])["""
 25 |     return [x for x in re.split(pattern, xs)
 26 |             if not re.match("\s", x) and x != ""]
 27 | 
 28 | def notokenize(xs):
 29 |     """Embeds deepest itemns into a list"""
 30 |     return [xs]
 31 | 
 32 | 
 33 | def lower(xs):
 34 |     """returns lowercase for sequence of strings"""
 35 |     # """performs lowercasing on string or sequence of strings"""
 36 |     # if isinstance(xs, str):
 37 |     #    return xs.lower()
 38 |     return [x.lower() for x in xs]
 39 | 
 40 | 
 41 | def deep_map(xs, fun, keys=None, fun_name='trf', expand=False, cache_fun=False):
 42 |     """Applies fun to a dict or list; adds the results in-place.
 43 | 
 44 |     Usage: Transform a corpus iteratively by applying functions like
 45 |     `tokenize`, `lower`, or vocabulary functions (word -> embedding id) to it.
 46 |     ::
 47 |       from jtr.sisyphos.vocab import Vocab
 48 |       vocab = Vocab()
 49 |       keys = ['question', 'support']
 50 |       corpus = deep_map(corpus, lambda x: x.lower(), keys)
 51 |       corpus = deep_map(corpus, tokenize, keys)
 52 |       corpus = deep_map(corpus, vocab, keys)
 53 |       corpus = deep_map(corpus, vocab._normalize, keys=keys)
 54 | 
 55 |     From here we can create batches from the corpus and feed it into a model.
 56 | 
 57 |     In case `expand==False` each top-level entry of `xs` to be transformed
 58 |     replaces the original entry.
 59 |     `deep_map` supports `xs` to be a dictionary or a list/tuple:
 60 |       - In case `xs` is a dictionary, its transformed value is also a dictionary, and `keys` contains the keys of the
 61 |       values to be transformed.
 62 |       - In case `xs` is a list/tuple, `keys` contains the indices of the entries to be transformed
 63 |     The function `deep_map` is recursively applied to the values of `xs`,
 64 |     only at the deepest level, where the entries are no longer sequences/dicts, after which `fun` is applied.
 65 | 
 66 |     Args:
 67 |       `xs`: a sequence (list/tuple) of objects or sequences of objects.
 68 |       `fun`: a function to transform objects
 69 |       `keys`: seq with keys if `xs` is dict; seq with integer indices if `xs` is seq.
 70 |         For entries not in `keys`, the original `xs` value is retained.
 71 |       `fun_name`: default value 'trf'; string with function tag (e.g. 'lengths'),
 72 |         used if '''expand==True''' and '''isinstance(xs,dict)'''
 73 |         Say for example fun_name='lengths', and `keys` contains 'sentence', then the transformed dict would look like
 74 |         '''{'sentence':[sentences], 'sentence_lengths':[fun(sentences)] ...}'''
 75 |       `cache_fun`: should the function values for seen inputs be cached. Use with care, as it will affect functions with side effects.
 76 | 
 77 |     Returns:
 78 |       Transformed sequence or dictionary.
 79 | 
 80 |     Example:
 81 | 
 82 |     >>> #(1) Test with sequence of stuff
 83 |     >>> dave = [
 84 |     ...         "All work and no play makes Jack a dull boy",
 85 |     ...         "All work and no play makes Jack a dull boy.",
 86 |     ...         "All work and no play makes Jack a very dull boy!"]
 87 |     >>> jack = [
 88 |     ...         "I'm sorry Dave, I'm afraid I can't do that!",
 89 |     ...         "I'm sorry Dave, I'm afraid I can't do that",
 90 |     ...         "I'm sorry Dave, I'm afraid I cannot do that"]
 91 |     >>> support = [
 92 |     ...         ["Play makes really dull", "really dull"],
 93 |     ...         ["Dave is human"],
 94 |     ...         ["All work", "all dull", "dull"]]
 95 |     >>> data1 = [dave, jack, support]
 96 |     >>> vocab1 = Vocab()
 97 |     >>> data1_lower = deep_map(data1, lambda s:s.lower())
 98 |     >>> data1_tokenized = deep_map(data1_lower, tokenize)
 99 |     >>> data1_ids = deep_map(data1_tokenized, vocab1)
100 |     >>> pprint.pprint(data1_ids)
101 |     [[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
102 |       [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
103 |       [1, 2, 3, 4, 5, 6, 7, 8, 12, 9, 10, 13]],
104 |      [[14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 21, 15, 22, 23, 24, 13],
105 |       [14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 21, 15, 22, 23, 24],
106 |       [14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 25, 23, 24]],
107 |      [[[5, 6, 26, 9], [26, 9]], [[18, 27, 28]], [[1, 2], [1, 9], [9]]]]
108 |     >>> data1_ids_with_lengths = deep_seq_map(data1_ids, lambda xs: len(xs),
109 |     ...                                       fun_name='lengths', expand=True)
110 |     >>> pprint.pprint(data1_ids_with_lengths)
111 |     [[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
112 |       [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
113 |       [1, 2, 3, 4, 5, 6, 7, 8, 12, 9, 10, 13]],
114 |      [10, 11, 12],
115 |      [[14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 21, 15, 22, 23, 24, 13],
116 |       [14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 21, 15, 22, 23, 24],
117 |       [14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 25, 23, 24]],
118 |      [17, 16, 14],
119 |      [[[5, 6, 26, 9], [26, 9]], [[18, 27, 28]], [[1, 2], [1, 9], [9]]],
120 |      [[4, 2], [3], [2, 2, 1]]]
121 | 
122 | 
123 |     >>> #(2) Test with data dictionary
124 |     >>> data2 = {'dave': dave, 'jack': jack, 'support': support}
125 |     >>> pprint.pprint(data2)
126 |     {'dave': ['All work and no play makes Jack a dull boy',
127 |               'All work and no play makes Jack a dull boy.',
128 |               'All work and no play makes Jack a very dull boy!'],
129 |      'jack': ["I'm sorry Dave, I'm afraid I can't do that!",
130 |               "I'm sorry Dave, I'm afraid I can't do that",
131 |               "I'm sorry Dave, I'm afraid I cannot do that"],
132 |      'support': [['Play makes really dull', 'really dull'],
133 |                  ['Dave is human'],
134 |                  ['All work', 'all dull', 'dull']]}
135 |     >>> data2_tokenized = deep_map(data2, tokenize)
136 |     >>> pprint.pprint(data2_tokenized['support'])
137 |     [[['Play', 'makes', 'really', 'dull'], ['really', 'dull']],
138 |      [['Dave', 'is', 'human']],
139 |      [['All', 'work'], ['all', 'dull'], ['dull']]]
140 |     """
141 | 
142 |     cache = {}
143 | 
144 |     def deep_map_recursion(inner_xs, keys=None):
145 |         if cache_fun and id(inner_xs) in cache:
146 |             return cache[id(inner_xs)]
147 |         if isinstance(inner_xs, dict):
148 |             xs_mapped = {}
149 |             for k, x in sorted(inner_xs.items(),
150 |                                key=lambda it: it[0]):  # to make deterministic (e.g. for consistent symbol id's)
151 |                 if keys is None or k in keys:
152 |                     if expand:
153 |                         xs_mapped[k] = x
154 |                         # if expand: create new key for transformed element, else use same key
155 |                         k = '%s_%s' % (str(k), str(fun_name))
156 |                     if isinstance(x, list) or isinstance(x, dict):
157 |                         x_mapped = deep_map_recursion(x)
158 |                     else:
159 |                         x_mapped = fun(x)
160 |                     xs_mapped[k] = x_mapped
161 |                 else:
162 |                     xs_mapped[k] = x
163 |         else:
164 |             xs_mapped = []
165 |             for k, x in enumerate(inner_xs):
166 |                 if keys is None or k in keys:
167 |                     if expand:
168 |                         xs_mapped.append(x)
169 |                     if isinstance(x, list) or isinstance(x, dict):
170 |                         x_mapped = deep_map_recursion(x) #deep_map(x, fun, fun_name=fun_name)
171 |                     else:
172 |                         x_mapped = fun(x)
173 |                     xs_mapped.append(x_mapped)
174 |                 else:
175 |                     xs_mapped.append(x)
176 |         if cache_fun:
177 |             cache[id(inner_xs)] = xs_mapped
178 |         return xs_mapped
179 | 
180 |     return deep_map_recursion(xs,keys)
181 | 
182 | 
183 | def deep_seq_map(xss, fun, keys=None, fun_name=None, expand=False):
184 |     """Applies fun to list of or dict of lists; adds the results in-place.
185 | 
186 |     Usage: Transform a corpus iteratively by applying functions like
187 |     `tokenize`, `lower`, or vocabulary functions (word -> embedding id) to it.
188 | 
189 |     from jtr.sisyphos.vocab import Vocab
190 |     vocab = Vocab()
191 |     keys = ['question', 'support']
192 | 
193 |     corpus = deep_map(corpus, lambda x: x.lower(), keys)
194 |     corpus = deep_map(corpus, tokenize, keys)
195 |     corpus = deep_map(corpus, vocab, keys)
196 |     corpus = deep_map(corpus, vocab._normalize, keys=keys)
197 |     -> through tokenize we go from a dict of sentences to
198 |        a dict of words (list of lists), thus we now apply deep_seq_map for
199 |        processing to add start of and end of sentence tags:
200 |     corpus = deep_seq_map(corpus, lambda xs: ["<SOS>"] + xs +
201 |                                              ["<EOS>"],
202 |                                              ['question', 'support'])
203 | 
204 |     -> From here we can create batches from the corpus and feed it into a model.
205 | 
206 |     In case `expand==False` each top-level entry of `xs` to be transformed
207 |     replaces the original entry.
208 |     `deep_map` supports `xs` to be a dictionary or a list/tuple:
209 |       - In case `xs` is a dictionary, its transformed value is also a dictionary, and `keys` contains the keys of the
210 |       values to be transformed.
211 |       - In case `xs` is a list/tuple, `keys` contains the indices of the entries to be transformed
212 |     The function `deep_map` is recursively applied to the values of `xs`;
213 |     the function `fun` takes a sequence as input, and is applied at the one but deepest level,
214 |     where the entries are sequences of objects (no longer sequences of sequences).
215 |     This is the only difference with `deep_map`
216 | 
217 |     Args:
218 |       `xs`: a sequence (list/tuple) of objects or sequences of objects.
219 |       `fun`: a function to transform sequences
220 |       `keys`: seq with keys if `xs` is dict; seq with integer indices if `xs` is seq.
221 |         For entries not in `keys`, the original `xs` value is retained.
222 |       `fun_name`: default value 'trf'; string with function tag (e.g. 'lengths'),
223 |         used if '''expand==True''' and '''isinstance(xs,dict)'''
224 |         Say for example fun_name='count', and `keys` contains 'sentence', then the transformed dict would look like
225 |         '''{'sentence':[sentences], 'sentence_lengths':[fun(sentences)] ...}'''
226 | 
227 |     Returns:
228 |       Transformed sequence or dictionary.
229 | 
230 |     Example:
231 |         >>> dave = [
232 |         ...         "All work and no play makes Jack a dull boy",
233 |         ...         "All work and no play makes Jack a dull boy.",
234 |         ...         "All work and no play makes Jack a very dull boy!"]
235 |         >>> jack = [
236 |         ...         "I'm sorry Dave, I'm afraid I can't do that!",
237 |         ...         "I'm sorry Dave, I'm afraid I can't do that",
238 |         ...         "I'm sorry Dave, I'm afraid I cannot do that"]
239 |         >>> support = [
240 |         ...         ["Play makes really dull", "really dull"],
241 |         ...         ["Dave is human"],
242 |         ...         ["All work", "all dull", "dull"]]
243 |         >>> data2 = {'dave': dave, 'jack': jack, 'support': support}
244 |         >>> vocab2 = Vocab()
245 |         >>> data2_processed = deep_map(data2, lambda x: tokenize(x.lower()))
246 |         >>> data2_ids = deep_map(data2_processed, vocab2)
247 |         >>> data2_ids_with_lengths = deep_seq_map(data2_ids, lambda xs: len(xs), keys=['dave','jack','support'],
248 |         ...                                       fun_name='lengths', expand=True)
249 |         >>> pprint.pprint(data2_ids_with_lengths)
250 |         {'dave': [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
251 |                   [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
252 |                   [1, 2, 3, 4, 5, 6, 7, 8, 12, 9, 10, 13]],
253 |          'dave_lengths': [10, 11, 12],
254 |          'jack': [[14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 21, 15, 22, 23, 24, 13],
255 |                   [14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 21, 15, 22, 23, 24],
256 |                   [14, 15, 16, 17, 18, 19, 14, 15, 16, 20, 14, 25, 23, 24]],
257 |          'jack_lengths': [17, 16, 14],
258 |          'support': [[[5, 6, 26, 9], [26, 9]], [[18, 27, 28]], [[1, 2], [1, 9], [9]]],
259 |          'support_lengths': [[4, 2], [3], [2, 2, 1]]}
260 |     """
261 | 
262 |     if isinstance(xss, list) and all([not isinstance(xs, list) for xs in xss]):
263 |         return fun(xss)
264 |     else:
265 |         if isinstance(xss, dict):
266 |             xss_mapped = {}
267 |             for k, xs in xss.items():
268 |                 if keys is None or k in keys:
269 |                     if expand:
270 |                         xss_mapped[k] = xs
271 |                         k = '%s_%s' % (str(k), str(fun_name) if fun_name is not None else 'trf')
272 |                     if isinstance(xs, list) and all([not isinstance(x, list) for x in xs]):
273 |                         xss_mapped[k] = fun(xs)
274 |                     else:
275 |                         xss_mapped[k] = deep_seq_map(xs, fun)  # fun_name not needed, because expand==False
276 |                 else:
277 |                     xss_mapped[k] = xs
278 |         else:
279 |             xss_mapped = []
280 |             for k, xs in enumerate(xss):
281 |                 if keys is None or k in keys:
282 |                     if expand:
283 |                         xss_mapped.append(xs)
284 |                     if isinstance(xs, list) and all([not isinstance(x, list) for x in xs]):
285 |                         xss_mapped.append(fun(xs))
286 |                     else:
287 |                         xss_mapped.append(deep_seq_map(xs, fun))
288 |                 else:
289 |                     xss_mapped.append(xs)
290 |         return xss_mapped
291 | 
292 | 
293 | def dynamic_subsample(xs, candidate_key, answer_key, how_many=1, avoid=[]):
294 |     """Replaces candidates by a mix of answers and random candidates.
295 | 
296 |     Creates negative samples by combining the true answers and some random
297 |     deletion of entries in the candidates. Then replaces the candidates
298 |     dictionary and returns it.
299 | 
300 |     Replace a list of lists with a list of dynamically subsampled lists. The dynamic list will
301 |     always contain the elements from the `answer_key` list, and a subsample of size `how_many` from
302 |     the corresponding `candidate_key` list
303 |     Args:
304 |         xs: a dictionary of keys to lists
305 |         candidate_key: the key of the candidate list
306 |         answer_key: the key of the answer list
307 |         how_many: how many samples from the candidate list should we take
308 |         avoid: list of candidates to be avoided
309 |         (note: only those are avoided, any instances according to `answer_key` which are not
310 |         in `avoid`, may still be sampled!)
311 | 
312 |     Returns:
313 |         a new dictionary identical to `xs` for all but the `candidate_key`. For that key the value
314 |         is a list of `DynamicSubsampledList` objects.
315 | 
316 |     Example:
317 |         >>> data = {'answers':[[1,2],[3,4]], 'candidates': [range(0,100), range(0,100)]}
318 |         >>> processed = dynamic_subsample(data, 'candidates', 'answers', 2)
319 |         >>> " | ".join([" ".join([str(elem) for elem in elems]) for elems in processed['candidates']])
320 |         '1 2 89 39 | 3 4 90 82'
321 |         >>> " | ".join([" ".join([str(elem) for elem in elems]) for elems in processed['candidates']])
322 |         '1 2 84 72 | 3 4 9 6'
323 |         >>> " | ".join([" ".join([str(elem) for elem in elems]) for elems in processed['answers']])
324 |         '1 2 | 3 4'
325 |         >>> processed = dynamic_subsample(data, 'candidates', 'answers', 5, avoid=range(91))
326 |         >>> " | ".join([" ".join([str(elem) for elem in elems]) for elems in processed['candidates']])
327 |         '1 2 93 91 91 95 97 | 3 4 93 99 92 98 93'
328 |     """
329 |     candidate_dataset = xs[candidate_key]
330 |     answer_dataset = xs[answer_key]
331 |     new_candidates = []
332 |     assert (len(candidate_dataset) == len(answer_dataset))
333 |     for i in range(0, len(candidate_dataset)):
334 |         candidates = candidate_dataset[i]
335 |         answers = [answer_dataset[i]] if not hasattr(answer_dataset[i],'__len__') else answer_dataset[i]
336 |         new_candidates.append(DynamicSubsampledList(answers, candidates, how_many, avoid=avoid, rand=rs))
337 |     result = {}
338 |     result.update(xs)
339 |     result[candidate_key] = new_candidates
340 |     return result
341 | 
342 | 
343 | 
344 | 
345 | class DynamicSubsampledList:
346 |     """
347 |     A container that produces different list subsamples on every call to `__iter__`.
348 | 
349 |     >>> dlist = DynamicSubsampledList([1,2], range(0,100),2, rand=rs)
350 |     >>> print(" ".join([str(e) for e in dlist]))
351 |     1 2 23 61
352 |     >>> print(" ".join([str(e) for e in dlist]))
353 |     1 2 92 39
354 |     """
355 | 
356 |     def __init__(self, always_in, to_sample_from, how_many, avoid=[], rand=rs):
357 |         self.always_in = always_in
358 |         self.to_sample_from = to_sample_from
359 |         self.how_many = how_many
360 |         self.avoid = set(avoid)
361 |         self.random = rand
362 | 
363 |     def __iter__(self):
364 |         result = []
365 |         result += self.always_in
366 |         if len(self.avoid) == 0:
367 |             result.extend(list(self.random.choice(self.to_sample_from, size=self.how_many, replace=True)))
368 |         else:
369 |             for _ in range(self.how_many):
370 |                 avoided = False
371 |                 trial, max_trial = 0, 50
372 |                 while (not avoided and trial < max_trial):
373 |                     samp = self.random.choice(self.to_sample_from)
374 |                     trial += 1
375 |                     avoided = False if samp in self.avoid else True
376 |                 result.append(samp)
377 |         return result.__iter__()
378 | 
379 |     def __len__(self):
380 |         return len(self.always_in)+self.how_many#number of items is the number of answers plus number of negative samples
381 |     
382 |     def __getitem__(self, key):
383 |         #todo: verify
384 |         return self.always_in[0]
385 | 
386 | 
387 | def get_list_shape(xs):
388 |     if isinstance(xs,int):
389 |         shape=[]
390 |     else:
391 |         shape = [len(xs)]
392 |         for i, x in enumerate(xs):
393 |             if isinstance(x, list) or isinstance(x, DynamicSubsampledList):
394 |                 if len(shape) == 1:
395 |                     shape.append(0)
396 |                 shape[1] = max(len(x), shape[1])
397 |                 for j, y in enumerate(x):
398 |                     if isinstance(y, list) or isinstance(y, DynamicSubsampledList):
399 |                         if len(shape) == 2:
400 |                             shape.append(0)
401 |                         shape[2] = max(len(y), shape[2])
402 |     return shape
403 | 
404 | 
405 | def get_seq_depth(xs):
406 |     return [n - 1 for n in get_list_shape(xs)]
407 | 
408 | 
409 | 
410 | def get_entry_dims(corpus):
411 |     """
412 |     get number of dimensions for each entry; needed for placeholder generation
413 |     """
414 |     #todo: implement recursive form; now only OK for 'regular' (=most common type of) data structures
415 |     if isinstance(corpus, dict):
416 |         keys = list(corpus.keys())
417 |         dims = {key: 0 for key in keys}
418 |     else:
419 |         keys = range(len(corpus))
420 |         dims = [0 for i in range(len(corpus))]  #scalars have dim 0 (but tensor version will have shape length 1)
421 |     for key in keys:
422 |         entry = corpus[key]
423 |         try:
424 |             while hasattr(entry, '__len__'):
425 |                 dims[key] += 1
426 |                 entry = entry[0]  #will fail if entry is dict
427 |         except:
428 |             dims[key] = None
429 |     return dims
430 | 
431 | 
432 | 
433 | def numpify(xs, pad=0, keys=None, dtypes=None):
434 |     """Converts a dict or list of Python data into a dict of numpy arrays."""
435 |     is_dict = isinstance(xs, dict)
436 |     xs_np = {} if is_dict else [0] * len(xs)
437 |     xs_iter = xs.items() if is_dict else enumerate(xs)
438 | 
439 |     for i, (key, x) in enumerate(xs_iter):
440 |         if keys is None or key in keys:
441 |             shape = get_list_shape(x)
442 |             if dtypes is None:
443 |                 dtype = np.int64
444 |             else:
445 |                 dtype = dtypes[i]
446 |             x_np = np.full(shape, pad, dtype)
447 |             dims = len(shape)
448 |             if dims == 0:
449 |                 x_np=x
450 |             elif dims == 1:
451 |                 x_np[0:shape[0]] = x
452 |             elif dims == 2:
453 |                 for j, y in enumerate(x):
454 |                     x_np[j, 0:len(y)] = [ys for ys in y]#this comprehension turns DynamicSubsampledList into a list
455 |             elif dims == 3:
456 |                 for j, ys in enumerate(x):
457 |                     for k, y in enumerate(ys):
458 |                         x_np[j, k, 0:len(y)] = y
459 |             else:
460 |                 raise (NotImplementedError)
461 |                 # todo: extend to general case
462 |                 pass
463 |             xs_np[key] = x_np
464 |         else:
465 |             xs_np[key] = x
466 |     return xs_np
467 | 
468 | 
469 | def map_to_targets(xs, cands_name, ans_name):
470 |     """
471 |     Create cand-length vector for each training instance with 1.0s for cands which are the correct answ and 0.0s for cands which are the wrong answ
472 |     #@todo: integrate this function with the one below - the pipeline() method only works with this function
473 |     """
474 |     targs = []
475 |     for i in range(len(xs[ans_name])):
476 |         targ = []
477 |         for cand in xs[cands_name]:
478 |             if xs[ans_name][i] == cand:
479 |                 targ.append(1.0)
480 |             else:
481 |                 targ.append(0.0)
482 |         targs.append(targ)
483 |     xs["targets"] = targs
484 |     return xs
485 | 
486 | if __name__ == '__main__':
487 |     import doctest
488 | 
489 |     print(doctest.testmod())
490 | 


--------------------------------------------------------------------------------
/mtl/training.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from collections import defaultdict
  3 | import numpy as np
  4 | from mtl.nn import bicond_reader, relabel_model
  5 | import os
  6 | from sklearn.metrics import classification_report
  7 | from preproc.log_utils import log_results, task2score, postproc_stance
  8 | from sklearn.metrics import f1_score
  9 | from copy import deepcopy
 10 | from constants import TASK_NAMES_SHORT
 11 | import copy
 12 | from preproc import batch
 13 | 
 14 | def alternate_epochs(target_sizes, max_iter, train_feed_dicts):
 15 |     """Return a batch generator that returns one epoch per batch and then
 16 |     switches tasks."""
 17 |     for task in target_sizes.keys():
 18 |         for j in range(0, max_iter):
 19 |             yield task, train_feed_dicts[task][j]
 20 | 
 21 | 
 22 | def alternate_batches(target_sizes, max_iter, train_feed_dicts):
 23 |     """Return a batch generator that returns one batch per task and then
 24 |     switches tasks."""
 25 |     for j in range(0, max_iter):
 26 |         for task in target_sizes.keys():
 27 |             yield task, train_feed_dicts[task][j]
 28 | 
 29 | 
 30 | def balanced_mtl_training_loop(placeholders, target_sizes, train_feed_dicts, dev_feed_dicts, max_iter,
 31 |                                min_op, logits_dict, loss_dict, preds_dict, sess, **options):
 32 |     # trains a MTL model, samples equal amounts of training data from each task and weighs the updates equally
 33 |     # early stopping based on main task dev set
 34 | 
 35 |     max_epochs = options["max_epochs"]
 36 |     main_task = options["main_task"]
 37 |     early_stopping = options["early_stopping"]
 38 |     batch_iter = alternate_batches if options["alternate_batches"] else alternate_epochs
 39 | 
 40 |     main_task_dev_acc = []
 41 |     stopping_criteron_reached = False
 42 | 
 43 |     for i in range(1, max_epochs + 1):
 44 |         task2loss_all, task2correct_all = defaultdict(list), defaultdict(float)
 45 |         task2total, task2correct_dev_all = defaultdict(float), defaultdict(float)
 46 |         task2total_dev = defaultdict(float)
 47 |         for task, batch in batch_iter(target_sizes, max_iter, train_feed_dicts):
 48 |             _, current_loss, p = sess.run([min_op[task], loss_dict[task], preds_dict[task]], feed_dict=batch)
 49 |             task2loss_all[task].extend(current_loss)
 50 |             hits = [pp for ii, pp in enumerate(p) if np.argmax(pp) == np.argmax(batch[placeholders["targets"]][ii])]
 51 |             task2correct_all[task] += len(hits)
 52 |             task2total[task] += len(batch[placeholders["targets"]])
 53 | 
 54 |         for task in target_sizes.keys():
 55 |             if options['dev_res_during_training']:
 56 |                 p_inds_dev, g_inds_dev = [], []
 57 |                 for j, batch_dev in enumerate(dev_feed_dicts[task]):
 58 |                     p_dev = sess.run(preds_dict[task], feed_dict=batch_dev)
 59 | 
 60 |                     # this is for super detailed results -- maybe we don't want to print this every epoch later on
 61 |                     if i % 1 == 0:
 62 |                         pred_inds = [np.argmax(pp_dev) for pp_dev in p_dev]
 63 |                         p_inds_dev.extend(pred_inds)
 64 |                         gold_inds = [np.argmax(batch_dev[placeholders["targets"]][i_d]) for i_d, targ in
 65 |                                      enumerate(batch_dev[placeholders["targets"]])]
 66 |                         g_inds_dev.extend(gold_inds)
 67 | 
 68 |                     hits = [pp for k, pp in enumerate(p_dev) if
 69 |                             np.argmax(pp) == np.argmax(batch_dev[placeholders["targets"]][k])]
 70 |                     task2correct_dev_all[task] += len(hits)
 71 |                     task2total_dev[task] += len(batch_dev[placeholders["targets"]])
 72 | 
 73 |             # Randomise batch IDs, so that selection of batch is random
 74 |             np.random.shuffle(train_feed_dicts[task])
 75 |             np.random.shuffle(dev_feed_dicts[task])
 76 |             acc = task2correct_all[task] / task2total[task]
 77 |             acc_dev = 0
 78 |             if options['dev_res_during_training']:
 79 |                 acc_dev = task2correct_dev_all[task] / task2total_dev[task]
 80 |             if task != main_task:
 81 |                 print('Epoch %d :' % i, "Task: " + task, "Loss: ", np.mean(task2loss_all[task]), "Acc: ", acc, "Acc Dev: ",
 82 |                       acc_dev)
 83 |             else:
 84 |                 print('Epoch %d :' % i, "Task: " + task, "Loss: ", np.mean(task2loss_all[task]), "Acc: ", acc, "Acc Dev: ",
 85 |                       acc_dev, "Previous Acc Dev: ", main_task_dev_acc)
 86 | 
 87 |             # too much information
 88 |             #if options['dev_res_during_training'] and i % 1 == 0:
 89 |             #    print(classification_report(g_inds_dev, p_inds_dev, target_names=target_labels[task]))
 90 | 
 91 |             if task == main_task:
 92 |                 if acc_dev >= early_stopping and len(main_task_dev_acc) >= 3 and acc_dev < main_task_dev_acc[-3]:
 93 |                     print("Dev accuracy is smaller than 4 epochs ago, early stopping criteron reached.")
 94 |                     stopping_criteron_reached = True
 95 |                     break
 96 |                 main_task_dev_acc.append(acc_dev)
 97 |         if stopping_criteron_reached == True:
 98 |             break
 99 | 
100 |     return logits_dict, loss_dict, preds_dict
101 | 
102 | 
103 | 
104 | 
105 | def balanced_mtl_with_ltn_training_loop(placeholders, target_sizes, train_feed_dicts, dev_feed_dicts, max_iter, min_op, min_op_ltn,
106 |                                         logits_dict, loss_dict, preds_dict, logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict, label_to_labelvocab, sess, **options):
107 | 
108 | 
109 |     max_epochs_ltn = options["max_epochs_ltn"]
110 |     main_task = options["main_task"]
111 |     early_stopping = options["early_stopping"]
112 |     hard_or_soft = options['ltn_pred_type']
113 |     batch_iter = alternate_batches if options["alternate_batches"] else alternate_epochs
114 | 
115 |     main_task_dev_acc = []
116 |     stopping_criteron_reached = False
117 | 
118 |     augment_data_from_epoch = max_epochs_ltn
119 |     if options["relabel_with_ltn"]:
120 |         # extend the training loop - for post-LTN data augmentation
121 |         augment_data_from_epoch = max_epochs_ltn
122 |         max_epochs_ltn = max_epochs_ltn + options["max_epochs_after_ltn"]
123 | 
124 |     for i in range(1, max_epochs_ltn + 1):
125 |         task2loss_all, task2correct_all = defaultdict(list), defaultdict(float)
126 |         task2total, task2correct_dev_all = defaultdict(float), defaultdict(float)
127 |         task2total_dev = defaultdict(float)
128 |         task2loss_all_ltn, task2correct_all_ltn = defaultdict(list), defaultdict(float)
129 |         task2total_ltn, task2correct_dev_all_ltn = defaultdict(float), defaultdict(float)
130 | 
131 |         # we collect these only if we want to relabel
132 |         task2preds = defaultdict(list)
133 |         batches_to_relab = []  # collect those we've covered in this round
134 |         batch2task = defaultdict()  # this one is for error analysis
135 | 
136 |         batch_id = 0
137 |         for task, batch in batch_iter(target_sizes, max_iter, train_feed_dicts):
138 | 
139 |             # this is just the normal training step - we minimise the loss on the task's own training data here
140 |             #batch = train_feed_dicts[task][j]
141 |             _, current_loss, p = sess.run([min_op[task], loss_dict[task], preds_dict[task]], feed_dict=batch)
142 |             task2loss_all[task].extend(current_loss)
143 |             hits = [pp for ii, pp in enumerate(p) if np.argmax(pp) == np.argmax(batch[placeholders["targets"]][ii])]
144 |             task2correct_all[task] += len(hits)
145 |             task2total[task] += len(batch[placeholders["targets"]])
146 | 
147 |             # now we apply the models for all the other tasks to the batch and collect the predictions
148 |             # this is used as input to the LTN model, to determine which of the instances in the batch can
149 |             # also be used as training data for any of the other tasks
150 |             batch = get_preds_for_ltn(sess, batch, placeholders, target_sizes, task, main_task, preds_dict,
151 |                                       hard_or_soft, label_to_labelvocab, options["lab_emb_dim"], options["model_type"])
152 | 
153 |             if task != main_task:  # then we want to have a relabelling model training step
154 |                 _, current_loss_ltn, p_ltn = sess.run([min_op_ltn[task], loss_dict_ltn[task], preds_dict_ltn[task]], feed_dict=batch)
155 |                 task2loss_all_ltn[task].extend(current_loss_ltn)
156 |                 hits_ltn = [pp for ii, pp in enumerate(p_ltn) if np.argmax(pp) == np.argmax(batch[placeholders["targets"]][ii])]
157 |                 task2correct_all_ltn[task] += len(hits_ltn)
158 |                 task2total_ltn[task] += len(batch[placeholders["targets"]])
159 | 
160 |             if task == main_task and options["relabel_with_ltn"] and i >= augment_data_from_epoch:
161 |                 p_ltn = sess.run(predict_main_dict[task], feed_dict=batch)
162 |                 task2preds[task].append(p_ltn)
163 |                 batches_to_relab.append(batch)
164 |                 batch2task[batch_id] = task
165 | 
166 |             batch_id += 1
167 | 
168 |         if options["relabel_with_ltn"] and i >= augment_data_from_epoch:
169 |             # we need to apply the relabelling function on the main task data first here, then pass the results on as train_feed_dicts
170 | 
171 |             # Before we reshuffle, see if we should augment the main task data
172 |             train_data_additional = relabel_data_with_ltn_preds(batches_to_relab, task2preds, options["batch_size"])
173 |             train_feed_dicts[main_task].append(train_data_additional)
174 | 
175 |         for task in target_sizes.keys():
176 |             p_inds_dev, g_inds_dev, p_ids_ltn = [], [], []
177 |             if options['dev_res_during_training']:
178 |                 for batch_dev in dev_feed_dicts[task]:
179 | 
180 |                     batch_dev = get_preds_for_ltn(sess, batch_dev, placeholders, target_sizes, task, main_task, preds_dict,
181 |                                                   hard_or_soft, label_to_labelvocab, options["lab_emb_dim"], options["model_type"])
182 | 
183 |                     p_dev = sess.run(preds_dict[task], feed_dict=batch_dev)
184 |                     hits = [pp for k, pp in enumerate(p_dev) if
185 |                         np.argmax(pp) == np.argmax(batch_dev[placeholders["targets"]][k])]
186 |                     task2correct_dev_all[task] += len(hits)
187 |                     task2total_dev[task] += len(batch_dev[placeholders["targets"]])
188 | 
189 |                     # this is for super detailed results -- maybe we don't want to print this every epoch later on
190 |                     if i % 1 == 0:
191 |                         pred_inds = [np.argmax(pp_dev) for pp_dev in p_dev]
192 |                         p_inds_dev.extend(pred_inds)
193 |                         gold_inds = [np.argmax(batch_dev[placeholders["targets"]][i_d]) for i_d, targ in
194 |                                  enumerate(batch_dev[placeholders["targets"]])]
195 |                         g_inds_dev.extend(gold_inds)
196 | 
197 |                         p_dev_ltn = sess.run(preds_dict_ltn[task], feed_dict=batch_dev)
198 |                         pred_inds_ltn = [np.argmax(pp_dev) for pp_dev in p_dev_ltn]
199 |                         p_ids_ltn.extend(pred_inds_ltn)
200 | 
201 | 
202 |             # Randomise batch IDs, so that selection of batch is random
203 |             np.random.shuffle(train_feed_dicts[task])
204 |             np.random.shuffle(dev_feed_dicts[task])
205 | 
206 |             if options['dev_res_during_training']:
207 |                 acc, acc_dev = task2correct_all[task] / task2total[task], task2correct_dev_all[task] / task2total_dev[task]
208 |             else:
209 |                 acc = task2correct_all[task] / task2total[task]
210 |                 acc_dev = 0.0
211 | 
212 |             try:
213 |                 acc_ltn_train = task2correct_all_ltn[task] / task2total_ltn[task]
214 |             except ZeroDivisionError:
215 |                 acc_ltn_train = 0
216 | 
217 |             if options["model_type"] == "label-transfer" or (options["model_type"] == 'semi-supervised' and task != main_task):
218 |                 print('Epoch %d :' % i, "Task: " + task, "Loss: ", np.mean(task2loss_all[task]), "Acc: ", acc, "Acc Dev: ",
219 |                       acc_dev, "Acc LTN Train: ", acc_ltn_train)
220 |             else:
221 |                 print('Epoch %d :' % i, "Task: " + task, "Loss: ", np.mean(task2loss_all[task]), "Acc: ", acc, "Acc Dev: ",
222 |                       acc_dev, "Previous Acc Dev: ", main_task_dev_acc, "Acc LTN Train: ", acc_ltn_train)
223 | 
224 | 
225 |             if task == main_task:
226 |                 if acc_dev >= early_stopping and len(main_task_dev_acc) >= 3 and acc_dev < main_task_dev_acc[-3]:
227 |                     print("Dev accuracy is smaller than 4 epochs ago, early stopping criteron reached.")
228 |                     stopping_criteron_reached = True
229 |                     break
230 |                 main_task_dev_acc.append(acc_dev)
231 |         if stopping_criteron_reached == True:
232 |             break
233 | 
234 |     return logits_dict, loss_dict, preds_dict, logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict
235 | 
236 | 
237 | def relabel_data_with_ltn_preds(all_batches, preds_ltn, batch_size):
238 |     print("Executing the data augmentation function")
239 | 
240 |     # flatten batches for easier handling
241 |     batches_flattenend = defaultdict(list)
242 |     for key in all_batches[0].keys():
243 |         values_flat = [all_batches[i][key] for i in range(0, len(all_batches))]
244 |         values_flatter = [ii for i in values_flat for ii in i]
245 |         batches_flattenend[key] = np.stack(values_flatter, axis=0)
246 |         print("number instances")
247 |         print(len(batches_flattenend[key]))
248 | 
249 |     relabelled_insts = defaultdict(list)
250 |     num_sampled = 0
251 | 
252 |     # for each task, get the predictions
253 |     for task, preds in preds_ltn.items():
254 | 
255 |         # flatten the preds batches
256 |         preds = [pp for p in preds for pp in p]
257 |         stacked = np.stack(preds, axis=1)
258 | 
259 |         # for each row (which are the probabilities for each label), sort the row by descending value and
260 |         # store the index of the original array in the transformed array
261 |         stacked_sorted = np.argsort(-stacked)
262 | 
263 |         label2inds = defaultdict(list)
264 | 
265 |         labelindex = 0
266 |         for row in stacked_sorted:
267 |             # the number of instances to sample are the top 10%, equally distributed across the labels
268 |             num_inst_to_samp = int(len(row) * 0.1 * (1/len(stacked_sorted)))
269 |             # determine how many batches this makes. We only want full batches so we might sometimes take slightly
270 |             # less than the top 10%
271 |             number_batches_to_samp = int(num_inst_to_samp/batch_size)
272 |             num_inst_to_samp_final = number_batches_to_samp*batch_size
273 |             label2inds[labelindex] = row[:num_inst_to_samp_final]
274 | 
275 |             num_sampled += num_inst_to_samp_final
276 | 
277 |             # store which batches are useful so we can iterate over these afterwards
278 |             for instid in row[:num_inst_to_samp]:
279 | 
280 |                 for key in all_batches[0].keys():
281 |                     if key.name.startswith("label_vocab_inds:"):
282 |                         relabelled_insts[key].append(batches_flattenend[key][instid])
283 | 
284 |                     elif key.name.startswith("targets:"):
285 |                         # re-initialise the targets, then set the one for the predicted label to 1
286 |                         targets_here = np.zeros([len(batches_flattenend[key][instid])], np.int32)
287 |                         targets_here[labelindex] = 1
288 |                         relabelled_insts[key].append(targets_here)
289 | 
290 |                     else:
291 |                         relabelled_insts[key].append(batches_flattenend[key][instid])
292 | 
293 |             labelindex += 1
294 | 
295 |     # now all the relabelled data is in relabelled_insts and we need to change it to batch format again
296 |     rebatched_instances = batch.batch_feed_dicts(relabelled_insts, batch_size, num_sampled)
297 | 
298 |     return rebatched_instances
299 | 
300 | 
301 | def get_preds_for_ltn(sess, batch, placeholders, target_sizes, task, main_task, preds_dict, hard_or_soft, label_to_labelvocab, lab_emb_dim, model_type):
302 |     # get predictions on dev data for EM
303 |     p_task_for_ltn = []
304 |     if lab_emb_dim > 0:
305 |         # we don't want to modify the original batch
306 |         batch_copy = dict.copy(batch) #copy.copy(batch)
307 |     for taskjj in target_sizes.keys():
308 |         if model_type == 'semi-supervised' and (taskjj == task or taskjj == main_task):
309 |             continue
310 |         elif model_type == 'label-transfer' and (taskjj == task):
311 |             if taskjj != main_task:
312 |                 continue
313 |         if lab_emb_dim > 0:
314 |             label_vocab_inds = np.array([label_to_labelvocab[taskjj] for i in range(0, len(batch[placeholders["seq1"]]))], np.int64)
315 |             batch_copy[placeholders["label_vocab_inds"]] = label_vocab_inds
316 |             p_jj = sess.run([preds_dict[taskjj]], feed_dict=batch_copy)
317 |         else:
318 |             p_jj = sess.run([preds_dict[taskjj]], feed_dict=batch)
319 |         if hard_or_soft == 'hard':
320 |             pred_inds = [np.argmax(pp, 1) for pp in p_jj]
321 |         else:
322 |             pred_inds = p_jj
323 |         if p_task_for_ltn == []:
324 |             p_task_for_ltn = pred_inds
325 |         else:
326 |             p_task_for_ltn.extend(pred_inds)
327 | 
328 |     if model_type == 'label-transfer' or task != main_task:  # then we want to have an LTN model training step
329 |         # enter current predictions in feed_dicts so main model predictions can be used by LTN model
330 |         if hard_or_soft == 'hard':
331 |             preds_for_ltn = np.stack(p_task_for_ltn, 1)
332 |         else:
333 |             preds_for_ltn = np.concatenate(p_task_for_ltn, 1)
334 |         batch[placeholders["preds_for_ltn"]] = preds_for_ltn
335 | 
336 |     return batch
337 | 
338 | 
339 | def train(placeholders, target_sizes, train_feed_dicts, dev_feed_dicts, vocab, label_vocab, input_size_preds, num_preds_ltn, label_to_labelvocab, sess=None, **options):
340 | 
341 |     max_num_batches = {}
342 |     for task in target_sizes.keys():
343 |         max_num_batches[task] = len(train_feed_dicts[task])
344 |     if label_vocab == None:
345 |         label_vocab_len = 0
346 |     else:
347 |         label_vocab_len = len(label_vocab)
348 | 
349 |     # create model
350 |     logits_dict, loss_dict, preds_dict, label_embeddings = bicond_reader(placeholders, target_sizes, len(vocab), label_vocab_len, **options)  # those return dicts where the keys are the task names
351 | 
352 |     optim = tf.train.RMSPropOptimizer(learning_rate=options["learning_rate"])
353 | 
354 |     if options["model_type"] == "semi-supervised" or options["model_type"] == "label-transfer":
355 |         # additional TF model needed for estimating relabelling function
356 |         logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict = relabel_model(placeholders, target_sizes, input_size_preds, num_preds_ltn, label_embeddings, **options)  # those return dicts where the keys are the task names
357 |         min_op_ltn = {}
358 |         for task in target_sizes.keys():
359 |             min_op_ltn[task] = optim.minimize(tf.reduce_mean(loss_dict_ltn[task]))
360 | 
361 |     min_op = {}
362 |     for task in target_sizes.keys():
363 |         min_op[task] = optim.minimize(tf.reduce_mean(loss_dict[task]))
364 |     # The maximum number of iterations should be based on the number of batches in the smallest training set
365 |     max_iter = min(max_num_batches.values())
366 |     print("Max number batches for each task:", max_num_batches)
367 |     print("Randomly sampling one from", str(max_iter), "batches for each task every training epoch")
368 | 
369 |     tf.global_variables_initializer().run(session=sess)
370 | 
371 |     if options["save_model"] == True:
372 |         saver = tf.train.Saver(max_to_keep=100)
373 | 
374 |     if options["model_type"] == "hard-sharing":
375 |         logits_dict, loss_dict, preds_dict = balanced_mtl_training_loop(placeholders, target_sizes, train_feed_dicts,
376 |                                                                         dev_feed_dicts, max_iter,
377 |                                                                         min_op, logits_dict, loss_dict, preds_dict, sess, **options)
378 | 
379 |         logits_dict_ltn, loss_dict_ltn, preds_dict_ltn = {}, {}, {}
380 | 
381 |     elif options["model_type"] == "semi-supervised" or options["model_type"] == "label-transfer":
382 |         # load pre-trained mtl model
383 |         print("Check if pre-trained MTL model exists...")
384 |         save_path = get_save_path(create_path=False, **options)
385 |         if not os.path.exists(save_path):
386 |             print("Save path", save_path, "does not exist. Training MTL model first.")
387 |             logits_dict, loss_dict, preds_dict = balanced_mtl_training_loop(placeholders, target_sizes, train_feed_dicts,
388 |                                                                         dev_feed_dicts, max_iter,
389 |                                                                         min_op, logits_dict, loss_dict, preds_dict, sess, **options)
390 |         else:
391 |             print("Model already exists. Restoring model.")
392 |             saver = tf.train.Saver(max_to_keep=100)
393 |             saver.restore(sess, save_path + "/model.ckpt")
394 |             print("Model " + save_path + "/model.ckpt" + " restored.")
395 | 
396 |         print("\nStarting LTN training...")
397 |         logits_dict, loss_dict, preds_dict, logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict = balanced_mtl_with_ltn_training_loop(placeholders, target_sizes, train_feed_dicts,
398 |                                                                         dev_feed_dicts, max_iter, min_op, min_op_ltn, logits_dict, loss_dict, preds_dict,
399 |                                                                         logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict, label_to_labelvocab, sess, **options)
400 | 
401 |     if options["save_model"] == True:
402 |         savepath = get_save_path(create_path=True, **options)
403 |         print("Saving model at location:", savepath)
404 |         saver.save(sess, savepath + "/model.ckpt")
405 | 
406 |     return logits_dict, loss_dict, preds_dict, logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict
407 | 
408 | 
409 | def restore_trained_model(placeholders, target_sizes, train_feed_dicts, vocab, label_vocab_len, label_to_labelvocab, input_size_preds, num_preds_ltn, sess=None, **options):
410 | 
411 |     max_num_batches = {}
412 |     for task in target_sizes.keys():
413 |         max_num_batches[task] = len(train_feed_dicts[task])
414 | 
415 |     # create model
416 |     logits_dict, loss_dict, preds_dict, label_embeddings = bicond_reader(placeholders, target_sizes, len(vocab), label_vocab_len, **options)  # those return dicts where the keys are the task names
417 | 
418 |     logits_dict_ltn = loss_dict_ltn = preds_dict_ltn = None
419 |     if options["model_type"] == "semi-supervised":
420 |         logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict = relabel_model(placeholders, target_sizes, input_size_preds, num_preds_ltn, label_embeddings, **options)  # those return dicts where the keys are the task names
421 | 
422 |     tf.global_variables_initializer().run(session=sess)
423 | 
424 |     saver = tf.train.Saver(max_to_keep=100)
425 |     save_path = get_save_path(create_path=False, **options)
426 |     if not os.path.exists(save_path):
427 |         print("Save path", save_path, "does not exist. Model cannot be loaded. Aborting.")
428 |         return "", "", ""
429 |     saver.restore(sess, save_path + "/model.ckpt")
430 |     print("Model " + save_path + "/model.ckpt" + " restored.")
431 | 
432 |     return logits_dict, loss_dict, preds_dict, logits_dict_ltn, loss_dict_ltn, preds_dict_ltn, predict_main_dict
433 | 
434 | 
435 | def get_save_path(create_path=True, **options):
436 | 
437 |     tasks = ",".join([TASK_NAMES_SHORT[tsk] for tsk in options["tasks"]])
438 |     if type(options["feature_sets"]) == list:
439 |         features = ",".join(options["feature_sets"])
440 |     else:
441 |         features = options["feature_sets"]
442 |     debug = False
443 |     if str(options["debug"]):
444 |         debug = True
445 |     lab_embs_for_ltn = False
446 |     if str(options["lab_embs_for_ltn"]):
447 |         lab_embs_for_ltn = True
448 |     skip_connections = False
449 |     if str(options["skip_connections"]):
450 |         skip_connections = True
451 |     attention = False
452 |     if str(options["attention"]):
453 |         attention = True
454 |     alternate_batches = False
455 |     if str(options["alternate_batches"]):
456 |         alternate_batches = True
457 |     ltn_pred_type, lel_hid_size, max_ltn = "", "", ""
458 |     if options["model_type"] != "hard_sharing":
459 |         ltn_pred_type = options['ltn_pred_type']
460 |         lel_hid_size = str(options["lel_hid_size"])
461 |         max_ltn = str(options["max_epochs_ltn"])
462 | 
463 |     save_model_dir = "_".join([options["model_type"], ltn_pred_type, options["main_task"], tasks, str(debug),
464 |                                str(options["num_instances"]), str(options["emb_dim"]), lel_hid_size,
465 |                                str(options["task_specific_layer_size"]), str(options["lab_emb_dim"]),
466 |                                str(skip_connections), features, str(options["main_num_layers"]),
467 |                                str(options["rnn_cell_type"]),
468 |                                str(lab_embs_for_ltn), str(attention), str(alternate_batches),
469 |                                str(options["batch_size"]), str(options["max_epochs"]), max_ltn,
470 |                                str(options["early_stopping"]), str(options["learning_rate"]),
471 |                                str(options["l1_rate_main"]), str(options["l2_rate_main"]),
472 |                                str(options["l1_rate_ltn"]), str(options["l2_rate_ltn"]),
473 |                                str(options["dropout_rate"]), str(options["exp_id"])])
474 | 
475 |     save_path = os.path.abspath(os.path.join("./save/", save_model_dir))
476 |     if create_path == True and not os.path.exists(save_path):
477 |         os.makedirs(save_path)
478 |     return save_path


--------------------------------------------------------------------------------