├── README.md ├── code ├── .DS_Store ├── LSTM_attention.py ├── LSTM_conditional.py ├── README.md ├── __init__.py ├── basicLSTM_model_config.py ├── bow_model_config.py ├── data_analysis_plotting │ ├── Results_loading_1.R │ ├── data_analysis.Rmd │ ├── results_analysis.Rmd │ └── results_analysis_2.Rmd ├── execute_bow_config.py ├── execute_lstm_attention.py ├── execute_lstm_conditional.py ├── execute_lstm_config.py ├── our_model_config.py ├── our_util.py ├── run_text_processing.py └── test_script6.py ├── paper.pdf └── poster.pdf /README.md: -------------------------------------------------------------------------------- 1 | # Stance Detection for the Fake News Challenge with Conditional Encoding and Attention LSTM 2 | Stance Detection Model developed by Stephen Pfohl, Oskar Triebe and Ferdinand Legros for the Fake News Challenge with Conditional Encoding and Attention LSTM, as Stanford CS224N class project. 3 | 4 | Additional to the code written for the Fake News Challenge (http://www.fakenewschallenge.org/), the poster and paper presenting the work done are included. 5 | 6 | Our motivation to publish this is to help other researchers kickstart their projects. Please feel free to use under appropriate attribution. -------------------------------------------------------------------------------- /code/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ourownstory/stance_detection/df9f34883005d203cb6d8ee7d56d9bc82ef39b62/code/.DS_Store -------------------------------------------------------------------------------- /code/LSTM_attention.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | 4 | ###### 5 | # Model class for Baseline_LSTM 6 | # Based on starter code from PS3-CS224n 7 | ###### 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | 11 | import argparse 12 | import sys 13 | import time 14 | import logging 15 | from datetime import datetime 16 | 17 | import tensorflow as tf 18 | import numpy as np 19 | 20 | from our_util import Progbar, minibatches, get_performance, softmax 21 | from our_model_config import OurModel 22 | 23 | logger = logging.getLogger("hw3.q3") 24 | logger.setLevel(logging.DEBUG) 25 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) 26 | 27 | class LSTMAttention(OurModel): 28 | 29 | def add_placeholders(self): 30 | """Generates placeholder variables to represent the input tensors 31 | MODIF: OVERWRITING 32 | """ 33 | self.inputs_placeholder = tf.placeholder(tf.int64, shape=(None, self.config.max_length), name = "x") 34 | self.labels_placeholder = tf.placeholder(tf.int64, shape=(None), name = "y") 35 | self.seqlen_placeholder = tf.placeholder(tf.int64, shape=(None), name = "seqlen") 36 | self.dropout_placeholder = tf.placeholder(tf.float64, name = 'dropout') 37 | 38 | def create_feed_dict(self, inputs_batch, seqlen_batch, labels_batch = None, dropout = 1.0): 39 | """Creates the feed_dict for the model. 40 | MODIF: OVERWRITING 41 | """ 42 | feed_dict = { 43 | self.inputs_placeholder: inputs_batch, 44 | } 45 | if labels_batch is not None: 46 | feed_dict[self.labels_placeholder] = labels_batch 47 | if dropout is not None: 48 | feed_dict[self.dropout_placeholder] = dropout 49 | feed_dict[self.seqlen_placeholder] = seqlen_batch 50 | return feed_dict 51 | 52 | def add_prediction_op(self): 53 | 54 | # Initialize 55 | theInitializer = tf.contrib.layers.xavier_initializer(uniform = True, dtype = tf.float64) 56 | 57 | # Configure LSTM cells 58 | 59 | cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = self.config.hidden_size) 60 | cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob = self.dropout_placeholder) 61 | 62 | # Create an initializer 63 | theInitializer = tf.contrib.layers.xavier_initializer(uniform = True, dtype = tf.float64) 64 | 65 | # Get the inputs 66 | x = self.add_embedding(option = self.config.trainable_embeddings) 67 | 68 | if self.config.n_layers <= 1: 69 | rnnOutput = tf.nn.dynamic_rnn(cell, inputs = x, dtype = tf.float64, sequence_length = self.seqlen_placeholder) #MODIF 70 | Y = tf.slice(rnnOutput[0], begin = [0, 0, 0], size = [-1, self.config.attention_length, -1]) 71 | h_N = rnnOutput[1][1] # batch_size, cell.state_size 72 | elif self.config.n_layers > 1: 73 | stacked_lstm = tf.nn.rnn_cell.MultiRNNCell([cell] * self.config.n_layers) 74 | rnnOutput = tf.nn.dynamic_rnn(stacked_lstm, inputs = x, dtype = tf.float64, sequence_length = self.seqlen_placeholder) #MODIF 75 | # print('rnnOutput[0] shape:', rnnOutput[0].get_shape()) 76 | Y = tf.slice(rnnOutput[0], begin = [0, 0, 0], size = [-1, self.config.attention_length, -1]) 77 | h_N = rnnOutput[1][self.config.n_layers - 1][1] 78 | # Run the RNN 79 | 80 | # Attention implementation, as in https://arxiv.org/abs/1509.06664 81 | W_y = tf.get_variable(name = 'Wy', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 82 | W_h = tf.get_variable(name = 'Wh', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 83 | w = tf.get_variable(name = 'w', shape = (self.config.hidden_size, 1), initializer = theInitializer, dtype = tf.float64) 84 | W_p = tf.get_variable(name = 'Wo', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 85 | W_x = tf.get_variable(name = 'Wx', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 86 | 87 | M_1 = tf.reshape(tf.matmul(tf.reshape(Y, shape = (-1, self.config.hidden_size)), W_y), shape = (-1, self.config.attention_length, self.config.hidden_size)) 88 | M_2 = tf.expand_dims(tf.matmul(h_N, W_h), axis = 1) 89 | M = tf.tanh(M_1 + M_2) 90 | alpha = tf.reshape(tf.nn.softmax(tf.matmul(tf.reshape(M, shape = (-1, self.config.hidden_size)), w)), shape = (-1, self.config.attention_length)) 91 | 92 | r = tf.squeeze(tf.batch_matmul(tf.transpose(tf.expand_dims(alpha, 2), perm = [0, 2, 1]), Y)) 93 | h_star = tf.tanh(tf.matmul(r, W_p) + tf.matmul(h_N, W_x)) 94 | 95 | # Output matrices 96 | U = tf.get_variable(name = 'U', shape = (self.config.hidden_size, self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 97 | b = tf.get_variable(name = 'b', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 98 | 99 | # Compute predictions 100 | preds = tf.matmul(h_star, U) + b # batch_size, n_classes 101 | return preds 102 | 103 | def add_embedding(self, option = 'Constant'): 104 | """Adds an embedding layer that maps from input tokens (integers) to vectors and then 105 | concatenates those vectors: 106 | 107 | Returns: 108 | embeddings: tf.Tensor of shape (None, max_length, n_features*embed_size) 109 | """ 110 | # option = config.trainable_embeddings 111 | if option == 'Variable': 112 | embeddings_temp = tf.nn.embedding_lookup(params = tf.Variable(self.config.pretrained_embeddings), ids = self.inputs_placeholder) 113 | elif option == 'Constant': 114 | embeddings_temp = tf.nn.embedding_lookup(params = tf.constant(self.config.pretrained_embeddings), ids = self.inputs_placeholder) 115 | embeddings = tf.reshape(embeddings_temp, shape = (-1, self.config.max_length, self.config.embed_size)) 116 | ### END YOUR CODE 117 | return embeddings 118 | 119 | def train_on_batch(self, sess, inputs_batch, labels_batch, seqlen_batch): 120 | """ 121 | MODIF 122 | Perform one step of gradient descent on the provided batch of data. 123 | 124 | Args: 125 | sess: tf.Session() 126 | input_batch: np.ndarray of shape (n_samples, n_features) # CHECK: np.ndarray?? 127 | labels_batch: np.ndarray of shape (n_samples, n_classes) 128 | labels_batch: np.array of shape (n_samples) 129 | Returns: 130 | loss: loss over the batch (a scalar) 131 | """ 132 | # inputs_batch = np.reshape(inputs_batch, (-1, inputs_batch.shape[1], 1)) 133 | labels_batch = np.reshape(labels_batch, (-1, 1)) 134 | feed = self.create_feed_dict(inputs_batch, labels_batch=labels_batch, seqlen_batch = seqlen_batch, dropout = self.config.dropout) # MODIF 135 | print(inputs_batch.shape) 136 | print(len(labels_batch)) 137 | _, loss = sess.run([self.train_op, self.loss], feed_dict=feed) 138 | return loss 139 | 140 | def predict_on_batch(self, sess, inputs_batch, seqlen_batch): 141 | """Make predictions for the provided batch of data  142 | 143 | Args: 144 | sess: tf.Session() 145 | input_batch: np.ndarray of shape (n_samples, n_features) 146 | Returns: 147 | predictions: np.ndarray of shape (n_samples, n_classes) 148 | """ 149 | feed = self.create_feed_dict(inputs_batch, seqlen_batch) 150 | predictions = sess.run(self.pred, feed_dict=feed) 151 | return predictions 152 | 153 | def run_epoch(self, sess, train): 154 | prog = Progbar(target=1 + int(len(train) / self.config.batch_size)) 155 | losses = [] 156 | for i, batch in enumerate(minibatches(train, self.config.batch_size)): 157 | loss = self.train_on_batch(sess, *batch) 158 | losses.append(loss) 159 | # grad_norms.append(grad_norm) 160 | prog.update(i + 1, [("train loss", loss)]) 161 | return losses 162 | 163 | def fit(self, sess, train, dev_data_np, dev_seqlen, dev_labels): # MODIF # CAREFUL DEV/dev 164 | ''' 165 | Returns LISTS: 166 | - losses_epochs 167 | - dev_performances_epochs 168 | - dev_predictions_epochs 169 | - dev_predicted_classes_epochs 170 | ''' 171 | losses_epochs = [] #M 172 | dev_performances_epochs = [] # MODIF 173 | dev_predictions_epochs = [] #M 174 | dev_predicted_classes_epochs = [] #M 175 | for epoch in range(self.config.n_epochs): 176 | logger.info("Epoch %d out of %d", epoch + 1, self.config.n_epochs) 177 | loss = self.run_epoch(sess, train) 178 | 179 | # Computing predictions # MODIF 180 | dev_predictions = self.predict_on_batch(sess, dev_data_np, dev_seqlen) #OUCH 181 | 182 | # Computing development performance #MODIF 183 | dev_predictions = softmax(np.array(dev_predictions)) 184 | dev_predicted_classes = np.argmax(dev_predictions, axis = 1) 185 | dev_performance = get_performance(dev_predicted_classes, dev_labels, n_classes = 4) 186 | 187 | # Adding to global outputs #MODIF 188 | dev_predictions_epochs.append(dev_predictions) 189 | dev_predicted_classes_epochs.append(dev_predicted_classes) 190 | dev_performances_epochs.append(dev_performance) 191 | losses_epochs.append(loss) 192 | 193 | return losses_epochs, dev_performances_epochs, dev_predicted_classes_epochs, dev_predictions_epochs 194 | 195 | def build(self): 196 | self.add_placeholders() 197 | self.pred = self.add_prediction_op() 198 | self.loss = self.add_loss_op(self.pred) 199 | self.train_op = self.add_training_op(self.loss) 200 | 201 | def __init__(self, config): 202 | self.config = config 203 | self.inputs_placeholder = None 204 | self.labels_placeholder = None 205 | self.seqlen_placeholder = None 206 | self.dropout_placeholder = None 207 | self.build() -------------------------------------------------------------------------------- /code/LSTM_conditional.py: -------------------------------------------------------------------------------- 1 | ###### 2 | # basic BOW model with architecture extendable to more complex LSTM models which use both headings and bodies separately. 3 | ###### 4 | import tensorflow as tf 5 | import numpy as np 6 | import random 7 | 8 | from our_model_config import OurModel 9 | from our_util import Progbar, minibatches, get_performance, softmax 10 | 11 | class LSTMCondModel(OurModel): 12 | 13 | def add_placeholders(self): 14 | """Generates placeholder variables to represent the input tensors 15 | """ 16 | self.headings_placeholder = tf.placeholder(tf.int64, shape=(None, self.config.h_max_len), name = "headings") 17 | self.bodies_placeholder = tf.placeholder(tf.int64, shape=(None, self.config.b_max_len), name = "bodies") 18 | self.headings_lengths_placeholder = tf.placeholder(tf.float64, shape=(None), name = "headings_lengths") 19 | self.bodies_lengths_placeholder = tf.placeholder(tf.float64, shape=(None), name = "bodies_lengths") 20 | self.labels_placeholder = tf.placeholder(tf.int64, shape=(None), name = "labels") 21 | self.dropout_placeholder = tf.placeholder(tf.float64, name = 'dropout') 22 | 23 | def create_feed_dict(self, headings_batch, bodies_batch, headings_lengths_batch, bodies_lengths_batch, labels_batch=None, dropout = 1.0): 24 | """Creates the feed_dict for the model. 25 | """ 26 | feed_dict = { 27 | self.headings_placeholder: headings_batch, 28 | self.bodies_placeholder: bodies_batch, 29 | self.headings_lengths_placeholder: headings_lengths_batch, 30 | self.bodies_lengths_placeholder: bodies_lengths_batch 31 | } 32 | if labels_batch is not None: 33 | feed_dict[self.labels_placeholder] = labels_batch 34 | if dropout is not None: 35 | feed_dict[self.dropout_placeholder] = dropout 36 | return feed_dict 37 | 38 | def add_embedding(self, option = 'Constant'): 39 | """Adds an embedding layer that maps from input tokens (integers) to vectors for both the headings and bodies: 40 | 41 | Returns: 42 | embeddings_headings: tf.Tensor of shape (None, h_max_len, embed_size) 43 | embeddings_bodies: tf.Tensor of shape (None, b_max_len, embed_size) 44 | """ 45 | if option == 'Constant': 46 | embeddings_headings_temp = tf.nn.embedding_lookup(params = tf.constant(self.config.pretrained_embeddings), ids = self.headings_placeholder) 47 | embeddings_bodies_temp = tf.nn.embedding_lookup(params = tf.constant(self.config.pretrained_embeddings), ids = self.bodies_placeholder) 48 | elif option == 'Variable': 49 | embeddings_headings_temp = tf.nn.embedding_lookup(params = tf.Variable(self.config.pretrained_embeddings), ids = self.headings_placeholder) 50 | embeddings_bodies_temp = tf.nn.embedding_lookup(params = tf.Variable(self.config.pretrained_embeddings), ids = self.bodies_placeholder) 51 | embeddings_headings = tf.reshape(embeddings_headings_temp, shape = (-1, self.config.h_max_len, self.config.embed_size)) 52 | embeddings_bodies = tf.reshape(embeddings_bodies_temp, shape = (-1, self.config.b_max_len, self.config.embed_size)) 53 | return embeddings_headings, embeddings_bodies 54 | 55 | def add_prediction_op(self): 56 | 57 | with tf.variable_scope('head'): 58 | 59 | # LSTM that handles the headers 60 | cell_h = tf.nn.rnn_cell.BasicLSTMCell(num_units = self.config.hidden_size) 61 | cell_h = tf.nn.rnn_cell.DropoutWrapper(cell_h, output_keep_prob = self.dropout_placeholder) 62 | theInitializer = tf.contrib.layers.xavier_initializer(uniform = True, dtype = tf.float64) 63 | 64 | # x = self.inputs_placeholder 65 | x_header, x_body = self.add_embedding(option = self.config.trainable_embeddings) 66 | # print('Predict op: x', x) 67 | rnnOutput_h = tf.nn.dynamic_rnn(cell_h, inputs = x_header, dtype = tf.float64, sequence_length = self.headings_lengths_placeholder) #MODIF 68 | Y = tf.slice(rnnOutput_h[0], begin = [0, 0, 0], size = [-1, self.config.attention_length, -1]) 69 | 70 | with tf.variable_scope('body'): 71 | # LSTM that handles the bodies 72 | cell_b = tf.nn.rnn_cell.BasicLSTMCell(num_units = self.config.hidden_size) 73 | cell_b = tf.nn.rnn_cell.DropoutWrapper(cell_b, output_keep_prob = self.dropout_placeholder) 74 | 75 | U_b = tf.get_variable(name = 'U_b', shape = (self.config.hidden_size, self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 76 | b_b = tf.get_variable(name = 'b_b', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 77 | 78 | rnnOutput_b = tf.nn.dynamic_rnn(cell_b, inputs = x_body, dtype = tf.float64, initial_state = rnnOutput_h[1], sequence_length = self.bodies_lengths_placeholder) 79 | h_N = rnnOutput_b[1][1] # batch_size, cell.state_size 80 | 81 | ## ATTENTION! 82 | W_y = tf.get_variable(name = 'Wy', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 83 | W_h = tf.get_variable(name = 'Wh', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 84 | w = tf.get_variable(name = 'w', shape = (self.config.hidden_size, 1), initializer = theInitializer, dtype = tf.float64) 85 | W_p = tf.get_variable(name = 'Wo', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 86 | W_x = tf.get_variable(name = 'Wx', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 87 | 88 | M_1 = tf.reshape(tf.matmul(tf.reshape(Y, shape = (-1, self.config.hidden_size)), W_y), shape = (-1, self.config.attention_length, self.config.hidden_size)) 89 | M_2 = tf.expand_dims(tf.matmul(h_N, W_h), axis = 1) 90 | M = tf.tanh(M_1 + M_2) 91 | alpha = tf.reshape(tf.nn.softmax(tf.matmul(tf.reshape(M, shape = (-1, self.config.hidden_size)), w)), shape = (-1, self.config.attention_length)) 92 | 93 | r = tf.squeeze(tf.batch_matmul(tf.transpose(tf.expand_dims(alpha, 2), perm = [0, 2, 1]), Y)) 94 | h_star = tf.tanh(tf.matmul(r, W_p) + tf.matmul(h_N, W_x)) 95 | 96 | # Compute predictions 97 | preds = tf.matmul(h_star, U_b) + b_b # batch_size, n_classes 98 | return preds 99 | 100 | def add_prediction_op(self): 101 | 102 | with tf.variable_scope('head'): 103 | 104 | # LSTM that handles the headers 105 | cell_h = tf.nn.rnn_cell.BasicLSTMCell(num_units = self.config.hidden_size) 106 | cell_h = tf.nn.rnn_cell.DropoutWrapper(cell_h, output_keep_prob = self.dropout_placeholder) 107 | theInitializer = tf.contrib.layers.xavier_initializer(uniform = True, dtype = tf.float64) 108 | 109 | x_header, x_body = self.add_embedding(option = self.config.trainable_embeddings) 110 | 111 | if self.config.n_layers <= 1: 112 | rnnOutput_h = tf.nn.dynamic_rnn(cell_h, inputs = x_header, dtype = tf.float64, sequence_length = self.headings_lengths_placeholder) #MODIF 113 | elif self.config.n_layers > 1: 114 | stacked_lstm = tf.nn.rnn_cell.MultiRNNCell([cell_h] * self.config.n_layers) 115 | rnnOutput_h = tf.nn.dynamic_rnn(stacked_lstm, inputs = x_header, dtype = tf.float64, sequence_length = self.headings_lengths_placeholder) #MODIF 116 | Y = tf.slice(rnnOutput_h[0], begin = [0, 0, 0], size = [-1, self.config.attention_length, -1]) 117 | 118 | with tf.variable_scope('body'): 119 | # LSTM that handles the bodies 120 | cell_b = tf.nn.rnn_cell.BasicLSTMCell(num_units = self.config.hidden_size) 121 | cell_b = tf.nn.rnn_cell.DropoutWrapper(cell_b, output_keep_prob = self.dropout_placeholder) 122 | 123 | U_b = tf.get_variable(name = 'U_b', shape = (self.config.hidden_size, self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 124 | b_b = tf.get_variable(name = 'b_b', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 125 | 126 | if self.config.n_layers <= 1: 127 | rnnOutput_b = tf.nn.dynamic_rnn(cell_b, inputs = x_body, dtype = tf.float64, initial_state = rnnOutput_h[1], sequence_length = self.bodies_lengths_placeholder) 128 | h_N = rnnOutput_b[1][1] # batch_size, cell.state_size 129 | elif self.config.n_layers > 1: 130 | print('header rnn, ', len(rnnOutput_h[1])) 131 | stacked_lstm = tf.nn.rnn_cell.MultiRNNCell([cell_b] * self.config.n_layers) 132 | rnnOutput_b = tf.nn.dynamic_rnn(stacked_lstm, inputs = x_body, dtype = tf.float64, initial_state = rnnOutput_h[1], sequence_length = self.bodies_lengths_placeholder) 133 | h_N = rnnOutput_b[1][self.config.n_layers - 1][1] 134 | 135 | ## ATTENTION! 136 | W_y = tf.get_variable(name = 'Wy', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 137 | W_h = tf.get_variable(name = 'Wh', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 138 | w = tf.get_variable(name = 'w', shape = (self.config.hidden_size, 1), initializer = theInitializer, dtype = tf.float64) 139 | W_p = tf.get_variable(name = 'Wo', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 140 | W_x = tf.get_variable(name = 'Wx', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 141 | 142 | M_1 = tf.reshape(tf.matmul(tf.reshape(Y, shape = (-1, self.config.hidden_size)), W_y), shape = (-1, self.config.attention_length, self.config.hidden_size)) 143 | M_2 = tf.expand_dims(tf.matmul(h_N, W_h), axis = 1) 144 | M = tf.tanh(M_1 + M_2) 145 | alpha = tf.reshape(tf.nn.softmax(tf.matmul(tf.reshape(M, shape = (-1, self.config.hidden_size)), w)), shape = (-1, self.config.attention_length)) 146 | 147 | r = tf.squeeze(tf.batch_matmul(tf.transpose(tf.expand_dims(alpha, 2), perm = [0, 2, 1]), Y)) 148 | h_star = tf.tanh(tf.matmul(r, W_p) + tf.matmul(h_N, W_x)) 149 | 150 | # Compute predictions 151 | preds = tf.matmul(h_star, U_b) + b_b # batch_size, n_classes 152 | return preds 153 | 154 | 155 | def train_on_batch(self, sess, h_batch, b_batch, h_len_batch, b_len_batch, y_batch): 156 | """Perform one step of gradient descent on the provided batch of data. 157 | Args: 158 | sess: tf.Session() 159 | headings_batch: np.ndarray of shape (n_samples, n_features) 160 | bodies_batch: np.ndarray of shape (n_samples, n_features) 161 | headings_lengths_batch: np.ndarray of shape (n_samples, 1) 162 | bodies_lengths_batch: np.ndarray of shape (n_samples, 1) 163 | labels_batch: np.ndarray of shape (n_samples, n_classes) 164 | Returns: 165 | loss: loss over the batch (a scalar) 166 | """ 167 | feed = self.create_feed_dict(h_batch, b_batch, h_len_batch, b_len_batch, y_batch, dropout = self.config.dropout) 168 | # print('feed', feed) 169 | _, loss = sess.run([self.train_op, self.loss], feed_dict=feed) 170 | ## for debugging / testing 171 | if (np.isnan(loss)): 172 | print('headings', h_batch) 173 | print('bodies', b_batch) 174 | print('nh_len', h_len_batch) 175 | print('b_len', b_len_batch) 176 | print('labels', y_batch) 177 | assert(False) 178 | return loss 179 | 180 | def predict_on_batch(self, sess, h_batch, b_batch, h_len_batch, b_len_batch): 181 | """Make predictions for the provided batch of data 182 | Args: 183 | sess: tf.Session() 184 | headings_batch: np.ndarray of shape (n_samples, n_features) 185 | bodies_batch: np.ndarray of shape (n_samples, n_features) 186 | headings_lengths_batch: np.ndarray of shape (n_samples, 1) 187 | bodies_lengths_batch: np.ndarray of shape (n_samples, 1) 188 | Returns: 189 | predictions: np.ndarray of shape (n_samples, n_classes) 190 | """ 191 | feed = self.create_feed_dict(h_batch, b_batch, h_len_batch, b_len_batch) 192 | predictions = sess.run(self.pred, feed_dict=feed) 193 | return predictions 194 | 195 | def run_epoch(self, sess, h_np, b_np, h_len, b_len, y): 196 | # prog = Progbar(target=1 + int(len(train) / self.config.batch_size)) 197 | losses = [] 198 | # shuffle 199 | ind = range(self.config.num_samples) 200 | random.shuffle(ind) 201 | # sizes 202 | batch_start = 0 203 | batch_end = 0 204 | N = self.config.batch_size 205 | num_batches = self.config.num_samples / N 206 | # run batches 207 | for i in range(num_batches): 208 | batch_start = (i*N) 209 | batch_end = (i+1)*N 210 | indices = ind[batch_start:batch_end] 211 | h_batch = h_np[indices,:] 212 | b_batch = b_np[indices,:] 213 | h_len_batch = h_len[indices] 214 | b_len_batch = b_len[indices] 215 | y_batch = y[indices] 216 | loss = self.train_on_batch(sess, h_batch, b_batch, h_len_batch, b_len_batch, y_batch) 217 | losses.append(loss) 218 | if (i % (1 + num_batches/10)) == 0: 219 | print('batch: ', i, ', loss: ', loss) 220 | # run last smaller batch 221 | if (batch_end < self.config.num_samples): 222 | indices = ind[batch_end:] 223 | h_batch = h_np[indices,:] 224 | b_batch = b_np[indices,:] 225 | h_len_batch = h_len[indices] 226 | b_len_batch = b_len[indices] 227 | y_batch = y[indices] 228 | # loss 229 | loss = self.train_on_batch(sess, h_batch, b_batch, h_len_batch, b_len_batch, y_batch) 230 | losses.append(loss) 231 | return losses 232 | 233 | def fit(self, sess, h_np, b_np, h_len, b_len, y, dev_h, dev_b, dev_h_len, dev_b_len, dev_y): #M 234 | #losses = [] 235 | losses_epochs = [] #M 236 | dev_performances_epochs = [] # M 237 | dev_predictions_epochs = [] #M 238 | dev_predicted_classes_epochs = [] #M 239 | 240 | for epoch in range(self.config.n_epochs): 241 | print('-------new epoch---------') 242 | loss = self.run_epoch(sess, h_np, b_np, h_len, b_len, y) 243 | 244 | # Computing predictions #MODIF 245 | dev_predictions = self.predict_on_batch(sess, dev_h, dev_b, dev_h_len, dev_b_len) 246 | 247 | # Computing development performance #MODIF 248 | dev_predictions = softmax(np.array(dev_predictions)) 249 | dev_predicted_classes = np.argmax(dev_predictions, axis = 1) 250 | dev_performance = get_performance(dev_predicted_classes, dev_y, n_classes = 4) 251 | 252 | # Adding to global outputs #MODIF 253 | dev_predictions_epochs.append(dev_predictions) 254 | dev_predicted_classes_epochs.append(dev_predicted_classes) 255 | dev_performances_epochs.append(dev_performance) 256 | losses_epochs.append(loss) 257 | 258 | print('EPOCH: ', epoch, ', LOSS: ', np.mean(loss)) 259 | 260 | return losses_epochs, dev_performances_epochs, dev_predicted_classes_epochs, dev_predictions_epochs 261 | 262 | def __init__(self, config): 263 | self.config = config 264 | self.headings_placeholder = None 265 | self.bodies_placeholder = None 266 | self.headings_lengths_placeholder = None 267 | self.bodies_lengths_placeholder = None 268 | self.labels_placeholder = None 269 | self.dropout_placeholder = None 270 | self.build() -------------------------------------------------------------------------------- /code/README.md: -------------------------------------------------------------------------------- 1 | CS 224n Project Directory 2 | 3 | Winter 2017 4 | 5 | Stephen Pfohl 6 | Ferdinand Legros 7 | Oskar Triebe 8 | 9 | Model Files: 10 | our_model_config.py 11 | contains abstract model class to be extended by other models. Is based off of the model classes used in the course assignments. 12 | bow_model_config.py 13 | Bag of words model class that extends our_model_config.py 14 | basicLSTM_model_config.py 15 | model class for the basic LSTM model that operates on the concatenated input 16 | LSTM_attention.py 17 | model class for the LSTM model that has been augmented by attention 18 | LSTM_conditional.py 19 | model class for the LSTM with attention and conditional encoding 20 | 21 | Model Execution Files 22 | execute_bow_config 23 | script that executes a single experiment of the bag of words model for a given set of parameters 24 | execute_lstm_config.py 25 | script that executes a single experiment of the basic LSTM model for a given set of parameters 26 | execute_lstm_attention.py 27 | script that executes a single experiment of the lstm model that has been augmented by attention for a given set of parameters 28 | execute_lstm_conditional.py 29 | script that executes a single experiment of the LSTM model with conditional encoding and attention for a given set of parameters for a given set of parameters 30 | 31 | Utility Files 32 | our_util.py 33 | Utility functions for use in other files. Based on the example of the util.py files provided in course assignments. 34 | run_text_processing.py 35 | File that performas tokenization, loads the data, etc 36 | 37 | Runtime scripts 38 | test_script6.py 39 | Allows the user to define a set of experiments for any of the models described above. 40 | 41 | fnc_baseline directory 42 | Required and provided by the competition organizers at https://github.com/FakeNewsChallenge/fnc-1-baseline 43 | Not included with this submission due to size constraints 44 | 45 | Plotting 46 | Contains .Rmd files for plotting -------------------------------------------------------------------------------- /code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ourownstory/stance_detection/df9f34883005d203cb6d8ee7d56d9bc82ef39b62/code/__init__.py -------------------------------------------------------------------------------- /code/basicLSTM_model_config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | 4 | ###### 5 | # Model class for Baseline_LSTM 6 | # Based on starter code from PS3-CS224n 7 | ###### 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | 11 | import argparse 12 | import sys 13 | import time 14 | import logging 15 | from datetime import datetime 16 | 17 | import tensorflow as tf 18 | import numpy as np 19 | 20 | from our_util import Progbar, minibatches, get_performance, softmax 21 | from our_model_config import OurModel 22 | 23 | logger = logging.getLogger("hw3.q3") 24 | logger.setLevel(logging.DEBUG) 25 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) 26 | 27 | class BaselineLSTM(OurModel): 28 | 29 | def add_placeholders(self): 30 | """Generates placeholder variables to represent the input tensors 31 | MODIF: OVERWRITING 32 | """ 33 | self.inputs_placeholder = tf.placeholder(tf.int64, shape=(None, self.config.max_length), name = "x") 34 | self.labels_placeholder = tf.placeholder(tf.int64, shape=(None), name = "y") 35 | self.seqlen_placeholder = tf.placeholder(tf.int64, shape=(None), name = "seqlen") 36 | self.dropout_placeholder = tf.placeholder(tf.float64, name = 'dropout') 37 | 38 | def create_feed_dict(self, inputs_batch, seqlen_batch, labels_batch = None, dropout = 1.0): 39 | """Creates the feed_dict for the model. 40 | MODIF: OVERWRITING 41 | """ 42 | feed_dict = { 43 | self.inputs_placeholder: inputs_batch, 44 | } 45 | if labels_batch is not None: 46 | feed_dict[self.labels_placeholder] = labels_batch 47 | if dropout is not None: 48 | feed_dict[self.dropout_placeholder] = dropout 49 | feed_dict[self.seqlen_placeholder] = seqlen_batch 50 | return feed_dict 51 | 52 | def add_prediction_op(self): 53 | """ 54 | Returns: 55 | preds: tf.Tensor of shape (batch_size, 1) 56 | """ 57 | 58 | if self.config.n_layers <= 1: 59 | print('layers = ', self.config.n_layers) 60 | cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = self.config.hidden_size) 61 | cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob = self.dropout_placeholder) 62 | theInitializer = tf.contrib.layers.xavier_initializer(uniform = True, dtype = tf.float64) 63 | U = tf.get_variable(name = 'U', shape = (self.config.hidden_size, self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 64 | b = tf.get_variable(name = 'b', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 65 | 66 | x = self.add_embedding(option = self.config.trainable_embeddings) 67 | rnnOutput = tf.nn.dynamic_rnn(cell, inputs = x, dtype = tf.float64, sequence_length = self.seqlen_placeholder) #MODIF 68 | finalState = rnnOutput[1][1] # batch_size, cell.state_size 69 | preds = tf.matmul(finalState, U) + b # batch_size, n_classes 70 | # print('Predict op: preds', preds) 71 | elif self.config.n_layers > 1: # MODIF 72 | cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = self.config.hidden_size) 73 | cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob = self.dropout_placeholder) 74 | stacked_lstm = tf.nn.rnn_cell.MultiRNNCell([cell] * self.config.n_layers) 75 | theInitializer = tf.contrib.layers.xavier_initializer(uniform = True, dtype = tf.float64) 76 | U = tf.get_variable(name = 'U', shape = (self.config.hidden_size, self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 77 | b = tf.get_variable(name = 'b', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 78 | x = self.add_embedding(option = self.config.trainable_embeddings) 79 | rnnOutput = tf.nn.dynamic_rnn(stacked_lstm, inputs = x, dtype = tf.float64, sequence_length = self.seqlen_placeholder) #MODIF 80 | print('layers = ', self.config.n_layers) 81 | finalState = rnnOutput[1][self.config.n_layers - 1][1] # batch_size, cell.state_size 82 | preds = tf.matmul(finalState, U) + b # batch_size, n_classes 83 | return preds 84 | 85 | def add_embedding(self, option = 'Constant'): 86 | """Adds an embedding layer that maps from input tokens (integers) to vectors and then 87 | concatenates those vectors" 88 | 89 | Returns: 90 | embeddings: tf.Tensor of shape (None, max_length, n_features*embed_size) 91 | """ 92 | # option = config.trainable_embeddings 93 | if option == 'Variable': 94 | embeddings_temp = tf.nn.embedding_lookup(params = tf.Variable(self.config.pretrained_embeddings), ids = self.inputs_placeholder) 95 | elif option == 'Constant': 96 | embeddings_temp = tf.nn.embedding_lookup(params = tf.constant(self.config.pretrained_embeddings), ids = self.inputs_placeholder) 97 | embeddings = tf.reshape(embeddings_temp, shape = (-1, self.config.max_length, self.config.embed_size)) 98 | ### END YOUR CODE 99 | return embeddings 100 | 101 | def train_on_batch(self, sess, inputs_batch, labels_batch, seqlen_batch): 102 | """ 103 | MODIF 104 | Perform one step of gradient descent on the provided batch of data. 105 | 106 | Args: 107 | sess: tf.Session() 108 | input_batch: np.ndarray of shape (n_samples, n_features) # CHECK: np.ndarray?? 109 | labels_batch: np.ndarray of shape (n_samples, n_classes) 110 | labels_batch: np.array of shape (n_samples) 111 | Returns: 112 | loss: loss over the batch (a scalar) 113 | """ 114 | labels_batch = np.reshape(labels_batch, (-1, 1)) 115 | feed = self.create_feed_dict(inputs_batch, labels_batch=labels_batch, seqlen_batch = seqlen_batch, dropout = self.config.dropout) # MODIF 116 | print(inputs_batch.shape) 117 | print(len(labels_batch)) 118 | _, loss = sess.run([self.train_op, self.loss], feed_dict=feed) 119 | return loss 120 | 121 | def predict_on_batch(self, sess, inputs_batch, seqlen_batch): 122 | """Make predictions for the provided batch of data  123 | 124 | Args: 125 | sess: tf.Session() 126 | input_batch: np.ndarray of shape (n_samples, n_features) 127 | Returns: 128 | predictions: np.ndarray of shape (n_samples, n_classes) 129 | """ 130 | feed = self.create_feed_dict(inputs_batch, seqlen_batch) 131 | predictions = sess.run(self.pred, feed_dict=feed) 132 | return predictions 133 | 134 | def run_epoch(self, sess, train): 135 | prog = Progbar(target=1 + int(len(train) / self.config.batch_size)) 136 | losses = [] 137 | for i, batch in enumerate(minibatches(train, self.config.batch_size)): 138 | loss = self.train_on_batch(sess, *batch) 139 | losses.append(loss) 140 | # grad_norms.append(grad_norm) 141 | prog.update(i + 1, [("train loss", loss)]) 142 | return losses 143 | 144 | def fit(self, sess, train, dev_data_np, dev_seqlen, dev_labels): # MODIF # CAREFUL DEV/dev 145 | ''' 146 | Returns LISTS: 147 | - losses_epochs 148 | - dev_performances_epochs 149 | - dev_predictions_epochs 150 | - dev_predicted_classes_epochs 151 | ''' 152 | losses_epochs = [] #M 153 | dev_performances_epochs = [] # MODIF 154 | dev_predictions_epochs = [] #M 155 | dev_predicted_classes_epochs = [] #M 156 | for epoch in range(self.config.n_epochs): 157 | logger.info("Epoch %d out of %d", epoch + 1, self.config.n_epochs) 158 | loss = self.run_epoch(sess, train) 159 | 160 | # Computing predictions # MODIF 161 | dev_predictions = self.predict_on_batch(sess, dev_data_np, dev_seqlen) #OUCH 162 | 163 | # Computing development performance #MODIF 164 | dev_predictions = softmax(np.array(dev_predictions)) 165 | dev_predicted_classes = np.argmax(dev_predictions, axis = 1) 166 | dev_performance = get_performance(dev_predicted_classes, dev_labels, n_classes = 4) 167 | 168 | # Adding to global outputs #MODIF 169 | dev_predictions_epochs.append(dev_predictions) 170 | dev_predicted_classes_epochs.append(dev_predicted_classes) 171 | dev_performances_epochs.append(dev_performance) 172 | losses_epochs.append(loss) 173 | 174 | return losses_epochs, dev_performances_epochs, dev_predicted_classes_epochs, dev_predictions_epochs 175 | 176 | def build(self): 177 | self.add_placeholders() 178 | self.pred = self.add_prediction_op() 179 | self.loss = self.add_loss_op(self.pred) 180 | self.train_op = self.add_training_op(self.loss) 181 | 182 | def __init__(self, config): 183 | self.config = config 184 | self.inputs_placeholder = None 185 | self.labels_placeholder = None 186 | self.seqlen_placeholder = None 187 | self.dropout_placeholder = None 188 | self.build() -------------------------------------------------------------------------------- /code/bow_model_config.py: -------------------------------------------------------------------------------- 1 | ###### 2 | # basic BOW model with architecture extendable to more complex LSTM models which use both headings and bodies separately. 3 | ###### 4 | import tensorflow as tf 5 | import numpy as np 6 | import random 7 | 8 | from our_model_config import OurModel 9 | from our_util import Progbar, minibatches, get_performance, softmax 10 | 11 | class BOWModel(OurModel): 12 | 13 | def add_placeholders(self): 14 | """Generates placeholder variables to represent the input tensors 15 | """ 16 | self.headings_placeholder = tf.placeholder(tf.int64, shape=(None, self.config.h_max_len), name="headings") 17 | self.bodies_placeholder = tf.placeholder(tf.int64, shape=(None, self.config.b_max_len), name="bodies") 18 | self.headings_lengths_placeholder = tf.placeholder(tf.float64, shape=(None), name="headings_lengths") 19 | self.bodies_lengths_placeholder = tf.placeholder(tf.float64, shape=(None), name="bodies_lengths") 20 | self.labels_placeholder = tf.placeholder(tf.int64, shape=(None), name="labels") 21 | 22 | def create_feed_dict(self, headings_batch, bodies_batch, headings_lengths_batch, bodies_lengths_batch, labels_batch=None): 23 | """Creates the feed_dict for the model. 24 | """ 25 | feed_dict = { 26 | self.headings_placeholder: headings_batch, 27 | self.bodies_placeholder: bodies_batch, 28 | self.headings_lengths_placeholder: headings_lengths_batch, 29 | self.bodies_lengths_placeholder: bodies_lengths_batch, 30 | } 31 | if labels_batch is not None: 32 | feed_dict[self.labels_placeholder] = labels_batch 33 | return feed_dict 34 | 35 | def add_embedding(self, option = 'Constant'): 36 | """Adds an embedding layer that maps from input tokens (integers) to vectors for both the headings and bodies: 37 | 38 | Returns: 39 | embeddings_headings: tf.Tensor of shape (None, h_max_len, embed_size) 40 | embeddings_bodies: tf.Tensor of shape (None, b_max_len, embed_size) 41 | """ 42 | # 43 | # embeddings_headings_temp = tf.nn.embedding_lookup(params = tf.Constant(self.config.pretrained_embeddings), ids = self.headings_placeholder) 44 | # embeddings_bodies_temp = tf.nn.embedding_lookup(params = tf.Constant(self.config.pretrained_embeddings), ids = self.bodies_placeholder) 45 | embeddings_headings_temp = tf.nn.embedding_lookup(params = self.config.pretrained_embeddings, ids = self.headings_placeholder) 46 | embeddings_bodies_temp = tf.nn.embedding_lookup(params = self.config.pretrained_embeddings, ids = self.bodies_placeholder) 47 | embeddings_headings = tf.reshape(embeddings_headings_temp, shape = (-1, self.config.h_max_len, self.config.embed_size)) 48 | embeddings_bodies = tf.reshape(embeddings_bodies_temp, shape = (-1, self.config.b_max_len, self.config.embed_size)) 49 | return embeddings_headings, embeddings_bodies 50 | 51 | def add_bow_input(self): 52 | headings, bodies = self.add_embedding(option = self.config.trainable_embeddings) 53 | headings_bag = tf.divide(tf.reduce_sum(headings, axis=1), tf.reshape(self.headings_lengths_placeholder, shape = (-1, 1))) 54 | bodies_bag = tf.divide(tf.reduce_sum(bodies, axis=1), tf.reshape(self.bodies_lengths_placeholder, shape = (-1, 1))) 55 | x = tf.concat_v2(values=[headings_bag, bodies_bag], axis=1) 56 | return x 57 | 58 | def add_prediction_op(self): 59 | """Runs an rnn on the input using TensorFlows's 60 | @tf.nn.dynamic_rnn function, and returns the final state as a prediction. 61 | 62 | Returns: 63 | logits: tf.Tensor of shape (batch_size, n_classes) 64 | """ 65 | hidden_size_2 = np.floor(self.config.hidden_next**2 * self.config.hidden_size) 66 | hidden_size_3 = np.floor(self.config.hidden_next**3 * self.config.hidden_size) 67 | hidden_size_4 = np.floor(self.config.hidden_next**4 * self.config.hidden_size) 68 | hidden_size_5 = np.floor(self.config.hidden_next**5 * self.config.hidden_size) 69 | 70 | x = self.add_bow_input() 71 | theInitializer = tf.contrib.layers.xavier_initializer(uniform = True, dtype = tf.float64) 72 | if not self.config.n_layers: 73 | W = tf.get_variable(name = 'W', shape = (2*self.config.embed_size, self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 74 | c = tf.get_variable(name = 'c', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 75 | pred = tf.matmul(x, W) + c # batch_size, n_classes 76 | elif self.config.n_layers == 1: 77 | U0 = tf.get_variable(name = 'U0', shape = (2*self.config.embed_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 78 | c0 = tf.get_variable(name = 'c0', shape = (self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 79 | h1 = tf.nn.relu(tf.matmul(x, U0) + c0) # batch_size, hidden_size 80 | U1 = tf.get_variable(name = 'U1', shape = (self.config.hidden_size, self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 81 | c1 = tf.get_variable(name = 'c1', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 82 | pred = tf.matmul(h1, U1) + c1 # batch_size, n_classes 83 | elif self.config.n_layers == 2: 84 | U0 = tf.get_variable(name = 'U0', shape = (2*self.config.embed_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 85 | c0 = tf.get_variable(name = 'c0', shape = (self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 86 | h1 = tf.nn.relu(tf.matmul(x, U0) + c0) # batch_size, hidden_size 87 | U1 = tf.get_variable(name = 'U1', shape = (self.config.hidden_size, hidden_size_2), initializer = theInitializer, dtype = tf.float64) 88 | c1 = tf.get_variable(name = 'c1', shape = (hidden_size_2), initializer = theInitializer, dtype = tf.float64) 89 | h2 = tf.nn.relu(tf.matmul(h1, U1) + c1) # batch_size, hidden_size_2 90 | U2 = tf.get_variable(name = 'U2', shape = (hidden_size_2, self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 91 | c2 = tf.get_variable(name = 'c2', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 92 | pred = tf.matmul(h2, U2) + c2 # batch_size, n_classes 93 | elif self.config.n_layers == 3: 94 | U0 = tf.get_variable(name = 'U0', shape = (2*self.config.embed_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 95 | c0 = tf.get_variable(name = 'c0', shape = (self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 96 | h1 = tf.nn.relu(tf.matmul(x, U0) + c0) # batch_size, hidden_size 97 | U1 = tf.get_variable(name = 'U1', shape = (self.config.hidden_size, hidden_size_2), initializer = theInitializer, dtype = tf.float64) 98 | c1 = tf.get_variable(name = 'c1', shape = (hidden_size_2), initializer = theInitializer, dtype = tf.float64) 99 | h2 = tf.nn.relu(tf.matmul(h1, U1) + c1) # batch_size, hidden_size_2 100 | U2 = tf.get_variable(name = 'U2', shape = (hidden_size_2, hidden_size_3), initializer = theInitializer, dtype = tf.float64) 101 | c2 = tf.get_variable(name = 'c2', shape = (hidden_size_3), initializer = theInitializer, dtype = tf.float64) 102 | h3 = tf.nn.relu(tf.matmul(h2, U2) + c2) # batch_size, hidden_size_3 103 | U3 = tf.get_variable(name = 'U3', shape = (hidden_size_3, self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 104 | c3 = tf.get_variable(name = 'c3', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 105 | pred = tf.matmul(h3, U3) + c3 # batch_size, n_classes 106 | elif self.config.n_layers == 4: 107 | U0 = tf.get_variable(name = 'U0', shape = (2*self.config.embed_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 108 | c0 = tf.get_variable(name = 'c0', shape = (self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 109 | h1 = tf.nn.relu(tf.matmul(x, U0) + c0) # batch_size, hidden_size 110 | U1 = tf.get_variable(name = 'U1', shape = (self.config.hidden_size, hidden_size_2), initializer = theInitializer, dtype = tf.float64) 111 | c1 = tf.get_variable(name = 'c1', shape = (hidden_size_2), initializer = theInitializer, dtype = tf.float64) 112 | h2 = tf.nn.relu(tf.matmul(h1, U1) + c1) # batch_size, hidden_size_2 113 | U2 = tf.get_variable(name = 'U2', shape = (hidden_size_2, hidden_size_3), initializer = theInitializer, dtype = tf.float64) 114 | c2 = tf.get_variable(name = 'c2', shape = (hidden_size_3), initializer = theInitializer, dtype = tf.float64) 115 | h3 = tf.nn.relu(tf.matmul(h2, U2) + c2) # batch_size, hidden_size_3 116 | U3 = tf.get_variable(name = 'U3', shape = (hidden_size_3, hidden_size_4), initializer = theInitializer, dtype = tf.float64) 117 | c3 = tf.get_variable(name = 'c3', shape = (hidden_size_4), initializer = theInitializer, dtype = tf.float64) 118 | h4 = tf.nn.relu(tf.matmul(h3, U3) + c3) # batch_size, hidden_size_4 119 | U4 = tf.get_variable(name = 'U4', shape = (hidden_size_4, self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 120 | c4 = tf.get_variable(name = 'c4', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 121 | pred = tf.matmul(h4, U4) + c4 # batch_size, n_classes 122 | elif self.config.n_layers == 5: 123 | U0 = tf.get_variable(name = 'U0', shape = (2*self.config.embed_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 124 | c0 = tf.get_variable(name = 'c0', shape = (self.config.hidden_size), initializer = theInitializer, dtype = tf.float64) 125 | h1 = tf.nn.relu(tf.matmul(x, U0) + c0) # batch_size, hidden_size 126 | U1 = tf.get_variable(name = 'U1', shape = (self.config.hidden_size, hidden_size_2), initializer = theInitializer, dtype = tf.float64) 127 | c1 = tf.get_variable(name = 'c1', shape = (hidden_size_2), initializer = theInitializer, dtype = tf.float64) 128 | h2 = tf.nn.relu(tf.matmul(h1, U1) + c1) # batch_size, hidden_size_2 129 | U2 = tf.get_variable(name = 'U2', shape = (hidden_size_2, hidden_size_3), initializer = theInitializer, dtype = tf.float64) 130 | c2 = tf.get_variable(name = 'c2', shape = (hidden_size_3), initializer = theInitializer, dtype = tf.float64) 131 | h3 = tf.nn.relu(tf.matmul(h2, U2) + c2) # batch_size, hidden_size_3 132 | U3 = tf.get_variable(name = 'U3', shape = (hidden_size_3, hidden_size_4), initializer = theInitializer, dtype = tf.float64) 133 | c3 = tf.get_variable(name = 'c3', shape = (hidden_size_4), initializer = theInitializer, dtype = tf.float64) 134 | h4 = tf.nn.relu(tf.matmul(h3, U3) + c3) # batch_size, hidden_size_4 135 | U4 = tf.get_variable(name = 'U4', shape = (hidden_size_4, hidden_size_5), initializer = theInitializer, dtype = tf.float64) 136 | c4 = tf.get_variable(name = 'c4', shape = (hidden_size_5), initializer = theInitializer, dtype = tf.float64) 137 | h5 = tf.nn.relu(tf.matmul(h4, U4) + c4) # batch_size, hidden_size_5 138 | U5 = tf.get_variable(name = 'U5', shape = (hidden_size_5, self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 139 | c5 = tf.get_variable(name = 'c5', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64) 140 | pred = tf.matmul(h5, U5) + c5 # batch_size, n_classes 141 | return pred 142 | 143 | def train_on_batch(self, sess, h_batch, b_batch, h_len_batch, b_len_batch, y_batch): 144 | """Perform one step of gradient descent on the provided batch of data. 145 | Args: 146 | sess: tf.Session() 147 | headings_batch: np.ndarray of shape (n_samples, n_features) 148 | bodies_batch: np.ndarray of shape (n_samples, n_features) 149 | headings_lengths_batch: np.ndarray of shape (n_samples, 1) 150 | bodies_lengths_batch: np.ndarray of shape (n_samples, 1) 151 | labels_batch: np.ndarray of shape (n_samples, n_classes) 152 | Returns: 153 | loss: loss over the batch (a scalar) 154 | """ 155 | feed = self.create_feed_dict(h_batch, b_batch, h_len_batch, b_len_batch, y_batch) 156 | _, loss = sess.run([self.train_op, self.loss], feed_dict=feed) 157 | ## for debugging / testing 158 | if (np.isnan(loss)): 159 | print('headings', h_batch) 160 | print('bodies', b_batch) 161 | print('nh_len', h_len_batch) 162 | print('b_len', b_len_batch) 163 | print('labels', y_batch) 164 | assert(False) 165 | return loss 166 | 167 | def predict_on_batch(self, sess, h_batch, b_batch, h_len_batch, b_len_batch): 168 | """Make predictions for the provided batch of data 169 | Args: 170 | sess: tf.Session() 171 | headings_batch: np.ndarray of shape (n_samples, n_features) 172 | bodies_batch: np.ndarray of shape (n_samples, n_features) 173 | headings_lengths_batch: np.ndarray of shape (n_samples, 1) 174 | bodies_lengths_batch: np.ndarray of shape (n_samples, 1) 175 | Returns: 176 | predictions: np.ndarray of shape (n_samples, n_classes) 177 | """ 178 | feed = self.create_feed_dict(h_batch, b_batch, h_len_batch, b_len_batch) 179 | predictions = sess.run(self.pred, feed_dict=feed) 180 | return predictions 181 | 182 | def run_epoch(self, sess, h_np, b_np, h_len, b_len, y): 183 | losses = [] 184 | # shuffle 185 | ind = range(self.config.num_samples) 186 | random.shuffle(ind) 187 | # sizes 188 | batch_start = 0 189 | batch_end = 0 190 | N = self.config.batch_size 191 | num_batches = self.config.num_samples / N 192 | # run batches 193 | for i in range(num_batches): 194 | batch_start = (i*N) 195 | batch_end = (i+1)*N 196 | indices = ind[batch_start:batch_end] 197 | h_batch = h_np[indices,:] 198 | b_batch = b_np[indices,:] 199 | h_len_batch = h_len[indices] 200 | b_len_batch = b_len[indices] 201 | y_batch = y[indices] 202 | # loss 203 | loss = self.train_on_batch(sess, h_batch, b_batch, h_len_batch, b_len_batch, y_batch) 204 | losses.append(loss) 205 | # prog.update(i + 1, [("train loss", loss)]) 206 | if (i % (1 + num_batches/10)) == 0: 207 | print('batch: ', i, ', loss: ', loss) 208 | # run last smaller batch 209 | if (batch_end < self.config.num_samples): 210 | indices = ind[batch_end:] 211 | h_batch = h_np[indices,:] 212 | b_batch = b_np[indices,:] 213 | h_len_batch = h_len[indices] 214 | b_len_batch = b_len[indices] 215 | y_batch = y[indices] 216 | # loss 217 | loss = self.train_on_batch(sess, h_batch, b_batch, h_len_batch, b_len_batch, y_batch) 218 | losses.append(loss) 219 | print('batch: ', i, ', loss: ', loss) 220 | # print('-------last batch---------') 221 | return losses 222 | 223 | 224 | def fit(self, sess, h_np, b_np, h_len, b_len, y, dev_h, dev_b, dev_h_len, dev_b_len, dev_y): #M 225 | #losses = [] 226 | losses_epochs = [] #M 227 | dev_performances_epochs = [] # M 228 | dev_predictions_epochs = [] #M 229 | dev_predicted_classes_epochs = [] #M 230 | 231 | for epoch in range(self.config.n_epochs): 232 | print('-------new epoch---------') 233 | loss = self.run_epoch(sess, h_np, b_np, h_len, b_len, y) 234 | 235 | # Computing predictions #MODIF 236 | dev_predictions = self.predict_on_batch(sess, dev_h, dev_b, dev_h_len, dev_b_len) 237 | 238 | # Computing development performance #MODIF 239 | dev_predictions = softmax(np.array(dev_predictions)) 240 | dev_predicted_classes = np.argmax(dev_predictions, axis = 1) 241 | dev_performance = get_performance(dev_predicted_classes, dev_y, n_classes = 4) 242 | 243 | # Adding to global outputs #MODIF 244 | dev_predictions_epochs.append(dev_predictions) 245 | dev_predicted_classes_epochs.append(dev_predicted_classes) 246 | dev_performances_epochs.append(dev_performance) 247 | losses_epochs.append(loss) 248 | 249 | print('EPOCH: ', epoch, ', LOSS: ', np.mean(loss)) 250 | 251 | return losses_epochs, dev_performances_epochs, dev_predicted_classes_epochs, dev_predictions_epochs 252 | 253 | def __init__(self, config): 254 | self.config = config 255 | self.headings_placeholder = None 256 | self.bodies_placeholder = None 257 | self.headings_lengths_placeholder = None 258 | self.bodies_lengths_placeholder = None 259 | self.labels_placeholder = None 260 | self.build() -------------------------------------------------------------------------------- /code/data_analysis_plotting/Results_loading_1.R: -------------------------------------------------------------------------------- 1 | ### Results loading 2 | library(tidyverse) 3 | library(stringr) 4 | library(forcats) 5 | # help(package = 'forcats') 6 | 7 | 8 | ###Load data function################################################################# 9 | ## Paths 10 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/' 11 | 12 | 13 | ## Load data function 14 | load_data <- function(names) { 15 | num <- length(names) 16 | data <- read_csv(str_c(path_res, names[1])) 17 | if (num > 1) { 18 | for (i in 2:num) { 19 | data <- data %>% 20 | bind_rows(read_csv(str_c(path_res, names[i]))) 21 | } 22 | } 23 | return(data) 24 | } 25 | 26 | ## Load data combine function 27 | load_data_combine <- 28 | function(perf_names, loss_names, model_ = NULL, xp_ = NULL) { 29 | perf <- load_data(perf_names) %>% 30 | mutate(epoch = epoch + 1) 31 | loss <- load_data(loss_names) %>% 32 | rename(train_loss = loss) 33 | data <- 34 | inner_join(perf, loss) %>% 35 | mutate( 36 | model = model_, 37 | xp = xp_, 38 | downsample = FALSE) %>% 39 | select(model, xp, downsample, everything()) 40 | return(data) 41 | } 42 | 43 | 44 | #####LSTM Experiment 01########################################################################### 45 | ## Experiment 01 46 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/' 47 | 48 | # Sensitivity Analysis over max_length and n_layers, without downsampling. 49 | # Naive data splitting testing script: test_script3_config.py 50 | # 51 | # Sensitivity Analysis over max_length and n_layers, with downsampling. 52 | # Downsample first and then split data. Naive data splitting wrt headlines. 53 | # testing script: test_script4_config.py 54 | 55 | 56 | ## Data 01 57 | 58 | # {r 1 No Downsampling max_length} 59 | ### No Downsampling 60 | ## Collect max_length XP data 61 | #max_length 62 | perf_names <- 63 | str_c('old(lstm)/', 64 | c( 65 | 'perf_148979940586.csv', 66 | 'perf_148980307237.csv', 67 | 'perf_148980957009.csv' 68 | ) 69 | ) 70 | #max_length 71 | loss_names <- 72 | str_c('old(lstm)/', 73 | c( 74 | 'losses_148979940587.csv', 75 | 'losses_148980307238.csv', 76 | 'losses_148980957009.csv' 77 | ) 78 | ) 79 | 80 | ## Read data #max_length 81 | perf <- load_data(perf_names) %>% 82 | mutate(epoch = epoch + 1) 83 | loss <- load_data(loss_names) %>% 84 | rename(train_loss = loss) 85 | 86 | results1 <- 87 | inner_join(perf, loss) %>% 88 | mutate(xp = 'max_length', 89 | downsample = FALSE) %>% 90 | select(xp, downsample, everything()) 91 | 92 | # unique(perf$max_length) 93 | # unique(loss$max_length) 94 | 95 | 96 | 97 | 98 | # {r 2 No Downsampling n_layers} 99 | ### No Downsampling 100 | ## Collect n_layers data 101 | #n_layers 102 | perf_names <- 103 | str_c('old(lstm)/', 104 | c( 105 | 'perf_148982227549.csv', 106 | 'perf_148981534989.csv', 107 | 'perf_148981166478.csv' 108 | ) 109 | ) 110 | #n_layers 111 | loss_names <- 112 | str_c('old(lstm)/', 113 | c( 114 | 'losses_148973251886.csv', 115 | 'losses_148973620163.csv', 116 | 'losses_148974313682.csv' 117 | ) 118 | ) 119 | 120 | ## Read data #n_layers 121 | perf <- load_data(perf_names) %>% 122 | mutate(epoch = epoch + 1) 123 | loss <- load_data(loss_names) %>% 124 | rename(train_loss = loss) 125 | 126 | results2 <- 127 | inner_join(perf, loss) %>% 128 | mutate(xp = 'n_layers', 129 | downsample = FALSE) %>% 130 | select(xp, downsample, everything()) 131 | 132 | # unique(perf$max_length) 133 | # unique(loss$max_length) 134 | 135 | 136 | 137 | 138 | # {r 3 With Downsampling max_length} 139 | ### With Downsampling 140 | ## Collect max_length XP data 141 | #max_length 142 | perf_names <- 143 | str_c('old(lstm)/', 144 | c( 145 | 'perf_148978975462.csv', 146 | 'perf_148978680439.csv', 147 | 'perf_148978512014.csv' 148 | ) 149 | ) 150 | #max_length 151 | loss_names <- 152 | str_c('old(lstm)/', 153 | c( 154 | 'losses_148978975462.csv', 155 | 'losses_148978680439.csv', 156 | 'losses_148978512015.csv' 157 | ) 158 | ) 159 | 160 | ## Read data #max_length 161 | perf <- load_data(perf_names) %>% 162 | mutate(epoch = epoch + 1) 163 | loss <- load_data(loss_names) %>% 164 | rename(train_loss = loss) 165 | 166 | results3 <- 167 | inner_join(perf, loss) %>% 168 | mutate(xp = 'max_length', 169 | downsample = TRUE) %>% 170 | select(xp, downsample, everything()) 171 | 172 | # unique(perf$max_length) 173 | # unique(loss$max_length) 174 | 175 | # results3 <- 176 | # anti_join(perf, loss) 177 | 178 | 179 | 180 | 181 | # {r 4 With Downsampling n_layers} 182 | ### With Downsampling 183 | ## Collect n_layers data 184 | #n_layers 185 | perf_names <- 186 | str_c('old(lstm)/', 187 | c( 188 | 'perf_148979553502.csv', 189 | 'perf_148979239173.csv', 190 | 'perf_148979071576.csv' 191 | ) 192 | ) 193 | #n_layers 194 | loss_names <- 195 | str_c('old(lstm)/', 196 | c( 197 | 'losses_148979071576.csv', 198 | 'losses_148979239173.csv', 199 | 'losses_148979553502.csv' 200 | ) 201 | ) 202 | 203 | ## Read data #n_layers 204 | perf <- load_data(perf_names) %>% 205 | mutate(epoch = epoch + 1) 206 | loss <- load_data(loss_names) %>% 207 | rename(train_loss = loss) 208 | 209 | results4 <- 210 | inner_join(perf, loss) %>% 211 | mutate(xp = 'n_layers', 212 | downsample = TRUE) %>% 213 | select(xp, downsample, everything()) 214 | 215 | # unique(perf$max_length) 216 | # unique(loss$max_length) 217 | 218 | 219 | 220 | 221 | ##Check all 222 | # print('1 No Downsampling max_length') 223 | # sapply(results1 %>% select(1:12), unique) 224 | # print('2 No Downsampling n_layers') 225 | # sapply(results2 %>% select(1:12), unique) 226 | # print('3 With Downsampling max_length') 227 | # sapply(results3 %>% select(1:12), unique) 228 | # print('4 With Downsampling n_layers') 229 | # sapply(results4 %>% select(1:12), unique) 230 | 231 | ##Combine all 232 | results_lstm1 <- bind_rows(results1, results2, results3, results4) 233 | 234 | # results_lstm1 %>% write_rds(str_c(path_res, 'old(lstm)/', 'results_lstm1.rds')) 235 | 236 | 237 | #####LSTM Experiment 02########################################################################### 238 | ### Experiment 02 239 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/basiclstm/wrangled/' 240 | 241 | perf_names_drop <- 242 | c( 243 | 'perf_148991935973_drop.csv', 244 | 'perf_148992159916_drop.csv', 245 | 'perf_148992383928_drop.csv'#, 246 | # 'perf_148992558606_maxl.csv' 247 | ) 248 | 249 | perf_names_maxl <- 250 | c( 251 | # 'perf_148991935973_drop.csv', 252 | # 'perf_148992159916_drop.csv', 253 | # 'perf_148992383928_drop.csv'#, 254 | 'perf_148992558606_maxl.csv' 255 | ) 256 | 257 | loss_names_drop <- 258 | c( 259 | 'losses_148991935973_drop.csv', 260 | 'losses_148992159916_drop.csv', 261 | 'losses_148992383928_drop.csv'#, 262 | # 'losses_148992558606_maxl.csv', 263 | # 'losses_148992694411_maxl.csv' 264 | ) 265 | 266 | loss_names_maxl <- 267 | c( 268 | # 'losses_148991935973_drop.csv', 269 | # 'losses_148992159916_drop.csv', 270 | # 'losses_148992383928_drop.csv', 271 | 'losses_148992558606_maxl.csv', 272 | 'losses_148992694411_maxl.csv' 273 | ) 274 | 275 | ## Read data #max_length, dropout 276 | perf_drop <- load_data(perf_names_drop) %>% 277 | mutate(epoch = epoch + 1) %>% 278 | mutate(xp = 'dropout') 279 | perf_maxl <- load_data(perf_names_maxl) %>% 280 | mutate(epoch = epoch + 1) %>% 281 | mutate(xp = 'max_length') 282 | # perf <- bind_rows(perf_drop, 283 | 284 | loss_drop <- load_data(loss_names_drop) %>% 285 | rename(train_loss = loss) 286 | 287 | loss_maxl <- load_data(loss_names_maxl) %>% 288 | rename(train_loss = loss) 289 | 290 | results_drop <- 291 | inner_join(perf_drop, loss_drop) %>% 292 | mutate(#xp = 'max_length', 293 | downsample = FALSE) %>% 294 | select(xp, downsample, everything()) 295 | 296 | results_maxl <- 297 | inner_join(perf_maxl, loss_maxl) %>% 298 | mutate(#xp = 'max_length', 299 | downsample = FALSE) %>% 300 | select(xp, downsample, everything()) 301 | 302 | 303 | ## add baselines for other experiments 304 | # base1 <- 305 | # results_lstm1 %>% 306 | # filter(xp == 'max_length') %>% 307 | # filter(max_length == 150) %>% 308 | # mutate(xp = 'base_150') 309 | 310 | base_drop <- 311 | results_lstm1 %>% 312 | filter(xp == 'max_length') %>% 313 | filter(max_length == 75) %>% 314 | mutate(xp = 'dropout') 315 | 316 | ### final results of Exp 2 317 | results_lstm2 <- 318 | bind_rows(results_drop, results_maxl, base_drop) 319 | 320 | # unique(perf$max_length) 321 | # unique(loss$max_length) 322 | 323 | #####Combine LSTM data################################################################# 324 | ## Combine LSTM data 325 | 326 | 327 | results_lstm <- bind_rows(results_lstm1, results_lstm2) %>% 328 | mutate(model = 'Basic LSTM') %>% 329 | select(model, xp, everything()) 330 | 331 | # results_lstm %>% write_rds(str_c(path_res, 'results_lstm.rds')) 332 | 333 | #####Attention_data 01########################################################################### 334 | ## Attention_data 335 | 336 | ## Paths 337 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/attention/wrangled/' 338 | 339 | ### Experiments 340 | perf_names <- c( 341 | 'perf_148990711237_base.csv' 342 | ) 343 | loss_names <- c( 344 | 'losses_148990711238_base.csv' 345 | ) 346 | att_base <- load_data_combine(perf_names, loss_names, 347 | model_ = 'attention', xp_ = 'base150') 348 | 349 | perf_names <- c( 350 | 'perf_148990459067_maxl.csv', 351 | 'perf_148990711237_maxl.csv', 352 | 'perf_148991552302_maxl.csv', 353 | 'perf_148992831854_maxl.csv' 354 | ) 355 | loss_names <- c( 356 | 'losses_148990459067_maxl.csv', 357 | 'losses_148990711238_maxl.csv', 358 | 'losses_148991552302_maxl.csv', 359 | 'losses_148992831855_maxl.csv' 360 | ) 361 | att_maxl <- load_data_combine(perf_names, loss_names, 362 | model_ = 'attention', xp_ = 'max_length') 363 | 364 | perf_names <- c( 365 | 'perf_148990711237_att.csv', 366 | 'perf_148993077478_att.csv', 367 | 'perf_148993325155_att.csv', 368 | 'perf_148989809876_att.csv' 369 | ) 370 | loss_names <- c( 371 | 'losses_148990711238_att.csv', 372 | 'losses_148993077478_att.csv', 373 | 'losses_148993325156_att.csv', 374 | 'losses_148989809876_att.csv' 375 | ) 376 | att_att <- load_data_combine(perf_names, loss_names, 377 | model_ = 'attention', xp_ = 'attention_length') 378 | 379 | perf_names <- c( 380 | 'perf_148990711237_lr.csv', 381 | 'perf_148993571229_lr.csv', 382 | 'perf_148993821388_lr.csv' 383 | ) 384 | loss_names <- c( 385 | 'losses_148990711238_lr.csv', 386 | 'losses_148993571229_lr.csv', 387 | 'losses_148993821388_lr.csv' 388 | ) 389 | att_lr <- load_data_combine(perf_names, loss_names, 390 | model_ = 'attention', xp_ = 'lr') 391 | 392 | ### att_att has 40 rows too much!! because: max(att_att$n_epochs) is 50 ## 393 | results_att1 <- bind_rows(#att_base, 394 | att_maxl, att_att, att_lr) 395 | 396 | # results_att %>% write_rds(str_c(path_res, 'results_attention.rds')) 397 | 398 | 399 | #####Attention_data 02, Combine########################################################################### 400 | ## Attention_data 401 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/attention/wrangled/' 402 | perf_names <- c( 403 | 'perf_148998721083_nlay.csv', 404 | 'perf_148997977122_nlay.csv' 405 | ) 406 | 407 | results_att2 <- load_data(perf_names) %>% 408 | mutate(epoch = epoch + 1) %>% 409 | mutate(model = 'attention', xp = 'n_layers') %>% 410 | mutate(downsample = FALSE) 411 | 412 | results_att2 <- bind_rows(results_att2, (att_base %>% mutate(xp = 'n_layers'))) 413 | 414 | results_att <- bind_rows(results_att2, results_att1) %>% 415 | mutate(model = 'Attention LSTM') 416 | # results_att %>% write_rds(str_c(path_res, 'results_attention.rds')) 417 | 418 | 419 | #####Conditional ########################################################################### 420 | ### Conditional Data 421 | 422 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/conditional/wrangled/' 423 | perf_names <- c( 424 | 'perf_148995686583_max75.csv', 425 | 'perf_148996029453_max150.csv', 426 | 'perf_148996505757_max300.csv' 427 | ) 428 | 429 | results_cond1 <- load_data(perf_names) %>% 430 | mutate(epoch = epoch + 1, 431 | max_length = b_max_len, 432 | xp = 'max_length') 433 | 434 | 435 | perf_names <- c( 436 | 'perf_148996029453_max150.csv', 437 | 'perf_149000293039_nlay.csv', 438 | 'perf_14899932587_nlay.csv' 439 | ) 440 | 441 | results_cond2 <- load_data(perf_names) %>% 442 | mutate(epoch = epoch + 1, 443 | max_length = b_max_len, 444 | xp = 'n_layers') 445 | 446 | results_cond <- bind_rows(results_cond1, results_cond2) %>% 447 | mutate(model = 'CEA LSTM') 448 | 449 | 450 | # results_cond %>% write_rds(str_c('C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/', 451 | # 'results_cond.rds')) 452 | 453 | 454 | #####BOW Data#################################################################### 455 | 456 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/bow/wrangled/' 457 | 458 | perf_names <- c( 459 | 'perf_14899707540.csv', 460 | 'perf_148996933874.csv', 461 | 'perf_148996941198.csv', 462 | 'perf_148996948691.csv', 463 | 'perf_148997030832.csv', 464 | 'perf_148997038112.csv', 465 | 'perf_148997045544.csv', 466 | 'perf_148997052838.csv', 467 | 'perf_148997060346.csv', 468 | 'perf_148997067842.csv', 469 | 'perf_148997083262.csv', 470 | 'perf_148997091668.csv', 471 | 'perf_148997099947.csv', 472 | 'perf_148997108384.csv', 473 | 'perf_148997116878.csv' 474 | ) 475 | 476 | results_bow <- load_data(perf_names) 477 | 478 | ## add missing variable embedding 4 layer runs for 75, 300 and 600 max_length 479 | results_bow_add <- results_bow %>% 480 | filter(trainable_embeddings == 'Constant', 481 | n_layers == 3, 482 | b_max_len %in% c(75, 300, 600)) %>% 483 | mutate(trainable_embeddings = 'Variable') 484 | results_bow <- bind_rows(results_bow, results_bow_add) 485 | 486 | ## add 150 max_len as n_layers experiment 487 | results_bow_add <- results_bow %>% 488 | filter(trainable_embeddings == 'Variable', 489 | b_max_len %in% c(150)) %>% 490 | mutate(xp = 'n_layers') 491 | results_bow <- bind_rows(results_bow, results_bow_add) 492 | 493 | ## add 4 layers as max_len experiment 494 | results_bow_add <- results_bow %>% 495 | filter(trainable_embeddings == 'Variable', 496 | n_layers == 3, 497 | xp == 'layers') %>% 498 | mutate(xp = 'max_length') 499 | results_bow <- bind_rows(results_bow, results_bow_add) 500 | 501 | 502 | 503 | results_bow <- results_bow %>% 504 | mutate( 505 | epoch = epoch + 1, 506 | n_layers = n_layers + 1, 507 | max_length = b_max_len, 508 | model = 'BOW' 509 | ) 510 | 511 | # results_bow %>% write_rds(str_c(path_res, 'results_bow.rds')) 512 | 513 | # model_bow <- results_bow %>% select(attention_length:xp) %>% distinct %>% 514 | # arrange(model, xp, trainable_embeddings, max_length, n_layers) 515 | 516 | #####COMBINE ALL########################################################################### 517 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/' 518 | 519 | results <- bind_rows( 520 | results_lstm, 521 | results_att %>% mutate(b_max_len = NA_integer_, h_max_len = NA_integer_), 522 | results_cond %>% mutate(downsample = FALSE), 523 | results_bow %>% mutate(downsample = FALSE) 524 | ) 525 | 526 | # results %>% write_rds(str_c('C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/', 'results.rds')) 527 | 528 | ## debug 529 | # names(results_lstm) 530 | # names(results_att) 531 | # names(results_cond) 532 | # names(results_bow) 533 | 534 | # results_lstm %>% select(b_max_len) %>% head(5) 535 | # results_att %>% select(b_max_len) %>% head(5) 536 | # results_cond %>% select(b_max_len) %>% head(5) 537 | # results_bow %>% select(b_max_len) %>% head(5) 538 | 539 | # results_lstm %>% select(downsample) %>% head(5) 540 | # results_att %>% select(downsample) %>% head(5) 541 | # results_cond %>% select(downsample) %>% head(5) 542 | # results_bow %>% select(downsample) %>% head(5) 543 | 544 | #####FINAL RESULTS################################################################# 545 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/final/' 546 | 547 | results_final_bow <- load_data('perf_149004079896.csv') %>% 548 | mutate(max_length = b_max_len, 549 | model = 'BOW') 550 | results_final_lstm <- load_data('perf_149004484911.csv') %>% 551 | mutate(model = 'Basic LSTM') 552 | results_final_att <- load_data('perf_149004809705.csv') %>% 553 | mutate(model = 'Attention LSTM') 554 | results_final_cond <- load_data('perf_149005331987.csv') %>% 555 | mutate(max_length = b_max_len, 556 | model = 'CEA LSTM') 557 | 558 | results_final <- bind_rows(results_final_bow, 559 | results_final_lstm, 560 | results_final_att, 561 | results_final_cond) %>% 562 | mutate(epoch = epoch + 1) 563 | 564 | # results_final %>% distinct(model) 565 | 566 | # results_final %>% write_rds(str_c(path_res, 'results_final.rds')) 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | -------------------------------------------------------------------------------- /code/data_analysis_plotting/data_analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "FakeNewsChallenge" 3 | author: "Oskar Triebe" 4 | date: "February 3, 2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ```{r, message=FALSE} 13 | library(tidyverse) 14 | library(stringr) 15 | library(modelr) 16 | ``` 17 | 18 | ## Data Loading 19 | 20 | ```{r} 21 | ### Paths 22 | # url_train_bodies <- 23 | # 'https://raw.githubusercontent.com/FakeNewsChallenge/fnc-1/master/train_bodies.csv' 24 | # url_train_stances <- 25 | # 'https://raw.githubusercontent.com/FakeNewsChallenge/fnc-1/master/train_stances.csv' 26 | # url_train_stances.random <- 'https://raw.githubusercontent.com/FakeNewsChallenge/fnc-1/master/train_stances.random.csv' 27 | 28 | path_data <- 'C:/Users/OurOwnStory/Desktop/MyDrive/6_MSC/00_16-17 Winter/CS224n Natural Language Processing with Deep Learning/Project/R_Data/' 29 | path_fig <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/Figures/ggplot2/' 30 | ``` 31 | 32 | 33 | ```{r} 34 | ### save data 35 | # read_csv( 36 | # url_train_bodies, 37 | # col_names = TRUE, 38 | # col_types = cols( 39 | # `Body ID` = col_integer(), 40 | # articleBody = col_character() 41 | # ) 42 | # ) %>% 43 | # rename(body_id = `Body ID`, 44 | # body = articleBody 45 | # ) %>% 46 | # write_rds(str_c(path_data, 'bodies.rds')) 47 | # 48 | # read_csv( 49 | # url_train_stances, 50 | # col_names = TRUE, 51 | # col_types = cols( 52 | # Headline = col_character(), 53 | # `Body ID` = col_integer(), 54 | # Stance = col_character()) 55 | # ) %>% 56 | # rename(body_id = `Body ID`, 57 | # stance = Stance, 58 | # headline = Headline 59 | # ) %>% 60 | # write_rds(str_c(path_data, 'stances.rds')) 61 | # 62 | # read_csv( 63 | # url_train_stances.random, 64 | # col_names = TRUE, 65 | # col_types = cols( 66 | # Headline = col_character(), 67 | # `Body ID` = col_integer(), 68 | # Stance = col_character() 69 | # ) 70 | # ) %>% 71 | # rename(body_id = `Body ID`, 72 | # stance = Stance, 73 | # headline = Headline 74 | # ) %>% 75 | # write_rds(str_c(path_data, 'stances_random.rds')) 76 | 77 | ``` 78 | 79 | ```{r load data} 80 | bodies <- read_rds(str_c(path_data, 'bodies.rds')) 81 | stances <- read_rds(str_c(path_data, 'stances.rds')) 82 | stances_random <- read_rds(str_c(path_data, 'stances_random.rds')) 83 | 84 | data <- 85 | stances %>% 86 | inner_join(bodies, by = 'body_id') %>% 87 | select(body_id, stance, body, headline) %>% 88 | arrange(body_id, stance, headline) 89 | 90 | ## Remove doubles 91 | data <- 92 | data %>% 93 | distinct(body_id, stance, body, headline) 94 | 95 | ## String lengths 96 | data <- 97 | data %>% 98 | mutate(h_len = map_int(headline, str_length), 99 | b_len = map_int(body, str_length), 100 | h_words = map_int(headline, str_count, pattern = '[^\\w]+'), 101 | b_words = map_int(body, str_count, pattern = '[^\\w]+')) 102 | ``` 103 | 104 | ## Data Distribution 105 | 106 | 107 | ```{r} 108 | bodies_unique <- 109 | bodies %>% 110 | distinct(body) #%>% left_join(bodies, by = 'articleBody') 111 | 112 | stances_unique <- 113 | stances %>% 114 | distinct(headline) #%>% left_join(stances, by = 'Headline') 115 | 116 | n_bodies <- 117 | (bodies %>% nrow()) 118 | n_bodies_unique <- 119 | (bodies_unique %>% nrow()) 120 | 121 | n_stances_unique <- 122 | (stances_unique %>% nrow()) 123 | n_stances <- 124 | (stances %>% nrow()) 125 | 126 | n_data <- 127 | data %>% nrow() 128 | 129 | n_data_distinct <- 130 | data %>% 131 | distinct(body_id, headline, stance) %>% nrow() 132 | 133 | n_data_double <- 134 | n_data - n_data_distinct 135 | 136 | prop_bodies_unique <- 137 | n_bodies_unique / n_bodies 138 | prop_stances_unique <- 139 | n_stances_unique / n_stances 140 | 141 | ``` 142 | 143 | 144 | ```{r} 145 | str_c('Propp of unique bodies: ', prop_bodies_unique) 146 | 147 | str_c('Propp of unique headlines: ', prop_stances_unique) 148 | ``` 149 | 150 | 151 | 152 | 153 | ```{r} 154 | # Check doubles 155 | data %>% 156 | count(body_id, headline) %>% filter(n > 1) 157 | 158 | #Check match 159 | stances %>% 160 | anti_join(bodies, by = 'body_id') 161 | 162 | #Check unique id 163 | bodies %>% count(body_id) %>% filter(n > 1) 164 | 165 | ## 402 double entries 166 | headline_double <- 167 | stances %>% 168 | count(body_id, headline, stance) %>% 169 | filter(n > 1) 170 | 171 | n_data_double == headline_double %>% nrow() 172 | 173 | ``` 174 | 175 | 176 | 177 | ```{r} 178 | data %>% count(stance) %>% mutate(prop = n / n_data) 179 | ``` 180 | 181 | 182 | ## Lengths of bodies and headlines 183 | 184 | ```{r} 185 | print(str_c('Median headline words: ', median(data$h_words))) 186 | summary(data$h_words) 187 | print(str_c('Median body words: ', median(data$b_words))) 188 | summary(data$b_words) 189 | ``` 190 | 191 | 192 | ```{r} 193 | head_distr <- 194 | data %>% 195 | ggplot() + 196 | geom_ref_line(v = median(data$h_words), 197 | colour = 'grey70') + 198 | geom_histogram(aes(h_words), binwidth = 1) + 199 | labs(title = 'Article Headline Length Distribution', 200 | subtitle = 'The median headline has 10 words (first quartile 8, thrid quartile 13, minimum 1, maximum 40).', 201 | x = 'Number of Words', y = 'Count', 202 | caption = 'based on full dataset fnc-1 from FakeNewsChallenge.org') 203 | head_distr 204 | ggsave(plot = head_distr, filename = str_c('head_distr', '.png'), 205 | width = 7, height = 3, dpi = 900, units = 'in', 206 | path = path_fig, device = 'png') 207 | ``` 208 | 209 | ```{r} 210 | body_distr <- 211 | data %>% 212 | ggplot() + 213 | geom_ref_line(v = median(data$b_words), 214 | colour = 'grey70') + 215 | geom_histogram(aes(b_words), binwidth = 50) + 216 | scale_x_continuous(limits = c(-100, 2500)) + 217 | labs(title = 'Article Body Length Distribution', 218 | subtitle = 'The body headline has 315 words (first quartile 206, thrid quartile 477, minimum 3, maximum 4937).', 219 | x = 'Number of Words', y = 'Count', 220 | caption = 'based on full dataset fnc-1 from FakeNewsChallenge.org') 221 | body_distr 222 | ggsave(plot = body_distr, filename = 'body_distr.png', 223 | width = 7, height = 3, dpi = 900, units = 'in', 224 | path = path_fig, device = 'png') 225 | ``` 226 | 227 | 228 | 229 | 230 | ## Distribution among bodies and headlines 231 | 232 | ```{r} 233 | # Distribution of stances among bodies 234 | distr_body_stance <- 235 | data %>% 236 | group_by(body_id) %>% 237 | count(stance) %>% 238 | spread(key = stance, value = n, fill = 0) %>% 239 | gather(key = stance, value = n, c(agree, disagree, discuss, unrelated)) %>% 240 | mutate(prop = n / sum(n)) 241 | 242 | # Distribution of stances among headlines 243 | distr_headline_stance <- 244 | data %>% 245 | group_by(headline) %>% 246 | count(stance) %>% 247 | spread(key = stance, value = n, fill = 0) %>% 248 | gather(key = stance, value = n, c(agree, disagree, discuss, unrelated)) %>% 249 | mutate(prop = n / sum(n)) 250 | 251 | # Number of uses of each body 252 | distr_body <- 253 | distr_body_stance %>% 254 | group_by(body_id) %>% 255 | count(wt = n) 256 | 257 | # Number of uses of each headline 258 | distr_headline <- distr_headline_stance %>% 259 | group_by(headline) %>% 260 | count(wt = n) 261 | ``` 262 | 263 | ```{r, fig.asp=0.5, warning = FALSE} 264 | # Number of uses of each body 265 | distr_body %>% 266 | ggplot(aes(nn)) + 267 | # geom_histogram(binwidth = 2) + 268 | geom_freqpoly(binwidth = 2, color = 'black', size = 0.5) + 269 | # stat_ecdf() + 270 | theme_minimal() 271 | 272 | # Number of uses of each headline 273 | distr_headline %>% 274 | ggplot(aes(nn)) + 275 | # geom_histogram(binwidth = 2) + 276 | geom_freqpoly(binwidth = 2, color = 'black', size = 0.5) + 277 | # stat_ecdf() + 278 | theme_minimal() 279 | 280 | # Number of uses of each body and headline 281 | distr_body %>% 282 | ggplot(aes(nn)) + 283 | # geom_histogram(binwidth = 2) + 284 | # geom_freqpoly(binwidth = 2, color = 'black', size = 0.5) + 285 | stat_ecdf(color = 'red', size = 1) + 286 | stat_ecdf(data = distr_headline, size = 1) + 287 | theme_minimal() 288 | ``` 289 | 290 | 291 | ```{r, fig.asp=1/2, warning = FALSE} 292 | # HIST Distribution of stances among bodies 293 | 294 | # distr_body_stance %>% 295 | # ggplot(aes(n)) + 296 | # geom_histogram(binwidth = 1) + 297 | # # geom_freqpoly(binwidth = 1, color = 'blue') + 298 | # facet_wrap(~stance, nrow = 1) 299 | # 300 | # distr_body_stance %>% 301 | # ggplot(aes(prop)) + 302 | # geom_histogram(binwidth = 0.02) + 303 | # # geom_freqpoly(binwidth = 0.01, color = 'blue') + 304 | # facet_wrap(~stance, nrow = 1) 305 | # 306 | # ## log 307 | # distr_body_stance %>% 308 | # ggplot(aes(n)) + 309 | # geom_histogram(binwidth = 1) + 310 | # # geom_freqpoly(binwidth = 1, color = 'blue') + 311 | # facet_wrap(~stance, nrow = 1) + 312 | # scale_y_log10() 313 | # 314 | # distr_body_stance %>% 315 | # ggplot(aes(prop)) + 316 | # geom_histogram(binwidth = 0.02) + 317 | # # geom_freqpoly(binwidth = 0.01, color = 'blue') + 318 | # facet_wrap(~stance, nrow = 1) + 319 | # scale_y_log10() 320 | 321 | ``` 322 | 323 | 324 | 325 | ```{r, fig.asp=1/2, warning = FALSE} 326 | ## ECDF Distribution of stances among bodies 327 | distr_body_stance %>% 328 | ggplot(aes(n, color = stance)) + 329 | stat_ecdf(size = 1) + 330 | theme_minimal() 331 | 332 | distr_body_stance %>% 333 | ggplot(aes(prop,color = stance)) + 334 | stat_ecdf(size = 1) + 335 | theme_minimal() 336 | 337 | ## FREQPOLY Distribution of stances among bodies 338 | distr_body_stance %>% 339 | ggplot(aes(n)) + 340 | geom_freqpoly(binwidth = 3, color = 'black', size = 0.5) + 341 | facet_wrap(~stance, nrow = 1) + 342 | theme_minimal() 343 | 344 | distr_body_stance %>% 345 | ggplot(aes(prop)) + 346 | geom_freqpoly(binwidth = 0.04, color = 'black') + 347 | facet_wrap(~stance, nrow = 1) + 348 | theme_minimal() 349 | 350 | ## log 351 | # distr_body_stance %>% 352 | # ggplot(aes(n)) + 353 | # geom_freqpoly(binwidth = 5, color = 'blue') + 354 | # facet_wrap(~stance, nrow = 1) + 355 | # scale_y_log10() 356 | # 357 | # distr_body_stance %>% 358 | # ggplot(aes(prop)) + 359 | # geom_freqpoly(binwidth = 0.05, color = 'blue') + 360 | # facet_wrap(~stance, nrow = 1) + 361 | # scale_y_log10() 362 | 363 | ``` 364 | 365 | 366 | ```{r, fig.asp=1/2, warning = FALSE} 367 | ## ECDF Distribution of stances among bodies 368 | distr_headline_stance %>% 369 | ggplot(aes(n, color = stance)) + 370 | stat_ecdf(size = 1) + 371 | theme_minimal() 372 | 373 | distr_headline_stance %>% 374 | ggplot(aes(prop,color = stance)) + 375 | stat_ecdf(size = 1) + 376 | theme_minimal() 377 | 378 | 379 | ## FREQPOLY Distribution of stances among stances 380 | distr_headline_stance %>% 381 | ggplot(aes(n)) + 382 | geom_freqpoly(binwidth = 3, color = 'black', size = 0.5) + 383 | facet_wrap(~stance, nrow = 1) + 384 | theme_minimal() 385 | 386 | distr_headline_stance %>% 387 | ggplot(aes(prop)) + 388 | geom_freqpoly(binwidth = 0.03, color = 'black') + 389 | facet_wrap(~stance, nrow = 1) + 390 | theme_minimal() 391 | 392 | ## log 393 | # distr_headline_stance %>% 394 | # ggplot(aes(n)) + 395 | # geom_freqpoly(binwidth = 5, color = 'blue') + 396 | # facet_wrap(~stance, nrow = 1) + 397 | # scale_y_log10() 398 | # 399 | # distr_headline_stance %>% 400 | # ggplot(aes(prop)) + 401 | # geom_freqpoly(binwidth = 0.05, color = 'blue') + 402 | # facet_wrap(~stance, nrow = 1) + 403 | # scale_y_log10() 404 | 405 | ``` 406 | 407 | 408 | ## Conclusions 409 | 410 | * Headlines always have at least one body pair that is unrelated. 411 | 412 | 413 | 414 | ## Better Data management 415 | 416 | 417 | ```{r} 418 | headers_new <- 419 | stances_unique %>% 420 | mutate(h_id = row_number(), 421 | h_words = map_int(headline, str_count, pattern = '[^\\w]+')) 422 | 423 | bodies_new <- 424 | bodies_unique %>% 425 | mutate(b_id = row_number(), 426 | b_words = map_int(body, str_count, pattern = '[^\\w]+')) 427 | 428 | 429 | data_new <- 430 | data %>% 431 | select(stance, headline, body) %>% 432 | left_join(headers_new, by = 'headline') %>% 433 | left_join(bodies_new, by = 'body') 434 | 435 | ## Check 436 | # data_new %>% 437 | # anti_join(data) 438 | 439 | data_new <- 440 | data_new %>% 441 | select(stance, h_id, b_id) 442 | 443 | ``` 444 | 445 | 446 | ```{r} 447 | ## Test naive splitting bleedover 448 | data_new_random <- sample.int(n = (data_new %>% nrow()), size = 10) 449 | 450 | 451 | ## Test Random headline splitting bleedover 452 | ``` 453 | 454 | 455 | -------------------------------------------------------------------------------- /code/data_analysis_plotting/results_analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Results FakeNewsChallenge" 3 | author: "Oskar Triebe" 4 | date: "March 18, 2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ```{r} 13 | library(tidyverse) 14 | library(stringr) 15 | library(forcats) 16 | # help(package = 'forcats') 17 | ``` 18 | 19 | 20 | ## Plots 21 | 22 | ```{r} 23 | # mutate( 24 | # max_length = fct_rev(factor(max_length)), 25 | # n_layers = fct_rev(factor(n_layers)) 26 | # ) 27 | ``` 28 | 29 | 30 | ### Train Loss 31 | 32 | ```{r} 33 | # results %>% 34 | # filter(!downsample) %>% 35 | # filter(xp == 'max_length') %>% 36 | # ggplot(aes(x = epoch)) + 37 | # geom_line(aes(y = train_loss, color = max_length), 38 | # size = 1) + 39 | # labs(title = 'No Downsampling max_length') 40 | 41 | # title = str_c('XP: ', xp, ', downsampled: ', downsample) 42 | ``` 43 | 44 | ```{r} 45 | # results %>% 46 | # filter(downsample) %>% 47 | # filter(xp == 'max_length') %>% 48 | # ggplot(aes(x = epoch)) + 49 | # geom_line(aes(y = train_loss, color = max_length), 50 | # size = 1) + 51 | # labs(title = 'With Downsampling max_length') 52 | ``` 53 | 54 | ```{r} 55 | # results %>% 56 | # filter(!downsample) %>% 57 | # filter(xp == 'n_layers') %>% 58 | # ggplot(aes(x = epoch)) + 59 | # geom_line(aes(y = train_loss, color = n_layers), 60 | # size = 1) + 61 | # labs(title = 'No Downsampling n_layers') 62 | ``` 63 | 64 | ```{r} 65 | # results %>% 66 | # filter(downsample) %>% 67 | # filter(xp == 'n_layers') %>% 68 | # ggplot(aes(x = epoch)) + 69 | # geom_line(aes(y = train_loss, color = n_layers), 70 | # size = 1) + 71 | # labs(title = 'With Downsampling n_layers') 72 | ``` 73 | 74 | 75 | ```{r} 76 | results %>% 77 | # filter(!downsample) %>% 78 | filter(xp == 'max_length') %>% 79 | ggplot(aes(x = epoch)) + 80 | geom_line(aes(y = train_loss, color = max_length), 81 | size = 1) + 82 | labs(title = 'Train Loss: max_length, downsample') + 83 | facet_wrap(~downsample) + 84 | coord_cartesian(ylim = c(0.0, 0.8)) 85 | ``` 86 | 87 | ```{r} 88 | results %>% 89 | # filter(downsample) %>% 90 | filter(xp == 'n_layers') %>% 91 | ggplot(aes(x = epoch)) + 92 | geom_line(aes(y = train_loss, color = n_layers), 93 | size = 1) + 94 | labs(title = 'Train Loss: n_layers, downsample') + 95 | facet_wrap(~downsample) + 96 | coord_cartesian(ylim = c(0.0, 0.8)) 97 | ``` 98 | 99 | ### Competition Score 100 | 101 | 102 | ```{r} 103 | results %>% 104 | filter(xp == 'max_length') %>% 105 | mutate( 106 | max_length = fct_rev(max_length), 107 | n_layers = fct_rev(n_layers)) %>% 108 | ggplot(aes(x = epoch)) + 109 | geom_line(aes(y = competition, color = max_length), 110 | size = 1) + 111 | labs(title = 'Competition Score: max_length, downsample') + 112 | facet_wrap(~downsample) + 113 | coord_cartesian(ylim = c(0.45, 0.75)) 114 | ``` 115 | 116 | ```{r} 117 | results %>% 118 | filter(xp == 'n_layers') %>% 119 | mutate( 120 | max_length = fct_rev(max_length), 121 | n_layers = fct_rev(n_layers)) %>% 122 | ggplot(aes(x = epoch)) + 123 | geom_line(aes(y = competition, color = n_layers), 124 | size = 1) + 125 | labs(title = 'Competition Score: n_layers, downsample') + 126 | facet_wrap(~downsample) + 127 | coord_cartesian(ylim = c(0.45, 0.75)) 128 | ``` 129 | 130 | 131 | ### F1 for each Stance 132 | 133 | ```{r, fig.asp = 1} 134 | results %>% 135 | filter(xp == 'n_layers') %>% 136 | mutate(max_length = fct_rev(max_length), 137 | n_layers = fct_rev(n_layers)) %>% 138 | ggplot(aes(x = epoch)) + 139 | geom_line(aes(y = f1, color = n_layers), 140 | size = 1) + 141 | labs(title = 'F1: n_layers, stance, downsample') + 142 | facet_grid(class~downsample, scales = 'free') 143 | # facet_grid(downsample~class, scales = 'free') 144 | ``` 145 | 146 | 147 | ```{r, fig.asp = 1} 148 | results %>% 149 | filter(xp == 'n_layers') %>% 150 | # mutate(max_length = fct_rev(max_length), 151 | # n_layers = fct_rev(n_layers)) %>% 152 | ggplot(aes(x = epoch)) + 153 | geom_line(aes(y = f1, color = downsample), 154 | size = 1) + 155 | labs(title = 'F1: n_layers, stance, downsample') + 156 | facet_grid(class~n_layers) 157 | # coord_cartesian(ylim = c(0.45, 0.75)) 158 | ``` 159 | 160 | 161 | 162 | 163 | 164 | 165 | ```{r, fig.asp = 1} 166 | results %>% 167 | filter(xp == 'max_length') %>% 168 | mutate(max_length = fct_rev(max_length)) %>% 169 | ggplot(aes(x = epoch)) + 170 | geom_line(aes(y = f1, color = max_length), 171 | size = 1) + 172 | labs(title = 'F1: max_length, stance, downsample') + 173 | facet_grid(class~downsample, scales = 'free') 174 | # facet_grid(downsample~class, scales = 'free') 175 | ``` 176 | 177 | 178 | ```{r, fig.asp = 1} 179 | results %>% 180 | filter(xp == 'max_length') %>% 181 | mutate(max_length = fct_rev(max_length)) %>% 182 | ggplot(aes(x = epoch)) + 183 | geom_line(aes(y = f1, color = downsample), 184 | size = 1) + 185 | labs(title = 'F1: max_length, stance, downsample') + 186 | facet_grid(class~max_length) 187 | # coord_cartesian(ylim = c(0.45, 0.75)) 188 | ``` 189 | 190 | 191 | 192 | 193 | 194 | 195 | -------------------------------------------------------------------------------- /code/data_analysis_plotting/results_analysis_2.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Results FakeNewsChallenge" 3 | author: "Oskar Triebe" 4 | date: "March 18, 2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ```{r} 13 | library(tidyverse) 14 | library(stringr) 15 | library(forcats) 16 | # help(package = 'forcats') 17 | ``` 18 | 19 | ## Options 20 | 21 | ### Data 22 | ```{r} 23 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/' 24 | results <- read_rds(str_c(path_res, 'results.rds')) 25 | 26 | path_res_final <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/final/' 27 | results_final <- read_rds(str_c(path_res_final, 'results_final.rds')) 28 | path_fig <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/Figures/ggplot2/' 29 | 30 | 31 | results <- results %>% 32 | filter(!downsample) %>% 33 | filter(trainable_embeddings == 'Variable') 34 | 35 | # name_model <- function(x){ 36 | # if (x == 'bow') {return('BOW')} 37 | # if (x == 'basiclstm') {return('Basic LSTM')} 38 | # if (x == 'attention') {return('Attention LSTM')} 39 | # if (x == 'conditional') {return('CEA LSTM')} 40 | # } 41 | 42 | model_names = c('BOW', 'Basic LSTM', 'Attention LSTM', 'CEA LSTM') 43 | results <- results %>% 44 | select(model:hidden_size, lr:n_classes, everything()) %>% 45 | mutate(model = factor(model, levels = model_names)) #%>% 46 | # mutate( 47 | # max_length = fct_rev(factor(max_length), levels = c(50, 75, 150, 200, 300, 600)), 48 | # n_layers = fct_rev(factor(n_layers), levels = c(1, 2, 4)) 49 | # ) 50 | 51 | # same for final res 52 | results_final <- results_final %>% 53 | select(model:hidden_size, lr:n_classes, everything()) %>% 54 | mutate(model = factor(model, levels = model_names)) 55 | 56 | ## What we can analyze: 57 | params <- sapply(results %>% select(model:n_classes), unique) 58 | 59 | models <- 60 | results %>% 61 | select(model:n_classes) %>% 62 | # filter(model == 'basiclstm') %>% 63 | # filter(xp == 'base_150') %>% 64 | distinct() %>% 65 | arrange(model, xp, trainable_embeddings, max_length, n_layers) 66 | 67 | hyperparams <- names(models) 68 | hyperparams_class <- c(hyperparams, 'class') 69 | 70 | ### Max F1 score 71 | results <- 72 | results %>% 73 | group_by_(.dots = hyperparams_class) %>% 74 | mutate( 75 | competition_max = max(competition), 76 | epoch_max_comp = epoch[which.max(competition)], 77 | f1_max_comp = f1[which.max(competition)] 78 | ) %>% 79 | ungroup() 80 | 81 | ## Rename Stances to classes: 82 | results$class[results$class == 0] = 'Related: Agree' 83 | results$class[results$class == 1] = 'Related: Disagree' 84 | results$class[results$class == 2] = 'Related: Discuss' 85 | results$class[results$class == 3] = 'Unrelated' 86 | ## same for final res 87 | results_final$class[results_final$class == 0] = 'Related: Agree' 88 | results_final$class[results_final$class == 1] = 'Related: Disagree' 89 | results_final$class[results_final$class == 2] = 'Related: Discuss' 90 | results_final$class[results_final$class == 3] = 'Unrelated' 91 | 92 | ``` 93 | 94 | ### Plotting 95 | 96 | ```{r} 97 | ## set theme light 98 | # theme_set(theme_light() + theme(panel.grid = element_blank())) 99 | theme_set(theme_light() + theme(panel.grid.major.x = element_blank(), 100 | panel.grid.minor.x = element_blank())) 101 | ## set theme dark 102 | # theme_set(theme_dark() + theme(panel.grid = element_blank())) 103 | # theme_set(theme_dark() + theme(panel.grid.major.x = element_blank(), 104 | # panel.grid.minor.x = element_blank())) 105 | ``` 106 | 107 | ```{r} 108 | breaks_max_length = c(75, 150, 300, 600) 109 | breaks_max_length_short = c(75, 150, 300) 110 | breaks_max_length_all = c(50, 75, 150, 300, 600) 111 | breaks_n_layers = c(1, 2, 4) 112 | ``` 113 | 114 | 115 | ```{r} 116 | ## plotting vars for Competition Scores 117 | comp_ylim = c(0.60, 0.80) 118 | comp_ylim_low = c(0.60, 0.75) 119 | comp_ylim_high = c(0.65, 0.80) 120 | comp_ylim_zoom = c(0.65, 0.75) 121 | comp_ylim_bow = c(0.40, 0.80) 122 | ``` 123 | 124 | ```{r} 125 | ## plotting vars for F1 126 | f1_ylim = c(0.30, 1.00) 127 | f1_ylim_low = c(0.60, 0.75) 128 | f1_ylim_high = c(0.65, 0.80) 129 | f1_ylim_zoom = c(0.65, 0.75) 130 | f1_ylim_bow = c(0.40, 0.80) 131 | ``` 132 | 133 | ```{r} 134 | # trancation_labels <- c( 135 | # 30 = 136 | ``` 137 | 138 | 139 | 140 | 141 | ## Competition Score 142 | 143 | ```{r, fig.asp = 0.8} 144 | score_max_length <- 145 | results %>% 146 | filter(xp %in% c('max_length')) %>% 147 | filter(max_length != 50) %>% 148 | mutate( 149 | max_length = str_c('Truncation: ', max_length), 150 | max_length = factor(max_length, levels = str_c('Truncation: ', c(75, 150, 200, 300, 600))) 151 | ) %>% 152 | ggplot(aes(x = epoch, color = model)) + 153 | geom_line(aes(y = competition), size = 1) + 154 | geom_point( 155 | data = results %>% distinct_(.dots = c(hyperparams, 'epoch_max_comp', 'competition_max')) %>% 156 | filter(xp %in% c('max_length')) %>% filter(max_length != 50) %>% 157 | mutate(max_length = str_c('Truncation: ', max_length), 158 | max_length = factor(max_length, levels = str_c('Truncation: ', c(75, 150, 200, 300, 600)))), 159 | mapping = aes(x = epoch_max_comp, y = competition_max), size = 2, alpha = 0.6 160 | ) + 161 | coord_cartesian(ylim = comp_ylim) + 162 | facet_wrap(~max_length, nrow = 1) + 163 | theme(legend.position = 'bottom', legend.text = element_text(size = 10)) + 164 | labs( 165 | title = 'Sensitivity of Competition Score to Sequence Truncation', 166 | x = 'Epoch', y = 'Competition Score', 167 | subtitle = 'BOW and CEA LSTM models perform best at shortest and longest truncation lengths. \nBasic LSTM and Attention LSTM models perform best at shortest truncation lengths.', 168 | caption = 'Fitted on train set (60%) and evaluated on dev set (20%)' 169 | ) + 170 | guides(color = guide_legend(title = 'Model')) 171 | 172 | score_max_length 173 | 174 | ggsave(plot = score_max_length, filename = str_c('score_max_length', '.png'), 175 | width = 8, height = 6, dpi = 900, units = 'in', 176 | path = path_fig, device = 'png') 177 | ``` 178 | 179 | 180 | ```{r, fig.asp = 0.8} 181 | ### For the report 182 | score_max_length <- 183 | results %>% 184 | filter(xp %in% c('max_length')) %>% 185 | filter(max_length != 50) %>% 186 | mutate( 187 | max_length = str_c('Truncation: ', max_length), 188 | max_length = factor(max_length, levels = str_c('Truncation: ', c(75, 150, 200, 300, 600))) 189 | ) %>% 190 | ggplot(aes(x = epoch, color = model)) + 191 | geom_line(aes(y = competition), size = 1) + 192 | geom_point( 193 | data = results %>% distinct_(.dots = c(hyperparams, 'epoch_max_comp', 'competition_max')) %>% 194 | filter(xp %in% c('max_length')) %>% filter(max_length != 50) %>% 195 | mutate(max_length = str_c('Truncation: ', max_length), 196 | max_length = factor(max_length, levels = str_c('Truncation: ', c(75, 150, 200, 300, 600)))), 197 | mapping = aes(x = epoch_max_comp, y = competition_max), size = 2, alpha = 0.6 198 | ) + 199 | coord_cartesian(ylim = comp_ylim) + 200 | facet_wrap(~max_length, nrow = 1) + 201 | theme(legend.position = 'bottom', legend.text = element_text(size = 10), 202 | title = element_blank()) + 203 | labs( 204 | title = 'Sensitivity of Competition Score to Sequence Truncation', 205 | x = 'Epoch', y = 'Competition Score', 206 | subtitle = 'BOW and CEA LSTM models perform best at shortest and longest truncation lengths. \nBasic LSTM and Attention LSTM models perform best at shortest truncation lengths.', 207 | caption = 'Fitted on train set (60%) and evaluated on dev set (20%)' 208 | ) + 209 | guides(color = guide_legend(title = 'Model')) 210 | 211 | score_max_length 212 | 213 | ggsave(plot = score_max_length, filename = str_c('score_max_length_report', '.png'), 214 | width = 8, height = 4, dpi = 900, units = 'in', 215 | path = path_fig, device = 'png') 216 | ``` 217 | 218 | 219 | 220 | ```{r, fig.asp = 0.8} 221 | score_n_layers <- 222 | results %>% 223 | filter(xp %in% c('n_layers')) %>% 224 | mutate( 225 | n_layers = str_c('Layers: ', n_layers), 226 | n_layers = factor(n_layers, levels = str_c('Layers: ', c(1, 2, 4))) 227 | ) %>% 228 | ggplot(aes(x = epoch, color = model)) + 229 | geom_line(aes(y = competition), 230 | size = 1) + 231 | geom_point( 232 | data = results %>% distinct_(.dots = c(hyperparams, 'epoch_max_comp', 'competition_max')) %>% 233 | filter(xp %in% c('n_layers')) %>% filter(max_length != 50) %>% 234 | mutate(n_layers = str_c('Layers: ', n_layers), 235 | n_layers = factor(n_layers, levels = str_c('Layers: ', c(1, 2, 4)))), 236 | mapping = aes(x = epoch_max_comp, y = competition_max), size = 2, alpha = 0.6 237 | ) + 238 | coord_cartesian(ylim = comp_ylim) + # cuts off 1 layer BOW 239 | facet_wrap(~n_layers, nrow = 1) + 240 | theme(legend.position = 'bottom', legend.text = element_text(size = 10)) + 241 | labs( 242 | title = 'Sensitivity of Competition Score to Hidden Layers', 243 | x = 'Epoch', y = 'Competition Score', 244 | subtitle = 'Basic LSTM and CEA LSTM models perform best with 2 layers. \nBOW and Attention LSTM models perform best with more hidden layers.', 245 | caption = 'Fitted on train set (60%) and evaluated on dev set (20%)' 246 | ) + 247 | guides(color = guide_legend(title = 'Model')) 248 | 249 | 250 | score_n_layers 251 | 252 | ggsave(plot = score_n_layers, filename = str_c('score_n_layers', '.png'), 253 | width = 6, height = 6, dpi = 900, units = 'in', 254 | path = path_fig, device = 'png') 255 | ``` 256 | 257 | 258 | ```{r, fig.asp = 0.8} 259 | ### For Report 260 | score_n_layers <- 261 | results %>% 262 | filter(xp %in% c('n_layers')) %>% 263 | mutate( 264 | n_layers = str_c('Layers: ', n_layers), 265 | n_layers = factor(n_layers, levels = str_c('Layers: ', c(1, 2, 4))) 266 | ) %>% 267 | ggplot(aes(x = epoch, color = model)) + 268 | geom_line(aes(y = competition), 269 | size = 1) + 270 | geom_point( 271 | data = results %>% distinct_(.dots = c(hyperparams, 'epoch_max_comp', 'competition_max')) %>% 272 | filter(xp %in% c('n_layers')) %>% filter(max_length != 50) %>% 273 | mutate(n_layers = str_c('Layers: ', n_layers), 274 | n_layers = factor(n_layers, levels = str_c('Layers: ', c(1, 2, 4)))), 275 | mapping = aes(x = epoch_max_comp, y = competition_max), size = 2, alpha = 0.6 276 | ) + 277 | coord_cartesian(ylim = comp_ylim) + # cuts off 1 layer BOW 278 | facet_wrap(~n_layers, nrow = 1) + 279 | theme(legend.position = 'bottom', legend.text = element_text(size = 10), 280 | title = element_blank()) + 281 | labs( 282 | title = 'Sensitivity of Competition Score to Hidden Layers', 283 | x = 'Epoch', y = 'Competition Score', 284 | subtitle = 'Basic LSTM and CEA LSTM models perform best with 2 layers. \nBOW and Attention LSTM models perform best with more hidden layers.', 285 | caption = 'Fitted on train set (60%) and evaluated on dev set (20%)' 286 | ) + 287 | guides(color = guide_legend(title = 'Model')) 288 | 289 | 290 | score_n_layers 291 | 292 | ggsave(plot = score_n_layers, filename = str_c('score_n_layers_report', '.png'), 293 | width = 6, height = 4, dpi = 900, units = 'in', 294 | path = path_fig, device = 'png') 295 | ``` 296 | 297 | 298 | 299 | ## F1 Scores 300 | 301 | 302 | ```{r, fig.asp = 0.8} 303 | f1_max_length <- 304 | results %>% 305 | filter(xp %in% c('max_length')) %>% 306 | filter(max_length != 50) %>% 307 | # filter(max_length != 600) %>% 308 | filter(epoch == epoch_max_comp) %>% 309 | ggplot(aes(x = max_length, color = model)) + 310 | geom_line(aes(y = f1), size = 1) + 311 | geom_point(aes(y = f1), size = 2, alpha = 0.6) + 312 | scale_x_continuous(trans = 'log2', breaks = breaks_max_length) + 313 | facet_wrap(~class, nrow = 1 314 | # ,scales = 'free_y' 315 | ) + coord_cartesian(ylim = f1_ylim) + 316 | theme(legend.position = 'bottom', legend.text = element_text(size = 10)) + 317 | labs( 318 | title = 'Sensitivity of Stance F1 Scores to Truncation Length', 319 | x = 'Truncation Length', y = 'F1 Score', 320 | # subtitle = 'Basic LSTM and CEA LSTM models perform best with 2 layers. \nBOW and Attention LSTM models perform best with more hidden layers.', 321 | caption = 'Fitted on train set (60%) and evaluated on dev set (20%), shown for epoch with maximal competition score' 322 | ) + 323 | guides(color = guide_legend(title = 'Model')) 324 | 325 | 326 | f1_max_length 327 | 328 | ggsave(plot = f1_max_length, filename = str_c('f1_max_length', '.png'), 329 | width = 8, height = 6, dpi = 900, units = 'in', 330 | path = path_fig, device = 'png') 331 | 332 | 333 | 334 | 335 | f1_max_length_report <- f1_max_length + theme(title = element_blank()) 336 | 337 | ggsave(plot = f1_max_length_report, filename = str_c('f1_max_length_report', '.png'), 338 | width = 8, height = 4, dpi = 900, units = 'in', 339 | path = path_fig, device = 'png') 340 | ``` 341 | 342 | ```{r, fig.asp = 0.8} 343 | f1_n_layers <- 344 | results %>% 345 | filter(xp %in% c('n_layers')) %>% 346 | filter(epoch == epoch_max_comp) %>% 347 | ggplot(aes(x = n_layers, color = model)) + 348 | geom_line(aes(y = f1), size = 1) + 349 | geom_point(aes(y = f1), size = 2, alpha = 0.6) + 350 | scale_x_continuous(trans = 'log2', breaks = breaks_n_layers) + 351 | facet_wrap(~class, nrow = 1 352 | # ,scales = 'free_y' 353 | ) + coord_cartesian(ylim = f1_ylim) + 354 | theme(legend.position = 'bottom', legend.text = element_text(size = 10)) + 355 | labs( 356 | title = 'Sensitivity of Stance F1 Scores to Hidden Layers', 357 | x = 'Truncation Length', y = 'F1 Score', 358 | # subtitle = 'Basic LSTM and CEA LSTM models perform best with 2 layers. \nBOW and Attention LSTM models perform best with more hidden layers.', 359 | caption = 'Fitted on train set (60%) and evaluated on dev set (20%), shown for epoch with maximal competition score' 360 | ) + 361 | guides(color = guide_legend(title = 'Model')) 362 | 363 | 364 | 365 | f1_n_layers 366 | 367 | ggsave(plot = f1_n_layers, filename = str_c('f1_n_layers', '.png'), 368 | width = 8, height = 6, dpi = 900, units = 'in', 369 | path = path_fig, device = 'png') 370 | 371 | 372 | f1_n_layers_report <- f1_n_layers + theme(title = element_blank()) 373 | 374 | ggsave(plot = f1_n_layers_report, filename = str_c('f1_n_layers_report', '.png'), 375 | width = 8, height = 4, dpi = 900, units = 'in', 376 | path = path_fig, device = 'png') 377 | ``` 378 | 379 | 380 | ## Final Results 381 | 382 | ```{r} 383 | ### TABLE 384 | results_table_all <- 385 | results_final %>% 386 | filter(epoch == 40) %>% 387 | spread(key = class, value = f1) %>% 388 | group_by(model) %>% 389 | mutate( 390 | f1_agree = sum(`Related: Agree`, na.rm = TRUE), 391 | f1_disagree = sum(`Related: Disagree`, na.rm = TRUE), 392 | f1_discuss = sum(`Related: Discuss`, na.rm = TRUE), 393 | f1_unrelated = sum(`Unrelated`, na.rm = TRUE) 394 | ) %>% 395 | select(-`Related: Agree`,-`Related: Disagree`, -`Related: Discuss`, -`Unrelated`) %>% 396 | select( -(fp:tn), -(specificity:accuracy), -xp) %>% 397 | select(model, everything()) 398 | 399 | variables_results <- names(results_table_all) 400 | 401 | results_table_all <- 402 | results_table_all %>% 403 | distinct_(.dots = variables_results) 404 | 405 | results_table_all <- 406 | results_table_all[,-ncol(results_table_all)] %>% 407 | mutate(max_length = if_else(model %in% c('BOW', 'CEA LSTM'), NA_integer_, max_length)) 408 | 409 | results_table <- 410 | results_table_all %>% 411 | select(model, competition:f1_unrelated) 412 | 413 | results_table %>% write_excel_csv(path = str_c(path_fig, 'results_table.csv')) 414 | 415 | hyperparam_table <- 416 | results_table_all %>% 417 | select(model:epoch) 418 | 419 | hyperparam_table %>% write_excel_csv(path = str_c(path_fig, 'hyperparam_table.csv')) 420 | 421 | 422 | ``` 423 | 424 | 425 | ```{r, fig.asp = 0.8} 426 | final_loss <- 427 | results_final %>% 428 | ggplot(aes(x = epoch, color = model)) + 429 | geom_line(aes(y = train_loss), 430 | size = 1) + 431 | coord_cartesian(ylim = c(0, 0.5)) + 432 | theme(legend.position = 'bottom', legend.text = element_text(size = 10)) + 433 | labs( 434 | title = 'Training Loss of Selected Models', 435 | x = 'Epoch', y = 'Training Loss', 436 | # subtitle = 'Basic LSTM and CEA LSTM models perform best with 2 layers. \nBOW and Attention LSTM models perform best with more hidden layers.', 437 | caption = 'Fitted on train and dev set (80%) and evaluated on test set (20%)' 438 | ) + 439 | guides(color = guide_legend(title = 'Model')) 440 | 441 | final_loss 442 | 443 | ggsave(plot = final_loss, filename = str_c('final_loss', '.png'), 444 | width = 5, height = 5, dpi = 900, units = 'in', 445 | path = path_fig, device = 'png') 446 | ``` 447 | 448 | ```{r, fig.asp = 0.8} 449 | final_f1 <- 450 | results_final %>% 451 | filter(epoch == 40) %>% 452 | ggplot(aes(x = model, fill = model)) + 453 | geom_col(aes(y = f1), size = 1) + 454 | theme(legend.position = 'bottom', legend.text = element_text(size = 10), 455 | axis.text.x = element_blank(), axis.ticks.x = element_blank(), 456 | axis.title.x = element_blank()) + 457 | facet_wrap(~class, nrow = 1) + 458 | labs( 459 | title = 'Stance F1 Scores of Selected Models', 460 | x = 'Model', y = 'F1 Score', 461 | # subtitle = 'Basic LSTM and CEA LSTM models perform best with 2 layers. \nBOW and Attention LSTM models perform best with more hidden layers.', 462 | caption = 'Fitted on train and dev set (80%) and evaluated on test set (20%)' 463 | ) + 464 | guides(fill = guide_legend(title = 'Model')) 465 | 466 | final_f1 467 | 468 | ggsave(plot = final_f1, filename = str_c('final_f1', '.png'), 469 | width = 8, height = 6, dpi = 900, units = 'in', 470 | path = path_fig, device = 'png') 471 | ``` 472 | 473 | ```{r, fig.asp = 0.8} 474 | final_comp <- 475 | results_final %>% 476 | filter(epoch == 40) %>% 477 | distinct(model, competition) %>% 478 | ggplot(aes(x = model, fill = model)) + 479 | geom_col(aes(y = competition), size = 1) + 480 | coord_cartesian(ylim = c(0.725, 0.825)) + 481 | 482 | theme(legend.position = 'bottom', legend.text = element_text(size = 10), 483 | axis.text.x = element_blank(), axis.ticks.x = element_blank(), 484 | axis.title.x = element_blank()) + 485 | # facet_wrap(~class, nrow = 1) + 486 | labs( 487 | title = 'Competition Scores of Selected Models', 488 | x = 'Model', y = 'Competition Score', 489 | # subtitle = 'Basic LSTM and CEA LSTM models perform best with 2 layers. \nBOW and Attention LSTM models perform best with more hidden layers.', 490 | caption = 'Fitted on train and dev set (80%) and evaluated on test set (20%)' 491 | ) + 492 | guides(fill = guide_legend(title = 'Model')) 493 | 494 | final_comp 495 | 496 | ggsave(plot = final_comp, filename = str_c('final_comp', '.png'), 497 | width = 5, height = 5, dpi = 900, units = 'in', 498 | path = path_fig, device = 'png') 499 | ``` 500 | 501 | 502 | ```{r, fig.asp = 0.8} 503 | ### DO NOT use this plot 504 | results_final %>% 505 | ggplot(aes(x = epoch, color = model)) + 506 | geom_line(aes(y = competition), 507 | size = 1) + 508 | # coord_cartesian(ylim = comp_ylim) + 509 | theme(legend.position = 'bottom', legend.text = element_text(size = 10)) + 510 | labs( 511 | title = 'Final Competition Scores of Selected Models', 512 | x = 'Epoch', y = 'Competition Score', 513 | # subtitle = 'Basic LSTM and CEA LSTM models perform best with 2 layers. \nBOW and Attention LSTM models perform best with more hidden layers.', 514 | caption = 'Fitted on train and de set (80%) and evaluated on test set (20%)' 515 | ) + 516 | guides(color = guide_legend(title = 'Model')) 517 | ``` 518 | 519 | 520 | 521 | ## Model Details 522 | 523 | ### BOW - Competition Score 524 | 525 | ```{r} 526 | results %>% 527 | filter(model %in% c('BOW')) %>% 528 | ggplot(aes(x = epoch)) + 529 | geom_line(aes(y = competition, 530 | # linetype = trainable_embeddings, 531 | color = factor(n_layers)), 532 | size = 1) + 533 | labs(title = 'BOW Competition Score: max_length, n_layers') + 534 | facet_grid(~max_length) + coord_cartesian(ylim = comp_ylim_bow) 535 | ``` 536 | 537 | ```{r} 538 | results %>% 539 | filter(model %in% c('BOW')) %>% 540 | 541 | ggplot(aes(x = epoch)) + 542 | geom_line(aes(y = competition, 543 | # linetype = trainable_embeddings, 544 | color = factor(max_length)), 545 | size = 1) + 546 | labs(title = 'BOW Competition Score: max_length, n_layers') + 547 | facet_grid(~n_layers) + coord_cartesian(ylim = comp_ylim_bow) 548 | ``` 549 | 550 | ### Basic LSTM - Competition Score 551 | 552 | ```{r} 553 | results %>% 554 | filter(model %in% c('Basic LSTM'), 555 | xp %in% c('max_length', 'n_layers')) %>% 556 | filter(max_length != 50) %>% 557 | ggplot(aes(x = epoch)) + 558 | geom_line(aes(y = competition, 559 | # linetype = trainable_embeddings, 560 | color = factor(n_layers)), 561 | size = 1) + 562 | labs(title = 'Basic LSTM Competition Score: max_length, n_layers') + 563 | facet_wrap(~max_length, nrow = 1) + coord_cartesian(ylim = comp_ylim_zoom) 564 | ``` 565 | 566 | ```{r} 567 | results %>% 568 | filter(model %in% c('Basic LSTM'), 569 | xp %in% c('max_length', 'n_layers')) %>% 570 | filter(max_length != 50) %>% 571 | ggplot(aes(x = epoch)) + 572 | geom_line(aes(y = competition, 573 | # linetype = trainable_embeddings, 574 | color = factor(max_length)), 575 | size = 1) + 576 | labs(title = 'Basic LSTM Competition Score: max_length, n_layers') + 577 | facet_wrap(~n_layers, nrow = 1) + coord_cartesian(ylim = comp_ylim_zoom) 578 | ``` 579 | 580 | ### Attention LSTM - Competition Score 581 | 582 | 583 | ```{r} 584 | results %>% 585 | filter(model %in% c('Attention LSTM'), 586 | xp %in% c('max_length', 'n_layers')) %>% 587 | ggplot(aes(x = epoch)) + 588 | geom_line(aes(y = competition, 589 | # linetype = trainable_embeddings, 590 | color = factor(n_layers)), 591 | size = 1) + 592 | labs(title = 'Attention LSTM Competition Score: max_length, n_layers') + 593 | facet_wrap(~max_length, nrow = 1) + coord_cartesian(ylim = comp_ylim_high) 594 | ``` 595 | 596 | 597 | 598 | ```{r} 599 | results %>% 600 | filter(model %in% c('Attention LSTM'), 601 | xp %in% c('max_length', 'n_layers')) %>% 602 | ggplot(aes(x = epoch)) + 603 | geom_line(aes(y = competition, 604 | # linetype = trainable_embeddings, 605 | color = factor(max_length)), 606 | size = 1) + 607 | labs(title = 'Attention LSTM Competition Score: max_length, n_layers') + 608 | facet_wrap(~n_layers, nrow = 1) + coord_cartesian(ylim = comp_ylim_high) 609 | ``` 610 | 611 | ### Conditional LSTM - Competition Score 612 | 613 | 614 | ```{r} 615 | results %>% 616 | filter(model %in% c('CEA LSTM'), 617 | xp %in% c('max_length', 'n_layers')) %>% 618 | ggplot(aes(x = epoch)) + 619 | geom_line(aes(y = competition, 620 | # linetype = trainable_embeddings, 621 | color = factor(n_layers)), 622 | size = 1) + 623 | labs(title = 'Conditional LSTM Competition Score: max_length, n_layers') + 624 | facet_wrap(~max_length, nrow = 1) + coord_cartesian(ylim = comp_ylim_high) 625 | ``` 626 | 627 | 628 | ```{r} 629 | results %>% 630 | filter(model %in% c('CEA LSTM'), 631 | xp %in% c('max_length', 'n_layers')) %>% 632 | ggplot(aes(x = epoch)) + 633 | geom_line(aes(y = competition, 634 | # linetype = trainable_embeddings, 635 | color = factor(max_length)), 636 | size = 1) + 637 | labs(title = 'Conditional LSTM Competition Score: max_length, n_layers') + 638 | facet_wrap(~n_layers, nrow = 1) + coord_cartesian(ylim = comp_ylim_high) 639 | ``` 640 | -------------------------------------------------------------------------------- /code/execute_bow_config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | ###### 4 | # Initial test of LSTM model for Fake News Challenge 5 | # Based on starter code from PS3-CS224n 6 | # Based on Stephen's rnn_test1 7 | ###### 8 | ## General libraries 9 | import tensorflow as tf 10 | import numpy as np 11 | import random 12 | 13 | ## Our Own Code 14 | # from our_model import Config 15 | from bow_model_config import BOWModel 16 | from run_text_processing import save_data_pickle, get_data 17 | # from run_text_processing import get_data 18 | ## currently using: split_indices 19 | # from our_util import Progbar, minibatches, pack_labels, split_data, split_indices, softmax, get_performance 20 | from our_util import split_indices, softmax, get_performance, convertOutputs #M 21 | 22 | def run_save_data_pickle(): ## Needs NLTK to be installed! 23 | save_data_pickle(outfilename = '/../../glove/twitter50d_h_ids_b_ids_pickle.p', 24 | embedding_type = 'twitter.27B.50d', 25 | parserOption = 'nltk') 26 | 27 | def run_bow(config, split = True, outputpath = '../../xp', final = False): #M 28 | 29 | 30 | 31 | ## Get data 32 | # config, y, h, b, h_len, b_len = get_BOW_data(config, reload = True, save_data = False) 33 | config, data_dict = get_data(config, 34 | filename_embeddings = '/../../glove/glove.twitter.27B.50d.txt', 35 | pickle_path = '/../../glove/twitter50d_h_ids_b_ids_pickle.p', 36 | concat = False) 37 | 38 | ## pass data into local namespace: 39 | y = data_dict['y'] 40 | h = data_dict['h_np'] 41 | b = data_dict['b_np'] 42 | h_len = data_dict['h_seqlen'] 43 | b_len = data_dict['b_seqlen'] 44 | 45 | # Do shortening of dataset ## affects number of samples and max_len. 46 | if config.num_samples is not None: 47 | ## Random seed 48 | np.random.seed(1) 49 | ind = range(np.shape(h)[0]) 50 | random.shuffle(ind) 51 | indices = ind[0:config.num_samples ] 52 | h = h[indices,:] 53 | b = b[indices,:] 54 | h_len = h_len[indices] 55 | b_len = b_len[indices] 56 | y = y[indices] 57 | 58 | if config.h_max_len is not None: 59 | h_max_len = config.h_max_len 60 | if np.shape(h)[1] > h_max_len: 61 | h = h[:, 0:h_max_len] 62 | h_len = np.minimum(h_len, h_max_len) 63 | 64 | if config.b_max_len is not None: 65 | b_max_len = config.b_max_len 66 | if np.shape(b)[1] > b_max_len: 67 | b = b[:, 0:b_max_len] 68 | b_len = np.minimum(b_len, b_max_len) 69 | 70 | if split: 71 | # Split data 72 | train_indices, dev_indices, test_indices = split_indices(np.shape(h)[0]) 73 | # Divide data 74 | train_h = h[train_indices,:] 75 | train_b = b[train_indices,:] 76 | train_h_len = h_len[train_indices] 77 | train_b_len = b_len[train_indices] 78 | train_y = y[train_indices] 79 | 80 | # Development 81 | dev_h = h[dev_indices,:] 82 | dev_b = b[dev_indices,:] 83 | dev_h_len = h_len[dev_indices] 84 | dev_b_len = b_len[dev_indices] 85 | dev_y = y[dev_indices] 86 | 87 | if final: 88 | # Combine train and dev 89 | train_dev_indices = train_indices + dev_indices 90 | train_h = h[train_dev_indices,:] 91 | train_b = b[train_dev_indices,:] 92 | train_h_len = h_len[train_dev_indices] 93 | train_b_len = b_len[train_dev_indices] 94 | train_y = y[train_dev_indices] 95 | 96 | # Set dev to test 97 | dev_h = h[test_indices,:] 98 | dev_b = b[test_indices,:] 99 | dev_h_len = h_len[test_indices] 100 | dev_b_len = b_len[test_indices] 101 | dev_y = y[test_indices] 102 | 103 | 104 | 105 | ## Passing parameter_dict to config settings 106 | ## Changes to config based on data shape 107 | assert(np.shape(train_h)[0] == np.shape(train_b)[0] == np.shape(train_y)[0] == np.shape(train_h_len)[0] == np.shape(train_b_len)[0]) 108 | config.num_samples = np.shape(train_h)[0] 109 | config.h_max_len = np.shape(train_h)[1] 110 | config.b_max_len = np.shape(train_b)[1] 111 | 112 | ## Start Tensorflow! 113 | print('Starting TensorFlow operations') 114 | print 'With hidden layers: ', config.n_layers ## hidden layer? 115 | with tf.Graph().as_default(): 116 | tf.set_random_seed(1) 117 | model = BOWModel(config) 118 | init = tf.global_variables_initializer() 119 | with tf.Session() as session: 120 | session.run(init) 121 | losses_ep, dev_performances_ep, dev_predicted_classes_ep, dev_predictions_ep = model.fit(session, train_h, train_b, train_h_len, train_b_len, train_y, dev_h, dev_b, dev_h_len, dev_b_len, dev_y) #M 122 | 123 | # Write results to csv 124 | convertOutputs(outputpath, config, losses_ep, dev_performances_ep) 125 | 126 | print('Losses ', losses_ep) 127 | print('Dev Performance ', dev_performances_ep) #M 128 | return losses_ep, dev_predicted_classes_ep, dev_performances_ep #M 129 | 130 | ## for debugging 131 | if __name__ == "__main__": 132 | print('Doing something!') 133 | losses, dev_predicted_classes, dev_performance = run_bow(num_samples = 1028) 134 | print('Execution Complete') 135 | -------------------------------------------------------------------------------- /code/execute_lstm_attention.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | 4 | ###### 5 | # Execution file for the LSTM attention model 6 | # Based on starter code from PS3-CS224n 7 | ###### 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | 11 | import argparse 12 | import logging 13 | import sys 14 | import time 15 | import os 16 | from datetime import datetime 17 | 18 | import tensorflow as tf 19 | import numpy as np 20 | import cPickle as pickle 21 | 22 | from run_text_processing import get_data, save_data_pickle 23 | 24 | from our_util import Progbar, minibatches, pack_labels, split_data, softmax, get_performance, convertOutputs, downsample_label 25 | # from our_model import OurModel, Config 26 | 27 | from LSTM_attention import * 28 | 29 | logger = logging.getLogger("hw3.q3") 30 | logger.setLevel(logging.DEBUG) 31 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) 32 | 33 | def run_save_data_pickle(): 34 | save_data_pickle(outfilename = '/../../glove/twitter50d_h_ids_b_ids_pickle.p', 35 | embedding_type = 'twitter.27B.50d', 36 | parserOption = 'nltk') 37 | 38 | def run_lstm_attention(config, outputpath = '../../xp', final = False): 39 | config, data_dict = get_data(config, 40 | filename_embeddings = '/../../glove/glove.twitter.27B.50d.txt', 41 | pickle_path = '/../../glove/twitter50d_h_ids_b_ids_pickle.p', 42 | concat = True) 43 | 44 | y = data_dict['y'] 45 | h_b_np = data_dict['h_b_np'] 46 | seqlen = data_dict['seqlen'] 47 | 48 | # Perform downsampling 49 | if 'downsample' in config.__dict__: 50 | if config.downsample == True: 51 | downsample_indices = downsample_label(y, label_for_ds = 3, downsample_factor = 4) 52 | y = y[downsample_indices] 53 | h_b_np = h_b_np[downsample_indices, :] 54 | seqlen = seqlen[downsample_indices] 55 | 56 | if config.max_length is not None: 57 | max_length = config.max_length 58 | if np.shape(h_b_np)[1] > max_length: 59 | h_b_np = h_b_np[:, 0:max_length] 60 | seqlen = np.minimum(seqlen, max_length) 61 | 62 | # Set maximum dataset size for testing purposes 63 | data = pack_labels(h_b_np, y, seqlen) 64 | if config.num_samples is not None: 65 | num_samples = config.num_samples 66 | data = data[0:num_samples - 1] 67 | 68 | # Split data, result is still packed 69 | train_data, dev_data, test_data, train_indices, dev_indices, test_indices = split_data(data, prop_train = 0.6, prop_dev = 0.2, seed = 56) 70 | 71 | # Compute some convenience sub-sets 72 | # Dev 73 | dev_labels = y[dev_indices] 74 | dev_data_np = h_b_np[dev_indices, :] 75 | dev_seqlen = seqlen[dev_indices] 76 | # Test 77 | test_labels = y[test_indices] 78 | test_data_np = h_b_np[test_indices, :] 79 | test_seqlen = seqlen[test_indices] 80 | 81 | ## Config determined at data loading: 82 | config.num_samples = len(train_indices) 83 | config.max_length = np.shape(h_b_np)[1] 84 | 85 | 86 | # If this is the final test: 87 | # Combine test and dev 88 | # Reassign test to dev - for compatibility with rest of the code 89 | if final: 90 | # train_dev_indices = train_indices.extend(dev_indices) 91 | train_dev_indices = train_indices + dev_indices 92 | train_data = [data[i] for i in train_dev_indices] 93 | dev_data_np = test_data_np 94 | dev_seqlen = test_seqlen 95 | dev_labels = test_labels 96 | config.num_samples = len(train_dev_indices) 97 | 98 | with tf.Graph().as_default(): 99 | 100 | tf.set_random_seed(59) 101 | 102 | logger.info("Building model...",) 103 | start = time.time() 104 | model = LSTMAttention(config) 105 | logger.info("took %.2f seconds", time.time() - start) 106 | 107 | init = tf.global_variables_initializer() 108 | 109 | with tf.Session() as session: 110 | session.run(init) 111 | # losses = model.fit(session, train_data) 112 | losses_ep, dev_performances_ep, dev_predicted_classes_ep, dev_predictions_ep = model.fit(session, train_data, dev_data_np, dev_seqlen, dev_labels) # MODIF 113 | # dev_predictions = model.predict_on_batch(session, dev_data_np, dev_seqlen) 114 | 115 | 116 | #test_predictions = model.predict_on_batch(session, test_data_np, test_seqlen) 117 | 118 | # outputpath = '../../xp' # MODIF 119 | convertOutputs(outputpath, config, losses_ep, dev_performances_ep) # MODIF 120 | # Compute testing predictions --> MODIF --> SHOULD BE REMOVED WHEN OK 121 | print('Dev Performance ', dev_performances_ep) #M 122 | return losses_ep, dev_predicted_classes_ep, dev_performances_ep #MODIF 123 | 124 | if __name__ == "__main__": 125 | 126 | # print('Doing something!') 127 | # # run_save_data_pickle() 128 | # # test_model_loading_functions('') 129 | # # test_run_model_with_parameters('') 130 | # # test_save_load_data_pickle('twitter50d_h_ids_b_ids_pickle.p') 131 | # # losses = test_model_with_real_data_pickle('args') 132 | print('Execution Complete') 133 | # # print(losses) -------------------------------------------------------------------------------- /code/execute_lstm_conditional.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | ###### 4 | # Execution script for the conditional LSTM with attention 5 | # Based on starter code from PS3-CS224n 6 | ###### 7 | ## General libraries 8 | import tensorflow as tf 9 | import numpy as np 10 | import random 11 | 12 | ## Our Own Code 13 | from LSTM_conditional import LSTMCondModel 14 | from run_text_processing import save_data_pickle, get_data 15 | from our_util import Progbar, minibatches, pack_labels, split_data, softmax, get_performance, convertOutputs, downsample_label, split_indices 16 | 17 | def run_save_data_pickle(): ## Needs NLTK to be installed! 18 | save_data_pickle(outfilename = '/../../glove/twitter50d_h_ids_b_ids_pickle.p', 19 | embedding_type = 'twitter.27B.50d', 20 | parserOption = 'nltk') 21 | 22 | def run_lstm_conditional(config, split = True, outputpath = '../../xp', final = False): 23 | ## Get data 24 | config, data_dict = get_data(config, 25 | filename_embeddings = '/../../glove/glove.twitter.27B.50d.txt', 26 | pickle_path = '/../../glove/twitter50d_h_ids_b_ids_pickle.p', 27 | concat = False) 28 | 29 | ## pass data into local namespace: 30 | y = data_dict['y'] 31 | h = data_dict['h_np'] 32 | b = data_dict['b_np'] 33 | h_len = data_dict['h_seqlen'] 34 | b_len = data_dict['b_seqlen'] 35 | 36 | # Do shortening of dataset ## affects number of samples and max_len. 37 | if config.num_samples is not None: 38 | ## Random seed 39 | np.random.seed(1) 40 | ind = range(np.shape(h)[0]) 41 | random.shuffle(ind) 42 | indices = ind[0:config.num_samples ] 43 | h = h[indices,:] 44 | b = b[indices,:] 45 | h_len = h_len[indices] 46 | b_len = b_len[indices] 47 | y = y[indices] 48 | 49 | # Truncate headlines and bodies 50 | if config.h_max_len is not None: 51 | h_max_len = config.h_max_len 52 | if np.shape(h)[1] > h_max_len: 53 | h = h[:, 0:h_max_len] 54 | h_len = np.minimum(h_len, h_max_len) 55 | 56 | if config.b_max_len is not None: 57 | b_max_len = config.b_max_len 58 | if np.shape(b)[1] > b_max_len: 59 | b = b[:, 0:b_max_len] 60 | b_len = np.minimum(b_len, b_max_len) 61 | 62 | if split: 63 | # Split data 64 | train_indices, dev_indices, test_indices = split_indices(np.shape(h)[0]) 65 | # Divide data 66 | train_h = h[train_indices,:] 67 | train_b = b[train_indices,:] 68 | train_h_len = h_len[train_indices] 69 | train_b_len = b_len[train_indices] 70 | train_y = y[train_indices] 71 | # test 72 | dev_h = h[dev_indices,:] 73 | dev_b = b[dev_indices,:] 74 | dev_h_len = h_len[dev_indices] 75 | dev_b_len = b_len[dev_indices] 76 | dev_y = y[dev_indices] 77 | 78 | if final: 79 | # Combine train and dev 80 | train_dev_indices = train_indices + dev_indices 81 | train_h = h[train_dev_indices,:] 82 | train_b = b[train_dev_indices,:] 83 | train_h_len = h_len[train_dev_indices] 84 | train_b_len = b_len[train_dev_indices] 85 | train_y = y[train_dev_indices] 86 | 87 | # Set dev to test 88 | dev_h = h[test_indices,:] 89 | dev_b = b[test_indices,:] 90 | dev_h_len = h_len[test_indices] 91 | dev_b_len = b_len[test_indices] 92 | dev_y = y[test_indices] 93 | 94 | ## Passing parameter_dict to config settings 95 | ## Changes to config based on data shape 96 | assert(np.shape(train_h)[0] == np.shape(train_b)[0] == np.shape(train_y)[0] == np.shape(train_h_len)[0] == np.shape(train_b_len)[0]) 97 | config.num_samples = np.shape(train_h)[0] 98 | config.h_max_len = np.shape(train_h)[1] 99 | config.b_max_len = np.shape(train_b)[1] 100 | 101 | ## Start Tensorflow! 102 | print('Starting TensorFlow operations') 103 | print 'With hidden layers: ', config.n_layers ## hidden layer? 104 | with tf.Graph().as_default(): 105 | tf.set_random_seed(1) 106 | model = LSTMCondModel(config) 107 | init = tf.global_variables_initializer() 108 | with tf.Session() as session: 109 | session.run(init) 110 | losses_ep, dev_performances_ep, dev_predicted_classes_ep, dev_predictions_ep = model.fit(session, train_h, train_b, train_h_len, train_b_len, train_y, dev_h, dev_b, dev_h_len, dev_b_len, dev_y) #M 111 | 112 | # Write results to csv 113 | convertOutputs(outputpath, config, losses_ep, dev_performances_ep) 114 | 115 | print('Losses ', losses_ep) 116 | print('Dev Performance ', dev_performances_ep) #M 117 | return losses_ep, dev_predicted_classes_ep, dev_performances_ep #M 118 | 119 | ## for debugging 120 | if __name__ == "__main__": 121 | print('Doing something!') 122 | # run_save_data_pickle() 123 | losses, dev_predicted_classes, dev_performance = run_bow(num_samples = 1028) 124 | print('Execution Complete') -------------------------------------------------------------------------------- /code/execute_lstm_config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | 4 | ###### 5 | # Initial test of LSTM model for Fake News Challenge - Using actual data 6 | # Based on starter code from PS3-CS224n 7 | ###### 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | 11 | import argparse 12 | import logging 13 | import sys 14 | import time 15 | import os 16 | from datetime import datetime 17 | 18 | import tensorflow as tf 19 | import numpy as np 20 | import cPickle as pickle 21 | 22 | from run_text_processing import get_data, save_data_pickle 23 | 24 | from our_util import Progbar, minibatches, pack_labels, split_data, softmax, get_performance, convertOutputs, downsample_label 25 | 26 | from basicLSTM_model_config import * 27 | 28 | logger = logging.getLogger("hw3.q3") 29 | logger.setLevel(logging.DEBUG) 30 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) 31 | 32 | def run_save_data_pickle(): 33 | save_data_pickle(outfilename = '/../../glove/twitter50d_h_ids_b_ids_pickle.p', 34 | embedding_type = 'twitter.27B.50d', 35 | parserOption = 'nltk') 36 | 37 | def run_lstm(config, outputpath = '../../xp', final = False): 38 | config, data_dict = get_data(config, 39 | filename_embeddings = '/../../glove/glove.twitter.27B.50d.txt', 40 | pickle_path = '/../../glove/twitter50d_h_ids_b_ids_pickle.p', 41 | concat = True) 42 | 43 | y = data_dict['y'] 44 | h_b_np = data_dict['h_b_np'] 45 | seqlen = data_dict['seqlen'] 46 | 47 | # Perform downsampling 48 | if 'downsample' in config.__dict__: 49 | if config.downsample == True: 50 | downsample_indices = downsample_label(y, label_for_ds = 3, downsample_factor = 4) 51 | y = y[downsample_indices] 52 | h_b_np = h_b_np[downsample_indices, :] 53 | seqlen = seqlen[downsample_indices] 54 | 55 | if config.max_length is not None: 56 | max_length = config.max_length 57 | if np.shape(h_b_np)[1] > max_length: 58 | h_b_np = h_b_np[:, 0:max_length] 59 | seqlen = np.minimum(seqlen, max_length) 60 | 61 | # Set maximum dataset size for testing purposes 62 | data = pack_labels(h_b_np, y, seqlen) 63 | if config.num_samples is not None: 64 | num_samples = config.num_samples 65 | data = data[0:num_samples - 1] 66 | 67 | # Split data, result is still packed 68 | train_data, dev_data, test_data, train_indices, dev_indices, test_indices = split_data(data, prop_train = 0.6, prop_dev = 0.2, seed = 56) 69 | 70 | # Compute some convenience sub-sets 71 | # Dev 72 | dev_labels = y[dev_indices] 73 | dev_data_np = h_b_np[dev_indices, :] 74 | dev_seqlen = seqlen[dev_indices] 75 | # Test 76 | test_labels = y[test_indices] 77 | test_data_np = h_b_np[test_indices, :] 78 | test_seqlen = seqlen[test_indices] 79 | 80 | 81 | ## Config determined at data loading: 82 | config.num_samples = len(train_indices) 83 | config.max_length = np.shape(h_b_np)[1] 84 | 85 | # If this is the final test: 86 | # Combine test and dev 87 | # Reassign test to dev - for compatibility with rest of the code 88 | if final: 89 | # train_dev_indices = train_indices.extend(dev_indices) 90 | train_dev_indices = train_indices + dev_indices 91 | train_data = [data[i] for i in train_dev_indices] 92 | dev_data_np = test_data_np 93 | dev_seqlen = test_seqlen 94 | dev_labels = test_labels 95 | config.num_samples = len(train_dev_indices) 96 | 97 | with tf.Graph().as_default(): 98 | 99 | tf.set_random_seed(59) 100 | 101 | logger.info("Building model...",) 102 | start = time.time() 103 | model = BaselineLSTM(config) 104 | logger.info("took %.2f seconds", time.time() - start) 105 | 106 | init = tf.global_variables_initializer() 107 | 108 | with tf.Session() as session: 109 | session.run(init) 110 | losses_ep, dev_performances_ep, dev_predicted_classes_ep, dev_predictions_ep = model.fit(session, train_data, dev_data_np, dev_seqlen, dev_labels) # MODIF 111 | 112 | # outputpath = '../../xp' # MODIF 113 | convertOutputs(outputpath, config, losses_ep, dev_performances_ep) # MODIF 114 | print('Dev Performance ', dev_performances_ep) #M 115 | return losses_ep, dev_predicted_classes_ep, dev_performances_ep #MODIF 116 | 117 | if __name__ == "__main__": 118 | print('Doing something!') 119 | print('Execution Complete') -------------------------------------------------------------------------------- /code/our_model_config.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class OurModel(object): 4 | """Abstracts a Tensorflow graph for use on final project. 5 | """ 6 | 7 | def add_placeholders(self): 8 | """Generates placeholder variables to represent the input tensors 9 | """ 10 | self.inputs_placeholder = tf.placeholder(tf.int64, shape=(None, self.config.max_length), name="x") 11 | self.labels_placeholder = tf.placeholder(tf.int64, shape=(None), name="y") 12 | 13 | def create_feed_dict(self, inputs_batch, labels_batch=None): 14 | """Creates the feed_dict for the model. 15 | """ 16 | feed_dict = { 17 | self.inputs_placeholder: inputs_batch, 18 | } 19 | if labels_batch is not None: 20 | feed_dict[self.labels_placeholder] = labels_batch 21 | return feed_dict 22 | 23 | def add_embedding(self, option = 'Constant'): 24 | """Adds an embedding layer that maps from input tokens (integers) to vectors and then 25 | concatenates those vectors. 26 | 27 | Returns: 28 | embeddings: tf.Tensor of shape (None, max_length, n_features*embed_size) 29 | """ 30 | if option == 'Variable': 31 | embeddings_temp = tf.nn.embedding_lookup(params = tf.Variable(self.config.pretrained_embeddings), ids = self.inputs_placeholder) 32 | elif option == 'Constant': 33 | embeddings_temp = tf.nn.embedding_lookup(params = tf.constant(self.config.pretrained_embeddings), ids = self.inputs_placeholder) 34 | embeddings = tf.reshape(embeddings_temp, shape = (-1, self.config.max_length, self.config.embed_size)) 35 | ### END YOUR CODE 36 | return embeddings 37 | 38 | def add_prediction_op(self): 39 | """Implements the core of the model that transforms a batch of input data into predictions. 40 | 41 | Returns: 42 | pred: A tensor of shape (batch_size, n_classes) 43 | """ 44 | raise NotImplementedError("Each Model must re-implement this method.") 45 | 46 | def add_loss_op(self, pred): 47 | """Adds ops to compute the loss function. 48 | 49 | Args: 50 | pred: A tensor of shape (batch_size, 1) containing the last 51 | state of the neural network. 52 | Returns: 53 | loss: A 0-d tensor (scalar) 54 | """ 55 | y = tf.reshape(self.labels_placeholder, (-1, )) # Check whether this is necessary 56 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = pred, labels = y)) 57 | return loss 58 | 59 | def add_training_op(self, loss): 60 | """Sets up the training Ops. 61 | 62 | Creates an optimizer and applies the gradients to all trainable variables. 63 | The Op returned by this function is what must be passed to the 64 | `sess.run()` call to cause the model to train. 65 | Args: 66 | loss: Loss tensor. 67 | Returns: 68 | train_op: The Op for training. 69 | """ 70 | # Check if Adam has adaptive learning rate 71 | train_op = tf.train.AdamOptimizer(self.config.lr).minimize(loss) 72 | return train_op 73 | 74 | def train_on_batch(self, sess, inputs_batch, labels_batch): 75 | """Perform one step of gradient descent on the provided batch of data. 76 | 77 | Args: 78 | sess: tf.Session() 79 | input_batch: np.ndarray of shape (n_samples, n_features) 80 | labels_batch: np.ndarray of shape (n_samples, n_classes) 81 | Returns: 82 | loss: loss over the batch (a scalar) 83 | """ 84 | feed = self.create_feed_dict(inputs_batch, labels_batch=labels_batch) 85 | _, loss = sess.run([self.train_op, self.loss], feed_dict=feed) 86 | return loss 87 | 88 | def predict_on_batch(self, sess, inputs_batch): 89 | """Make predictions for the provided batch of data 90 | 91 | Args: 92 | sess: tf.Session() 93 | input_batch: np.ndarray of shape (n_samples, n_features) 94 | Returns: 95 | predictions: np.ndarray of shape (n_samples, n_classes) 96 | """ 97 | feed = self.create_feed_dict(inputs_batch) 98 | predictions = sess.run(self.pred, feed_dict=feed) 99 | return predictions 100 | 101 | def run_epoch(self, sess, train): 102 | prog = Progbar(target=1 + int(len(train) / self.config.batch_size)) 103 | losses = [] 104 | for i, batch in enumerate(minibatches(train, self.config.batch_size)): 105 | loss = self.train_on_batch(sess, *batch) 106 | losses.append(loss) 107 | prog.update(i + 1, [("train loss", loss)]) 108 | return losses 109 | 110 | def fit(self, sess, train): 111 | losses = [] 112 | for epoch in range(self.config.n_epochs): 113 | logger.info("Epoch %d out of %d", epoch + 1, self.config.n_epochs) 114 | loss = self.run_epoch(sess, train) 115 | losses.append(loss) 116 | return losses 117 | 118 | def build(self): 119 | self.add_placeholders() 120 | self.pred = self.add_prediction_op() 121 | self.loss = self.add_loss_op(self.pred) 122 | self.train_op = self.add_training_op(self.loss) -------------------------------------------------------------------------------- /code/our_util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Utility functions 5 | """ 6 | 7 | from __future__ import division 8 | 9 | import sys 10 | import time 11 | import logging 12 | import StringIO 13 | import pandas as pd 14 | from collections import defaultdict, Counter, OrderedDict 15 | import numpy as np 16 | from numpy import array, zeros, allclose 17 | 18 | 19 | def split_data(data, prop_train = 0.6, prop_dev = 0.2, seed = None): 20 | ## Generate hold-out data 21 | np.random.seed(seed) 22 | # If data is a numpy object 23 | 24 | assert prop_train + prop_dev <= 1 25 | 26 | if (type(data).__module__ == np.__name__): 27 | 28 | num_samples = data.shape[0] 29 | num_train_samples = int(np.floor(num_samples * prop_train)) 30 | num_dev_samples = int(np.floor(num_samples * prop_dev)) 31 | 32 | indices = range(num_samples) 33 | np.random.shuffle(indices) 34 | 35 | train_indices = indices[0:num_train_samples] 36 | dev_indices = indices[num_train_samples:num_train_samples + num_dev_samples] 37 | test_indices = indices[num_train_samples+num_dev_samples:num_samples] 38 | 39 | train_data = data[indices[train_indices], :] 40 | dev_data = data[indices[dev_indices], :] 41 | test_data = data[indices[test_indices], :] 42 | 43 | elif isinstance(data, list): 44 | 45 | num_samples = len(data) 46 | num_train_samples = int(np.floor(num_samples * prop_train)) 47 | num_dev_samples = int(np.floor(num_samples * prop_dev)) 48 | 49 | indices = range(num_samples) 50 | np.random.shuffle(indices) 51 | 52 | # train_indices = indices[range(num_train_samples)] 53 | train_indices = indices[0:num_train_samples] 54 | dev_indices = indices[num_train_samples:num_train_samples + num_dev_samples] 55 | test_indices = indices[num_train_samples+num_dev_samples:num_samples] 56 | 57 | train_data = [data[i] for i in train_indices] 58 | dev_data = [data[i] for i in dev_indices] 59 | test_data = [data[i] for i in test_indices] 60 | 61 | return train_data, dev_data, test_data, train_indices, dev_indices, test_indices, 62 | 63 | def split_indices(num_samples, prop_train = 0.6, prop_dev = 0.2): 64 | num_train_samples = int(np.floor(num_samples * prop_train)) 65 | num_dev_samples = int(np.floor(num_samples * prop_dev)) 66 | indices = range(num_samples) 67 | np.random.shuffle(indices) 68 | train_indices = indices[0:num_train_samples] 69 | dev_indices = indices[num_train_samples:num_train_samples + num_dev_samples] 70 | test_indices = indices[num_train_samples + num_dev_samples:num_samples] 71 | return train_indices, dev_indices, test_indices 72 | 73 | def test_data_splitting(data): 74 | test_data, train_data = split_data(data) 75 | print 'Full data' + str(len(data)) 76 | print 'Test' + str(len(test_data)) 77 | print 'Train' + str(len(train_data)) 78 | 79 | # Returns a list of indices that should remain in the dataset 80 | def downsample_label(y, label_for_ds = 3, downsample_factor = 4): 81 | y = np.asarray(y) 82 | indices = np.asarray(range(len(y))) 83 | indices_to_sample = indices[y == label_for_ds] 84 | n_samples = int(np.floor(len(indices_to_sample)/downsample_factor)) 85 | sampled_indices = np.random.choice(indices_to_sample, size = n_samples, replace = False) 86 | output = np.append(indices[y != label_for_ds], sampled_indices) 87 | return(output) 88 | 89 | def pack_labels(data, labels, seqlen): # MODIF 90 | output = [] 91 | num_rows = data.shape[0] 92 | assert num_rows == len(labels) 93 | for i in range(data.shape[0]): 94 | the_row = data[i, :] 95 | output.append((the_row, labels[i], seqlen[i])) 96 | return output 97 | 98 | def softmax(x): 99 | """Compute the softmax function for each row of the input x. 100 | """ 101 | orig_shape = x.shape 102 | 103 | if len(x.shape) > 1: 104 | # Matrix 105 | x = x - np.amax(x, axis = 1).reshape(x.shape[0], 1) 106 | rowSums = np.sum(np.exp(x), axis = 1).reshape(x.shape[0], 1) 107 | x = np.exp(x) / rowSums 108 | else: 109 | # Vector 110 | x = x - np.max(x) 111 | theSum = np.sum(np.exp(x)) 112 | x = np.exp(x) / theSum 113 | 114 | assert x.shape == orig_shape 115 | return x 116 | 117 | # Compute performance metrics 118 | def get_performance(predicted, truth, n_classes = None, outputStyle = 'dict'): 119 | # Predicted and observed are both integer vectors of class label 120 | 121 | # Cast both predicted and observed to numpy integer 122 | predicted = np.asarray(predicted, dtype = np.int64) 123 | truth = np.asarray(truth, dtype = np.int64) 124 | 125 | assert len(predicted) == len(truth) 126 | 127 | # Compute competition score: 128 | competition_score = scorer(predicted, truth) 129 | 130 | output = [] 131 | # If n_classes is unknown, infer from the labels 132 | if n_classes is None: 133 | n_classes = len(np.unique(predicted.extend(truth))) 134 | 135 | for i in range(n_classes): 136 | 137 | # Get 2-way table 138 | tp = sum((predicted == i) & (truth == i)) 139 | tn = sum((predicted != i) & (truth != i)) 140 | fp = sum((predicted == i) & (truth != i)) 141 | fn = sum((predicted != i) & (truth == i)) 142 | 143 | print 'tp ' + str(tp) 144 | print 'tn ' + str(tn) 145 | print 'fp ' + str(fp) 146 | print 'fn ' + str(fn) 147 | 148 | # Compute performance metrics 149 | recall = tp / (tp + fn) # aka sensitivity 150 | print 'recall ' + str(recall) 151 | precision = tp / (tp + fp) # aka ppv 152 | print 'precision ' + str(precision) 153 | specificity = tn / (tn + fp) 154 | print 'specificity ' + str(specificity) 155 | f1 = 2 * tp / (2 * tp + fp + fn) 156 | print 'f1 ' + str(f1) 157 | accuracy = (tp + tn)/len(truth) 158 | 159 | keys = ['tp', 'tn', 'fp', 'fn', 'recall', 'precision', 'specificity', 'f1', 'accuracy', 'competition'] 160 | values = [tp, tn , fp, fn, recall, precision, specificity, f1, accuracy, competition_score] 161 | output.append(dict(zip(keys, values))) 162 | 163 | return output 164 | 165 | # Computes competition score 166 | def scorer(pred, truth): 167 | # Maximum possible score 168 | max_score = 0.25 * sum(truth == 3) + 1 * sum(truth != 3) 169 | # Computing achieved sore 170 | # Score from unrelated correct 171 | unrelated_score = 0.25 * sum((truth == 3) & (pred == truth)) 172 | # Score from related correct, but specific class incorrect 173 | related_score1 = 0.25 * sum((truth != 3) & (pred != truth) & (pred != 3)) 174 | # Score from getting related correct, specific class correct 175 | related_score2 = 0.75 * sum((truth != 3) & (pred == truth)) 176 | 177 | final_score = (unrelated_score + related_score1 + related_score2) / max_score 178 | return final_score 179 | 180 | def convertOutputs(outputpath, config, losses_ep, dev_performances_ep): #MODIF 181 | 182 | ''' 183 | Inputs are lists of length n_epochs 184 | - losses_ep: list. losses_ep[i][j] --> loss after batch j 185 | - dev_performances_ep: dictionnary 186 | - dev_predicted_classes_ep: np.array 187 | - dev_predictions_ep: np.array 188 | ''' 189 | 190 | # Define parameter keys 191 | parameter_keys = dir(config) 192 | params_remove = ['__doc__', '__module__','pretrained_embeddings'] 193 | parameter_keys = [param for param in parameter_keys if param not in params_remove] 194 | print('parameter_keys', parameter_keys) 195 | 196 | n_epochs = getattr(config,'n_epochs') 197 | 198 | # Define column names 199 | common_keys = parameter_keys + ['epoch'] # Common keys to all csv files 200 | performance_keys = (dev_performances_ep[0][0]).keys() # [0] for epoch / [0] for 1st class 201 | # Keys specific to performance output 202 | 203 | # Initialization 204 | performances_pds = [] 205 | 206 | for i in range(n_epochs): 207 | # Performance csv 208 | performance_pd = pd.DataFrame(index = range(4), columns = common_keys + ['class'] + performance_keys) 209 | performance_pd['class'] = range(4) 210 | for j, outp in enumerate(dev_performances_ep[i]): 211 | for key in outp.keys(): 212 | performance_pd.loc[j, key] = outp[key] 213 | performance_pd['epoch'] = i 214 | performance_pd['train_loss'] = 1.0 * sum(losses_ep[i]) / len(losses_ep[i]) 215 | performances_pds.append(performance_pd) 216 | # Append all dataframes 217 | performance_pd_global = pd.concat(performances_pds, axis = 0) 218 | 219 | # Loss dataframe 220 | losses_pd_global = pd.DataFrame(columns = common_keys + ['loss']) 221 | losses_ep = np.array(losses_ep) 222 | losses_pd_global['epoch'] = range(1, n_epochs+1) 223 | losses_pd_global['loss'] = np.mean(losses_ep, axis = 1) 224 | 225 | # Adding parameter columns 226 | output_pds = [performance_pd_global, losses_pd_global] 227 | for par_name in parameter_keys: 228 | for output_pd in output_pds: 229 | output_pd[par_name] = getattr(config,par_name) 230 | 231 | # --- Writing to csv --- 232 | performance_pd_global.to_csv(outputpath+'/perf_'+ str(time.time()).replace('.','') + '.csv',index = False) 233 | losses_pd_global.to_csv(outputpath+'/losses_'+ str(time.time()).replace('.','') + '.csv', index = False) 234 | 235 | 236 | # BACK-UP FUNCTION 237 | def convertOutputs0(outputpath, config, losses_ep, dev_performances_ep): #MODIF 238 | 239 | ''' 240 | Inputs are lists of length n_epochs 241 | - losses_ep: list. losses_ep[i][j] --> loss after batch j 242 | - dev_performances_ep: dictionnary 243 | - dev_predicted_classes_ep: np.array 244 | - dev_predictions_ep: np.array 245 | ''' 246 | 247 | # Define parameter dict 248 | parameter_dict = config.__dict__ 249 | parameter_dict.pop('pretrained_embeddings', None) # Removing embedding matrix 250 | # Added line to handle list-valued parameter 251 | # if 'extra_hidden_size' in parameter_dict & parameter_dict['extra_hidden_size'] is not None: 252 | # parameter_dict['extra_hidden_size'] = str(parameter_dict['extra_hidden_size']) 253 | parameter_keys = parameter_dict.keys() 254 | print('parameter_keys', parameter_keys) 255 | n_epochs = parameter_dict['n_epochs'] 256 | 257 | # Define column names 258 | common_keys = parameter_keys + ['epoch'] # Common keys to all csv files 259 | performance_keys = (dev_performances_ep[0][0]).keys() # [0] for epoch / [0] for 1st class 260 | # Keys specific to performance output 261 | 262 | # Initialization 263 | performances_pds = [] 264 | 265 | for i in range(n_epochs): 266 | # Performance csv 267 | performance_pd = pd.DataFrame(index = range(4), columns = common_keys + ['class'] + performance_keys) 268 | performance_pd['class'] = range(4) 269 | for j, outp in enumerate(dev_performances_ep[i]): 270 | for key in outp.keys(): 271 | performance_pd.loc[j, key] = outp[key] 272 | performance_pd['epoch'] = i 273 | performances_pds.append(performance_pd) 274 | # Append all dataframes 275 | performance_pd_global = pd.concat(performances_pds, axis = 0) 276 | 277 | # Loss dataframe 278 | losses_pd_global = pd.DataFrame(columns = common_keys + ['loss']) 279 | losses_ep = np.array(losses_ep) 280 | losses_pd_global['epoch'] = range(1, n_epochs+1) 281 | losses_pd_global['loss'] = np.mean(losses_ep, axis = 1) 282 | 283 | # Adding parameter columns 284 | output_pds = [performance_pd_global, losses_pd_global] 285 | for par_name in parameter_keys: 286 | for output_pd in output_pds: 287 | output_pd[par_name] = parameter_dict[par_name] 288 | 289 | # --- Writing to csv --- 290 | performance_pd_global.to_csv(outputpath+'/perf_'+ str(time.time()).replace('.','') + '.csv',index = False) 291 | losses_pd_global.to_csv(outputpath+'/losses_'+ str(time.time()).replace('.','') + '.csv', index = False) 292 | 293 | # Ferdinand 294 | def get_minibatches(data, minibatch_size, shuffle=True): 295 | 296 | ''' 297 | MODIF 298 | Assuming we have a list [examples, labels, seqlen] of np.array 299 | ''' 300 | 301 | list_data = type(data) is list and (type(data[0]) is list or type(data[0]) is np.ndarray) 302 | data_size = len(data[0]) if list_data else len(data) 303 | indices = np.arange(data_size) 304 | if shuffle: 305 | np.random.shuffle(indices) 306 | for minibatch_start in np.arange(0, data_size, minibatch_size): 307 | minibatch_indices = indices[minibatch_start:minibatch_start + minibatch_size] 308 | 309 | if list_data: 310 | examples_minibatch = minibatch(data[0], minibatch_indices) # np.array of shape (batch_size, max_length_global) 311 | labels_minibatch = minibatch(data[1], minibatch_indices) 312 | seqlen_minibatch = minibatch(data[2], minibatch_indices) 313 | 314 | # Truncating sentences to the max_length of the minibatch --> NOT HERE, placeholders have fixed side 315 | #max_len_minibatch = max(seqlen_minibatch) 316 | #examples_minibatch = examples_minibatch[:,:max_len_minibatch] 317 | 318 | yield [examples_minibatch, labels_minibatch, seqlen_minibatch] 319 | 320 | else: # no truncating if data not in the 'packed' list format [examples, labels, seqlen] 321 | yield minibatch(data, minibatch_indices) 322 | 323 | 324 | ## Derived from Stanford CS 224n started code provided for assignment 3. 325 | def minibatch(data, minibatch_idx): 326 | return data[minibatch_idx] if type(data) is np.ndarray else [data[i] for i in minibatch_idx] 327 | 328 | def minibatches(data, batch_size, shuffle=True): 329 | batches = [np.array(col) for col in zip(*data)] 330 | return get_minibatches(batches, batch_size, shuffle) 331 | 332 | 333 | class Progbar(object): 334 | """ 335 | Progbar class copied from keras (https://github.com/fchollet/keras/) 336 | Displays a progress bar. 337 | # Arguments 338 | target: Total number of steps expected. 339 | interval: Minimum visual progress update interval (in seconds). 340 | """ 341 | 342 | def __init__(self, target, width=30, verbose=1): 343 | self.width = width 344 | self.target = target 345 | self.sum_values = {} 346 | self.unique_values = [] 347 | self.start = time.time() 348 | self.total_width = 0 349 | self.seen_so_far = 0 350 | self.verbose = verbose 351 | 352 | def update(self, current, values=None, exact=None): 353 | """ 354 | Updates the progress bar. 355 | # Arguments 356 | current: Index of current step. 357 | values: List of tuples (name, value_for_last_step). 358 | The progress bar will display averages for these values. 359 | exact: List of tuples (name, value_for_last_step). 360 | The progress bar will display these values directly. 361 | """ 362 | values = values or [] 363 | exact = exact or [] 364 | 365 | for k, v in values: 366 | if k not in self.sum_values: 367 | self.sum_values[k] = [v * (current - self.seen_so_far), current - self.seen_so_far] 368 | self.unique_values.append(k) 369 | else: 370 | self.sum_values[k][0] += v * (current - self.seen_so_far) 371 | self.sum_values[k][1] += (current - self.seen_so_far) 372 | for k, v in exact: 373 | if k not in self.sum_values: 374 | self.unique_values.append(k) 375 | self.sum_values[k] = [v, 1] 376 | self.seen_so_far = current 377 | 378 | now = time.time() 379 | if self.verbose == 1: 380 | prev_total_width = self.total_width 381 | sys.stdout.write("\b" * prev_total_width) 382 | sys.stdout.write("\r") 383 | 384 | numdigits = int(np.floor(np.log10(self.target))) + 1 385 | barstr = '%%%dd/%%%dd [' % (numdigits, numdigits) 386 | bar = barstr % (current, self.target) 387 | prog = float(current)/self.target 388 | prog_width = int(self.width*prog) 389 | if prog_width > 0: 390 | bar += ('='*(prog_width-1)) 391 | if current < self.target: 392 | bar += '>' 393 | else: 394 | bar += '=' 395 | bar += ('.'*(self.width-prog_width)) 396 | bar += ']' 397 | sys.stdout.write(bar) 398 | self.total_width = len(bar) 399 | 400 | if current: 401 | time_per_unit = (now - self.start) / current 402 | else: 403 | time_per_unit = 0 404 | eta = time_per_unit*(self.target - current) 405 | info = '' 406 | if current < self.target: 407 | info += ' - ETA: %ds' % eta 408 | else: 409 | info += ' - %ds' % (now - self.start) 410 | for k in self.unique_values: 411 | if isinstance(self.sum_values[k], list): 412 | info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1])) 413 | else: 414 | info += ' - %s: %s' % (k, self.sum_values[k]) 415 | 416 | self.total_width += len(info) 417 | if prev_total_width > self.total_width: 418 | info += ((prev_total_width-self.total_width) * " ") 419 | 420 | sys.stdout.write(info) 421 | sys.stdout.flush() 422 | 423 | if current >= self.target: 424 | sys.stdout.write("\n") 425 | 426 | if self.verbose == 2: 427 | if current >= self.target: 428 | info = '%ds' % (now - self.start) 429 | for k in self.unique_values: 430 | info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1])) 431 | sys.stdout.write(info + "\n") 432 | 433 | def add(self, n, values=None): 434 | self.update(self.seen_so_far+n, values) 435 | -------------------------------------------------------------------------------- /code/run_text_processing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Text processing of data 5 | """ 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import os 10 | 11 | import cPickle as pickle 12 | 13 | from fnc_baseline.utils.score import report_score, LABELS, score_submission 14 | from fnc_baseline.utils.dataset import DataSet 15 | 16 | import codecs 17 | import sys 18 | reload(sys) # for text processing 19 | sys.setdefaultencoding('utf8') # for text processing 20 | 21 | # ======== Load data ======= 22 | 23 | def read_data(base_path = '/Users/spfohl/Documents/CS_224n/project/altfactcheckers'): 24 | 25 | # Extracting data 26 | dataset = DataSet(path = base_path + '/data') 27 | stances = dataset.stances 28 | articles = dataset.articles 29 | 30 | # Data to lists 31 | h, b, y = [],[],[] 32 | for stance in stances: 33 | y.append(LABELS.index(stance['Stance'])) 34 | h.append(stance['Headline']) 35 | b.append(dataset.articles[stance['Body ID']]) 36 | y = np.asarray(y, dtype = np.int64) 37 | return h, b, y 38 | 39 | # ----- Loading Glove embeddings ---- 40 | def loadGloVe(filename): 41 | # Getting embedding dimension 42 | file0 = open(filename,'r') 43 | # file0 = codecs.open(filename, 'r', 'utf8', 'ignore') 44 | line = file0.readline() 45 | emb_dim = len(line.strip().split(' ')) - 1 46 | file0.close() 47 | 48 | # First row of embedding matrix is 0 for zero padding 49 | vocab = [''] 50 | embd = [[0.0] * emb_dim] 51 | 52 | # Reading embedding matrix 53 | file = open(filename,'r') 54 | # file = codecs.open(filename, 'r', 'utf8', 'ignore') 55 | for line in file.readlines(): 56 | row = line.strip().split(' ') 57 | vocab.append(row[0]) 58 | embd.append(map(float,row[1:])) 59 | print('Loaded GloVe!') 60 | file.close() 61 | return vocab,embd 62 | 63 | 64 | # ------ Clean quote signs --------- 65 | def clean_data(sentences): 66 | ''' 67 | Delete quote signs 68 | - Rational: quote signs mix with the parsing 69 | - Con: quote signs are meaningul --> distanciation from a statement 70 | ''' 71 | new_sentences = [] 72 | for sentence in sentences: 73 | new_sentences.append(sentence.replace("'","").replace('"','')) 74 | return new_sentences 75 | 76 | # ---- Build vocab dictionary from embedding matrix ----- 77 | def build_vocDict(vocab): 78 | voc_dict = {} 79 | for i in range(len(vocab)): 80 | voc_dict[vocab[i]] = i 81 | return voc_dict 82 | 83 | # -------- words to ids only ------- 84 | #==========ADDED BY OSKAR============# 85 | def words2ids(sentences, voc_dict, option = 'simple'): 86 | ''' 87 | Inputs: 88 | - sentences: list of sentences as string 89 | - embedding_vocab: list of vocab words in the order of the rows of embedding_matrix 90 | Ouptut: 91 | - new_sentences_ids: list of sentences as successive word indexes 92 | Processing: delete word which do no appear in vocabulary 93 | - Alternative: replace missing words by the mean 94 | ''' 95 | new_sentences_ids = [] 96 | j = 0 97 | for sentence in sentences: 98 | j+=1 99 | if j % 5000 == 0: 100 | print ('sentence',str(j)) 101 | sentence_ids = [] 102 | if option == 'nltk': 103 | sentence = sentence.decode('utf8', 'ignore') 104 | # print('sentence', sentence) 105 | word_list = tokenize(sentence) 106 | # print('word_list', word_list) 107 | elif option == 'simple': 108 | word_list = sentence.split(" ") 109 | 110 | for word in word_list: 111 | if word.lower() in voc_dict: # Only add word if in dictionary 112 | word_index = voc_dict[word.lower()] 113 | sentence_ids.append(word_index) 114 | 115 | new_sentences_ids.append(sentence_ids) 116 | #print ("added",j) 117 | return new_sentences_ids 118 | 119 | 120 | # -------- words to ids and vectors ------- 121 | def words2ids_vects(sentences, voc_dict, embedding_matrix, option = 'simple'): 122 | ''' 123 | Inputs: 124 | - sentences: list of sentences as string 125 | - embedding_vocab: list of vocab words in the order of the rows of embedding_matrix 126 | - embedding_matrix 127 | Ouptut: 128 | - new_sentences_ids: list of sentences as successive word indexes 129 | - new_sentences_vects: list of sentences as successive word vectors 130 | Processing: delete word which do no appear in vocabulary 131 | - Alternative: replace missing words by the mean 132 | ''' 133 | 134 | new_sentences_ids = [] 135 | new_sentences_vects = [] 136 | j = 0 137 | for sentence in sentences: 138 | j+=1 139 | if j % 5000 == 0: 140 | print ('sentence',str(j)) 141 | sentence_ids = [] 142 | sentence_vects = [] 143 | if option == 'nltk': 144 | sentence = sentence.decode('utf8', 'ignore') 145 | # print('sentence', sentence) 146 | word_list = tokenize(sentence) 147 | # print('word_list', word_list) 148 | elif option == 'simple': 149 | word_list = sentence.split(" ") 150 | 151 | for word in word_list: 152 | if word.lower() in voc_dict: # Only add word if in dictionary 153 | word_index = voc_dict[word.lower()] 154 | sentence_ids.append(word_index) 155 | sentence_vects.append(embedding_matrix[word_index]) 156 | 157 | new_sentences_ids.append(sentence_ids) 158 | #print ("added", j) 159 | new_sentences_vects.append(sentence_vects) 160 | return new_sentences_ids, new_sentences_vects 161 | 162 | def tokenize(sequence): 163 | tokens = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sequence)] 164 | # return tokens 165 | return map(lambda x:x.encode('utf8', errors = 'ignore'), tokens) 166 | 167 | # ---------- Averaging vectors for headline and truncated body --------- 168 | 169 | def avg_trunc(sentences_vects): 170 | s_vects_np = [] 171 | for sentence in sentences_vects: 172 | s_vects_np.append(np.array(sentence)) 173 | s_vects_avg = [] 174 | for sentence in s_vects_np: 175 | s_vects_avg.append(np.mean(sentence,axis=0)) 176 | return s_vects_avg 177 | 178 | def concatConvert_np(h_list, b_list): 179 | ''' 180 | 1. Concatenate headlines and bodies 181 | 2. Convert list data to numpy zero padded data 182 | 3. Also outputs sequences lengths as np vector 183 | ''' 184 | 185 | # Concatenate 186 | n_sentences = len(h_list) 187 | h_b_list = [] 188 | seqlen = [] 189 | for i in range(n_sentences): 190 | h_b_list.append(h_list[i] + b_list[i]) 191 | seqlen.append(len(h_b_list[i])) 192 | 193 | max_len = max(seqlen) 194 | 195 | # Convert to numpy with zero padding. No truncating 196 | h_b_np = np.zeros((n_sentences, max_len)) 197 | for i in range(n_sentences): 198 | h_b_np[i,:seqlen[i]] = h_b_list[i] 199 | 200 | return h_b_list, h_b_np, np.array(seqlen) 201 | 202 | def distinctConvert_np(h_list, b_list): 203 | ''' 204 | 1. Convert list data to numpy zero padded data, 2 distinct matrices for headlines and bodies 205 | 2. Also outputs sequences lengths as np vector 206 | ''' 207 | # Compute sequences lengths 208 | n_sentences = len(h_list) 209 | h_seqlen = [] 210 | b_seqlen = [] 211 | for i in range(n_sentences): 212 | h_seqlen.append(len(h_list[i])) 213 | b_seqlen.append(len(b_list[i])) 214 | 215 | h_max_len = max(h_seqlen) 216 | b_max_len = max(b_seqlen) 217 | 218 | # Convert to numpy 219 | h_np = np.zeros((n_sentences, h_max_len)) 220 | b_np = np.zeros((n_sentences, b_max_len)) 221 | for i in range(n_sentences): 222 | h_np[i,:h_seqlen[i]] = h_list[i] 223 | b_np[i,:b_seqlen[i]] = b_list[i] 224 | 225 | return h_np, np.array(h_seqlen), b_np, np.array(b_seqlen) 226 | 227 | #------for nn_test--------# 228 | #==========ADDED BY OSKAR============# 229 | def get_BOW_data(config, reload = None, save_data = None): 230 | ## Random seed 231 | np.random.seed(1) 232 | 233 | # Define path 234 | cwd = os.getcwd() 235 | filename_embeddings = cwd + '/../../glove/glove.6B.50d.txt' 236 | 237 | # GloVe embeddings 238 | vocab,embd = loadGloVe(filename_embeddings) 239 | vocab_size = len(vocab) 240 | embedding_dim = len(embd[0]) 241 | embedding = np.asarray(embd) 242 | 243 | if reload: 244 | # Get vocab dict 245 | voc_dict = build_vocDict(vocab) 246 | 247 | # Read and process data 248 | h, b, y = read_data(cwd + '/../../') # headline / bodies/ labels 249 | # h_ids, _ = words2ids_vects(h, voc_dict, embd) 250 | # b_ids, _ = words2ids_vects(b, voc_dict, embd) 251 | h_ids = words2ids(h, voc_dict) 252 | b_ids = words2ids(b, voc_dict) 253 | 254 | # zero padded np matrices for headlines and bodies; seq. lengths as np vector 255 | h, h_len, b, b_len = distinctConvert_np(h_ids, b_ids) 256 | 257 | # Find and delete empty headings/bodies 258 | ind_empty = [] 259 | for i in range(np.shape(h)[0]): 260 | if ((h_len[i] == 0) or (b_len[i] == 0)): 261 | ind_empty.append(i) 262 | # print(i) 263 | print('Empty sequences: ', ind_empty) 264 | if (len(ind_empty) > 0): 265 | y = np.delete(y, ind_empty) 266 | h = np.delete(h, ind_empty, 0) 267 | b = np.delete(b, ind_empty, 0) 268 | h_len = np.delete(h_len, ind_empty) 269 | b_len = np.delete(b_len, ind_empty) 270 | 271 | if save_data: 272 | # Attention: Bodies CSV is HUGE (800mb) 273 | assert(False) ## Do you REALLY want to do this? Consider saving it in a txt file instead. 274 | # Write 275 | y_pd = pd.DataFrame(y) 276 | h_pd = pd.DataFrame(h) 277 | b_pd = pd.DataFrame(b) 278 | h_len_pd = pd.DataFrame(h_len) 279 | b_len_pd = pd.DataFrame(b_len) 280 | y_pd.to_csv('saved_data/y_noempty.csv', index = False, header = False) 281 | h_pd.to_csv('saved_data/h_noempty.csv', index = False, header = False) 282 | b_pd.to_csv('saved_data/b_noempty.csv', index = False, header = False) 283 | h_len_pd.to_csv('saved_data/h_len_noempty.csv', index = False, header = False) 284 | b_len_pd.to_csv('saved_data/b_len_noempty.csv', index = False, header = False) 285 | # assert(False) 286 | 287 | if not reload: 288 | # Load 289 | # Attention: Bodies CSV is HUGE (800mb) 290 | print("Loading Data") 291 | y = np.asarray(pd.read_csv('saved_data/y_noempty.csv', header = None)) 292 | print("loaded labels") 293 | h = np.asarray(pd.read_csv('saved_data/h_noempty.csv', header = None)) 294 | print("loaded headings") 295 | b = np.asarray(pd.read_csv('saved_data/b_noempty.csv', header = None)) 296 | print("loaded headings") 297 | h_len = np.asarray(pd.read_csv('saved_data/h_len_noempty.csv', header = None)) 298 | b_len = np.asarray(pd.read_csv('saved_data/b_len_noempty.csv', header = None)) 299 | print("loaded lengths") 300 | # assert(False) 301 | 302 | # Modify the config 303 | config.embed_size = embedding_dim 304 | config.pretrained_embeddings = embedding 305 | config.vocab_size = vocab_size 306 | # finish 307 | return config, y, h, b, h_len, b_len 308 | 309 | ## Added by Stephen 310 | def save_data_pickle(outfilename, 311 | embedding_type = 'twitter.27B.50d', 312 | parserOption = 'nltk'): 313 | cwd = os.getcwd() 314 | if embedding_type == 'twitter.27B.50d': 315 | filename_embeddings = cwd + '/../../glove/glove.twitter.27B.50d.txt' 316 | else: 317 | filename_embeddings = cwd + '/../../glove/glove.6B.50d.txt' 318 | 319 | # filename_embeddings = cwd + filename_embeddings 320 | 321 | # GloVe embeddings 322 | vocab, embd = loadGloVe(filename_embeddings) 323 | vocab_size = len(vocab) 324 | embedding_dim = len(embd[0]) 325 | embedding = np.asarray(embd, dtype = np.float64) 326 | 327 | # Get vocab dict 328 | voc_dict = build_vocDict(vocab) 329 | 330 | # Read and process data 331 | h, b, y = read_data(cwd + '/../../') # headline / bodies/ labels 332 | h_ids, h_vects = words2ids_vects(h, voc_dict, embd, parserOption) 333 | b_ids, b_vects = words2ids_vects(b, voc_dict, embd, parserOption) 334 | 335 | # Concatenated headline_bodies zero padded np matrices; seq. lengths as np vector 336 | h_b_ids, h_b_np, seqlen = concatConvert_np(h_ids, b_ids) 337 | h_np, h_seqlen, b_np, b_seqlen = distinctConvert_np(h_ids, b_ids) 338 | 339 | data_dict = {'h_ids':h_ids, 'b_ids':b_ids, 'y':y} 340 | with open(cwd + outfilename, 'wb') as fp: 341 | pickle.dump(data_dict, fp) 342 | 343 | ## Added by Stephen 344 | def get_data(config, 345 | filename_embeddings = '/../../glove/glove.twitter.27B.50d.txt', 346 | pickle_path = '/../../glove/twitter50d_h_ids_b_ids_pickle.p', 347 | concat = True): 348 | # np.random.seed(41) 349 | 350 | # Base path 351 | cwd = os.getcwd() 352 | # filename_embeddings = cwd + '/../../glove/glove.6B.50d.txt' 353 | 354 | filename_embeddings = cwd + filename_embeddings 355 | 356 | # GloVe embeddings 357 | vocab, embd = loadGloVe(filename_embeddings) 358 | vocab_size = len(vocab) 359 | embedding_dim = len(embd[0]) 360 | embedding = np.asarray(embd, dtype = np.float64) 361 | 362 | # Get vocab dict 363 | voc_dict = build_vocDict(vocab) 364 | 365 | # Read and process data 366 | # h, b, y = read_data(cwd + '/../../') # headline / bodies/ labels 367 | 368 | print('Loading Pickle') 369 | load_path = cwd + pickle_path 370 | with open (load_path, 'rb') as fp: 371 | data_dict = pickle.load(fp) 372 | h_ids = data_dict['h_ids'] 373 | b_ids = data_dict['b_ids'] 374 | y = data_dict['y'] 375 | print('finished loading Pickle') 376 | 377 | # Concatenated headline_bodies zero padded np matrices; seq. lengths as np vector 378 | # h_b_ids, h_b_np, seqlen = concatConvert_np(h_ids, b_ids) 379 | # h_np, h_seqlen, b_np, b_seqlen = distinctConvert_np(h_ids, b_ids) 380 | 381 | if concat: 382 | h_b_ids, h_b_np, seqlen = concatConvert_np(h_ids, b_ids) 383 | output_dict = {'y':y, 384 | 'h_b_np':h_b_np, 385 | 'seqlen':seqlen} 386 | else: 387 | h_np, h_seqlen, b_np, b_seqlen = distinctConvert_np(h_ids, b_ids) 388 | # Find and delete empty 389 | ind_empty = [] 390 | for i in range(np.shape(h_np)[0]): 391 | if ((h_seqlen[i] == 0) or (b_seqlen[i] == 0)): 392 | ind_empty.append(i) 393 | print('Empty sequences: ', ind_empty) 394 | if (len(ind_empty) > 0): 395 | y = np.delete(y, ind_empty) 396 | h_np = np.delete(h_np, ind_empty, 0) 397 | b_np = np.delete(b_np, ind_empty, 0) 398 | h_seqlen = np.delete(h_seqlen, ind_empty) 399 | b_seqlen = np.delete(b_seqlen, ind_empty) 400 | output_dict = {'y':y, 401 | 'h_np':h_np, 402 | 'b_np':b_np, 403 | 'h_seqlen':h_seqlen, 404 | 'b_seqlen':b_seqlen} 405 | 406 | config.embed_size = embedding_dim 407 | config.pretrained_embeddings = embedding 408 | config.vocab_size = vocab_size 409 | return config, output_dict 410 | 411 | # if __name__ == '__main__': 412 | # # ========== YOUR OWN EMBEDDING MATRIX PATH HERE ========= 413 | # filename_embeddings = '/Users/spfohl/Documents/CS_224n/project/altfactcheckers/code/stephen_scratch/glove.6B/glove.6B.50d.txt' 414 | 415 | # # Glove 416 | # vocab,embd = loadGloVe(filename_embeddings) 417 | # vocab_size = len(vocab) 418 | # embedding_dim = len(embd[0]) 419 | # embedding = np.asarray(embd) 420 | 421 | # print(embedding[0:5, :]) 422 | # # Dictionary 423 | # voc_dict = build_vocDict(vocab) 424 | 425 | # # Read and process data 426 | # h, b, y = read_data() # headline / bodies/ labels 427 | # h_ids, h_vects = words2ids_vects(h, voc_dict, embd) 428 | # b_ids, b_vects = words2ids_vects(b, voc_dict, embd) 429 | 430 | # # Concatenated headline_bodies zero padded np matrices; seq. lengths as np vector 431 | # h_b_ids, h_b_np, seqlen = concatConvert_np(h_ids, b_ids) 432 | 433 | # # Distinct headline / bodies zero padded np matrices; seq lengths as np vectors 434 | # h_np, h_seqlen, b_np, b_seqlen = distinctConvert_np(h_ids, b_ids) -------------------------------------------------------------------------------- /code/test_script6.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | 4 | ###### 5 | # Call all models with different hyperparameters 6 | ###### 7 | 8 | # standard libs 9 | import numpy as np 10 | 11 | # our code imports 12 | from execute_bow_config import run_bow 13 | from execute_lstm_config import run_lstm 14 | from execute_lstm_attention import run_lstm_attention 15 | from execute_lstm_conditional import run_lstm_conditional 16 | 17 | ### Parameter Overview: 18 | class Config: 19 | """Holds model hyperparams and data information. 20 | The config class is used to store various hyperparameters and dataset 21 | information parameters. Model objects are passed a Config() object at 22 | instantiation. Use self.config.? instead of Config.? 23 | """ 24 | ### Parameter Overview: 25 | ## For all models: 26 | # main params, 27 | n_epochs = 40 28 | lr = 0.001 29 | batch_size = 128 30 | n_classes = 4 31 | hidden_size = 100 32 | n_layers = 0 33 | xp = None 34 | model = None 35 | 36 | ## Determined at data loading: 37 | embed_size = None # not passed to config - assigned in get_data 38 | vocab_size = None # not passed to config - assigned in get_data 39 | pretrained_embeddings = [] # not passed to config - assigned in get_data 40 | num_samples = None # only indirectly passed to comfig, If defined, shortens the dataset, Otherwise determined at data loading, 41 | downsample = False 42 | 43 | ## LSTM specific: 44 | # main params 45 | dropout = 0.8 ## Attention: this is the keep_prob! # not assigned to BOW 46 | # extra_hidden_size = None 47 | trainable_embeddings = 'Variable' 48 | max_length = None # indirectly passed to config in LSTM, If defined, truncates sequences, Otherwise determined at data loading 49 | attention_length = 15 50 | 51 | ## BOW specific: 52 | # main params 53 | hidden_next = 0.6 # defines the number of hidden units in next layer 54 | # Determined at data loading: 55 | h_max_len = None # not passed to config 56 | b_max_len = None # not passed to config 57 | 58 | 59 | def run_bow_with_parameters(args): 60 | 61 | # Final test st 62 | np.random.seed(1) 63 | config = Config() 64 | config.n_layers = 1 65 | config.xp = 'final_test' 66 | config.model = 'bow' 67 | config.lr = 0.005 68 | config.trainable_embeddings = 'Variable' 69 | config.b_max_len = 600 70 | config.n_epochs = 40 71 | result = run_bow(config, final = True) 72 | 73 | ## Experiment 74 | # np.random.seed(1) 75 | # config = Config() 76 | # config.n_layers = 1 77 | # config.xp = 'layers' 78 | # config.model = 'bow' 79 | # config.lr = 0.005 80 | # config.trainable_embeddings = 'Variable' 81 | # config.b_max_len = 75 82 | # result = run_bow(config) 83 | 84 | # ## Experiment 85 | # np.random.seed(1) 86 | # config = Config() 87 | # config.n_layers = 3 88 | # config.xp = 'layers' 89 | # config.model = 'bow' 90 | # config.lr = 0.005 91 | # config.trainable_embeddings = 'Constant' 92 | # config.b_max_len = 75 93 | # result = run_bow(config) 94 | 95 | # ## Experiment 96 | # np.random.seed(1) 97 | # config = Config() 98 | # config.n_layers = 0 99 | # config.xp = 'layers' 100 | # config.model = 'bow' 101 | # config.lr = 0.005 102 | # config.trainable_embeddings = 'Variable' 103 | # config.b_max_len = 150 104 | # result = run_bow(config) 105 | 106 | # ## Experiment 107 | # np.random.seed(1) 108 | # config = Config() 109 | # config.n_layers = 1 110 | # config.xp = 'layers' 111 | # config.model = 'bow' 112 | # config.lr = 0.005 113 | # config.trainable_embeddings = 'Variable' 114 | # config.b_max_len = 150 115 | # result = run_bow(config) 116 | 117 | # ## Experiment 118 | # np.random.seed(1) 119 | # config = Config() 120 | # config.n_layers = 3 121 | # config.xp = 'layers' 122 | # config.model = 'bow' 123 | # config.lr = 0.005 124 | # config.trainable_embeddings = 'Variable' 125 | # config.b_max_len = 150 126 | # result = run_bow(config) 127 | 128 | # np.random.seed(1) 129 | # config = Config() 130 | # config.n_layers = 0 131 | # config.xp = 'layers' 132 | # config.model = 'bow' 133 | # config.lr = 0.005 134 | # config.trainable_embeddings = 'Variable' 135 | # config.b_max_len = 300 136 | # result = run_bow(config) 137 | 138 | # ## Experiment 139 | # np.random.seed(1) 140 | # config = Config() 141 | # config.n_layers = 1 142 | # config.xp = 'layers' 143 | # config.model = 'bow' 144 | # config.lr = 0.005 145 | # config.trainable_embeddings = 'Variable' 146 | # config.b_max_len = 300 147 | # result = run_bow(config) 148 | 149 | # ## Experiment 150 | # np.random.seed(1) 151 | # config = Config() 152 | # config.n_layers = 3 153 | # config.xp = 'layers' 154 | # config.model = 'bow' 155 | # config.lr = 0.005 156 | # config.trainable_embeddings = 'Constant' 157 | # config.b_max_len = 300 158 | # result = run_bow(config) 159 | 160 | # np.random.seed(1) 161 | # config = Config() 162 | # config.n_layers = 0 163 | # config.xp = 'layers' 164 | # config.model = 'bow' 165 | # config.lr = 0.005 166 | # config.trainable_embeddings = 'Variable' 167 | # config.b_max_len = 600 168 | # result = run_bow(config) 169 | 170 | # ## Experiment 171 | # np.random.seed(1) 172 | # config = Config() 173 | # config.n_layers = 1 174 | # config.xp = 'layers' 175 | # config.model = 'bow' 176 | # config.lr = 0.005 177 | # config.trainable_embeddings = 'Variable' 178 | # config.b_max_len = 600 179 | # result = run_bow(config) 180 | 181 | # ## Experiment 182 | # np.random.seed(1) 183 | # config = Config() 184 | # config.n_layers = 3 185 | # config.xp = 'layers' 186 | # config.model = 'bow' 187 | # config.lr = 0.005 188 | # config.trainable_embeddings = 'Constant' 189 | # config.b_max_len = 600 190 | # result = run_bow(config) 191 | 192 | ## Experiment 193 | # np.random.seed(1) 194 | # config = Config() 195 | # config.n_layers = 3 196 | # config.xp = 'layers' 197 | # config.model = 'bow' 198 | # config.lr = 0.005 199 | # config.trainable_embeddings = 'Constant' 200 | # config.b_max_len = 150 201 | # result = run_bow(config) 202 | 203 | 204 | 205 | 206 | def run_lstm_with_parameters(args): 207 | # Final test 208 | np.random.seed(1) 209 | config0 = Config() 210 | config0.max_length = 75 211 | config0.trainable_embeddings = 'Variable' 212 | config0.hidden_size = 100 213 | config0.n_epochs = 40 214 | config0.n_layers = 2 215 | config0.batch_size = 128 216 | config0.dropout = 0.8 217 | config0.lr = 0.001 218 | # config0.num_samples = 100 219 | config0.xp = 'final_test' 220 | config0.model = 'lstm_basic' 221 | result = run_lstm(config0, final = True) 222 | 223 | 224 | #### Testing Downsampling 225 | 226 | # # Experiment 1 227 | # # 2 layer, max_length = 75 228 | # np.random.seed(1) 229 | # config0 = Config() 230 | # # print('Running run_lstm_with_parameters') 231 | # # config0.n_layers = 0 232 | # config0.max_length = 75 233 | # config0.trainable_embeddings = 'Variable' 234 | # config0.hidden_size = 100 235 | # config0.n_epochs = 40 236 | # config0.n_layers = 1 237 | # config0.batch_size = 128 238 | # config0.dropout = 0.8 239 | # config0.n_layers = 1 240 | # # config0.downsample = True 241 | # config0.lr = 0.001 242 | # config0.attention_length = 15 243 | # result = run_lstm(config0) 244 | 245 | # # # # Experiment 2 246 | # # # # 2 layer, max_length = 150 247 | # np.random.seed(1) 248 | # config1 = Config() 249 | # config1.max_length = 150 250 | # config1.trainable_embeddings = 'Variable' 251 | # config1.hidden_size = 100 252 | # config1.n_epochs = 40 253 | # config1.batch_size = 128 254 | # config1.dropout = 0.8 255 | # config1.n_layers = 1 256 | # # config1.downsample = True 257 | # config1.lr = 0.001 258 | # config1.attention_length = 15 259 | # result = run_lstm(config1) 260 | 261 | # # ## Experiment 3 262 | # # # 2 layer, max_length = 300 263 | # np.random.seed(1) 264 | # config2 = Config() 265 | # config2.max_length = 250 266 | # config2.trainable_embeddings = 'Variable' 267 | # config2.hidden_size = 100 268 | # config2.n_epochs = 40 269 | # config2.batch_size = 128 270 | # config2.dropout = 0.8 271 | # config2.n_layers = 1 272 | # # config2.downsample = True 273 | # config2.lr = 0.001 274 | # config2.attention_length = 15 275 | # result = run_lstm(config2) 276 | 277 | # ## Experiment 4 278 | # # max_length = 150, n_layers = 1 279 | # np.random.seed(1) 280 | # config3 = Config() 281 | # config3.max_length = 150 282 | # config3.trainable_embeddings = 'Variable' 283 | # config3.hidden_size = 100 284 | # config3.n_epochs = 40 285 | # config3.batch_size = 128 286 | # config3.dropout = 0.8 287 | # config3.n_layers = 1 288 | # config3.downsample = True 289 | # # config3.extra_hidden_size = None 290 | # result = run_lstm(config3) 291 | 292 | 293 | # ## Experiment 5 294 | # # max_length = 150, n_layers = 2 295 | # np.random.seed(1) 296 | # config4 = Config() 297 | # config4.max_length = 150 298 | # config4.trainable_embeddings = 'Variable' 299 | # config4.hidden_size = 100 300 | # config4.n_epochs = 40 301 | # config4.batch_size = 128 302 | # config4.dropout = 0.8 303 | # config4.n_layers = 2 304 | # config4.downsample = True 305 | # result = run_lstm(config4) 306 | 307 | # ## Experiment 6 308 | # # max_length = 150, n_layers = 4 309 | # np.random.seed(1) 310 | # config5 = Config() 311 | # config5.max_length = 150 312 | # config5.trainable_embeddings = 'Variable' 313 | # config5.hidden_size = 100 314 | # config5.n_epochs = 40 315 | # config5.batch_size = 128 316 | # config5.dropout = 0.8 317 | # config5.n_layers = 4 318 | # config5.downsample = True 319 | # result = run_lstm(config5) 320 | 321 | 322 | # #### Testing Dropout 323 | 324 | # # ## Experiment 1 325 | # # # max_length = 75, n_layers = 2, dropout = 0.9 326 | # np.random.seed(1) 327 | # config = Config() 328 | # config.max_length = 75 329 | # config.trainable_embeddings = 'Variable' 330 | # config.hidden_size = 100 331 | # config.n_epochs = 40 332 | # config.batch_size = 128 333 | # config.dropout = 0.9 334 | # config.n_layers = 2 335 | # config.downsample = False 336 | # config.lr = 0.005 337 | # result = run_lstm(config) 338 | 339 | # # ## Experiment 2 340 | # # # max_length = 75, n_layers = 2, dropout = 0.65 341 | # np.random.seed(1) 342 | # config = Config() 343 | # config.max_length = 75 344 | # config.trainable_embeddings = 'Variable' 345 | # config.hidden_size = 100 346 | # config.n_epochs = 40 347 | # config.batch_size = 128 348 | # config.dropout = 0.65 349 | # config.n_layers = 2 350 | # config.downsample = False 351 | # config.lr = 0.005 352 | # result = run_lstm(config) 353 | 354 | # # ## Experiment 3 355 | # # # max_length = 75, n_layers = 2, dropout = 0.5 356 | # np.random.seed(1) 357 | # config = Config() 358 | # config.max_length = 75 359 | # config.trainable_embeddings = 'Variable' 360 | # config.hidden_size = 100 361 | # config.n_epochs = 40 362 | # config.batch_size = 128 363 | # config.dropout = 0.5 364 | # config.n_layers = 2 365 | # config.downsample = False 366 | # config.lr = 0.005 367 | # result = run_lstm(config) 368 | 369 | 370 | # #### Testing max_length 371 | 372 | # # ## Experiment 1 373 | # # # max_length = 50, n_layers = 2, 374 | # np.random.seed(1) 375 | # config = Config() 376 | # config.max_length = 50 377 | # config.trainable_embeddings = 'Variable' 378 | # config.hidden_size = 100 379 | # config.n_epochs = 40 380 | # config.batch_size = 128 381 | # config.dropout = 0.8 382 | # config.n_layers = 2 383 | # config.downsample = False 384 | # config.lr = 0.005 385 | # result = run_lstm(config) 386 | 387 | # # ## Experiment 2 388 | # # # max_length = 30, n_layers = 2, 389 | # np.random.seed(1) 390 | # config = Config() 391 | # config.max_length = 30 392 | # config.trainable_embeddings = 'Variable' 393 | # config.hidden_size = 100 394 | # config.n_epochs = 40 395 | # config.batch_size = 128 396 | # config.dropout = 0.8 397 | # config.n_layers = 2 398 | # config.downsample = False 399 | # config.lr = 0.005 400 | # result = run_lstm(config) 401 | 402 | def run_lstm_attention_with_parameters(args): 403 | #### Testing max_length # Experiment 1 404 | ## 1 layer, max_length = 50 405 | np.random.seed(1) 406 | config0 = Config() 407 | # print('Running run_lstm_with_parameters') 408 | config0.max_length = 75 409 | config0.trainable_embeddings = 'Variable' 410 | config0.hidden_size = 100 411 | config0.n_epochs = 40 412 | config0.batch_size = 128 413 | config0.dropout = 0.8 414 | config0.n_layers = 2 415 | config0.lr = 0.001 416 | config0.xp = 'final_test' 417 | config0.model = 'lstm_attention' 418 | # config0.num_samples = 100 419 | config0.attention_length = 15 420 | result = run_lstm_attention(config0, final = True) 421 | 422 | # np.random.seed(1) 423 | # config0 = Config() 424 | # # print('Running run_lstm_with_parameters') 425 | # config0.max_length = 150 426 | # config0.trainable_embeddings = 'Variable' 427 | # config0.hidden_size = 100 428 | # config0.n_epochs = 40 429 | # config0.n_layers = 2 430 | # config0.batch_size = 128 431 | # config0.dropout = 0.8 432 | # config0.n_layers = 4 433 | # # config0.downsample = False 434 | # config0.lr = 0.001 435 | # # config0.num_samples = 436 | # config0.attention_length = 15 437 | # result = run_lstm_attention(config0) 438 | 439 | 440 | 441 | #### Testing attention_length # Experiment 1 442 | ## 1 layer, max_length = 150, attention_length = 10 443 | # np.random.seed(1) 444 | # config0 = Config() 445 | # # print('Running run_lstm_with_parameters') 446 | # config0.max_length = 150 447 | # config0.trainable_embeddings = 'Variable' 448 | # config0.hidden_size = 100 449 | # config0.n_epochs = 40 450 | # config0.n_layers = 1 451 | # config0.batch_size = 128 452 | # config0.dropout = 0.8 453 | # config0.n_layers = 1 454 | # # config0.downsample = False 455 | # config0.lr = 0.001 456 | # config0.attention_length = 10 457 | # result = run_lstm_attention(config0) 458 | 459 | # #### Testing attention_length # Experiment 2 460 | # ## 1 layer, max_length = 150, attention_length = 20 461 | # np.random.seed(1) 462 | # config0 = Config() 463 | # # print('Running run_lstm_with_parameters') 464 | # config0.max_length = 150 465 | # config0.trainable_embeddings = 'Variable' 466 | # config0.hidden_size = 100 467 | # config0.n_epochs = 40 468 | # config0.n_layers = 1 469 | # config0.batch_size = 128 470 | # config0.dropout = 0.8 471 | # config0.n_layers = 1 472 | # # config0.downsample = False 473 | # config0.lr = 0.001 474 | # config0.attention_length = 20 475 | # result = run_lstm_attention(config0) 476 | 477 | 478 | # #### Testing alearning rate # Experiment 1 479 | # ## 1 layer, max_length = 150, lr = 0.0005 480 | # np.random.seed(1) 481 | # config0 = Config() 482 | # # print('Running run_lstm_with_parameters') 483 | # config0.max_length = 150 484 | # config0.trainable_embeddings = 'Variable' 485 | # config0.hidden_size = 100 486 | # config0.n_epochs = 40 487 | # config0.n_layers = 1 488 | # config0.batch_size = 128 489 | # config0.dropout = 0.8 490 | # config0.n_layers = 1 491 | # # config0.downsample = False 492 | # config0.lr = 0.0005 493 | # config0.attention_length = 15 494 | # result = run_lstm_attention(config0) 495 | 496 | # #### Testing alearning rate # Experiment 2 497 | # ## 1 layer, max_length = 150, lr = 0.0002 498 | # np.random.seed(1) 499 | # config0 = Config() 500 | # # print('Running run_lstm_with_parameters') 501 | # config0.max_length = 150 502 | # config0.trainable_embeddings = 'Variable' 503 | # config0.hidden_size = 100 504 | # config0.n_epochs = 40 505 | # config0.n_layers = 1 506 | # config0.batch_size = 128 507 | # config0.dropout = 0.8 508 | # config0.n_layers = 1 509 | # # config0.downsample = False 510 | # config0.lr = 0.0002 511 | # config0.attention_length = 15 512 | # result = run_lstm_attention(config0) 513 | 514 | def run_lstm_conditional_with_parameters(args): 515 | # To be defined - parameter saving not ready 516 | np.random.seed(1) 517 | config0 = Config() 518 | # print('Running run_lstm_with_parameters') 519 | config0.trainable_embeddings = 'Variable' 520 | config0.hidden_size = 100 521 | config0.n_epochs = 40 522 | config0.n_layers = 1 523 | config0.batch_size = 128 524 | config0.dropout = 0.8 525 | config0.n_layers = 2 526 | config0.lr = 0.001 527 | # config0.num_samples = 100 528 | config0.b_max_len = 75 529 | config0.attention_length = 15 530 | config0.xp = 'final_test' 531 | config0.model = 'conditional_lstm' 532 | # print 'config0' + str(config0.__dict__) 533 | result0 = run_lstm_conditional(config0, final = True) 534 | 535 | 536 | # np.random.seed(1) 537 | # config0 = Config() 538 | # # print('Running run_lstm_with_parameters') 539 | # # config0.n_layers = 0 540 | # # config0.max_length = 75 541 | # config0.trainable_embeddings = 'Variable' 542 | # config0.hidden_size = 100 543 | # config0.n_epochs = 40 544 | # config0.n_layers = 1 545 | # config0.batch_size = 128 546 | # config0.dropout = 0.8 547 | # config0.n_layers = 4 548 | # config0.lr = 0.001 549 | # # config0.num_samples = 100 550 | # config0.b_max_len = 150 551 | # # config0.downsample = True 552 | # config0.attention_length = 15 553 | # config0.xp = 'layers' 554 | # config0.model = 'conditional_lstm' 555 | # # print 'config0' + str(config0.__dict__) 556 | # result0 = run_lstm_conditional(config0) 557 | 558 | # np.random.seed(1) 559 | # config1 = Config() 560 | # # print('Running run_lstm_with_parameters') 561 | # # config0.n_layers = 0 562 | # # config0.max_length = 75 563 | # config1.trainable_embeddings = 'Variable' 564 | # config1.hidden_size = 100 565 | # config1.n_epochs = 40 566 | # config1.n_layers = 1 567 | # config1.batch_size = 128 568 | # config1.dropout = 0.8 569 | # config1.n_layers = 1 570 | # config1.lr = 0.001 571 | # # config0.num_samples = 1000 572 | # config1.b_max_len = 150 573 | # # config0.downsample = True 574 | # config1.attention_length = 15 575 | # config1.xp = 'body_length' 576 | # config1.model = 'conditional_lstm' 577 | # # print 'config0' + str(config0.__dict__) 578 | # result1 = run_lstm_conditional(config1) 579 | 580 | # np.random.seed(1) 581 | # config2 = Config() 582 | # # print('Running run_lstm_with_parameters') 583 | # # config0.n_layers = 0 584 | # # config0.max_length = 75 585 | # config2.trainable_embeddings = 'Variable' 586 | # config2.hidden_size = 100 587 | # config2.n_epochs = 40 588 | # config2.n_layers = 1 589 | # config2.batch_size = 128 590 | # config2.dropout = 0.8 591 | # config2.n_layers = 1 592 | # config2.lr = 0.001 593 | # # config0.num_samples = 1000 594 | # config2.b_max_len = 300 595 | # # config0.downsample = True 596 | # config2.attention_length = 15 597 | # config2.xp = 'body_length' 598 | # config2.model = 'conditional_lstm' 599 | # # print 'config0' + str(config0.__dict__) 600 | # result2 = run_lstm_conditional(config2) 601 | 602 | if __name__ == "__main__": 603 | print("-- Running Test Script --") 604 | print("-- Start BOW Experiments --") 605 | run_bow_with_parameters('') 606 | print("-- Start LSTM Basic Experiments --") 607 | run_lstm_with_parameters('') 608 | print("-- Start LSTM Attention Experiments --") 609 | run_lstm_attention_with_parameters('') 610 | print("-- Start LSTM Conditional Experiments --") 611 | run_lstm_conditional_with_parameters('') 612 | print("-- Finished Test Script --") -------------------------------------------------------------------------------- /paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ourownstory/stance_detection/df9f34883005d203cb6d8ee7d56d9bc82ef39b62/paper.pdf -------------------------------------------------------------------------------- /poster.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ourownstory/stance_detection/df9f34883005d203cb6d8ee7d56d9bc82ef39b62/poster.pdf --------------------------------------------------------------------------------