├── README.md
├── code
    ├── .DS_Store
    ├── LSTM_attention.py
    ├── LSTM_conditional.py
    ├── README.md
    ├── __init__.py
    ├── basicLSTM_model_config.py
    ├── bow_model_config.py
    ├── data_analysis_plotting
    │   ├── Results_loading_1.R
    │   ├── data_analysis.Rmd
    │   ├── results_analysis.Rmd
    │   └── results_analysis_2.Rmd
    ├── execute_bow_config.py
    ├── execute_lstm_attention.py
    ├── execute_lstm_conditional.py
    ├── execute_lstm_config.py
    ├── our_model_config.py
    ├── our_util.py
    ├── run_text_processing.py
    └── test_script6.py
├── paper.pdf
└── poster.pdf


/README.md:
--------------------------------------------------------------------------------
1 | # Stance Detection for the Fake News Challenge with Conditional Encoding and Attention LSTM
2 | Stance Detection Model developed by Stephen Pfohl, Oskar Triebe and Ferdinand Legros for the Fake News Challenge with Conditional Encoding and Attention LSTM, as Stanford CS224N class project.
3 | 
4 | Additional to the code written for the Fake News Challenge (http://www.fakenewschallenge.org/), the poster and paper presenting the work done are included.
5 | 
6 | Our motivation to publish this is to help other researchers kickstart their projects. Please feel free to use under appropriate attribution.


--------------------------------------------------------------------------------
/code/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ourownstory/stance_detection/df9f34883005d203cb6d8ee7d56d9bc82ef39b62/code/.DS_Store


--------------------------------------------------------------------------------
/code/LSTM_attention.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | ######
  5 | # Model class for Baseline_LSTM
  6 | # Based on starter code from PS3-CS224n
  7 | ######
  8 | from __future__ import absolute_import
  9 | from __future__ import division
 10 | 
 11 | import argparse
 12 | import sys
 13 | import time
 14 | import logging
 15 | from datetime import datetime
 16 | 
 17 | import tensorflow as tf
 18 | import numpy as np
 19 | 
 20 | from our_util import Progbar, minibatches, get_performance, softmax
 21 | from our_model_config import OurModel
 22 | 
 23 | logger = logging.getLogger("hw3.q3")
 24 | logger.setLevel(logging.DEBUG)
 25 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
 26 | 
 27 | class LSTMAttention(OurModel):
 28 | 
 29 |     def add_placeholders(self):
 30 |         """Generates placeholder variables to represent the input tensors
 31 |         MODIF: OVERWRITING
 32 |         """
 33 |         self.inputs_placeholder = tf.placeholder(tf.int64, shape=(None, self.config.max_length), name = "x")
 34 |         self.labels_placeholder = tf.placeholder(tf.int64, shape=(None), name = "y")
 35 |         self.seqlen_placeholder = tf.placeholder(tf.int64, shape=(None), name = "seqlen")
 36 |         self.dropout_placeholder = tf.placeholder(tf.float64, name = 'dropout')
 37 |     
 38 |     def create_feed_dict(self, inputs_batch, seqlen_batch, labels_batch = None, dropout = 1.0):
 39 |         """Creates the feed_dict for the model.
 40 |         MODIF: OVERWRITING
 41 |         """
 42 |         feed_dict = {
 43 |             self.inputs_placeholder: inputs_batch,
 44 |             }
 45 |         if labels_batch is not None:
 46 |             feed_dict[self.labels_placeholder] = labels_batch
 47 |         if dropout is not None:
 48 |             feed_dict[self.dropout_placeholder] = dropout
 49 |         feed_dict[self.seqlen_placeholder] = seqlen_batch
 50 |         return feed_dict
 51 | 
 52 |     def add_prediction_op(self):
 53 | 
 54 |         # Initialize
 55 |         theInitializer = tf.contrib.layers.xavier_initializer(uniform = True, dtype = tf.float64)
 56 |         
 57 |         # Configure LSTM cells
 58 |         
 59 |         cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = self.config.hidden_size)
 60 |         cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob = self.dropout_placeholder)
 61 | 
 62 |         # Create an initializer
 63 |         theInitializer = tf.contrib.layers.xavier_initializer(uniform = True, dtype = tf.float64)
 64 | 
 65 |         # Get the inputs
 66 |         x = self.add_embedding(option = self.config.trainable_embeddings)
 67 | 
 68 |         if self.config.n_layers <= 1:
 69 |             rnnOutput = tf.nn.dynamic_rnn(cell, inputs = x, dtype = tf.float64, sequence_length = self.seqlen_placeholder) #MODIF
 70 |             Y = tf.slice(rnnOutput[0], begin = [0, 0, 0], size = [-1, self.config.attention_length, -1])
 71 |             h_N = rnnOutput[1][1] # batch_size, cell.state_size
 72 |         elif self.config.n_layers > 1:
 73 |             stacked_lstm = tf.nn.rnn_cell.MultiRNNCell([cell] * self.config.n_layers)
 74 |             rnnOutput = tf.nn.dynamic_rnn(stacked_lstm, inputs = x, dtype = tf.float64, sequence_length = self.seqlen_placeholder) #MODIF
 75 |             # print('rnnOutput[0] shape:', rnnOutput[0].get_shape())
 76 |             Y = tf.slice(rnnOutput[0], begin = [0, 0, 0], size = [-1, self.config.attention_length, -1])
 77 |             h_N = rnnOutput[1][self.config.n_layers - 1][1]
 78 |         # Run the RNN
 79 | 
 80 |         # Attention implementation, as in https://arxiv.org/abs/1509.06664
 81 |         W_y = tf.get_variable(name = 'Wy', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
 82 |         W_h = tf.get_variable(name = 'Wh', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
 83 |         w = tf.get_variable(name = 'w', shape = (self.config.hidden_size, 1), initializer = theInitializer, dtype = tf.float64)
 84 |         W_p = tf.get_variable(name = 'Wo', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
 85 |         W_x = tf.get_variable(name = 'Wx', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
 86 | 
 87 |         M_1 = tf.reshape(tf.matmul(tf.reshape(Y, shape = (-1, self.config.hidden_size)), W_y), shape = (-1, self.config.attention_length, self.config.hidden_size))
 88 |         M_2 = tf.expand_dims(tf.matmul(h_N, W_h), axis = 1)
 89 |         M = tf.tanh(M_1 + M_2)
 90 |         alpha = tf.reshape(tf.nn.softmax(tf.matmul(tf.reshape(M, shape = (-1, self.config.hidden_size)), w)), shape = (-1, self.config.attention_length))
 91 | 
 92 |         r = tf.squeeze(tf.batch_matmul(tf.transpose(tf.expand_dims(alpha, 2), perm = [0, 2, 1]), Y))
 93 |         h_star = tf.tanh(tf.matmul(r, W_p) + tf.matmul(h_N, W_x))
 94 | 
 95 |         # Output matrices
 96 |         U = tf.get_variable(name = 'U', shape = (self.config.hidden_size, self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
 97 |         b = tf.get_variable(name = 'b', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
 98 |         
 99 |         # Compute predictions
100 |         preds = tf.matmul(h_star, U) + b # batch_size, n_classes
101 |         return preds
102 |     
103 |     def add_embedding(self, option = 'Constant'):
104 |         """Adds an embedding layer that maps from input tokens (integers) to vectors and then
105 |         concatenates those vectors:
106 | 
107 |         Returns:
108 |             embeddings: tf.Tensor of shape (None, max_length, n_features*embed_size)
109 |         """
110 |         # option = config.trainable_embeddings
111 |         if option == 'Variable':
112 |             embeddings_temp = tf.nn.embedding_lookup(params = tf.Variable(self.config.pretrained_embeddings), ids = self.inputs_placeholder)
113 |         elif option == 'Constant':
114 |             embeddings_temp = tf.nn.embedding_lookup(params = tf.constant(self.config.pretrained_embeddings), ids = self.inputs_placeholder)
115 |         embeddings = tf.reshape(embeddings_temp, shape = (-1, self.config.max_length, self.config.embed_size))
116 |         ### END YOUR CODE
117 |         return embeddings
118 | 
119 |     def train_on_batch(self, sess, inputs_batch, labels_batch, seqlen_batch):
120 |         """
121 |         MODIF
122 |         Perform one step of gradient descent on the provided batch of data.
123 | 
124 |         Args:
125 |             sess: tf.Session()
126 |             input_batch: np.ndarray of shape (n_samples, n_features) # CHECK: np.ndarray??
127 |             labels_batch: np.ndarray of shape (n_samples, n_classes)
128 |             labels_batch: np.array of shape (n_samples)
129 |         Returns:
130 |             loss: loss over the batch (a scalar)
131 |         """
132 |         # inputs_batch = np.reshape(inputs_batch, (-1, inputs_batch.shape[1], 1))
133 |         labels_batch = np.reshape(labels_batch, (-1, 1))
134 |         feed = self.create_feed_dict(inputs_batch, labels_batch=labels_batch, seqlen_batch = seqlen_batch, dropout = self.config.dropout) # MODIF
135 |         print(inputs_batch.shape)
136 |         print(len(labels_batch))
137 |         _, loss = sess.run([self.train_op, self.loss], feed_dict=feed)
138 |         return loss
139 | 
140 |     def predict_on_batch(self, sess, inputs_batch, seqlen_batch):
141 |         """Make predictions for the provided batch of data 
142 | 
143 |         Args:
144 |             sess: tf.Session()
145 |             input_batch: np.ndarray of shape (n_samples, n_features)
146 |         Returns:
147 |             predictions: np.ndarray of shape (n_samples, n_classes)
148 |         """
149 |         feed = self.create_feed_dict(inputs_batch, seqlen_batch)
150 |         predictions = sess.run(self.pred, feed_dict=feed)
151 |         return predictions
152 | 
153 |     def run_epoch(self, sess, train):
154 |         prog = Progbar(target=1 + int(len(train) / self.config.batch_size))
155 |         losses = []
156 |         for i, batch in enumerate(minibatches(train, self.config.batch_size)):
157 |             loss = self.train_on_batch(sess, *batch)
158 |             losses.append(loss)
159 |             # grad_norms.append(grad_norm)
160 |             prog.update(i + 1, [("train loss", loss)])
161 |         return losses
162 | 
163 |     def fit(self, sess, train, dev_data_np, dev_seqlen, dev_labels): # MODIF # CAREFUL DEV/dev
164 |         '''
165 |             Returns LISTS:
166 |                 - losses_epochs
167 |                 - dev_performances_epochs
168 |                 - dev_predictions_epochs
169 |                 - dev_predicted_classes_epochs
170 |         '''
171 |         losses_epochs = [] #M
172 |         dev_performances_epochs = [] # MODIF
173 |         dev_predictions_epochs = [] #M
174 |         dev_predicted_classes_epochs = [] #M
175 |         for epoch in range(self.config.n_epochs):
176 |             logger.info("Epoch %d out of %d", epoch + 1, self.config.n_epochs)
177 |             loss = self.run_epoch(sess, train)
178 | 
179 |             # Computing predictions # MODIF
180 |             dev_predictions = self.predict_on_batch(sess, dev_data_np, dev_seqlen) #OUCH
181 | 
182 |             # Computing development performance #MODIF
183 |             dev_predictions = softmax(np.array(dev_predictions))
184 |             dev_predicted_classes = np.argmax(dev_predictions, axis = 1)
185 |             dev_performance = get_performance(dev_predicted_classes, dev_labels, n_classes = 4)
186 | 
187 |             # Adding to global outputs #MODIF
188 |             dev_predictions_epochs.append(dev_predictions)
189 |             dev_predicted_classes_epochs.append(dev_predicted_classes)
190 |             dev_performances_epochs.append(dev_performance) 
191 |             losses_epochs.append(loss)
192 | 
193 |         return losses_epochs, dev_performances_epochs, dev_predicted_classes_epochs, dev_predictions_epochs
194 | 
195 |     def build(self):
196 |         self.add_placeholders()
197 |         self.pred = self.add_prediction_op()
198 |         self.loss = self.add_loss_op(self.pred)
199 |         self.train_op = self.add_training_op(self.loss)
200 | 
201 |     def __init__(self, config):
202 |         self.config = config
203 |         self.inputs_placeholder = None
204 |         self.labels_placeholder = None
205 |         self.seqlen_placeholder = None
206 |         self.dropout_placeholder = None
207 |         self.build()


--------------------------------------------------------------------------------
/code/LSTM_conditional.py:
--------------------------------------------------------------------------------
  1 | ######
  2 | # basic BOW model with architecture extendable to more complex LSTM models which use both headings and bodies separately.
  3 | ######
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import random
  7 | 
  8 | from our_model_config import OurModel
  9 | from our_util import Progbar, minibatches, get_performance, softmax
 10 | 
 11 | class LSTMCondModel(OurModel):
 12 | 
 13 |     def add_placeholders(self):
 14 |         """Generates placeholder variables to represent the input tensors
 15 |         """
 16 |         self.headings_placeholder = tf.placeholder(tf.int64, shape=(None, self.config.h_max_len), name = "headings")
 17 |         self.bodies_placeholder = tf.placeholder(tf.int64, shape=(None, self.config.b_max_len), name = "bodies")
 18 |         self.headings_lengths_placeholder = tf.placeholder(tf.float64, shape=(None), name = "headings_lengths")
 19 |         self.bodies_lengths_placeholder = tf.placeholder(tf.float64, shape=(None), name = "bodies_lengths")
 20 |         self.labels_placeholder = tf.placeholder(tf.int64, shape=(None), name = "labels")
 21 |         self.dropout_placeholder = tf.placeholder(tf.float64, name = 'dropout')
 22 | 
 23 |     def create_feed_dict(self, headings_batch, bodies_batch, headings_lengths_batch, bodies_lengths_batch, labels_batch=None, dropout = 1.0):
 24 |         """Creates the feed_dict for the model.
 25 |         """
 26 |         feed_dict = {
 27 |             self.headings_placeholder: headings_batch,
 28 |             self.bodies_placeholder: bodies_batch,
 29 |             self.headings_lengths_placeholder: headings_lengths_batch,
 30 |             self.bodies_lengths_placeholder: bodies_lengths_batch
 31 |             }
 32 |         if labels_batch is not None:
 33 |             feed_dict[self.labels_placeholder] = labels_batch
 34 |         if dropout is not None:
 35 |             feed_dict[self.dropout_placeholder] = dropout
 36 |         return feed_dict
 37 | 
 38 |     def add_embedding(self, option = 'Constant'):
 39 |         """Adds an embedding layer that maps from input tokens (integers) to vectors for both the headings and bodies:
 40 | 
 41 |         Returns:
 42 |             embeddings_headings: tf.Tensor of shape (None, h_max_len, embed_size)
 43 |             embeddings_bodies: tf.Tensor of shape (None, b_max_len, embed_size)
 44 |         """
 45 |         if option == 'Constant':
 46 |             embeddings_headings_temp = tf.nn.embedding_lookup(params = tf.constant(self.config.pretrained_embeddings), ids = self.headings_placeholder)
 47 |             embeddings_bodies_temp   = tf.nn.embedding_lookup(params = tf.constant(self.config.pretrained_embeddings), ids = self.bodies_placeholder)
 48 |         elif option == 'Variable':
 49 |             embeddings_headings_temp = tf.nn.embedding_lookup(params = tf.Variable(self.config.pretrained_embeddings), ids = self.headings_placeholder)
 50 |             embeddings_bodies_temp   = tf.nn.embedding_lookup(params = tf.Variable(self.config.pretrained_embeddings), ids = self.bodies_placeholder)
 51 |         embeddings_headings = tf.reshape(embeddings_headings_temp, shape = (-1, self.config.h_max_len, self.config.embed_size))
 52 |         embeddings_bodies = tf.reshape(embeddings_bodies_temp, shape = (-1, self.config.b_max_len, self.config.embed_size))
 53 |         return embeddings_headings, embeddings_bodies
 54 | 
 55 |     def add_prediction_op(self):
 56 | 
 57 |         with tf.variable_scope('head'):
 58 | 
 59 |             # LSTM that handles the headers
 60 |             cell_h = tf.nn.rnn_cell.BasicLSTMCell(num_units = self.config.hidden_size)
 61 |             cell_h = tf.nn.rnn_cell.DropoutWrapper(cell_h, output_keep_prob = self.dropout_placeholder)
 62 |             theInitializer = tf.contrib.layers.xavier_initializer(uniform = True, dtype = tf.float64)
 63 | 
 64 |             # x = self.inputs_placeholder
 65 |             x_header, x_body = self.add_embedding(option = self.config.trainable_embeddings)
 66 |             # print('Predict op: x', x)
 67 |             rnnOutput_h = tf.nn.dynamic_rnn(cell_h, inputs = x_header, dtype = tf.float64, sequence_length = self.headings_lengths_placeholder) #MODIF
 68 |             Y = tf.slice(rnnOutput_h[0], begin = [0, 0, 0], size = [-1, self.config.attention_length, -1])
 69 | 
 70 |         with tf.variable_scope('body'):
 71 |             # LSTM that handles the bodies
 72 |             cell_b = tf.nn.rnn_cell.BasicLSTMCell(num_units = self.config.hidden_size)
 73 |             cell_b = tf.nn.rnn_cell.DropoutWrapper(cell_b, output_keep_prob = self.dropout_placeholder)
 74 | 
 75 |             U_b = tf.get_variable(name = 'U_b', shape = (self.config.hidden_size, self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
 76 |             b_b = tf.get_variable(name = 'b_b', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
 77 | 
 78 |             rnnOutput_b = tf.nn.dynamic_rnn(cell_b, inputs = x_body, dtype = tf.float64, initial_state = rnnOutput_h[1], sequence_length = self.bodies_lengths_placeholder)
 79 |             h_N = rnnOutput_b[1][1] # batch_size, cell.state_size
 80 | 
 81 |         ## ATTENTION!
 82 |         W_y = tf.get_variable(name = 'Wy', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
 83 |         W_h = tf.get_variable(name = 'Wh', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
 84 |         w = tf.get_variable(name = 'w', shape = (self.config.hidden_size, 1), initializer = theInitializer, dtype = tf.float64)
 85 |         W_p = tf.get_variable(name = 'Wo', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
 86 |         W_x = tf.get_variable(name = 'Wx', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
 87 | 
 88 |         M_1 = tf.reshape(tf.matmul(tf.reshape(Y, shape = (-1, self.config.hidden_size)), W_y), shape = (-1, self.config.attention_length, self.config.hidden_size))
 89 |         M_2 = tf.expand_dims(tf.matmul(h_N, W_h), axis = 1)
 90 |         M = tf.tanh(M_1 + M_2)
 91 |         alpha = tf.reshape(tf.nn.softmax(tf.matmul(tf.reshape(M, shape = (-1, self.config.hidden_size)), w)), shape = (-1, self.config.attention_length))
 92 | 
 93 |         r = tf.squeeze(tf.batch_matmul(tf.transpose(tf.expand_dims(alpha, 2), perm = [0, 2, 1]), Y))
 94 |         h_star = tf.tanh(tf.matmul(r, W_p) + tf.matmul(h_N, W_x))
 95 | 
 96 |         # Compute predictions
 97 |         preds = tf.matmul(h_star, U_b) + b_b # batch_size, n_classes
 98 |         return preds
 99 | 
100 |     def add_prediction_op(self):
101 | 
102 |         with tf.variable_scope('head'):
103 | 
104 |             # LSTM that handles the headers
105 |             cell_h = tf.nn.rnn_cell.BasicLSTMCell(num_units = self.config.hidden_size)
106 |             cell_h = tf.nn.rnn_cell.DropoutWrapper(cell_h, output_keep_prob = self.dropout_placeholder)
107 |             theInitializer = tf.contrib.layers.xavier_initializer(uniform = True, dtype = tf.float64)
108 | 
109 |             x_header, x_body = self.add_embedding(option = self.config.trainable_embeddings)
110 | 
111 |             if self.config.n_layers <= 1:
112 |                 rnnOutput_h = tf.nn.dynamic_rnn(cell_h, inputs = x_header, dtype = tf.float64, sequence_length = self.headings_lengths_placeholder) #MODIF
113 |             elif self.config.n_layers > 1:
114 |                 stacked_lstm = tf.nn.rnn_cell.MultiRNNCell([cell_h] * self.config.n_layers)
115 |                 rnnOutput_h = tf.nn.dynamic_rnn(stacked_lstm, inputs = x_header, dtype = tf.float64, sequence_length = self.headings_lengths_placeholder) #MODIF
116 |             Y = tf.slice(rnnOutput_h[0], begin = [0, 0, 0], size = [-1, self.config.attention_length, -1])
117 | 
118 |         with tf.variable_scope('body'):
119 |             # LSTM that handles the bodies
120 |             cell_b = tf.nn.rnn_cell.BasicLSTMCell(num_units = self.config.hidden_size)
121 |             cell_b = tf.nn.rnn_cell.DropoutWrapper(cell_b, output_keep_prob = self.dropout_placeholder)
122 | 
123 |             U_b = tf.get_variable(name = 'U_b', shape = (self.config.hidden_size, self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
124 |             b_b = tf.get_variable(name = 'b_b', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
125 | 
126 |             if self.config.n_layers <= 1:
127 |                 rnnOutput_b = tf.nn.dynamic_rnn(cell_b, inputs = x_body, dtype = tf.float64, initial_state = rnnOutput_h[1], sequence_length = self.bodies_lengths_placeholder)
128 |                 h_N = rnnOutput_b[1][1] # batch_size, cell.state_size
129 |             elif self.config.n_layers > 1:
130 |                 print('header rnn, ', len(rnnOutput_h[1]))
131 |                 stacked_lstm = tf.nn.rnn_cell.MultiRNNCell([cell_b] * self.config.n_layers)
132 |                 rnnOutput_b = tf.nn.dynamic_rnn(stacked_lstm, inputs = x_body, dtype = tf.float64, initial_state = rnnOutput_h[1], sequence_length = self.bodies_lengths_placeholder)
133 |                 h_N = rnnOutput_b[1][self.config.n_layers - 1][1]
134 | 
135 |         ## ATTENTION!
136 |         W_y = tf.get_variable(name = 'Wy', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
137 |         W_h = tf.get_variable(name = 'Wh', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
138 |         w = tf.get_variable(name = 'w', shape = (self.config.hidden_size, 1), initializer = theInitializer, dtype = tf.float64)
139 |         W_p = tf.get_variable(name = 'Wo', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
140 |         W_x = tf.get_variable(name = 'Wx', shape = (self.config.hidden_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
141 | 
142 |         M_1 = tf.reshape(tf.matmul(tf.reshape(Y, shape = (-1, self.config.hidden_size)), W_y), shape = (-1, self.config.attention_length, self.config.hidden_size))
143 |         M_2 = tf.expand_dims(tf.matmul(h_N, W_h), axis = 1)
144 |         M = tf.tanh(M_1 + M_2)
145 |         alpha = tf.reshape(tf.nn.softmax(tf.matmul(tf.reshape(M, shape = (-1, self.config.hidden_size)), w)), shape = (-1, self.config.attention_length))
146 | 
147 |         r = tf.squeeze(tf.batch_matmul(tf.transpose(tf.expand_dims(alpha, 2), perm = [0, 2, 1]), Y))
148 |         h_star = tf.tanh(tf.matmul(r, W_p) + tf.matmul(h_N, W_x))
149 | 
150 |         # Compute predictions
151 |         preds = tf.matmul(h_star, U_b) + b_b # batch_size, n_classes
152 |         return preds
153 | 
154 | 
155 |     def train_on_batch(self, sess, h_batch, b_batch, h_len_batch, b_len_batch, y_batch):
156 |         """Perform one step of gradient descent on the provided batch of data.
157 |         Args:
158 |             sess: tf.Session()
159 |             headings_batch: np.ndarray of shape (n_samples, n_features)
160 |             bodies_batch: np.ndarray of shape (n_samples, n_features)
161 |             headings_lengths_batch: np.ndarray of shape (n_samples, 1)
162 |             bodies_lengths_batch: np.ndarray of shape (n_samples, 1)
163 |             labels_batch: np.ndarray of shape (n_samples, n_classes)
164 |         Returns:
165 |             loss: loss over the batch (a scalar)
166 |         """
167 |         feed = self.create_feed_dict(h_batch, b_batch, h_len_batch, b_len_batch, y_batch, dropout = self.config.dropout)
168 |         # print('feed', feed)
169 |         _, loss = sess.run([self.train_op, self.loss], feed_dict=feed)
170 |         ## for debugging / testing
171 |         if (np.isnan(loss)):
172 |             print('headings', h_batch)
173 |             print('bodies', b_batch)
174 |             print('nh_len', h_len_batch)
175 |             print('b_len', b_len_batch)
176 |             print('labels', y_batch)
177 |             assert(False)
178 |         return loss
179 | 
180 |     def predict_on_batch(self, sess, h_batch, b_batch, h_len_batch, b_len_batch):
181 |         """Make predictions for the provided batch of data
182 |         Args:
183 |             sess: tf.Session()
184 |             headings_batch: np.ndarray of shape (n_samples, n_features)
185 |             bodies_batch: np.ndarray of shape (n_samples, n_features)
186 |             headings_lengths_batch: np.ndarray of shape (n_samples, 1)
187 |             bodies_lengths_batch: np.ndarray of shape (n_samples, 1)
188 |         Returns:
189 |             predictions: np.ndarray of shape (n_samples, n_classes)
190 |         """
191 |         feed = self.create_feed_dict(h_batch, b_batch, h_len_batch, b_len_batch)
192 |         predictions = sess.run(self.pred, feed_dict=feed)
193 |         return predictions
194 | 
195 |     def run_epoch(self, sess, h_np, b_np, h_len, b_len, y):
196 |         # prog = Progbar(target=1 + int(len(train) / self.config.batch_size))
197 |         losses = []
198 |         # shuffle
199 |         ind = range(self.config.num_samples)
200 |         random.shuffle(ind)
201 |         # sizes
202 |         batch_start = 0
203 |         batch_end = 0       
204 |         N = self.config.batch_size
205 |         num_batches = self.config.num_samples / N
206 |         # run batches
207 |         for i in range(num_batches):
208 |             batch_start = (i*N)
209 |             batch_end = (i+1)*N
210 |             indices = ind[batch_start:batch_end]
211 |             h_batch = h_np[indices,:]
212 |             b_batch = b_np[indices,:]
213 |             h_len_batch = h_len[indices]
214 |             b_len_batch = b_len[indices]
215 |             y_batch = y[indices]
216 |             loss = self.train_on_batch(sess, h_batch, b_batch, h_len_batch, b_len_batch, y_batch)
217 |             losses.append(loss)
218 |             if (i % (1 + num_batches/10)) == 0:
219 |                 print('batch: ', i, ', loss: ', loss)
220 |         # run last smaller batch
221 |         if (batch_end < self.config.num_samples):
222 |             indices = ind[batch_end:]
223 |             h_batch = h_np[indices,:]
224 |             b_batch = b_np[indices,:]
225 |             h_len_batch = h_len[indices]
226 |             b_len_batch = b_len[indices]
227 |             y_batch = y[indices]
228 |             # loss
229 |             loss = self.train_on_batch(sess, h_batch, b_batch, h_len_batch, b_len_batch, y_batch)
230 |             losses.append(loss)
231 |         return losses
232 | 
233 |     def fit(self, sess, h_np, b_np, h_len, b_len, y, dev_h, dev_b, dev_h_len, dev_b_len, dev_y): #M
234 |         #losses = []
235 |         losses_epochs = [] #M
236 |         dev_performances_epochs = [] # M
237 |         dev_predictions_epochs = [] #M
238 |         dev_predicted_classes_epochs = [] #M
239 | 
240 |         for epoch in range(self.config.n_epochs):
241 |             print('-------new epoch---------')
242 |             loss = self.run_epoch(sess, h_np, b_np, h_len, b_len, y)
243 | 
244 |             # Computing predictions #MODIF
245 |             dev_predictions = self.predict_on_batch(sess, dev_h, dev_b, dev_h_len, dev_b_len)
246 | 
247 |             # Computing development performance #MODIF
248 |             dev_predictions = softmax(np.array(dev_predictions))
249 |             dev_predicted_classes = np.argmax(dev_predictions, axis = 1)
250 |             dev_performance = get_performance(dev_predicted_classes, dev_y, n_classes = 4)
251 | 
252 |             # Adding to global outputs #MODIF
253 |             dev_predictions_epochs.append(dev_predictions)
254 |             dev_predicted_classes_epochs.append(dev_predicted_classes)
255 |             dev_performances_epochs.append(dev_performance) 
256 |             losses_epochs.append(loss)
257 |             
258 |             print('EPOCH: ', epoch, ', LOSS: ', np.mean(loss))
259 | 
260 |         return losses_epochs, dev_performances_epochs, dev_predicted_classes_epochs, dev_predictions_epochs
261 | 
262 |     def __init__(self, config):
263 |         self.config = config
264 |         self.headings_placeholder = None
265 |         self.bodies_placeholder = None
266 |         self.headings_lengths_placeholder = None
267 |         self.bodies_lengths_placeholder = None
268 |         self.labels_placeholder = None
269 |         self.dropout_placeholder = None
270 |         self.build()


--------------------------------------------------------------------------------
/code/README.md:
--------------------------------------------------------------------------------
 1 | CS 224n Project Directory
 2 | 
 3 | Winter 2017
 4 | 
 5 | Stephen Pfohl
 6 | Ferdinand Legros
 7 | Oskar Triebe
 8 | 
 9 | Model Files:
10 |     our_model_config.py
11 |         contains abstract model class to be extended by other models. Is based off of the model classes used in the course assignments.
12 |     bow_model_config.py
13 |         Bag of words model class that extends our_model_config.py
14 |     basicLSTM_model_config.py
15 |         model class for the basic LSTM model that operates on the concatenated input
16 |     LSTM_attention.py
17 |         model class for the LSTM model that has been augmented by attention
18 |     LSTM_conditional.py
19 |         model class for the LSTM with attention and conditional encoding
20 | 
21 | Model Execution Files
22 |     execute_bow_config
23 |         script that executes a single experiment of the bag of words model for a given set of parameters
24 |     execute_lstm_config.py
25 |         script that executes a single experiment of the basic LSTM model for a given set of parameters
26 |     execute_lstm_attention.py
27 |         script that executes a single experiment of the lstm model that has been augmented by attention for a given set of parameters
28 |     execute_lstm_conditional.py
29 |         script that executes a single experiment of the LSTM model with conditional encoding and attention for a given set of parameters for a given set of parameters
30 | 
31 | Utility Files
32 |     our_util.py
33 |         Utility functions for use in other files. Based on the example of the util.py files provided in course assignments.
34 |     run_text_processing.py
35 |         File that performas tokenization, loads the data, etc
36 | 
37 | Runtime scripts
38 |     test_script6.py
39 |         Allows the user to define a set of experiments for any of the models described above.
40 | 
41 | fnc_baseline directory
42 |     Required and provided by the competition organizers at https://github.com/FakeNewsChallenge/fnc-1-baseline
43 |     Not included with this submission due to size constraints
44 | 
45 | Plotting
46 |     Contains .Rmd files for plotting


--------------------------------------------------------------------------------
/code/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ourownstory/stance_detection/df9f34883005d203cb6d8ee7d56d9bc82ef39b62/code/__init__.py


--------------------------------------------------------------------------------
/code/basicLSTM_model_config.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | ######
  5 | # Model class for Baseline_LSTM
  6 | # Based on starter code from PS3-CS224n
  7 | ######
  8 | from __future__ import absolute_import
  9 | from __future__ import division
 10 | 
 11 | import argparse
 12 | import sys
 13 | import time
 14 | import logging
 15 | from datetime import datetime
 16 | 
 17 | import tensorflow as tf
 18 | import numpy as np
 19 | 
 20 | from our_util import Progbar, minibatches, get_performance, softmax
 21 | from our_model_config import OurModel
 22 | 
 23 | logger = logging.getLogger("hw3.q3")
 24 | logger.setLevel(logging.DEBUG)
 25 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
 26 | 
 27 | class BaselineLSTM(OurModel):
 28 | 
 29 |     def add_placeholders(self):
 30 |         """Generates placeholder variables to represent the input tensors
 31 |         MODIF: OVERWRITING
 32 |         """
 33 |         self.inputs_placeholder = tf.placeholder(tf.int64, shape=(None, self.config.max_length), name = "x")
 34 |         self.labels_placeholder = tf.placeholder(tf.int64, shape=(None), name = "y")
 35 |         self.seqlen_placeholder = tf.placeholder(tf.int64, shape=(None), name = "seqlen")
 36 |         self.dropout_placeholder = tf.placeholder(tf.float64, name = 'dropout')
 37 |     
 38 |     def create_feed_dict(self, inputs_batch, seqlen_batch, labels_batch = None, dropout = 1.0):
 39 |         """Creates the feed_dict for the model.
 40 |         MODIF: OVERWRITING
 41 |         """
 42 |         feed_dict = {
 43 |             self.inputs_placeholder: inputs_batch,
 44 |             }
 45 |         if labels_batch is not None:
 46 |             feed_dict[self.labels_placeholder] = labels_batch
 47 |         if dropout is not None:
 48 |             feed_dict[self.dropout_placeholder] = dropout
 49 |         feed_dict[self.seqlen_placeholder] = seqlen_batch
 50 |         return feed_dict
 51 | 
 52 |     def add_prediction_op(self):
 53 |         """
 54 |         Returns:
 55 |             preds: tf.Tensor of shape (batch_size, 1)
 56 |         """
 57 | 
 58 |         if self.config.n_layers <= 1:
 59 |             print('layers = ', self.config.n_layers)
 60 |             cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = self.config.hidden_size)
 61 |             cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob = self.dropout_placeholder)
 62 |             theInitializer = tf.contrib.layers.xavier_initializer(uniform = True, dtype = tf.float64)
 63 |             U = tf.get_variable(name = 'U', shape = (self.config.hidden_size, self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
 64 |             b = tf.get_variable(name = 'b', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
 65 | 
 66 |             x = self.add_embedding(option = self.config.trainable_embeddings)
 67 |             rnnOutput = tf.nn.dynamic_rnn(cell, inputs = x, dtype = tf.float64, sequence_length = self.seqlen_placeholder) #MODIF
 68 |             finalState = rnnOutput[1][1] # batch_size, cell.state_size
 69 |             preds = tf.matmul(finalState, U) + b # batch_size, n_classes
 70 |         # print('Predict op: preds', preds)
 71 |         elif self.config.n_layers > 1: # MODIF
 72 |             cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = self.config.hidden_size)
 73 |             cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob = self.dropout_placeholder)
 74 |             stacked_lstm = tf.nn.rnn_cell.MultiRNNCell([cell] * self.config.n_layers)
 75 |             theInitializer = tf.contrib.layers.xavier_initializer(uniform = True, dtype = tf.float64)
 76 |             U = tf.get_variable(name = 'U', shape = (self.config.hidden_size, self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
 77 |             b = tf.get_variable(name = 'b', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
 78 |             x = self.add_embedding(option = self.config.trainable_embeddings)
 79 |             rnnOutput = tf.nn.dynamic_rnn(stacked_lstm, inputs = x, dtype = tf.float64, sequence_length = self.seqlen_placeholder) #MODIF
 80 |             print('layers = ', self.config.n_layers)
 81 |             finalState = rnnOutput[1][self.config.n_layers - 1][1] # batch_size, cell.state_size
 82 |             preds = tf.matmul(finalState, U) + b # batch_size, n_classes
 83 |         return preds
 84 |     
 85 |     def add_embedding(self, option = 'Constant'):
 86 |         """Adds an embedding layer that maps from input tokens (integers) to vectors and then
 87 |         concatenates those vectors"
 88 | 
 89 |         Returns:
 90 |             embeddings: tf.Tensor of shape (None, max_length, n_features*embed_size)
 91 |         """
 92 |         # option = config.trainable_embeddings
 93 |         if option == 'Variable':
 94 |             embeddings_temp = tf.nn.embedding_lookup(params = tf.Variable(self.config.pretrained_embeddings), ids = self.inputs_placeholder)
 95 |         elif option == 'Constant':
 96 |             embeddings_temp = tf.nn.embedding_lookup(params = tf.constant(self.config.pretrained_embeddings), ids = self.inputs_placeholder)
 97 |         embeddings = tf.reshape(embeddings_temp, shape = (-1, self.config.max_length, self.config.embed_size))
 98 |         ### END YOUR CODE
 99 |         return embeddings
100 | 
101 |     def train_on_batch(self, sess, inputs_batch, labels_batch, seqlen_batch):
102 |         """
103 |         MODIF
104 |         Perform one step of gradient descent on the provided batch of data.
105 | 
106 |         Args:
107 |             sess: tf.Session()
108 |             input_batch: np.ndarray of shape (n_samples, n_features) # CHECK: np.ndarray??
109 |             labels_batch: np.ndarray of shape (n_samples, n_classes)
110 |             labels_batch: np.array of shape (n_samples)
111 |         Returns:
112 |             loss: loss over the batch (a scalar)
113 |         """
114 |         labels_batch = np.reshape(labels_batch, (-1, 1))
115 |         feed = self.create_feed_dict(inputs_batch, labels_batch=labels_batch, seqlen_batch = seqlen_batch, dropout = self.config.dropout) # MODIF
116 |         print(inputs_batch.shape)
117 |         print(len(labels_batch))
118 |         _, loss = sess.run([self.train_op, self.loss], feed_dict=feed)
119 |         return loss
120 | 
121 |     def predict_on_batch(self, sess, inputs_batch, seqlen_batch):
122 |         """Make predictions for the provided batch of data 
123 | 
124 |         Args:
125 |             sess: tf.Session()
126 |             input_batch: np.ndarray of shape (n_samples, n_features)
127 |         Returns:
128 |             predictions: np.ndarray of shape (n_samples, n_classes)
129 |         """
130 |         feed = self.create_feed_dict(inputs_batch, seqlen_batch)
131 |         predictions = sess.run(self.pred, feed_dict=feed)
132 |         return predictions
133 | 
134 |     def run_epoch(self, sess, train):
135 |         prog = Progbar(target=1 + int(len(train) / self.config.batch_size))
136 |         losses = []
137 |         for i, batch in enumerate(minibatches(train, self.config.batch_size)):
138 |             loss = self.train_on_batch(sess, *batch)
139 |             losses.append(loss)
140 |             # grad_norms.append(grad_norm)
141 |             prog.update(i + 1, [("train loss", loss)])
142 |         return losses
143 | 
144 |     def fit(self, sess, train, dev_data_np, dev_seqlen, dev_labels): # MODIF # CAREFUL DEV/dev
145 |         '''
146 |             Returns LISTS:
147 |                 - losses_epochs
148 |                 - dev_performances_epochs
149 |                 - dev_predictions_epochs
150 |                 - dev_predicted_classes_epochs
151 |         '''
152 |         losses_epochs = [] #M
153 |         dev_performances_epochs = [] # MODIF
154 |         dev_predictions_epochs = [] #M
155 |         dev_predicted_classes_epochs = [] #M
156 |         for epoch in range(self.config.n_epochs):
157 |             logger.info("Epoch %d out of %d", epoch + 1, self.config.n_epochs)
158 |             loss = self.run_epoch(sess, train)
159 | 
160 |             # Computing predictions # MODIF
161 |             dev_predictions = self.predict_on_batch(sess, dev_data_np, dev_seqlen) #OUCH
162 | 
163 |             # Computing development performance #MODIF
164 |             dev_predictions = softmax(np.array(dev_predictions))
165 |             dev_predicted_classes = np.argmax(dev_predictions, axis = 1)
166 |             dev_performance = get_performance(dev_predicted_classes, dev_labels, n_classes = 4)
167 | 
168 |             # Adding to global outputs #MODIF
169 |             dev_predictions_epochs.append(dev_predictions)
170 |             dev_predicted_classes_epochs.append(dev_predicted_classes)
171 |             dev_performances_epochs.append(dev_performance) 
172 |             losses_epochs.append(loss)
173 | 
174 |         return losses_epochs, dev_performances_epochs, dev_predicted_classes_epochs, dev_predictions_epochs
175 | 
176 |     def build(self):
177 |         self.add_placeholders()
178 |         self.pred = self.add_prediction_op()
179 |         self.loss = self.add_loss_op(self.pred)
180 |         self.train_op = self.add_training_op(self.loss)
181 | 
182 |     def __init__(self, config):
183 |         self.config = config
184 |         self.inputs_placeholder = None
185 |         self.labels_placeholder = None
186 |         self.seqlen_placeholder = None
187 |         self.dropout_placeholder = None
188 |         self.build()


--------------------------------------------------------------------------------
/code/bow_model_config.py:
--------------------------------------------------------------------------------
  1 | ######
  2 | # basic BOW model with architecture extendable to more complex LSTM models which use both headings and bodies separately.
  3 | ######
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import random
  7 | 
  8 | from our_model_config import OurModel
  9 | from our_util import Progbar, minibatches, get_performance, softmax
 10 | 
 11 | class BOWModel(OurModel):
 12 | 
 13 |     def add_placeholders(self):
 14 |         """Generates placeholder variables to represent the input tensors
 15 |         """
 16 |         self.headings_placeholder = tf.placeholder(tf.int64, shape=(None, self.config.h_max_len), name="headings")
 17 |         self.bodies_placeholder = tf.placeholder(tf.int64, shape=(None, self.config.b_max_len), name="bodies")
 18 |         self.headings_lengths_placeholder = tf.placeholder(tf.float64, shape=(None), name="headings_lengths")
 19 |         self.bodies_lengths_placeholder = tf.placeholder(tf.float64, shape=(None), name="bodies_lengths")
 20 |         self.labels_placeholder = tf.placeholder(tf.int64, shape=(None), name="labels")
 21 | 
 22 |     def create_feed_dict(self, headings_batch, bodies_batch, headings_lengths_batch, bodies_lengths_batch, labels_batch=None):
 23 |         """Creates the feed_dict for the model.
 24 |         """
 25 |         feed_dict = {
 26 |             self.headings_placeholder: headings_batch,
 27 |             self.bodies_placeholder: bodies_batch,
 28 |             self.headings_lengths_placeholder: headings_lengths_batch,
 29 |             self.bodies_lengths_placeholder: bodies_lengths_batch,
 30 |             }
 31 |         if labels_batch is not None:
 32 |             feed_dict[self.labels_placeholder] = labels_batch
 33 |         return feed_dict
 34 | 
 35 |     def add_embedding(self, option = 'Constant'):
 36 |         """Adds an embedding layer that maps from input tokens (integers) to vectors for both the headings and bodies:
 37 | 
 38 |         Returns:
 39 |             embeddings_headings: tf.Tensor of shape (None, h_max_len, embed_size)
 40 |             embeddings_bodies: tf.Tensor of shape (None, b_max_len, embed_size)
 41 |         """
 42 |         #
 43 |         # embeddings_headings_temp = tf.nn.embedding_lookup(params = tf.Constant(self.config.pretrained_embeddings), ids = self.headings_placeholder)
 44 |         # embeddings_bodies_temp = tf.nn.embedding_lookup(params = tf.Constant(self.config.pretrained_embeddings), ids = self.bodies_placeholder)
 45 |         embeddings_headings_temp = tf.nn.embedding_lookup(params = self.config.pretrained_embeddings, ids = self.headings_placeholder)
 46 |         embeddings_bodies_temp   = tf.nn.embedding_lookup(params = self.config.pretrained_embeddings, ids = self.bodies_placeholder)
 47 |         embeddings_headings = tf.reshape(embeddings_headings_temp, shape = (-1, self.config.h_max_len, self.config.embed_size))
 48 |         embeddings_bodies = tf.reshape(embeddings_bodies_temp, shape = (-1, self.config.b_max_len, self.config.embed_size))
 49 |         return embeddings_headings, embeddings_bodies
 50 | 
 51 |     def add_bow_input(self):
 52 |         headings, bodies = self.add_embedding(option = self.config.trainable_embeddings)
 53 |         headings_bag = tf.divide(tf.reduce_sum(headings, axis=1), tf.reshape(self.headings_lengths_placeholder, shape = (-1, 1)))
 54 |         bodies_bag   = tf.divide(tf.reduce_sum(bodies, axis=1),   tf.reshape(self.bodies_lengths_placeholder, shape = (-1, 1)))
 55 |         x = tf.concat_v2(values=[headings_bag, bodies_bag], axis=1)
 56 |         return x
 57 | 
 58 |     def add_prediction_op(self):
 59 |         """Runs an rnn on the input using TensorFlows's
 60 |         @tf.nn.dynamic_rnn function, and returns the final state as a prediction.
 61 | 
 62 |         Returns:
 63 |             logits: tf.Tensor of shape (batch_size, n_classes)
 64 |         """
 65 |         hidden_size_2 = np.floor(self.config.hidden_next**2 * self.config.hidden_size)
 66 |         hidden_size_3 = np.floor(self.config.hidden_next**3 * self.config.hidden_size)
 67 |         hidden_size_4 = np.floor(self.config.hidden_next**4 * self.config.hidden_size)
 68 |         hidden_size_5 = np.floor(self.config.hidden_next**5 * self.config.hidden_size)
 69 | 
 70 |         x = self.add_bow_input()
 71 |         theInitializer = tf.contrib.layers.xavier_initializer(uniform = True, dtype = tf.float64)
 72 |         if not self.config.n_layers:
 73 |             W = tf.get_variable(name = 'W', shape = (2*self.config.embed_size, self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
 74 |             c = tf.get_variable(name = 'c', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
 75 |             pred = tf.matmul(x, W) + c # batch_size, n_classes
 76 |         elif self.config.n_layers == 1:
 77 |             U0 = tf.get_variable(name = 'U0', shape = (2*self.config.embed_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
 78 |             c0 = tf.get_variable(name = 'c0', shape = (self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
 79 |             h1 = tf.nn.relu(tf.matmul(x, U0) + c0) # batch_size, hidden_size
 80 |             U1 = tf.get_variable(name = 'U1', shape = (self.config.hidden_size, self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
 81 |             c1 = tf.get_variable(name = 'c1', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
 82 |             pred = tf.matmul(h1, U1) + c1 # batch_size, n_classes
 83 |         elif self.config.n_layers == 2:
 84 |             U0 = tf.get_variable(name = 'U0', shape = (2*self.config.embed_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
 85 |             c0 = tf.get_variable(name = 'c0', shape = (self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
 86 |             h1 = tf.nn.relu(tf.matmul(x, U0) + c0) # batch_size, hidden_size
 87 |             U1 = tf.get_variable(name = 'U1', shape = (self.config.hidden_size, hidden_size_2), initializer = theInitializer, dtype = tf.float64)
 88 |             c1 = tf.get_variable(name = 'c1', shape = (hidden_size_2), initializer = theInitializer, dtype = tf.float64)
 89 |             h2 = tf.nn.relu(tf.matmul(h1, U1) + c1) # batch_size, hidden_size_2
 90 |             U2 = tf.get_variable(name = 'U2', shape = (hidden_size_2, self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
 91 |             c2 = tf.get_variable(name = 'c2', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
 92 |             pred = tf.matmul(h2, U2) + c2 # batch_size, n_classes
 93 |         elif self.config.n_layers == 3:
 94 |             U0 = tf.get_variable(name = 'U0', shape = (2*self.config.embed_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
 95 |             c0 = tf.get_variable(name = 'c0', shape = (self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
 96 |             h1 = tf.nn.relu(tf.matmul(x, U0) + c0) # batch_size, hidden_size
 97 |             U1 = tf.get_variable(name = 'U1', shape = (self.config.hidden_size, hidden_size_2), initializer = theInitializer, dtype = tf.float64)
 98 |             c1 = tf.get_variable(name = 'c1', shape = (hidden_size_2), initializer = theInitializer, dtype = tf.float64)
 99 |             h2 = tf.nn.relu(tf.matmul(h1, U1) + c1) # batch_size, hidden_size_2
100 |             U2 = tf.get_variable(name = 'U2', shape = (hidden_size_2, hidden_size_3), initializer = theInitializer, dtype = tf.float64)
101 |             c2 = tf.get_variable(name = 'c2', shape = (hidden_size_3), initializer = theInitializer, dtype = tf.float64)
102 |             h3 = tf.nn.relu(tf.matmul(h2, U2) + c2) # batch_size, hidden_size_3
103 |             U3 = tf.get_variable(name = 'U3', shape = (hidden_size_3, self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
104 |             c3 = tf.get_variable(name = 'c3', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
105 |             pred = tf.matmul(h3, U3) + c3 # batch_size, n_classes
106 |         elif self.config.n_layers == 4:
107 |             U0 = tf.get_variable(name = 'U0', shape = (2*self.config.embed_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
108 |             c0 = tf.get_variable(name = 'c0', shape = (self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
109 |             h1 = tf.nn.relu(tf.matmul(x, U0) + c0) # batch_size, hidden_size
110 |             U1 = tf.get_variable(name = 'U1', shape = (self.config.hidden_size, hidden_size_2), initializer = theInitializer, dtype = tf.float64)
111 |             c1 = tf.get_variable(name = 'c1', shape = (hidden_size_2), initializer = theInitializer, dtype = tf.float64)
112 |             h2 = tf.nn.relu(tf.matmul(h1, U1) + c1) # batch_size, hidden_size_2
113 |             U2 = tf.get_variable(name = 'U2', shape = (hidden_size_2, hidden_size_3), initializer = theInitializer, dtype = tf.float64)
114 |             c2 = tf.get_variable(name = 'c2', shape = (hidden_size_3), initializer = theInitializer, dtype = tf.float64)
115 |             h3 = tf.nn.relu(tf.matmul(h2, U2) + c2) # batch_size, hidden_size_3
116 |             U3 = tf.get_variable(name = 'U3', shape = (hidden_size_3, hidden_size_4), initializer = theInitializer, dtype = tf.float64)
117 |             c3 = tf.get_variable(name = 'c3', shape = (hidden_size_4), initializer = theInitializer, dtype = tf.float64)
118 |             h4 = tf.nn.relu(tf.matmul(h3, U3) + c3) # batch_size, hidden_size_4
119 |             U4 = tf.get_variable(name = 'U4', shape = (hidden_size_4, self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
120 |             c4 = tf.get_variable(name = 'c4', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
121 |             pred = tf.matmul(h4, U4) + c4 # batch_size, n_classes
122 |         elif self.config.n_layers == 5:
123 |             U0 = tf.get_variable(name = 'U0', shape = (2*self.config.embed_size, self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
124 |             c0 = tf.get_variable(name = 'c0', shape = (self.config.hidden_size), initializer = theInitializer, dtype = tf.float64)
125 |             h1 = tf.nn.relu(tf.matmul(x, U0) + c0) # batch_size, hidden_size
126 |             U1 = tf.get_variable(name = 'U1', shape = (self.config.hidden_size, hidden_size_2), initializer = theInitializer, dtype = tf.float64)
127 |             c1 = tf.get_variable(name = 'c1', shape = (hidden_size_2), initializer = theInitializer, dtype = tf.float64)
128 |             h2 = tf.nn.relu(tf.matmul(h1, U1) + c1) # batch_size, hidden_size_2
129 |             U2 = tf.get_variable(name = 'U2', shape = (hidden_size_2, hidden_size_3), initializer = theInitializer, dtype = tf.float64)
130 |             c2 = tf.get_variable(name = 'c2', shape = (hidden_size_3), initializer = theInitializer, dtype = tf.float64)
131 |             h3 = tf.nn.relu(tf.matmul(h2, U2) + c2) # batch_size, hidden_size_3
132 |             U3 = tf.get_variable(name = 'U3', shape = (hidden_size_3, hidden_size_4), initializer = theInitializer, dtype = tf.float64)
133 |             c3 = tf.get_variable(name = 'c3', shape = (hidden_size_4), initializer = theInitializer, dtype = tf.float64)
134 |             h4 = tf.nn.relu(tf.matmul(h3, U3) + c3) # batch_size, hidden_size_4
135 |             U4 = tf.get_variable(name = 'U4', shape = (hidden_size_4, hidden_size_5), initializer = theInitializer, dtype = tf.float64)
136 |             c4 = tf.get_variable(name = 'c4', shape = (hidden_size_5), initializer = theInitializer, dtype = tf.float64)
137 |             h5 = tf.nn.relu(tf.matmul(h4, U4) + c4) # batch_size, hidden_size_5
138 |             U5 = tf.get_variable(name = 'U5', shape = (hidden_size_5, self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
139 |             c5 = tf.get_variable(name = 'c5', shape = (self.config.n_classes), initializer = theInitializer, dtype = tf.float64)
140 |             pred = tf.matmul(h5, U5) + c5 # batch_size, n_classes
141 |         return pred
142 | 
143 |     def train_on_batch(self, sess, h_batch, b_batch, h_len_batch, b_len_batch, y_batch):
144 |         """Perform one step of gradient descent on the provided batch of data.
145 |         Args:
146 |             sess: tf.Session()
147 |             headings_batch: np.ndarray of shape (n_samples, n_features)
148 |             bodies_batch: np.ndarray of shape (n_samples, n_features)
149 |             headings_lengths_batch: np.ndarray of shape (n_samples, 1)
150 |             bodies_lengths_batch: np.ndarray of shape (n_samples, 1)
151 |             labels_batch: np.ndarray of shape (n_samples, n_classes)
152 |         Returns:
153 |             loss: loss over the batch (a scalar)
154 |         """
155 |         feed = self.create_feed_dict(h_batch, b_batch, h_len_batch, b_len_batch, y_batch)
156 |         _, loss = sess.run([self.train_op, self.loss], feed_dict=feed)
157 |         ## for debugging / testing
158 |         if (np.isnan(loss)):
159 |             print('headings', h_batch)
160 |             print('bodies', b_batch)
161 |             print('nh_len', h_len_batch)
162 |             print('b_len', b_len_batch)
163 |             print('labels', y_batch)
164 |             assert(False)
165 |         return loss
166 | 
167 |     def predict_on_batch(self, sess, h_batch, b_batch, h_len_batch, b_len_batch):
168 |         """Make predictions for the provided batch of data
169 |         Args:
170 |             sess: tf.Session()
171 |             headings_batch: np.ndarray of shape (n_samples, n_features)
172 |             bodies_batch: np.ndarray of shape (n_samples, n_features)
173 |             headings_lengths_batch: np.ndarray of shape (n_samples, 1)
174 |             bodies_lengths_batch: np.ndarray of shape (n_samples, 1)
175 |         Returns:
176 |             predictions: np.ndarray of shape (n_samples, n_classes)
177 |         """
178 |         feed = self.create_feed_dict(h_batch, b_batch, h_len_batch, b_len_batch)
179 |         predictions = sess.run(self.pred, feed_dict=feed)
180 |         return predictions
181 | 
182 |     def run_epoch(self, sess, h_np, b_np, h_len, b_len, y):
183 |         losses = []
184 |         # shuffle
185 |         ind = range(self.config.num_samples)
186 |         random.shuffle(ind)
187 |         # sizes
188 |         batch_start = 0
189 |         batch_end = 0       
190 |         N = self.config.batch_size
191 |         num_batches = self.config.num_samples / N
192 |         # run batches
193 |         for i in range(num_batches):
194 |             batch_start = (i*N)
195 |             batch_end = (i+1)*N
196 |             indices = ind[batch_start:batch_end]
197 |             h_batch = h_np[indices,:]
198 |             b_batch = b_np[indices,:]
199 |             h_len_batch = h_len[indices]
200 |             b_len_batch = b_len[indices]
201 |             y_batch = y[indices]
202 |             # loss
203 |             loss = self.train_on_batch(sess, h_batch, b_batch, h_len_batch, b_len_batch, y_batch)
204 |             losses.append(loss)
205 |             # prog.update(i + 1, [("train loss", loss)])
206 |             if (i % (1 + num_batches/10)) == 0:
207 |                 print('batch: ', i, ', loss: ', loss)
208 |         # run last smaller batch
209 |         if (batch_end < self.config.num_samples):
210 |             indices = ind[batch_end:]
211 |             h_batch = h_np[indices,:]
212 |             b_batch = b_np[indices,:]
213 |             h_len_batch = h_len[indices]
214 |             b_len_batch = b_len[indices]
215 |             y_batch = y[indices]
216 |             # loss
217 |             loss = self.train_on_batch(sess, h_batch, b_batch, h_len_batch, b_len_batch, y_batch)
218 |             losses.append(loss)
219 |             print('batch: ', i, ', loss: ', loss)
220 |             # print('-------last batch---------')
221 |         return losses
222 |  
223 | 
224 |     def fit(self, sess, h_np, b_np, h_len, b_len, y, dev_h, dev_b, dev_h_len, dev_b_len, dev_y): #M
225 |         #losses = []
226 |         losses_epochs = [] #M
227 |         dev_performances_epochs = [] # M
228 |         dev_predictions_epochs = [] #M
229 |         dev_predicted_classes_epochs = [] #M
230 | 
231 |         for epoch in range(self.config.n_epochs):
232 |             print('-------new epoch---------')
233 |             loss = self.run_epoch(sess, h_np, b_np, h_len, b_len, y)
234 | 
235 |             # Computing predictions #MODIF
236 |             dev_predictions = self.predict_on_batch(sess, dev_h, dev_b, dev_h_len, dev_b_len)
237 | 
238 |             # Computing development performance #MODIF
239 |             dev_predictions = softmax(np.array(dev_predictions))
240 |             dev_predicted_classes = np.argmax(dev_predictions, axis = 1)
241 |             dev_performance = get_performance(dev_predicted_classes, dev_y, n_classes = 4)
242 | 
243 |             # Adding to global outputs #MODIF
244 |             dev_predictions_epochs.append(dev_predictions)
245 |             dev_predicted_classes_epochs.append(dev_predicted_classes)
246 |             dev_performances_epochs.append(dev_performance) 
247 |             losses_epochs.append(loss)
248 |             
249 |             print('EPOCH: ', epoch, ', LOSS: ', np.mean(loss))
250 | 
251 |         return losses_epochs, dev_performances_epochs, dev_predicted_classes_epochs, dev_predictions_epochs
252 | 
253 |     def __init__(self, config):
254 |         self.config = config
255 |         self.headings_placeholder = None
256 |         self.bodies_placeholder = None
257 |         self.headings_lengths_placeholder = None
258 |         self.bodies_lengths_placeholder = None
259 |         self.labels_placeholder = None
260 |         self.build()


--------------------------------------------------------------------------------
/code/data_analysis_plotting/Results_loading_1.R:
--------------------------------------------------------------------------------
  1 | ### Results loading
  2 | library(tidyverse)
  3 | library(stringr)
  4 | library(forcats)
  5 | # help(package = 'forcats')
  6 | 
  7 | 
  8 | ###Load data function#################################################################
  9 | ## Paths
 10 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/'
 11 | 
 12 | 
 13 | ## Load data function
 14 | load_data <- function(names) {
 15 |   num <-  length(names)
 16 |   data <- read_csv(str_c(path_res, names[1]))
 17 |   if (num > 1) {
 18 |     for (i in 2:num) {
 19 |       data <- data %>% 
 20 |         bind_rows(read_csv(str_c(path_res, names[i])))
 21 |     }
 22 |   }
 23 |   return(data)
 24 | }
 25 | 
 26 | ## Load data combine function
 27 | load_data_combine <- 
 28 |   function(perf_names, loss_names, model_ = NULL, xp_ = NULL) {
 29 |     perf <- load_data(perf_names) %>% 
 30 |       mutate(epoch = epoch + 1)
 31 |     loss <- load_data(loss_names) %>% 
 32 |       rename(train_loss = loss)
 33 |     data <- 
 34 |       inner_join(perf, loss) %>% 
 35 |       mutate(
 36 |         model = model_,
 37 |         xp = xp_,
 38 |         downsample = FALSE) %>% 
 39 |       select(model, xp, downsample, everything())
 40 |     return(data)
 41 |   }
 42 | 
 43 | 
 44 | #####LSTM Experiment 01###########################################################################
 45 | ## Experiment 01
 46 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/'
 47 | 
 48 | # Sensitivity Analysis over max_length and n_layers, without downsampling. 
 49 | # Naive data splitting testing script: test_script3_config.py 
 50 | # 
 51 | # Sensitivity Analysis over max_length and n_layers, with downsampling. 
 52 | # Downsample first and then split data. Naive data splitting wrt headlines. 
 53 | # testing script: test_script4_config.py 
 54 | 
 55 | 
 56 | ## Data 01
 57 | 
 58 | # {r 1 No Downsampling max_length}
 59 | ### No Downsampling
 60 | ## Collect max_length XP data
 61 | #max_length
 62 | perf_names <- 
 63 |   str_c('old(lstm)/', 
 64 |         c(
 65 |           'perf_148979940586.csv',
 66 |           'perf_148980307237.csv',
 67 |           'perf_148980957009.csv'
 68 |         )
 69 |   )
 70 | #max_length
 71 | loss_names <- 
 72 |   str_c('old(lstm)/', 
 73 |         c(
 74 |           'losses_148979940587.csv',
 75 |           'losses_148980307238.csv',
 76 |           'losses_148980957009.csv'
 77 |         )
 78 |   )
 79 | 
 80 | ## Read data #max_length
 81 | perf <- load_data(perf_names) %>% 
 82 |   mutate(epoch = epoch + 1)
 83 | loss <- load_data(loss_names) %>% 
 84 |   rename(train_loss = loss)
 85 | 
 86 | results1 <- 
 87 |   inner_join(perf, loss) %>% 
 88 |   mutate(xp = 'max_length',
 89 |          downsample = FALSE) %>% 
 90 |   select(xp, downsample, everything())
 91 | 
 92 | # unique(perf$max_length)
 93 | # unique(loss$max_length)
 94 | 
 95 | 
 96 | 
 97 | 
 98 | # {r 2 No Downsampling n_layers}
 99 | ### No Downsampling
100 | ## Collect n_layers data
101 | #n_layers
102 | perf_names <- 
103 |   str_c('old(lstm)/', 
104 |         c(
105 |           'perf_148982227549.csv', 
106 |           'perf_148981534989.csv', 
107 |           'perf_148981166478.csv'
108 |         )
109 |   )
110 | #n_layers
111 | loss_names <- 
112 |   str_c('old(lstm)/', 
113 |         c(
114 |           'losses_148973251886.csv', 
115 |           'losses_148973620163.csv', 
116 |           'losses_148974313682.csv'
117 |         )
118 |   )
119 | 
120 | ## Read data #n_layers
121 | perf <- load_data(perf_names) %>% 
122 |   mutate(epoch = epoch + 1)
123 | loss <- load_data(loss_names) %>% 
124 |   rename(train_loss = loss)
125 | 
126 | results2 <- 
127 |   inner_join(perf, loss) %>% 
128 |   mutate(xp = 'n_layers',
129 |          downsample = FALSE) %>% 
130 |   select(xp, downsample, everything())
131 | 
132 | # unique(perf$max_length)
133 | # unique(loss$max_length)
134 | 
135 | 
136 | 
137 | 
138 | # {r 3 With Downsampling max_length}
139 | ### With Downsampling
140 | ## Collect max_length XP data
141 | #max_length
142 | perf_names <- 
143 |   str_c('old(lstm)/', 
144 |         c(
145 |           'perf_148978975462.csv', 
146 |           'perf_148978680439.csv', 
147 |           'perf_148978512014.csv'
148 |         )
149 |   )
150 | #max_length
151 | loss_names <- 
152 |   str_c('old(lstm)/', 
153 |         c(
154 |           'losses_148978975462.csv', 
155 |           'losses_148978680439.csv', 
156 |           'losses_148978512015.csv'
157 |         )
158 |   )
159 | 
160 | ## Read data #max_length
161 | perf <- load_data(perf_names) %>% 
162 |   mutate(epoch = epoch + 1)
163 | loss <- load_data(loss_names) %>% 
164 |   rename(train_loss = loss)
165 | 
166 | results3 <- 
167 |   inner_join(perf, loss) %>% 
168 |   mutate(xp = 'max_length',
169 |          downsample = TRUE) %>% 
170 |   select(xp, downsample, everything())
171 | 
172 | # unique(perf$max_length)
173 | # unique(loss$max_length)
174 | 
175 | # results3 <- 
176 | #   anti_join(perf, loss)
177 | 
178 | 
179 | 
180 | 
181 | # {r 4 With Downsampling n_layers}
182 | ### With Downsampling
183 | ## Collect n_layers data
184 | #n_layers
185 | perf_names <- 
186 |   str_c('old(lstm)/', 
187 |         c(
188 |           'perf_148979553502.csv', 
189 |           'perf_148979239173.csv', 
190 |           'perf_148979071576.csv'
191 |         )
192 |   )
193 | #n_layers
194 | loss_names <- 
195 |   str_c('old(lstm)/', 
196 |         c(
197 |           'losses_148979071576.csv', 
198 |           'losses_148979239173.csv', 
199 |           'losses_148979553502.csv'
200 |         )
201 |   )
202 | 
203 | ## Read data #n_layers
204 | perf <- load_data(perf_names) %>% 
205 |   mutate(epoch = epoch + 1)
206 | loss <- load_data(loss_names) %>% 
207 |   rename(train_loss = loss)
208 | 
209 | results4 <- 
210 |   inner_join(perf, loss) %>% 
211 |   mutate(xp = 'n_layers',
212 |          downsample = TRUE) %>% 
213 |   select(xp, downsample, everything())
214 | 
215 | # unique(perf$max_length)
216 | # unique(loss$max_length)
217 | 
218 | 
219 | 
220 | 
221 | ##Check all
222 | # print('1 No Downsampling max_length') 
223 | # sapply(results1 %>% select(1:12), unique)
224 | # print('2 No Downsampling n_layers') 
225 | # sapply(results2 %>% select(1:12), unique)
226 | # print('3 With Downsampling max_length') 
227 | # sapply(results3 %>% select(1:12), unique)
228 | # print('4 With Downsampling n_layers') 
229 | # sapply(results4 %>% select(1:12), unique)
230 | 
231 | ##Combine all
232 | results_lstm1 <- bind_rows(results1, results2, results3, results4)
233 | 
234 | # results_lstm1 %>% write_rds(str_c(path_res, 'old(lstm)/', 'results_lstm1.rds'))
235 | 
236 | 
237 | #####LSTM Experiment 02###########################################################################
238 | ### Experiment 02
239 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/basiclstm/wrangled/'
240 | 
241 | perf_names_drop <- 
242 |   c(
243 |   'perf_148991935973_drop.csv',
244 |   'perf_148992159916_drop.csv',
245 |   'perf_148992383928_drop.csv'#,
246 |   # 'perf_148992558606_maxl.csv'
247 |   )
248 | 
249 | perf_names_maxl <- 
250 |   c(
251 |     # 'perf_148991935973_drop.csv',
252 |     # 'perf_148992159916_drop.csv',
253 |     # 'perf_148992383928_drop.csv'#,
254 |     'perf_148992558606_maxl.csv'
255 |   )
256 | 
257 | loss_names_drop <- 
258 |   c(
259 |     'losses_148991935973_drop.csv',
260 |     'losses_148992159916_drop.csv', 
261 |     'losses_148992383928_drop.csv'#, 
262 |     # 'losses_148992558606_maxl.csv',
263 |     # 'losses_148992694411_maxl.csv'
264 |   )
265 | 
266 | loss_names_maxl <- 
267 |   c(
268 |     # 'losses_148991935973_drop.csv',
269 |     # 'losses_148992159916_drop.csv', 
270 |     # 'losses_148992383928_drop.csv', 
271 |     'losses_148992558606_maxl.csv',
272 |     'losses_148992694411_maxl.csv'
273 |   )
274 | 
275 | ## Read data #max_length, dropout
276 | perf_drop <- load_data(perf_names_drop) %>% 
277 |   mutate(epoch = epoch + 1) %>% 
278 |   mutate(xp = 'dropout')
279 | perf_maxl <- load_data(perf_names_maxl) %>% 
280 |   mutate(epoch = epoch + 1) %>% 
281 |   mutate(xp = 'max_length')
282 | # perf <- bind_rows(perf_drop, 
283 | 
284 | loss_drop <- load_data(loss_names_drop) %>% 
285 |   rename(train_loss = loss) 
286 | 
287 | loss_maxl <- load_data(loss_names_maxl) %>% 
288 |   rename(train_loss = loss) 
289 | 
290 | results_drop <- 
291 |   inner_join(perf_drop, loss_drop) %>% 
292 |   mutate(#xp = 'max_length',
293 |          downsample = FALSE) %>% 
294 |   select(xp, downsample, everything())
295 | 
296 | results_maxl <- 
297 |   inner_join(perf_maxl, loss_maxl) %>% 
298 |   mutate(#xp = 'max_length',
299 |     downsample = FALSE) %>% 
300 |   select(xp, downsample, everything())
301 | 
302 | 
303 | ## add baselines for other experiments
304 | # base1 <- 
305 | #   results_lstm1 %>% 
306 | #   filter(xp == 'max_length') %>% 
307 | #   filter(max_length == 150) %>% 
308 | #   mutate(xp = 'base_150')
309 | 
310 | base_drop <- 
311 |   results_lstm1 %>% 
312 |   filter(xp == 'max_length') %>% 
313 |   filter(max_length == 75) %>% 
314 |   mutate(xp = 'dropout')
315 | 
316 | ### final results of Exp 2
317 | results_lstm2 <- 
318 |   bind_rows(results_drop, results_maxl, base_drop)
319 | 
320 | # unique(perf$max_length)
321 | # unique(loss$max_length)
322 | 
323 | #####Combine LSTM data#################################################################
324 | ## Combine LSTM data
325 | 
326 | 
327 | results_lstm <- bind_rows(results_lstm1, results_lstm2) %>% 
328 |   mutate(model = 'Basic LSTM') %>% 
329 |   select(model, xp, everything())
330 | 
331 | # results_lstm %>% write_rds(str_c(path_res, 'results_lstm.rds'))
332 | 
333 | #####Attention_data 01###########################################################################
334 | ## Attention_data
335 | 
336 | ## Paths
337 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/attention/wrangled/'
338 | 
339 | ### Experiments
340 | perf_names <- c(
341 | 'perf_148990711237_base.csv'
342 | )
343 | loss_names <- c(
344 | 'losses_148990711238_base.csv'
345 | )
346 | att_base <- load_data_combine(perf_names, loss_names,
347 |                                       model_ = 'attention', xp_ = 'base150')
348 | 
349 | perf_names <- c(
350 | 'perf_148990459067_maxl.csv',
351 | 'perf_148990711237_maxl.csv',
352 | 'perf_148991552302_maxl.csv',
353 | 'perf_148992831854_maxl.csv'
354 | )
355 | loss_names <- c(
356 | 'losses_148990459067_maxl.csv',
357 | 'losses_148990711238_maxl.csv',
358 | 'losses_148991552302_maxl.csv',
359 | 'losses_148992831855_maxl.csv'
360 | )
361 | att_maxl <- load_data_combine(perf_names, loss_names, 
362 |                                       model_ = 'attention', xp_ = 'max_length')
363 | 
364 | perf_names <- c(
365 | 'perf_148990711237_att.csv',
366 | 'perf_148993077478_att.csv',
367 | 'perf_148993325155_att.csv',
368 | 'perf_148989809876_att.csv'
369 | )
370 | loss_names <- c(
371 | 'losses_148990711238_att.csv',
372 | 'losses_148993077478_att.csv',
373 | 'losses_148993325156_att.csv',
374 | 'losses_148989809876_att.csv'
375 | )
376 | att_att <- load_data_combine(perf_names, loss_names, 
377 |                                       model_ = 'attention', xp_ = 'attention_length')
378 | 
379 | perf_names <- c(
380 | 'perf_148990711237_lr.csv',
381 | 'perf_148993571229_lr.csv',
382 | 'perf_148993821388_lr.csv'
383 | )
384 | loss_names <- c(
385 | 'losses_148990711238_lr.csv',
386 | 'losses_148993571229_lr.csv',
387 | 'losses_148993821388_lr.csv'
388 | )
389 | att_lr <- load_data_combine(perf_names, loss_names, 
390 |                                      model_ = 'attention', xp_ = 'lr')
391 | 
392 | ### att_att has 40 rows too much!! because: max(att_att$n_epochs) is 50 ##
393 | results_att1 <- bind_rows(#att_base, 
394 |                          att_maxl, att_att, att_lr)
395 | 
396 | # results_att %>% write_rds(str_c(path_res,  'results_attention.rds'))
397 | 
398 | 
399 | #####Attention_data 02, Combine###########################################################################
400 | ## Attention_data
401 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/attention/wrangled/'
402 | perf_names <- c(
403 | 'perf_148998721083_nlay.csv',
404 | 'perf_148997977122_nlay.csv'
405 | )
406 | 
407 | results_att2 <- load_data(perf_names) %>% 
408 |   mutate(epoch = epoch + 1) %>% 
409 |   mutate(model = 'attention', xp = 'n_layers') %>% 
410 |   mutate(downsample = FALSE)
411 | 
412 | results_att2 <- bind_rows(results_att2, (att_base %>% mutate(xp = 'n_layers')))
413 | 
414 | results_att <- bind_rows(results_att2, results_att1) %>% 
415 |   mutate(model = 'Attention LSTM')
416 | # results_att %>% write_rds(str_c(path_res,  'results_attention.rds'))
417 | 
418 | 
419 | #####Conditional ###########################################################################
420 | ### Conditional Data
421 | 
422 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/conditional/wrangled/'
423 | perf_names <- c(
424 | 'perf_148995686583_max75.csv',
425 | 'perf_148996029453_max150.csv',
426 | 'perf_148996505757_max300.csv'
427 | )
428 | 
429 | results_cond1 <- load_data(perf_names) %>% 
430 |   mutate(epoch = epoch + 1,
431 |          max_length = b_max_len,
432 |          xp = 'max_length') 
433 | 
434 | 
435 | perf_names <- c(
436 | 'perf_148996029453_max150.csv',
437 | 'perf_149000293039_nlay.csv',
438 | 'perf_14899932587_nlay.csv'
439 | )
440 | 
441 | results_cond2 <- load_data(perf_names) %>% 
442 |   mutate(epoch = epoch + 1,
443 |          max_length = b_max_len,
444 |          xp = 'n_layers') 
445 | 
446 | results_cond <- bind_rows(results_cond1, results_cond2) %>% 
447 |   mutate(model = 'CEA LSTM')
448 | 
449 | 
450 | # results_cond %>% write_rds(str_c('C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/', 
451 | #                                  'results_cond.rds'))
452 | 
453 | 
454 | #####BOW Data####################################################################
455 | 
456 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/bow/wrangled/'
457 | 
458 | perf_names <- c(
459 | 'perf_14899707540.csv',
460 | 'perf_148996933874.csv',
461 | 'perf_148996941198.csv',
462 | 'perf_148996948691.csv',
463 | 'perf_148997030832.csv',
464 | 'perf_148997038112.csv',
465 | 'perf_148997045544.csv',
466 | 'perf_148997052838.csv',
467 | 'perf_148997060346.csv',
468 | 'perf_148997067842.csv',
469 | 'perf_148997083262.csv',
470 | 'perf_148997091668.csv',
471 | 'perf_148997099947.csv',
472 | 'perf_148997108384.csv',
473 | 'perf_148997116878.csv'
474 | )
475 | 
476 | results_bow <- load_data(perf_names) 
477 | 
478 | ## add missing variable embedding 4 layer runs for 75, 300 and 600 max_length
479 | results_bow_add <- results_bow %>% 
480 |   filter(trainable_embeddings == 'Constant',
481 |          n_layers == 3,
482 |          b_max_len %in% c(75, 300, 600)) %>% 
483 |   mutate(trainable_embeddings = 'Variable')
484 | results_bow <- bind_rows(results_bow, results_bow_add)
485 | 
486 | ##  add 150 max_len as n_layers experiment
487 | results_bow_add <- results_bow %>% 
488 |   filter(trainable_embeddings == 'Variable',
489 |          b_max_len %in% c(150)) %>% 
490 |   mutate(xp = 'n_layers')
491 | results_bow <- bind_rows(results_bow, results_bow_add)
492 | 
493 | ## add 4 layers as max_len experiment
494 | results_bow_add <- results_bow %>% 
495 |   filter(trainable_embeddings == 'Variable',
496 |          n_layers == 3,
497 |          xp == 'layers') %>% 
498 |   mutate(xp = 'max_length')
499 | results_bow <- bind_rows(results_bow, results_bow_add)
500 | 
501 | 
502 | 
503 | results_bow <- results_bow %>% 
504 |   mutate(
505 |     epoch = epoch + 1,
506 |     n_layers = n_layers + 1,
507 |     max_length = b_max_len,
508 |     model = 'BOW'
509 |     )
510 | 
511 | # results_bow %>% write_rds(str_c(path_res, 'results_bow.rds'))
512 | 
513 | # model_bow <- results_bow %>% select(attention_length:xp) %>% distinct %>%
514 | #   arrange(model, xp, trainable_embeddings, max_length, n_layers)
515 | 
516 | #####COMBINE ALL###########################################################################
517 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/'
518 | 
519 | results <- bind_rows(
520 |   results_lstm, 
521 |   results_att %>% mutate(b_max_len = NA_integer_, h_max_len = NA_integer_), 
522 |   results_cond %>% mutate(downsample = FALSE), 
523 |   results_bow %>% mutate(downsample = FALSE)
524 |   )
525 | 
526 | # results %>% write_rds(str_c('C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/', 'results.rds'))
527 | 
528 | ## debug
529 | # names(results_lstm)
530 | # names(results_att)
531 | # names(results_cond)
532 | # names(results_bow)
533 | 
534 | # results_lstm %>% select(b_max_len) %>% head(5)
535 | # results_att %>% select(b_max_len) %>% head(5)
536 | # results_cond %>% select(b_max_len) %>% head(5)
537 | # results_bow %>% select(b_max_len) %>% head(5)
538 | 
539 | # results_lstm %>% select(downsample) %>% head(5)
540 | # results_att %>% select(downsample) %>% head(5)
541 | # results_cond %>% select(downsample) %>% head(5)
542 | # results_bow %>% select(downsample) %>% head(5)
543 | 
544 | #####FINAL RESULTS#################################################################
545 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/final/'
546 | 
547 | results_final_bow  <- load_data('perf_149004079896.csv') %>% 
548 |   mutate(max_length = b_max_len,
549 |          model = 'BOW')
550 | results_final_lstm <- load_data('perf_149004484911.csv') %>% 
551 |   mutate(model = 'Basic LSTM')
552 | results_final_att  <- load_data('perf_149004809705.csv') %>% 
553 |   mutate(model = 'Attention LSTM') 
554 | results_final_cond <- load_data('perf_149005331987.csv') %>% 
555 |   mutate(max_length = b_max_len,
556 |          model = 'CEA LSTM')
557 | 
558 | results_final <- bind_rows(results_final_bow, 
559 |                            results_final_lstm, 
560 |                            results_final_att, 
561 |                            results_final_cond) %>% 
562 |   mutate(epoch = epoch + 1)
563 | 
564 | # results_final %>% distinct(model) 
565 | 
566 | # results_final %>% write_rds(str_c(path_res, 'results_final.rds'))
567 | 
568 | 
569 | 
570 | 
571 | 
572 | 
573 | 
574 | 


--------------------------------------------------------------------------------
/code/data_analysis_plotting/data_analysis.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "FakeNewsChallenge"
  3 | author: "Oskar Triebe"
  4 | date: "February 3, 2017"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | ```{r, message=FALSE}
 13 | library(tidyverse)
 14 | library(stringr)
 15 | library(modelr)
 16 | ```
 17 | 
 18 | ## Data Loading
 19 | 
 20 | ```{r}
 21 | ### Paths
 22 | # url_train_bodies <-
 23 | #   'https://raw.githubusercontent.com/FakeNewsChallenge/fnc-1/master/train_bodies.csv'
 24 | # url_train_stances <-
 25 | #   'https://raw.githubusercontent.com/FakeNewsChallenge/fnc-1/master/train_stances.csv'
 26 | # url_train_stances.random <- 'https://raw.githubusercontent.com/FakeNewsChallenge/fnc-1/master/train_stances.random.csv'
 27 | 
 28 | path_data <- 'C:/Users/OurOwnStory/Desktop/MyDrive/6_MSC/00_16-17 Winter/CS224n Natural Language Processing with Deep Learning/Project/R_Data/'
 29 | path_fig <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/Figures/ggplot2/'
 30 | ```
 31 | 
 32 | 
 33 | ```{r}
 34 | ### save data
 35 | # read_csv(
 36 | #   url_train_bodies,
 37 | #   col_names = TRUE,
 38 | #   col_types = cols(
 39 | #     `Body ID` = col_integer(),
 40 | #     articleBody = col_character()
 41 | #     )
 42 | #   ) %>%
 43 | #   rename(body_id = `Body ID`, 
 44 | #          body = articleBody
 45 | #          ) %>% 
 46 | #   write_rds(str_c(path_data, 'bodies.rds'))
 47 | # 
 48 | # read_csv(
 49 | #   url_train_stances,
 50 | #   col_names = TRUE,
 51 | #   col_types = cols(
 52 | #     Headline = col_character(),
 53 | #     `Body ID` = col_integer(),
 54 | #     Stance = col_character())
 55 | #    ) %>%
 56 | #   rename(body_id = `Body ID`, 
 57 | #          stance = Stance, 
 58 | #          headline = Headline
 59 | #          ) %>% 
 60 | #   write_rds(str_c(path_data, 'stances.rds'))
 61 | # 
 62 | # read_csv(
 63 | #   url_train_stances.random,
 64 | #   col_names = TRUE,
 65 | #   col_types = cols(
 66 | #     Headline = col_character(),
 67 | #     `Body ID` = col_integer(),
 68 | #     Stance = col_character()
 69 | #     )
 70 | #   ) %>%
 71 | #   rename(body_id = `Body ID`, 
 72 | #          stance = Stance, 
 73 | #          headline = Headline
 74 | #          ) %>% 
 75 | #   write_rds(str_c(path_data, 'stances_random.rds'))
 76 | 
 77 | ```
 78 | 
 79 | ```{r load data}
 80 | bodies <- read_rds(str_c(path_data, 'bodies.rds'))
 81 | stances <- read_rds(str_c(path_data, 'stances.rds'))
 82 | stances_random <- read_rds(str_c(path_data, 'stances_random.rds'))
 83 | 
 84 | data <- 
 85 |   stances %>%
 86 |   inner_join(bodies, by = 'body_id') %>% 
 87 |   select(body_id, stance, body, headline) %>% 
 88 |   arrange(body_id, stance, headline)
 89 | 
 90 | ## Remove doubles
 91 | data <- 
 92 |   data %>% 
 93 |   distinct(body_id, stance, body, headline)
 94 | 
 95 | ## String lengths
 96 | data <- 
 97 | data %>%
 98 |   mutate(h_len = map_int(headline, str_length),
 99 |          b_len = map_int(body, str_length),
100 |          h_words = map_int(headline, str_count, pattern = '[^\\w]+'),
101 |          b_words = map_int(body, str_count, pattern = '[^\\w]+')) 
102 | ```
103 | 
104 | ## Data Distribution
105 | 
106 | 
107 | ```{r}
108 | bodies_unique <- 
109 |   bodies %>% 
110 |   distinct(body) #%>% left_join(bodies, by = 'articleBody')
111 | 
112 | stances_unique <- 
113 |   stances %>% 
114 |   distinct(headline) #%>% left_join(stances, by = 'Headline')
115 | 
116 | n_bodies <- 
117 |   (bodies %>% nrow())
118 | n_bodies_unique <- 
119 |   (bodies_unique %>% nrow()) 
120 | 
121 | n_stances_unique <- 
122 |   (stances_unique %>% nrow())
123 | n_stances <- 
124 |   (stances %>% nrow())
125 | 
126 | n_data <- 
127 |   data %>%  nrow()
128 | 
129 | n_data_distinct <- 
130 |   data %>% 
131 |   distinct(body_id, headline, stance) %>% nrow()
132 | 
133 | n_data_double <- 
134 |   n_data - n_data_distinct
135 | 
136 | prop_bodies_unique <- 
137 |   n_bodies_unique / n_bodies
138 | prop_stances_unique <- 
139 |   n_stances_unique / n_stances
140 | 
141 | ```
142 | 
143 | 
144 | ```{r}
145 | str_c('Propp of unique bodies: ', prop_bodies_unique)
146 | 
147 | str_c('Propp of unique headlines: ', prop_stances_unique)
148 | ```
149 | 
150 | 
151 | 
152 | 
153 | ```{r}
154 | # Check doubles
155 | data %>% 
156 |   count(body_id, headline) %>% filter(n > 1)
157 | 
158 | #Check match
159 | stances %>%
160 |   anti_join(bodies, by = 'body_id')
161 | 
162 | #Check unique id
163 | bodies %>% count(body_id) %>% filter(n > 1)
164 | 
165 | ## 402 double entries
166 | headline_double <- 
167 |   stances %>% 
168 |   count(body_id, headline, stance) %>% 
169 |   filter(n > 1) 
170 | 
171 | n_data_double == headline_double %>% nrow()
172 | 
173 | ```
174 | 
175 | 
176 | 
177 | ```{r}
178 | data %>% count(stance) %>% mutate(prop = n / n_data)
179 | ```
180 | 
181 | 
182 | ## Lengths of bodies and headlines
183 | 
184 | ```{r}
185 | print(str_c('Median headline words: ', median(data$h_words)))
186 | summary(data$h_words)
187 | print(str_c('Median body words: ', median(data$b_words)))
188 | summary(data$b_words)
189 | ```
190 | 
191 | 
192 | ```{r}
193 | head_distr <- 
194 |   data %>% 
195 |   ggplot() +
196 |   geom_ref_line(v = median(data$h_words), 
197 |                 colour = 'grey70') +
198 |   geom_histogram(aes(h_words), binwidth = 1) +
199 |   labs(title = 'Article Headline Length Distribution',
200 |        subtitle = 'The median headline has 10 words (first quartile 8, thrid quartile 13, minimum 1, maximum 40).',
201 |        x = 'Number of Words', y = 'Count',
202 |        caption = 'based on full dataset fnc-1 from FakeNewsChallenge.org')
203 | head_distr
204 | ggsave(plot = head_distr, filename = str_c('head_distr', '.png'), 
205 |        width = 7, height = 3, dpi = 900, units = 'in',
206 |        path = path_fig, device = 'png')
207 | ```
208 | 
209 | ```{r}
210 | body_distr <- 
211 |   data %>% 
212 |   ggplot() +
213 |   geom_ref_line(v = median(data$b_words), 
214 |                 colour = 'grey70') +
215 |   geom_histogram(aes(b_words), binwidth = 50) +
216 |   scale_x_continuous(limits = c(-100, 2500)) +
217 |   labs(title = 'Article Body Length Distribution',
218 |        subtitle = 'The body headline has 315 words (first quartile 206, thrid quartile 477, minimum 3, maximum 4937).',
219 |        x = 'Number of Words', y = 'Count',
220 |        caption = 'based on full dataset fnc-1 from FakeNewsChallenge.org')
221 | body_distr
222 | ggsave(plot = body_distr, filename = 'body_distr.png', 
223 |        width = 7, height = 3, dpi = 900, units = 'in',
224 |        path = path_fig, device = 'png')
225 | ```
226 | 
227 | 
228 | 
229 | 
230 | ## Distribution among bodies and headlines
231 | 
232 | ```{r}
233 | # Distribution of stances among bodies
234 | distr_body_stance <- 
235 |   data %>%
236 |   group_by(body_id) %>% 
237 |   count(stance) %>% 
238 |   spread(key = stance, value = n, fill = 0) %>% 
239 |   gather(key = stance, value = n, c(agree, disagree, discuss, unrelated)) %>% 
240 |   mutate(prop = n / sum(n))
241 | 
242 | # Distribution of stances among headlines
243 | distr_headline_stance <- 
244 |   data %>%
245 |   group_by(headline) %>% 
246 |   count(stance) %>% 
247 |   spread(key = stance, value = n, fill = 0) %>% 
248 |   gather(key = stance, value = n, c(agree, disagree, discuss, unrelated)) %>% 
249 |   mutate(prop = n / sum(n))
250 | 
251 | # Number of uses of each body
252 | distr_body <- 
253 |   distr_body_stance %>% 
254 |   group_by(body_id) %>% 
255 |   count(wt = n)
256 | 
257 | # Number of uses of each headline
258 | distr_headline <- distr_headline_stance %>% 
259 |   group_by(headline) %>% 
260 |   count(wt = n)
261 | ```
262 | 
263 | ```{r, fig.asp=0.5, warning = FALSE}
264 | # Number of uses of each body
265 | distr_body %>%  
266 |   ggplot(aes(nn)) +
267 |   # geom_histogram(binwidth = 2) +
268 |   geom_freqpoly(binwidth = 2, color = 'black', size = 0.5) +
269 |   # stat_ecdf() +
270 |   theme_minimal()
271 | 
272 | # Number of uses of each headline
273 | distr_headline %>% 
274 |   ggplot(aes(nn)) +
275 |   # geom_histogram(binwidth = 2) +
276 |   geom_freqpoly(binwidth = 2, color = 'black', size = 0.5) +
277 |   # stat_ecdf() +
278 |   theme_minimal()
279 | 
280 | # Number of uses of each body and headline
281 | distr_body %>% 
282 |   ggplot(aes(nn)) +
283 |   # geom_histogram(binwidth = 2) +
284 |   # geom_freqpoly(binwidth = 2, color = 'black', size = 0.5) +
285 |   stat_ecdf(color = 'red', size = 1) +
286 |   stat_ecdf(data = distr_headline, size = 1) +
287 |   theme_minimal()
288 | ```
289 | 
290 | 
291 | ```{r, fig.asp=1/2, warning = FALSE}
292 | # HIST Distribution of stances among bodies
293 | 
294 | # distr_body_stance %>% 
295 | #   ggplot(aes(n)) +
296 | #   geom_histogram(binwidth = 1) +
297 | #   # geom_freqpoly(binwidth = 1, color = 'blue') +
298 | #   facet_wrap(~stance, nrow = 1)
299 | # 
300 | # distr_body_stance %>% 
301 | #   ggplot(aes(prop)) +
302 | #   geom_histogram(binwidth = 0.02) +
303 | #   # geom_freqpoly(binwidth = 0.01, color = 'blue') +
304 | #   facet_wrap(~stance, nrow = 1) 
305 | # 
306 | # ## log 
307 | # distr_body_stance %>% 
308 | #   ggplot(aes(n)) +
309 | #   geom_histogram(binwidth = 1) +
310 | #   # geom_freqpoly(binwidth = 1, color = 'blue') +
311 | #   facet_wrap(~stance, nrow = 1) +
312 | #   scale_y_log10()
313 | # 
314 | # distr_body_stance %>% 
315 | #   ggplot(aes(prop)) +
316 | #   geom_histogram(binwidth = 0.02) +
317 | #   # geom_freqpoly(binwidth = 0.01, color = 'blue') +
318 | #   facet_wrap(~stance, nrow = 1) +
319 | #   scale_y_log10()
320 |   
321 | ```
322 | 
323 | 
324 | 
325 | ```{r, fig.asp=1/2, warning = FALSE}
326 | ## ECDF Distribution of stances among bodies
327 | distr_body_stance %>% 
328 |   ggplot(aes(n, color = stance)) +
329 |   stat_ecdf(size = 1) +
330 |   theme_minimal()
331 | 
332 | distr_body_stance %>% 
333 |   ggplot(aes(prop,color = stance)) +
334 |   stat_ecdf(size = 1) +
335 |   theme_minimal()
336 | 
337 | ## FREQPOLY Distribution of stances among bodies
338 | distr_body_stance %>% 
339 |   ggplot(aes(n)) +
340 |   geom_freqpoly(binwidth = 3, color = 'black', size = 0.5) +
341 |   facet_wrap(~stance, nrow = 1) +
342 |   theme_minimal()
343 | 
344 | distr_body_stance %>% 
345 |   ggplot(aes(prop)) +
346 |   geom_freqpoly(binwidth = 0.04, color = 'black') +
347 |   facet_wrap(~stance, nrow = 1) +
348 |   theme_minimal()
349 | 
350 | ## log
351 | # distr_body_stance %>% 
352 | #   ggplot(aes(n)) +
353 | #   geom_freqpoly(binwidth = 5, color = 'blue') +
354 | #   facet_wrap(~stance, nrow = 1) +
355 | #   scale_y_log10()
356 | # 
357 | # distr_body_stance %>% 
358 | #   ggplot(aes(prop)) +
359 | #   geom_freqpoly(binwidth = 0.05, color = 'blue') +
360 | #   facet_wrap(~stance, nrow = 1)  +
361 | #   scale_y_log10()
362 | 
363 | ```
364 | 
365 | 
366 | ```{r, fig.asp=1/2, warning = FALSE}
367 | ## ECDF Distribution of stances among bodies
368 | distr_headline_stance %>% 
369 |   ggplot(aes(n, color = stance)) +
370 |   stat_ecdf(size = 1) +
371 |   theme_minimal()
372 | 
373 | distr_headline_stance %>% 
374 |   ggplot(aes(prop,color = stance)) +
375 |   stat_ecdf(size = 1) +
376 |   theme_minimal()
377 | 
378 | 
379 | ## FREQPOLY Distribution of stances among stances
380 | distr_headline_stance %>% 
381 |   ggplot(aes(n)) +
382 |   geom_freqpoly(binwidth = 3, color = 'black', size = 0.5) +
383 |   facet_wrap(~stance, nrow = 1) +
384 |   theme_minimal()
385 | 
386 | distr_headline_stance %>% 
387 |   ggplot(aes(prop)) +
388 |   geom_freqpoly(binwidth = 0.03, color = 'black') +
389 |   facet_wrap(~stance, nrow = 1) +
390 |   theme_minimal()
391 | 
392 | ## log
393 | # distr_headline_stance %>% 
394 | #   ggplot(aes(n)) +
395 | #   geom_freqpoly(binwidth = 5, color = 'blue') +
396 | #   facet_wrap(~stance, nrow = 1) +
397 | #   scale_y_log10()
398 | # 
399 | # distr_headline_stance %>% 
400 | #   ggplot(aes(prop)) +
401 | #   geom_freqpoly(binwidth = 0.05, color = 'blue') +
402 | #   facet_wrap(~stance, nrow = 1)  +
403 | #   scale_y_log10()
404 | 
405 | ```
406 | 
407 | 
408 | ## Conclusions
409 | 
410 | * Headlines always have at least one body pair that is unrelated.
411 | 
412 | 
413 | 
414 | ## Better Data management
415 | 
416 | 
417 | ```{r}
418 | headers_new <- 
419 |   stances_unique %>% 
420 |   mutate(h_id = row_number(),
421 |          h_words = map_int(headline, str_count, pattern = '[^\\w]+'))
422 | 
423 | bodies_new <- 
424 |   bodies_unique %>% 
425 |   mutate(b_id = row_number(),
426 |          b_words = map_int(body, str_count, pattern = '[^\\w]+')) 
427 | 
428 | 
429 | data_new <- 
430 |   data %>% 
431 |   select(stance, headline, body) %>% 
432 |   left_join(headers_new, by = 'headline') %>% 
433 |   left_join(bodies_new, by = 'body')
434 | 
435 | ## Check
436 | # data_new %>% 
437 | #   anti_join(data)
438 | 
439 | data_new <- 
440 |   data_new %>% 
441 |   select(stance, h_id, b_id)
442 |   
443 | ```
444 | 
445 | 
446 | ```{r}
447 | ## Test naive splitting bleedover
448 | data_new_random <- sample.int(n = (data_new %>% nrow()), size = 10)
449 | 
450 | 
451 | ## Test Random headline splitting bleedover
452 | ```
453 | 
454 | 
455 | 


--------------------------------------------------------------------------------
/code/data_analysis_plotting/results_analysis.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Results FakeNewsChallenge"
  3 | author: "Oskar Triebe"
  4 | date: "March 18, 2017"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | ```{r}
 13 | library(tidyverse)
 14 | library(stringr)
 15 | library(forcats)
 16 | # help(package = 'forcats')
 17 | ```
 18 | 
 19 | 
 20 | ## Plots
 21 | 
 22 | ```{r}
 23 |   # mutate(
 24 |   #   max_length = fct_rev(factor(max_length)),
 25 |   #   n_layers = fct_rev(factor(n_layers))
 26 |   # )
 27 | ```
 28 | 
 29 | 
 30 | ### Train Loss
 31 | 
 32 | ```{r}
 33 | # results %>% 
 34 | #   filter(!downsample) %>% 
 35 | #   filter(xp == 'max_length') %>% 
 36 | #   ggplot(aes(x = epoch)) +
 37 | #   geom_line(aes(y = train_loss, color = max_length),
 38 | #             size = 1) +
 39 | #   labs(title = 'No Downsampling max_length')
 40 | 
 41 | # title = str_c('XP: ', xp, ', downsampled: ', downsample) 
 42 | ```
 43 | 
 44 | ```{r}
 45 | # results %>% 
 46 | #   filter(downsample) %>% 
 47 | #   filter(xp == 'max_length') %>% 
 48 | #   ggplot(aes(x = epoch)) +
 49 | #   geom_line(aes(y = train_loss, color = max_length),
 50 | #             size = 1)  +
 51 | #   labs(title = 'With Downsampling max_length')
 52 | ```
 53 | 
 54 | ```{r}
 55 | # results %>% 
 56 | #   filter(!downsample) %>% 
 57 | #   filter(xp == 'n_layers') %>% 
 58 | #   ggplot(aes(x = epoch)) +
 59 | #   geom_line(aes(y = train_loss, color = n_layers),
 60 | #             size = 1) +
 61 | #   labs(title = 'No Downsampling n_layers')
 62 | ```
 63 | 
 64 | ```{r}
 65 | # results %>% 
 66 | #   filter(downsample) %>% 
 67 | #   filter(xp == 'n_layers') %>% 
 68 | #   ggplot(aes(x = epoch)) +
 69 | #   geom_line(aes(y = train_loss, color = n_layers),
 70 | #             size = 1)  +
 71 | #   labs(title = 'With Downsampling n_layers')
 72 | ```
 73 | 
 74 | 
 75 | ```{r}
 76 | results %>% 
 77 |   # filter(!downsample) %>% 
 78 |   filter(xp == 'max_length') %>% 
 79 |   ggplot(aes(x = epoch)) +
 80 |   geom_line(aes(y = train_loss, color = max_length),
 81 |             size = 1) +
 82 |   labs(title = 'Train Loss: max_length, downsample') +
 83 |   facet_wrap(~downsample) +
 84 |   coord_cartesian(ylim = c(0.0, 0.8))
 85 | ```
 86 | 
 87 | ```{r}
 88 | results %>% 
 89 |   # filter(downsample) %>% 
 90 |   filter(xp == 'n_layers') %>% 
 91 |   ggplot(aes(x = epoch)) +
 92 |   geom_line(aes(y = train_loss, color = n_layers),
 93 |             size = 1)  +
 94 |   labs(title = 'Train Loss: n_layers, downsample') +
 95 |   facet_wrap(~downsample) +
 96 |   coord_cartesian(ylim = c(0.0, 0.8))
 97 | ```
 98 | 
 99 | ### Competition Score
100 | 
101 | 
102 | ```{r}
103 | results %>% 
104 |   filter(xp == 'max_length') %>% 
105 |   mutate(
106 |     max_length = fct_rev(max_length),
107 |     n_layers = fct_rev(n_layers)) %>% 
108 |   ggplot(aes(x = epoch)) +
109 |   geom_line(aes(y = competition, color = max_length),
110 |             size = 1) +
111 |   labs(title = 'Competition Score: max_length, downsample') +
112 |   facet_wrap(~downsample) +
113 |   coord_cartesian(ylim = c(0.45, 0.75))
114 | ```
115 | 
116 | ```{r}
117 | results %>% 
118 |   filter(xp == 'n_layers') %>% 
119 |   mutate(
120 |     max_length = fct_rev(max_length),
121 |     n_layers = fct_rev(n_layers)) %>% 
122 |   ggplot(aes(x = epoch)) +
123 |   geom_line(aes(y = competition, color = n_layers),
124 |             size = 1)  +
125 |   labs(title = 'Competition Score: n_layers, downsample') +
126 |   facet_wrap(~downsample) +
127 |   coord_cartesian(ylim = c(0.45, 0.75))
128 | ```
129 | 
130 | 
131 | ### F1 for each Stance
132 | 
133 | ```{r, fig.asp = 1}
134 | results %>% 
135 |   filter(xp == 'n_layers') %>% 
136 |   mutate(max_length = fct_rev(max_length),
137 |          n_layers = fct_rev(n_layers)) %>%
138 |   ggplot(aes(x = epoch)) +
139 |   geom_line(aes(y = f1, color = n_layers),
140 |             size = 1)  +
141 |   labs(title = 'F1: n_layers, stance, downsample') +
142 |   facet_grid(class~downsample, scales = 'free') 
143 |   # facet_grid(downsample~class, scales = 'free') 
144 | ```
145 | 
146 | 
147 | ```{r, fig.asp = 1}
148 | results %>% 
149 |   filter(xp == 'n_layers') %>% 
150 |   # mutate(max_length = fct_rev(max_length),
151 |   #        n_layers = fct_rev(n_layers)) %>% 
152 |   ggplot(aes(x = epoch)) +
153 |   geom_line(aes(y = f1, color = downsample),
154 |             size = 1)  +
155 |   labs(title = 'F1: n_layers, stance, downsample') +
156 |   facet_grid(class~n_layers) 
157 |   # coord_cartesian(ylim = c(0.45, 0.75))
158 | ```
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | ```{r, fig.asp = 1}
166 | results %>% 
167 |   filter(xp == 'max_length') %>% 
168 |   mutate(max_length = fct_rev(max_length)) %>%
169 |   ggplot(aes(x = epoch)) +
170 |   geom_line(aes(y = f1, color = max_length),
171 |             size = 1)  +
172 |   labs(title = 'F1: max_length, stance, downsample') +
173 |   facet_grid(class~downsample, scales = 'free') 
174 |   # facet_grid(downsample~class, scales = 'free') 
175 | ```
176 | 
177 | 
178 | ```{r, fig.asp = 1}
179 | results %>% 
180 |   filter(xp == 'max_length') %>% 
181 |   mutate(max_length = fct_rev(max_length)) %>%
182 |   ggplot(aes(x = epoch)) +
183 |   geom_line(aes(y = f1, color = downsample),
184 |             size = 1)  +
185 |   labs(title = 'F1: max_length, stance, downsample') +
186 |   facet_grid(class~max_length) 
187 |   # coord_cartesian(ylim = c(0.45, 0.75))
188 | ```
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 


--------------------------------------------------------------------------------
/code/data_analysis_plotting/results_analysis_2.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Results FakeNewsChallenge"
  3 | author: "Oskar Triebe"
  4 | date: "March 18, 2017"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | ```{r}
 13 | library(tidyverse)
 14 | library(stringr)
 15 | library(forcats)
 16 | # help(package = 'forcats')
 17 | ```
 18 | 
 19 | ## Options
 20 | 
 21 | ### Data
 22 | ```{r}
 23 | path_res <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/'
 24 | results <- read_rds(str_c(path_res, 'results.rds')) 
 25 | 
 26 | path_res_final <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/xp/final/'
 27 | results_final <- read_rds(str_c(path_res_final, 'results_final.rds'))
 28 | path_fig <- 'C:/Users/OurOwnStory/GitHub/altfactcheckers/Figures/ggplot2/'
 29 | 
 30 | 
 31 | results <- results %>% 
 32 |   filter(!downsample) %>% 
 33 |   filter(trainable_embeddings == 'Variable')
 34 | 
 35 | # name_model <- function(x){
 36 | #   if (x == 'bow') {return('BOW')}
 37 | #   if (x == 'basiclstm') {return('Basic LSTM')}
 38 | #   if (x == 'attention') {return('Attention LSTM')}
 39 | #   if (x == 'conditional') {return('CEA LSTM')}
 40 | #   }
 41 | 
 42 | model_names = c('BOW', 'Basic LSTM', 'Attention LSTM', 'CEA LSTM')
 43 | results <- results %>%  
 44 |   select(model:hidden_size, lr:n_classes, everything()) %>% 
 45 |   mutate(model = factor(model, levels = model_names))  #%>%  
 46 |   # mutate(
 47 |   #   max_length = fct_rev(factor(max_length), levels = c(50, 75, 150, 200, 300, 600)),
 48 |   #   n_layers = fct_rev(factor(n_layers), levels = c(1, 2, 4))
 49 |   # )
 50 | 
 51 | # same for final res
 52 | results_final <- results_final %>%  
 53 |   select(model:hidden_size, lr:n_classes, everything()) %>% 
 54 |   mutate(model = factor(model, levels = model_names))
 55 | 
 56 | ## What we can analyze:
 57 | params <- sapply(results %>% select(model:n_classes), unique)
 58 | 
 59 | models <- 
 60 | results %>% 
 61 |   select(model:n_classes) %>% 
 62 |   # filter(model == 'basiclstm') %>%
 63 |   # filter(xp == 'base_150') %>%
 64 |   distinct() %>% 
 65 |   arrange(model, xp, trainable_embeddings, max_length, n_layers)
 66 | 
 67 | hyperparams <- names(models)
 68 | hyperparams_class <- c(hyperparams, 'class')
 69 | 
 70 | ### Max F1 score
 71 | results <- 
 72 |   results %>%
 73 |   group_by_(.dots = hyperparams_class) %>% 
 74 |   mutate(
 75 |     competition_max = max(competition),
 76 |     epoch_max_comp = epoch[which.max(competition)],
 77 |     f1_max_comp = f1[which.max(competition)]
 78 |     ) %>% 
 79 |   ungroup()
 80 | 
 81 | ## Rename Stances to classes:
 82 | results$class[results$class == 0] = 'Related: Agree'
 83 | results$class[results$class == 1] = 'Related: Disagree'
 84 | results$class[results$class == 2] = 'Related: Discuss'
 85 | results$class[results$class == 3] = 'Unrelated'
 86 | ## same for final res
 87 | results_final$class[results_final$class == 0] = 'Related: Agree'
 88 | results_final$class[results_final$class == 1] = 'Related: Disagree'
 89 | results_final$class[results_final$class == 2] = 'Related: Discuss'
 90 | results_final$class[results_final$class == 3] = 'Unrelated'
 91 |   
 92 | ```
 93 | 
 94 | ### Plotting
 95 | 
 96 | ```{r}
 97 | ## set theme light
 98 | # theme_set(theme_light() + theme(panel.grid = element_blank()))
 99 | theme_set(theme_light() + theme(panel.grid.major.x = element_blank(), 
100 |                                 panel.grid.minor.x = element_blank()))
101 | ## set theme dark
102 | # theme_set(theme_dark() + theme(panel.grid = element_blank()))
103 | # theme_set(theme_dark() + theme(panel.grid.major.x = element_blank(), 
104 | #                                panel.grid.minor.x = element_blank()))
105 | ```
106 | 
107 | ```{r}
108 | breaks_max_length = c(75, 150, 300, 600)
109 | breaks_max_length_short = c(75, 150, 300)
110 | breaks_max_length_all = c(50, 75, 150, 300, 600)
111 | breaks_n_layers = c(1, 2, 4)
112 | ```
113 | 
114 | 
115 | ```{r}
116 | ## plotting vars for Competition Scores
117 | comp_ylim = c(0.60, 0.80)
118 | comp_ylim_low = c(0.60, 0.75)
119 | comp_ylim_high = c(0.65, 0.80)
120 | comp_ylim_zoom = c(0.65, 0.75)
121 | comp_ylim_bow = c(0.40, 0.80)
122 | ```
123 | 
124 | ```{r}
125 | ## plotting vars for F1
126 | f1_ylim = c(0.30, 1.00)
127 | f1_ylim_low = c(0.60, 0.75)
128 | f1_ylim_high = c(0.65, 0.80)
129 | f1_ylim_zoom = c(0.65, 0.75)
130 | f1_ylim_bow = c(0.40, 0.80)
131 | ```
132 | 
133 | ```{r}
134 | # trancation_labels <- c(
135 | #   30 = 
136 | ```
137 | 
138 | 
139 | 
140 | 
141 | ## Competition Score 
142 | 
143 | ```{r, fig.asp = 0.8}
144 | score_max_length <- 
145 |   results %>% 
146 |   filter(xp %in% c('max_length')) %>% 
147 |   filter(max_length != 50) %>% 
148 |   mutate(
149 |     max_length = str_c('Truncation: ', max_length),
150 |     max_length = factor(max_length, levels = str_c('Truncation: ', c(75, 150, 200, 300, 600)))
151 |   ) %>% 
152 |   ggplot(aes(x = epoch, color = model)) +
153 |   geom_line(aes(y = competition), size = 1) +
154 |   geom_point(
155 |     data = results %>% distinct_(.dots = c(hyperparams, 'epoch_max_comp', 'competition_max')) %>%
156 |               filter(xp %in% c('max_length')) %>% filter(max_length != 50) %>% 
157 |       mutate(max_length = str_c('Truncation: ', max_length),
158 |              max_length = factor(max_length, levels = str_c('Truncation: ', c(75, 150, 200, 300, 600)))), 
159 |     mapping = aes(x = epoch_max_comp, y = competition_max), size = 2, alpha = 0.6
160 |     ) +
161 |   coord_cartesian(ylim = comp_ylim) +
162 |   facet_wrap(~max_length, nrow = 1) + 
163 |   theme(legend.position = 'bottom', legend.text = element_text(size = 10)) +
164 |   labs(
165 |     title = 'Sensitivity of Competition Score to Sequence Truncation',
166 |     x = 'Epoch', y = 'Competition Score',
167 |     subtitle = 'BOW and CEA LSTM models perform best at shortest and longest truncation lengths. \nBasic LSTM and Attention LSTM models perform best at shortest truncation lengths.',
168 |     caption = 'Fitted on train set (60%) and evaluated on dev set (20%)'
169 |     ) +
170 |   guides(color = guide_legend(title = 'Model'))
171 | 
172 | score_max_length
173 | 
174 | ggsave(plot = score_max_length, filename = str_c('score_max_length', '.png'), 
175 |        width = 8, height = 6, dpi = 900, units = 'in',
176 |        path = path_fig, device = 'png')
177 | ```
178 | 
179 | 
180 | ```{r, fig.asp = 0.8}
181 | ### For the report
182 | score_max_length <- 
183 |   results %>% 
184 |   filter(xp %in% c('max_length')) %>% 
185 |   filter(max_length != 50) %>% 
186 |   mutate(
187 |     max_length = str_c('Truncation: ', max_length),
188 |     max_length = factor(max_length, levels = str_c('Truncation: ', c(75, 150, 200, 300, 600)))
189 |   ) %>% 
190 |   ggplot(aes(x = epoch, color = model)) +
191 |   geom_line(aes(y = competition), size = 1) +
192 |   geom_point(
193 |     data = results %>% distinct_(.dots = c(hyperparams, 'epoch_max_comp', 'competition_max')) %>%
194 |               filter(xp %in% c('max_length')) %>% filter(max_length != 50) %>% 
195 |       mutate(max_length = str_c('Truncation: ', max_length),
196 |              max_length = factor(max_length, levels = str_c('Truncation: ', c(75, 150, 200, 300, 600)))), 
197 |     mapping = aes(x = epoch_max_comp, y = competition_max), size = 2, alpha = 0.6
198 |     ) +
199 |   coord_cartesian(ylim = comp_ylim) +
200 |   facet_wrap(~max_length, nrow = 1) + 
201 |   theme(legend.position = 'bottom', legend.text = element_text(size = 10),
202 |         title = element_blank()) +
203 |   labs(
204 |     title = 'Sensitivity of Competition Score to Sequence Truncation',
205 |     x = 'Epoch', y = 'Competition Score',
206 |     subtitle = 'BOW and CEA LSTM models perform best at shortest and longest truncation lengths. \nBasic LSTM and Attention LSTM models perform best at shortest truncation lengths.',
207 |     caption = 'Fitted on train set (60%) and evaluated on dev set (20%)'
208 |     ) +
209 |   guides(color = guide_legend(title = 'Model'))
210 | 
211 | score_max_length
212 | 
213 | ggsave(plot = score_max_length, filename = str_c('score_max_length_report', '.png'), 
214 |        width = 8, height = 4, dpi = 900, units = 'in',
215 |        path = path_fig, device = 'png')
216 | ```
217 | 
218 | 
219 | 
220 | ```{r, fig.asp = 0.8}
221 | score_n_layers <- 
222 |   results %>% 
223 |   filter(xp %in% c('n_layers')) %>%  
224 |   mutate(
225 |     n_layers = str_c('Layers: ', n_layers),
226 |     n_layers = factor(n_layers, levels = str_c('Layers: ', c(1, 2, 4)))
227 |   ) %>% 
228 |   ggplot(aes(x = epoch, color = model)) +
229 |   geom_line(aes(y = competition),
230 |             size = 1) +
231 |   geom_point(
232 |     data = results %>% distinct_(.dots = c(hyperparams, 'epoch_max_comp', 'competition_max')) %>% 
233 |               filter(xp %in% c('n_layers')) %>% filter(max_length != 50) %>% 
234 |               mutate(n_layers = str_c('Layers: ', n_layers),
235 |                      n_layers = factor(n_layers, levels = str_c('Layers: ', c(1, 2, 4)))),
236 |     mapping = aes(x = epoch_max_comp, y = competition_max), size = 2, alpha = 0.6
237 |     ) +
238 |   coord_cartesian(ylim = comp_ylim) + # cuts off 1 layer BOW
239 |   facet_wrap(~n_layers, nrow = 1)  + 
240 |   theme(legend.position = 'bottom', legend.text = element_text(size = 10)) +
241 |   labs(
242 |     title = 'Sensitivity of Competition Score to Hidden Layers',
243 |     x = 'Epoch', y = 'Competition Score',
244 |     subtitle = 'Basic LSTM and CEA LSTM models perform best with 2 layers. \nBOW and Attention LSTM models perform best with more hidden layers.',
245 |     caption = 'Fitted on train set (60%) and evaluated on dev set (20%)'
246 |     ) +
247 |   guides(color = guide_legend(title = 'Model'))
248 | 
249 | 
250 | score_n_layers
251 | 
252 | ggsave(plot = score_n_layers, filename = str_c('score_n_layers', '.png'), 
253 |        width = 6, height = 6, dpi = 900, units = 'in',
254 |        path = path_fig, device = 'png')
255 | ```
256 | 
257 | 
258 | ```{r, fig.asp = 0.8}
259 | ### For Report
260 | score_n_layers <- 
261 |   results %>% 
262 |   filter(xp %in% c('n_layers')) %>%  
263 |   mutate(
264 |     n_layers = str_c('Layers: ', n_layers),
265 |     n_layers = factor(n_layers, levels = str_c('Layers: ', c(1, 2, 4)))
266 |   ) %>% 
267 |   ggplot(aes(x = epoch, color = model)) +
268 |   geom_line(aes(y = competition),
269 |             size = 1) +
270 |   geom_point(
271 |     data = results %>% distinct_(.dots = c(hyperparams, 'epoch_max_comp', 'competition_max')) %>% 
272 |               filter(xp %in% c('n_layers')) %>% filter(max_length != 50) %>% 
273 |               mutate(n_layers = str_c('Layers: ', n_layers),
274 |                      n_layers = factor(n_layers, levels = str_c('Layers: ', c(1, 2, 4)))),
275 |     mapping = aes(x = epoch_max_comp, y = competition_max), size = 2, alpha = 0.6
276 |     ) +
277 |   coord_cartesian(ylim = comp_ylim) + # cuts off 1 layer BOW
278 |   facet_wrap(~n_layers, nrow = 1)  + 
279 |   theme(legend.position = 'bottom', legend.text = element_text(size = 10),
280 |         title = element_blank()) +
281 |   labs(
282 |     title = 'Sensitivity of Competition Score to Hidden Layers',
283 |     x = 'Epoch', y = 'Competition Score',
284 |     subtitle = 'Basic LSTM and CEA LSTM models perform best with 2 layers. \nBOW and Attention LSTM models perform best with more hidden layers.',
285 |     caption = 'Fitted on train set (60%) and evaluated on dev set (20%)'
286 |     ) +
287 |   guides(color = guide_legend(title = 'Model'))
288 | 
289 | 
290 | score_n_layers
291 | 
292 | ggsave(plot = score_n_layers, filename = str_c('score_n_layers_report', '.png'), 
293 |        width = 6, height = 4, dpi = 900, units = 'in',
294 |        path = path_fig, device = 'png')
295 | ```
296 | 
297 | 
298 | 
299 | ## F1 Scores
300 | 
301 | 
302 | ```{r, fig.asp = 0.8}
303 | f1_max_length <- 
304 |   results %>% 
305 |   filter(xp %in% c('max_length')) %>% 
306 |   filter(max_length != 50) %>%
307 |   # filter(max_length != 600) %>%
308 |   filter(epoch == epoch_max_comp) %>%
309 |   ggplot(aes(x = max_length, color = model)) +
310 |   geom_line(aes(y = f1), size = 1) +
311 |   geom_point(aes(y = f1), size = 2, alpha = 0.6) +
312 |   scale_x_continuous(trans = 'log2', breaks = breaks_max_length) + 
313 |   facet_wrap(~class, nrow = 1 
314 |              # ,scales = 'free_y'
315 |              ) + coord_cartesian(ylim = f1_ylim) +
316 |   theme(legend.position = 'bottom', legend.text = element_text(size = 10)) +
317 |   labs(
318 |     title = 'Sensitivity of Stance F1 Scores to Truncation Length',
319 |     x = 'Truncation Length', y = 'F1 Score',
320 |     # subtitle = 'Basic LSTM and CEA LSTM models perform best with 2 layers. \nBOW and Attention LSTM models perform best with more hidden layers.',
321 |     caption = 'Fitted on train set (60%) and evaluated on dev set (20%), shown for epoch with maximal competition score'
322 |     ) +
323 |   guides(color = guide_legend(title = 'Model'))
324 | 
325 | 
326 | f1_max_length
327 | 
328 | ggsave(plot = f1_max_length, filename = str_c('f1_max_length', '.png'), 
329 |        width = 8, height = 6, dpi = 900, units = 'in',
330 |        path = path_fig, device = 'png')
331 | 
332 | 
333 | 
334 | 
335 | f1_max_length_report <- f1_max_length + theme(title = element_blank())
336 | 
337 | ggsave(plot = f1_max_length_report, filename = str_c('f1_max_length_report', '.png'), 
338 |        width = 8, height = 4, dpi = 900, units = 'in',
339 |        path = path_fig, device = 'png')
340 | ```
341 | 
342 | ```{r, fig.asp = 0.8}
343 | f1_n_layers <- 
344 |   results %>% 
345 |   filter(xp %in% c('n_layers')) %>% 
346 |   filter(epoch == epoch_max_comp) %>%
347 |   ggplot(aes(x = n_layers, color = model)) +
348 |   geom_line(aes(y = f1), size = 1) +
349 |   geom_point(aes(y = f1), size = 2, alpha = 0.6) +
350 |   scale_x_continuous(trans = 'log2', breaks = breaks_n_layers) +
351 |   facet_wrap(~class, nrow = 1 
352 |              # ,scales = 'free_y'
353 |              ) + coord_cartesian(ylim = f1_ylim) +
354 |   theme(legend.position = 'bottom', legend.text = element_text(size = 10)) +
355 |   labs(
356 |     title = 'Sensitivity of Stance F1 Scores to Hidden Layers',
357 |     x = 'Truncation Length', y = 'F1 Score',
358 |     # subtitle = 'Basic LSTM and CEA LSTM models perform best with 2 layers. \nBOW and Attention LSTM models perform best with more hidden layers.',
359 |     caption = 'Fitted on train set (60%) and evaluated on dev set (20%), shown for epoch with maximal competition score'
360 |     ) +
361 |   guides(color = guide_legend(title = 'Model'))
362 | 
363 | 
364 | 
365 | f1_n_layers
366 | 
367 | ggsave(plot = f1_n_layers, filename = str_c('f1_n_layers', '.png'), 
368 |        width = 8, height = 6, dpi = 900, units = 'in',
369 |        path = path_fig, device = 'png')
370 | 
371 | 
372 | f1_n_layers_report <- f1_n_layers + theme(title = element_blank())
373 | 
374 | ggsave(plot = f1_n_layers_report, filename = str_c('f1_n_layers_report', '.png'), 
375 |        width = 8, height = 4, dpi = 900, units = 'in',
376 |        path = path_fig, device = 'png')
377 | ```
378 | 
379 | 
380 | ## Final Results
381 | 
382 | ```{r}
383 | ### TABLE
384 | results_table_all <- 
385 |   results_final %>% 
386 |   filter(epoch == 40) %>% 
387 |   spread(key = class, value = f1) %>% 
388 |   group_by(model) %>% 
389 |   mutate(
390 |     f1_agree = sum(`Related: Agree`, na.rm = TRUE),
391 |     f1_disagree = sum(`Related: Disagree`, na.rm = TRUE),
392 |     f1_discuss = sum(`Related: Discuss`, na.rm = TRUE),
393 |     f1_unrelated = sum(`Unrelated`, na.rm = TRUE)
394 |     ) %>% 
395 |   select(-`Related: Agree`,-`Related: Disagree`, -`Related: Discuss`, -`Unrelated`) %>% 
396 |   select( -(fp:tn), -(specificity:accuracy), -xp) %>% 
397 |   select(model, everything())
398 | 
399 | variables_results <- names(results_table_all)
400 | 
401 | results_table_all <- 
402 |   results_table_all %>% 
403 |   distinct_(.dots = variables_results) 
404 | 
405 | results_table_all <- 
406 |   results_table_all[,-ncol(results_table_all)] %>% 
407 |   mutate(max_length = if_else(model %in% c('BOW', 'CEA LSTM'), NA_integer_, max_length))
408 | 
409 | results_table <- 
410 |   results_table_all %>% 
411 |   select(model, competition:f1_unrelated)
412 |   
413 | results_table %>% write_excel_csv(path = str_c(path_fig, 'results_table.csv'))
414 | 
415 | hyperparam_table <- 
416 |   results_table_all %>% 
417 |   select(model:epoch)
418 |   
419 | hyperparam_table %>% write_excel_csv(path = str_c(path_fig, 'hyperparam_table.csv'))
420 | 
421 | 
422 | ```
423 | 
424 | 
425 | ```{r, fig.asp = 0.8}
426 | final_loss <- 
427 |   results_final %>% 
428 |   ggplot(aes(x = epoch, color = model)) +
429 |   geom_line(aes(y = train_loss),
430 |             size = 1) +
431 |   coord_cartesian(ylim = c(0, 0.5)) +
432 |   theme(legend.position = 'bottom', legend.text = element_text(size = 10)) +
433 |   labs(
434 |     title = 'Training Loss of Selected Models',
435 |     x = 'Epoch', y = 'Training Loss',
436 |     # subtitle = 'Basic LSTM and CEA LSTM models perform best with 2 layers. \nBOW and Attention LSTM models perform best with more hidden layers.',
437 |     caption = 'Fitted on train and dev set (80%) and evaluated on test set (20%)'
438 |     ) +
439 |   guides(color = guide_legend(title = 'Model'))
440 | 
441 | final_loss
442 | 
443 | ggsave(plot = final_loss, filename = str_c('final_loss', '.png'), 
444 |        width = 5, height = 5, dpi = 900, units = 'in',
445 |        path = path_fig, device = 'png')
446 | ```
447 | 
448 | ```{r, fig.asp = 0.8}
449 | final_f1 <- 
450 |   results_final %>% 
451 |   filter(epoch == 40) %>% 
452 |   ggplot(aes(x = model, fill = model)) +
453 |   geom_col(aes(y = f1), size = 1) +
454 |   theme(legend.position = 'bottom', legend.text = element_text(size = 10),
455 |         axis.text.x = element_blank(), axis.ticks.x = element_blank(),
456 |         axis.title.x = element_blank()) +
457 |   facet_wrap(~class, nrow = 1) +
458 |   labs(
459 |     title = 'Stance F1 Scores of Selected Models',
460 |     x = 'Model', y = 'F1 Score',
461 |     # subtitle = 'Basic LSTM and CEA LSTM models perform best with 2 layers. \nBOW and Attention LSTM models perform best with more hidden layers.',
462 |     caption = 'Fitted on train and dev set (80%) and evaluated on test set (20%)'
463 |     ) +
464 |   guides(fill = guide_legend(title = 'Model'))
465 | 
466 | final_f1
467 | 
468 | ggsave(plot = final_f1, filename = str_c('final_f1', '.png'), 
469 |        width = 8, height = 6, dpi = 900, units = 'in',
470 |        path = path_fig, device = 'png')
471 | ```
472 | 
473 | ```{r, fig.asp = 0.8}
474 | final_comp <- 
475 |   results_final %>% 
476 |   filter(epoch == 40) %>%
477 |   distinct(model, competition) %>% 
478 |   ggplot(aes(x = model, fill = model)) +
479 |   geom_col(aes(y = competition), size = 1) +
480 |   coord_cartesian(ylim = c(0.725, 0.825)) +
481 | 
482 |   theme(legend.position = 'bottom', legend.text = element_text(size = 10),
483 |         axis.text.x = element_blank(), axis.ticks.x = element_blank(),
484 |         axis.title.x = element_blank()) +
485 |   # facet_wrap(~class, nrow = 1) +
486 |   labs(
487 |     title = 'Competition Scores of Selected Models',
488 |     x = 'Model', y = 'Competition Score',
489 |     # subtitle = 'Basic LSTM and CEA LSTM models perform best with 2 layers. \nBOW and Attention LSTM models perform best with more hidden layers.',
490 |     caption = 'Fitted on train and dev set (80%) and evaluated on test set (20%)'
491 |     ) +
492 |   guides(fill = guide_legend(title = 'Model'))
493 | 
494 | final_comp
495 | 
496 | ggsave(plot = final_comp, filename = str_c('final_comp', '.png'), 
497 |        width = 5, height = 5, dpi = 900, units = 'in',
498 |        path = path_fig, device = 'png')
499 | ```
500 | 
501 | 
502 | ```{r, fig.asp = 0.8}
503 | ### DO NOT use this plot
504 | results_final %>% 
505 |   ggplot(aes(x = epoch, color = model)) +
506 |   geom_line(aes(y = competition),
507 |             size = 1) +
508 |   # coord_cartesian(ylim = comp_ylim) +
509 |   theme(legend.position = 'bottom', legend.text = element_text(size = 10)) +
510 |   labs(
511 |     title = 'Final Competition Scores of Selected Models',
512 |     x = 'Epoch', y = 'Competition Score',
513 |     # subtitle = 'Basic LSTM and CEA LSTM models perform best with 2 layers. \nBOW and Attention LSTM models perform best with more hidden layers.',
514 |     caption = 'Fitted on train and de set (80%) and evaluated on test set (20%)'
515 |     ) +
516 |   guides(color = guide_legend(title = 'Model'))
517 | ```
518 | 
519 | 
520 | 
521 | ## Model Details
522 | 
523 | ### BOW - Competition Score 
524 | 
525 | ```{r}
526 | results %>% 
527 |   filter(model %in% c('BOW')) %>% 
528 |   ggplot(aes(x = epoch)) +
529 |   geom_line(aes(y = competition, 
530 |                 # linetype = trainable_embeddings, 
531 |                 color = factor(n_layers)),
532 |             size = 1) +
533 |   labs(title = 'BOW Competition Score: max_length, n_layers') +
534 |   facet_grid(~max_length)  + coord_cartesian(ylim = comp_ylim_bow)
535 | ```
536 | 
537 | ```{r}
538 | results %>% 
539 |   filter(model %in% c('BOW')) %>% 
540 | 
541 |   ggplot(aes(x = epoch)) +
542 |   geom_line(aes(y = competition, 
543 |                 # linetype = trainable_embeddings, 
544 |                 color = factor(max_length)),
545 |             size = 1) +
546 |   labs(title = 'BOW Competition Score: max_length, n_layers') +
547 |   facet_grid(~n_layers)  + coord_cartesian(ylim = comp_ylim_bow)
548 | ```
549 | 
550 | ### Basic LSTM - Competition Score 
551 | 
552 | ```{r}
553 | results %>% 
554 |   filter(model %in% c('Basic LSTM'),
555 |          xp %in% c('max_length', 'n_layers')) %>% 
556 |   filter(max_length != 50) %>% 
557 |   ggplot(aes(x = epoch)) +
558 |   geom_line(aes(y = competition, 
559 |                 # linetype = trainable_embeddings, 
560 |                 color = factor(n_layers)),
561 |             size = 1) +
562 |   labs(title = 'Basic LSTM Competition Score: max_length, n_layers') +
563 |   facet_wrap(~max_length, nrow = 1)  + coord_cartesian(ylim = comp_ylim_zoom)
564 | ```
565 | 
566 | ```{r}
567 | results %>% 
568 |   filter(model %in% c('Basic LSTM'),
569 |          xp %in% c('max_length', 'n_layers')) %>% 
570 |   filter(max_length != 50) %>% 
571 |   ggplot(aes(x = epoch)) +
572 |   geom_line(aes(y = competition, 
573 |                 # linetype = trainable_embeddings, 
574 |                 color = factor(max_length)),
575 |             size = 1) +
576 |   labs(title = 'Basic LSTM Competition Score: max_length, n_layers') +
577 |   facet_wrap(~n_layers, nrow = 1) + coord_cartesian(ylim = comp_ylim_zoom)
578 | ```
579 | 
580 | ### Attention LSTM - Competition Score 
581 | 
582 | 
583 | ```{r}
584 | results %>% 
585 |   filter(model %in% c('Attention LSTM'),
586 |          xp %in% c('max_length', 'n_layers')) %>% 
587 |   ggplot(aes(x = epoch)) +
588 |   geom_line(aes(y = competition, 
589 |                 # linetype = trainable_embeddings, 
590 |                 color = factor(n_layers)),
591 |             size = 1) +
592 |   labs(title = 'Attention LSTM Competition Score: max_length, n_layers') +
593 |   facet_wrap(~max_length, nrow = 1) + coord_cartesian(ylim = comp_ylim_high)
594 | ```
595 | 
596 | 
597 | 
598 | ```{r}
599 | results %>% 
600 |   filter(model %in% c('Attention LSTM'),
601 |          xp %in% c('max_length', 'n_layers')) %>% 
602 |   ggplot(aes(x = epoch)) +
603 |   geom_line(aes(y = competition, 
604 |                 # linetype = trainable_embeddings, 
605 |                 color = factor(max_length)),
606 |             size = 1) +
607 |   labs(title = 'Attention LSTM Competition Score: max_length, n_layers') +
608 |   facet_wrap(~n_layers, nrow = 1)  + coord_cartesian(ylim = comp_ylim_high)
609 | ```
610 | 
611 | ### Conditional LSTM - Competition Score 
612 | 
613 | 
614 | ```{r}
615 | results %>% 
616 |   filter(model %in% c('CEA LSTM'),
617 |          xp %in% c('max_length', 'n_layers')) %>% 
618 |   ggplot(aes(x = epoch)) +
619 |   geom_line(aes(y = competition, 
620 |                 # linetype = trainable_embeddings, 
621 |                 color = factor(n_layers)),
622 |             size = 1) +
623 |   labs(title = 'Conditional LSTM Competition Score: max_length, n_layers') +
624 |   facet_wrap(~max_length, nrow = 1)  + coord_cartesian(ylim = comp_ylim_high)
625 | ```
626 | 
627 | 
628 | ```{r}
629 | results %>% 
630 |   filter(model %in% c('CEA LSTM'),
631 |          xp %in% c('max_length', 'n_layers')) %>% 
632 |   ggplot(aes(x = epoch)) +
633 |   geom_line(aes(y = competition, 
634 |                 # linetype = trainable_embeddings, 
635 |                 color = factor(max_length)),
636 |             size = 1) +
637 |   labs(title = 'Conditional LSTM Competition Score: max_length, n_layers') +
638 |   facet_wrap(~n_layers, nrow = 1)  + coord_cartesian(ylim = comp_ylim_high)
639 | ```
640 | 


--------------------------------------------------------------------------------
/code/execute_bow_config.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | ######
  4 | # Initial test of LSTM model for Fake News Challenge
  5 | # Based on starter code from PS3-CS224n
  6 | # Based on Stephen's rnn_test1
  7 | ######
  8 | ## General libraries
  9 | import tensorflow as tf
 10 | import numpy as np
 11 | import random
 12 | 
 13 | ## Our Own Code
 14 | # from our_model import Config
 15 | from bow_model_config import BOWModel
 16 | from run_text_processing import save_data_pickle, get_data
 17 | # from run_text_processing import get_data
 18 | ## currently using: split_indices
 19 | # from our_util import Progbar, minibatches, pack_labels, split_data, split_indices, softmax, get_performance
 20 | from our_util import split_indices, softmax, get_performance, convertOutputs #M
 21 | 
 22 | def run_save_data_pickle(): ## Needs NLTK to be installed!
 23 |     save_data_pickle(outfilename = '/../../glove/twitter50d_h_ids_b_ids_pickle.p',
 24 |                     embedding_type = 'twitter.27B.50d',
 25 |                     parserOption = 'nltk')
 26 | 
 27 | def run_bow(config, split = True, outputpath = '../../xp', final = False): #M
 28 | 
 29 | 
 30 | 
 31 |     ## Get data
 32 |     # config, y, h, b, h_len, b_len = get_BOW_data(config, reload = True, save_data = False)
 33 |     config, data_dict = get_data(config, 
 34 |             filename_embeddings = '/../../glove/glove.twitter.27B.50d.txt',
 35 |             pickle_path = '/../../glove/twitter50d_h_ids_b_ids_pickle.p',
 36 |             concat = False)
 37 | 
 38 |     ## pass data into local namespace:
 39 |     y = data_dict['y']
 40 |     h = data_dict['h_np']
 41 |     b = data_dict['b_np']
 42 |     h_len = data_dict['h_seqlen']
 43 |     b_len = data_dict['b_seqlen']
 44 |     
 45 |     # Do shortening of dataset ## affects number of samples and max_len.
 46 |     if config.num_samples  is not None:
 47 |         ## Random seed
 48 |         np.random.seed(1)
 49 |         ind = range(np.shape(h)[0])
 50 |         random.shuffle(ind)
 51 |         indices = ind[0:config.num_samples ]
 52 |         h = h[indices,:]
 53 |         b = b[indices,:]
 54 |         h_len = h_len[indices]
 55 |         b_len = b_len[indices]
 56 |         y = y[indices]
 57 | 
 58 |     if config.h_max_len is not None:
 59 |         h_max_len = config.h_max_len
 60 |         if np.shape(h)[1] > h_max_len:
 61 |             h = h[:, 0:h_max_len]
 62 |         h_len = np.minimum(h_len, h_max_len)
 63 | 
 64 |     if config.b_max_len is not None:
 65 |         b_max_len = config.b_max_len
 66 |         if np.shape(b)[1] > b_max_len:
 67 |             b = b[:, 0:b_max_len]
 68 |         b_len = np.minimum(b_len, b_max_len)
 69 | 
 70 |     if split:
 71 |         # Split data
 72 |         train_indices, dev_indices, test_indices = split_indices(np.shape(h)[0])
 73 |         # Divide data
 74 |         train_h = h[train_indices,:]
 75 |         train_b = b[train_indices,:]
 76 |         train_h_len = h_len[train_indices]
 77 |         train_b_len = b_len[train_indices]
 78 |         train_y = y[train_indices]
 79 | 
 80 |         # Development
 81 |         dev_h = h[dev_indices,:]
 82 |         dev_b = b[dev_indices,:]
 83 |         dev_h_len = h_len[dev_indices]
 84 |         dev_b_len = b_len[dev_indices]
 85 |         dev_y = y[dev_indices]
 86 | 
 87 |         if final:
 88 |             # Combine train and dev
 89 |             train_dev_indices = train_indices + dev_indices
 90 |             train_h = h[train_dev_indices,:]
 91 |             train_b = b[train_dev_indices,:]
 92 |             train_h_len = h_len[train_dev_indices]
 93 |             train_b_len = b_len[train_dev_indices]
 94 |             train_y = y[train_dev_indices]
 95 | 
 96 |             # Set dev to test
 97 |             dev_h = h[test_indices,:]
 98 |             dev_b = b[test_indices,:]
 99 |             dev_h_len = h_len[test_indices]
100 |             dev_b_len = b_len[test_indices]
101 |             dev_y = y[test_indices]
102 | 
103 | 
104 |       
105 |     ## Passing parameter_dict to config settings
106 |     ## Changes to config  based on data shape
107 |     assert(np.shape(train_h)[0] == np.shape(train_b)[0] == np.shape(train_y)[0] == np.shape(train_h_len)[0] == np.shape(train_b_len)[0])
108 |     config.num_samples = np.shape(train_h)[0]
109 |     config.h_max_len = np.shape(train_h)[1]
110 |     config.b_max_len = np.shape(train_b)[1]
111 |     
112 |     ## Start Tensorflow!
113 |     print('Starting TensorFlow operations')
114 |     print 'With hidden layers: ', config.n_layers ## hidden layer?
115 |     with tf.Graph().as_default():
116 |         tf.set_random_seed(1)
117 |         model = BOWModel(config)
118 |         init = tf.global_variables_initializer()
119 |         with tf.Session() as session:
120 |             session.run(init)
121 |             losses_ep, dev_performances_ep, dev_predicted_classes_ep, dev_predictions_ep = model.fit(session, train_h, train_b, train_h_len, train_b_len, train_y, dev_h, dev_b, dev_h_len, dev_b_len, dev_y) #M
122 | 
123 |     # Write results to csv
124 |     convertOutputs(outputpath, config, losses_ep, dev_performances_ep)
125 | 
126 |     print('Losses ', losses_ep)
127 |     print('Dev Performance ', dev_performances_ep) #M
128 |     return losses_ep, dev_predicted_classes_ep, dev_performances_ep #M
129 | 
130 | ## for debugging
131 | if __name__ == "__main__":
132 |     print('Doing something!')
133 |     losses, dev_predicted_classes, dev_performance = run_bow(num_samples = 1028)
134 |     print('Execution Complete')
135 | 


--------------------------------------------------------------------------------
/code/execute_lstm_attention.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | ######
  5 | # Execution file for the LSTM attention model
  6 | # Based on starter code from PS3-CS224n
  7 | ######
  8 | from __future__ import absolute_import
  9 | from __future__ import division
 10 | 
 11 | import argparse
 12 | import logging
 13 | import sys
 14 | import time
 15 | import os
 16 | from datetime import datetime
 17 | 
 18 | import tensorflow as tf
 19 | import numpy as np
 20 | import cPickle as pickle
 21 | 
 22 | from run_text_processing import get_data, save_data_pickle
 23 | 
 24 | from our_util import Progbar, minibatches, pack_labels, split_data, softmax, get_performance, convertOutputs, downsample_label
 25 | # from our_model import OurModel, Config
 26 | 
 27 | from LSTM_attention import *
 28 | 
 29 | logger = logging.getLogger("hw3.q3")
 30 | logger.setLevel(logging.DEBUG)
 31 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
 32 | 
 33 | def run_save_data_pickle():
 34 |     save_data_pickle(outfilename = '/../../glove/twitter50d_h_ids_b_ids_pickle.p',
 35 |                     embedding_type = 'twitter.27B.50d',
 36 |                     parserOption = 'nltk')
 37 |     
 38 | def run_lstm_attention(config, outputpath = '../../xp', final = False):
 39 |     config, data_dict = get_data(config,
 40 |                                 filename_embeddings = '/../../glove/glove.twitter.27B.50d.txt',
 41 |                                 pickle_path = '/../../glove/twitter50d_h_ids_b_ids_pickle.p',
 42 |                                 concat = True)
 43 | 
 44 |     y = data_dict['y']
 45 |     h_b_np = data_dict['h_b_np']
 46 |     seqlen = data_dict['seqlen']
 47 | 
 48 |     # Perform downsampling
 49 |     if 'downsample' in config.__dict__:
 50 |         if config.downsample == True:
 51 |             downsample_indices = downsample_label(y, label_for_ds = 3, downsample_factor = 4)
 52 |             y = y[downsample_indices]
 53 |             h_b_np = h_b_np[downsample_indices, :]
 54 |             seqlen = seqlen[downsample_indices]
 55 | 
 56 |     if config.max_length is not None:
 57 |         max_length = config.max_length
 58 |         if np.shape(h_b_np)[1] > max_length:
 59 |             h_b_np = h_b_np[:, 0:max_length]
 60 |         seqlen = np.minimum(seqlen, max_length)
 61 | 
 62 |     # Set maximum dataset size for testing purposes
 63 |     data = pack_labels(h_b_np, y, seqlen)
 64 |     if config.num_samples is not None:
 65 |         num_samples = config.num_samples
 66 |         data = data[0:num_samples - 1]
 67 | 
 68 |     # Split data, result is still packed
 69 |     train_data, dev_data, test_data, train_indices, dev_indices, test_indices = split_data(data, prop_train = 0.6, prop_dev = 0.2, seed = 56)
 70 | 
 71 |     # Compute some convenience sub-sets
 72 |     # Dev
 73 |     dev_labels = y[dev_indices]
 74 |     dev_data_np = h_b_np[dev_indices, :]
 75 |     dev_seqlen = seqlen[dev_indices]
 76 |     # Test
 77 |     test_labels = y[test_indices]
 78 |     test_data_np = h_b_np[test_indices, :]
 79 |     test_seqlen = seqlen[test_indices]
 80 | 
 81 |     ## Config determined at data loading:
 82 |     config.num_samples = len(train_indices)
 83 |     config.max_length = np.shape(h_b_np)[1]
 84 | 
 85 | 
 86 |     # If this is the final test:
 87 |         # Combine test and dev
 88 |         # Reassign test to dev - for compatibility with rest of the code
 89 |     if final:
 90 |         # train_dev_indices = train_indices.extend(dev_indices)
 91 |         train_dev_indices = train_indices + dev_indices
 92 |         train_data = [data[i] for i in train_dev_indices]
 93 |         dev_data_np = test_data_np
 94 |         dev_seqlen = test_seqlen
 95 |         dev_labels = test_labels
 96 |         config.num_samples = len(train_dev_indices)
 97 | 
 98 |     with tf.Graph().as_default():
 99 |         
100 |         tf.set_random_seed(59)
101 | 
102 |         logger.info("Building model...",)
103 |         start = time.time()
104 |         model = LSTMAttention(config)
105 |         logger.info("took %.2f seconds", time.time() - start)
106 | 
107 |         init = tf.global_variables_initializer()
108 |         
109 |         with tf.Session() as session:
110 |             session.run(init)
111 |             # losses = model.fit(session, train_data)
112 |             losses_ep, dev_performances_ep, dev_predicted_classes_ep, dev_predictions_ep = model.fit(session, train_data, dev_data_np, dev_seqlen, dev_labels) # MODIF
113 |             # dev_predictions = model.predict_on_batch(session, dev_data_np, dev_seqlen)
114 | 
115 | 
116 |             #test_predictions = model.predict_on_batch(session, test_data_np, test_seqlen)
117 | 
118 |     # outputpath = '../../xp' # MODIF
119 |     convertOutputs(outputpath, config, losses_ep, dev_performances_ep) # MODIF
120 |     # Compute testing predictions --> MODIF --> SHOULD BE REMOVED WHEN OK
121 |     print('Dev Performance ', dev_performances_ep) #M
122 |     return losses_ep, dev_predicted_classes_ep, dev_performances_ep #MODIF
123 | 
124 | if __name__ == "__main__":
125 |     
126 |     # print('Doing something!')
127 |     # # run_save_data_pickle()
128 |     # # test_model_loading_functions('')
129 |     # # test_run_model_with_parameters('')
130 |     # # test_save_load_data_pickle('twitter50d_h_ids_b_ids_pickle.p')
131 |     # # losses = test_model_with_real_data_pickle('args')
132 |     print('Execution Complete')
133 |     # # print(losses)


--------------------------------------------------------------------------------
/code/execute_lstm_conditional.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | ######
  4 | # Execution script for the conditional LSTM with attention
  5 | # Based on starter code from PS3-CS224n
  6 | ######
  7 | ## General libraries
  8 | import tensorflow as tf
  9 | import numpy as np
 10 | import random
 11 | 
 12 | ## Our Own Code
 13 | from LSTM_conditional import LSTMCondModel
 14 | from run_text_processing import save_data_pickle, get_data
 15 | from our_util import Progbar, minibatches, pack_labels, split_data, softmax, get_performance, convertOutputs, downsample_label, split_indices
 16 | 
 17 | def run_save_data_pickle(): ## Needs NLTK to be installed!
 18 |     save_data_pickle(outfilename = '/../../glove/twitter50d_h_ids_b_ids_pickle.p',
 19 |                     embedding_type = 'twitter.27B.50d',
 20 |                     parserOption = 'nltk')
 21 | 
 22 | def run_lstm_conditional(config, split = True, outputpath = '../../xp', final = False):
 23 |     ## Get data
 24 |     config, data_dict = get_data(config, 
 25 |             filename_embeddings = '/../../glove/glove.twitter.27B.50d.txt',
 26 |             pickle_path = '/../../glove/twitter50d_h_ids_b_ids_pickle.p',
 27 |             concat = False)
 28 | 
 29 |     ## pass data into local namespace:
 30 |     y = data_dict['y']
 31 |     h = data_dict['h_np']
 32 |     b = data_dict['b_np']
 33 |     h_len = data_dict['h_seqlen']
 34 |     b_len = data_dict['b_seqlen']
 35 |     
 36 |     # Do shortening of dataset ## affects number of samples and max_len.
 37 |     if config.num_samples  is not None:
 38 |         ## Random seed
 39 |         np.random.seed(1)
 40 |         ind = range(np.shape(h)[0])
 41 |         random.shuffle(ind)
 42 |         indices = ind[0:config.num_samples ]
 43 |         h = h[indices,:]
 44 |         b = b[indices,:]
 45 |         h_len = h_len[indices]
 46 |         b_len = b_len[indices]
 47 |         y = y[indices]
 48 | 
 49 |     # Truncate headlines and bodies
 50 |     if config.h_max_len is not None:
 51 |         h_max_len = config.h_max_len
 52 |         if np.shape(h)[1] > h_max_len:
 53 |             h = h[:, 0:h_max_len]
 54 |         h_len = np.minimum(h_len, h_max_len)
 55 | 
 56 |     if config.b_max_len is not None:
 57 |         b_max_len = config.b_max_len
 58 |         if np.shape(b)[1] > b_max_len:
 59 |             b = b[:, 0:b_max_len]
 60 |         b_len = np.minimum(b_len, b_max_len)
 61 | 
 62 |     if split:
 63 |         # Split data
 64 |         train_indices, dev_indices, test_indices = split_indices(np.shape(h)[0])
 65 |         # Divide data
 66 |         train_h = h[train_indices,:]
 67 |         train_b = b[train_indices,:]
 68 |         train_h_len = h_len[train_indices]
 69 |         train_b_len = b_len[train_indices]
 70 |         train_y = y[train_indices]
 71 |         # test
 72 |         dev_h = h[dev_indices,:]
 73 |         dev_b = b[dev_indices,:]
 74 |         dev_h_len = h_len[dev_indices]
 75 |         dev_b_len = b_len[dev_indices]
 76 |         dev_y = y[dev_indices]
 77 | 
 78 |         if final:
 79 |             # Combine train and dev
 80 |             train_dev_indices = train_indices + dev_indices
 81 |             train_h = h[train_dev_indices,:]
 82 |             train_b = b[train_dev_indices,:]
 83 |             train_h_len = h_len[train_dev_indices]
 84 |             train_b_len = b_len[train_dev_indices]
 85 |             train_y = y[train_dev_indices]
 86 | 
 87 |             # Set dev to test
 88 |             dev_h = h[test_indices,:]
 89 |             dev_b = b[test_indices,:]
 90 |             dev_h_len = h_len[test_indices]
 91 |             dev_b_len = b_len[test_indices]
 92 |             dev_y = y[test_indices]
 93 |       
 94 |     ## Passing parameter_dict to config settings
 95 |     ## Changes to config  based on data shape
 96 |     assert(np.shape(train_h)[0] == np.shape(train_b)[0] == np.shape(train_y)[0] == np.shape(train_h_len)[0] == np.shape(train_b_len)[0])
 97 |     config.num_samples = np.shape(train_h)[0]
 98 |     config.h_max_len = np.shape(train_h)[1]
 99 |     config.b_max_len = np.shape(train_b)[1]
100 |     
101 |     ## Start Tensorflow!
102 |     print('Starting TensorFlow operations')
103 |     print 'With hidden layers: ', config.n_layers ## hidden layer?
104 |     with tf.Graph().as_default():
105 |         tf.set_random_seed(1)
106 |         model = LSTMCondModel(config)
107 |         init = tf.global_variables_initializer()
108 |         with tf.Session() as session:
109 |             session.run(init) 
110 |             losses_ep, dev_performances_ep, dev_predicted_classes_ep, dev_predictions_ep = model.fit(session, train_h, train_b, train_h_len, train_b_len, train_y, dev_h, dev_b, dev_h_len, dev_b_len, dev_y) #M
111 | 
112 |     # Write results to csv
113 |     convertOutputs(outputpath, config, losses_ep, dev_performances_ep)
114 | 
115 |     print('Losses ', losses_ep)
116 |     print('Dev Performance ', dev_performances_ep) #M
117 |     return losses_ep, dev_predicted_classes_ep, dev_performances_ep #M
118 | 
119 | ## for debugging
120 | if __name__ == "__main__":
121 |     print('Doing something!')
122 |     # run_save_data_pickle()
123 |     losses, dev_predicted_classes, dev_performance = run_bow(num_samples = 1028)
124 |     print('Execution Complete')


--------------------------------------------------------------------------------
/code/execute_lstm_config.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | ######
  5 | # Initial test of LSTM model for Fake News Challenge - Using actual data
  6 | # Based on starter code from PS3-CS224n
  7 | ######
  8 | from __future__ import absolute_import
  9 | from __future__ import division
 10 | 
 11 | import argparse
 12 | import logging
 13 | import sys
 14 | import time
 15 | import os
 16 | from datetime import datetime
 17 | 
 18 | import tensorflow as tf
 19 | import numpy as np
 20 | import cPickle as pickle
 21 | 
 22 | from run_text_processing import get_data, save_data_pickle
 23 | 
 24 | from our_util import Progbar, minibatches, pack_labels, split_data, softmax, get_performance, convertOutputs, downsample_label
 25 | 
 26 | from basicLSTM_model_config import *
 27 | 
 28 | logger = logging.getLogger("hw3.q3")
 29 | logger.setLevel(logging.DEBUG)
 30 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
 31 | 
 32 | def run_save_data_pickle():
 33 |     save_data_pickle(outfilename = '/../../glove/twitter50d_h_ids_b_ids_pickle.p',
 34 |                     embedding_type = 'twitter.27B.50d',
 35 |                     parserOption = 'nltk')
 36 |     
 37 | def run_lstm(config, outputpath = '../../xp', final = False):
 38 |     config, data_dict = get_data(config,
 39 |                                 filename_embeddings = '/../../glove/glove.twitter.27B.50d.txt',
 40 |                                 pickle_path = '/../../glove/twitter50d_h_ids_b_ids_pickle.p',
 41 |                                 concat = True)
 42 | 
 43 |     y = data_dict['y']
 44 |     h_b_np = data_dict['h_b_np']
 45 |     seqlen = data_dict['seqlen']
 46 | 
 47 |     # Perform downsampling
 48 |     if 'downsample' in config.__dict__:
 49 |         if config.downsample == True:
 50 |             downsample_indices = downsample_label(y, label_for_ds = 3, downsample_factor = 4)
 51 |             y = y[downsample_indices]
 52 |             h_b_np = h_b_np[downsample_indices, :]
 53 |             seqlen = seqlen[downsample_indices]
 54 | 
 55 |     if config.max_length is not None:
 56 |         max_length = config.max_length
 57 |         if np.shape(h_b_np)[1] > max_length:
 58 |             h_b_np = h_b_np[:, 0:max_length]
 59 |         seqlen = np.minimum(seqlen, max_length)
 60 | 
 61 |     # Set maximum dataset size for testing purposes
 62 |     data = pack_labels(h_b_np, y, seqlen)
 63 |     if config.num_samples is not None:
 64 |         num_samples = config.num_samples
 65 |         data = data[0:num_samples - 1]
 66 | 
 67 |     # Split data, result is still packed
 68 |     train_data, dev_data, test_data, train_indices, dev_indices, test_indices = split_data(data, prop_train = 0.6, prop_dev = 0.2, seed = 56)
 69 | 
 70 |     # Compute some convenience sub-sets
 71 |     # Dev
 72 |     dev_labels = y[dev_indices]
 73 |     dev_data_np = h_b_np[dev_indices, :]
 74 |     dev_seqlen = seqlen[dev_indices]
 75 |     # Test
 76 |     test_labels = y[test_indices]
 77 |     test_data_np = h_b_np[test_indices, :]
 78 |     test_seqlen = seqlen[test_indices]
 79 | 
 80 | 
 81 |     ## Config determined at data loading:
 82 |     config.num_samples = len(train_indices)
 83 |     config.max_length = np.shape(h_b_np)[1]
 84 | 
 85 |     # If this is the final test:
 86 |         # Combine test and dev
 87 |         # Reassign test to dev - for compatibility with rest of the code
 88 |     if final:
 89 |         # train_dev_indices = train_indices.extend(dev_indices)
 90 |         train_dev_indices = train_indices + dev_indices
 91 |         train_data = [data[i] for i in train_dev_indices]
 92 |         dev_data_np = test_data_np
 93 |         dev_seqlen = test_seqlen
 94 |         dev_labels = test_labels
 95 |         config.num_samples = len(train_dev_indices)
 96 | 
 97 |     with tf.Graph().as_default():
 98 |         
 99 |         tf.set_random_seed(59)
100 | 
101 |         logger.info("Building model...",)
102 |         start = time.time()
103 |         model = BaselineLSTM(config)
104 |         logger.info("took %.2f seconds", time.time() - start)
105 | 
106 |         init = tf.global_variables_initializer()
107 |         
108 |         with tf.Session() as session:
109 |             session.run(init)
110 |             losses_ep, dev_performances_ep, dev_predicted_classes_ep, dev_predictions_ep = model.fit(session, train_data, dev_data_np, dev_seqlen, dev_labels) # MODIF
111 | 
112 |     # outputpath = '../../xp' # MODIF
113 |     convertOutputs(outputpath, config, losses_ep, dev_performances_ep) # MODIF
114 |     print('Dev Performance ', dev_performances_ep) #M
115 |     return losses_ep, dev_predicted_classes_ep, dev_performances_ep #MODIF
116 | 
117 | if __name__ == "__main__":
118 |     print('Doing something!')
119 |     print('Execution Complete')


--------------------------------------------------------------------------------
/code/our_model_config.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | class OurModel(object):
  4 |     """Abstracts a Tensorflow graph for use on final project.
  5 |     """
  6 | 
  7 |     def add_placeholders(self):
  8 |         """Generates placeholder variables to represent the input tensors
  9 |         """
 10 |         self.inputs_placeholder = tf.placeholder(tf.int64, shape=(None, self.config.max_length), name="x")
 11 |         self.labels_placeholder = tf.placeholder(tf.int64, shape=(None), name="y")
 12 | 
 13 |     def create_feed_dict(self, inputs_batch, labels_batch=None):
 14 |         """Creates the feed_dict for the model.
 15 |         """
 16 |         feed_dict = {
 17 |             self.inputs_placeholder: inputs_batch,
 18 |             }
 19 |         if labels_batch is not None:
 20 |             feed_dict[self.labels_placeholder] = labels_batch
 21 |         return feed_dict
 22 | 
 23 |     def add_embedding(self, option = 'Constant'):
 24 |         """Adds an embedding layer that maps from input tokens (integers) to vectors and then
 25 |         concatenates those vectors.
 26 | 
 27 |         Returns:
 28 |             embeddings: tf.Tensor of shape (None, max_length, n_features*embed_size)
 29 |         """
 30 |         if option == 'Variable':
 31 |             embeddings_temp = tf.nn.embedding_lookup(params = tf.Variable(self.config.pretrained_embeddings), ids = self.inputs_placeholder)
 32 |         elif option == 'Constant':
 33 |             embeddings_temp = tf.nn.embedding_lookup(params = tf.constant(self.config.pretrained_embeddings), ids = self.inputs_placeholder)
 34 |         embeddings = tf.reshape(embeddings_temp, shape = (-1, self.config.max_length, self.config.embed_size))
 35 |         ### END YOUR CODE
 36 |         return embeddings
 37 | 
 38 |     def add_prediction_op(self):
 39 |         """Implements the core of the model that transforms a batch of input data into predictions.
 40 | 
 41 |         Returns:
 42 |             pred: A tensor of shape (batch_size, n_classes)
 43 |         """
 44 |         raise NotImplementedError("Each Model must re-implement this method.")
 45 | 
 46 |     def add_loss_op(self, pred):
 47 |         """Adds ops to compute the loss function.
 48 | 
 49 |         Args:
 50 |             pred: A tensor of shape (batch_size, 1) containing the last
 51 |             state of the neural network.
 52 |         Returns:
 53 |             loss: A 0-d tensor (scalar)
 54 |         """
 55 |         y = tf.reshape(self.labels_placeholder, (-1, )) # Check whether this is necessary
 56 |         loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = pred, labels = y))
 57 |         return loss
 58 | 
 59 |     def add_training_op(self, loss):
 60 |         """Sets up the training Ops.
 61 | 
 62 |         Creates an optimizer and applies the gradients to all trainable variables.
 63 |         The Op returned by this function is what must be passed to the
 64 |         `sess.run()` call to cause the model to train.
 65 |         Args:
 66 |             loss: Loss tensor.
 67 |         Returns:
 68 |             train_op: The Op for training.
 69 |         """
 70 |         # Check if Adam has adaptive learning rate
 71 |         train_op = tf.train.AdamOptimizer(self.config.lr).minimize(loss)
 72 |         return train_op
 73 | 
 74 |     def train_on_batch(self, sess, inputs_batch, labels_batch):
 75 |         """Perform one step of gradient descent on the provided batch of data.
 76 | 
 77 |         Args:
 78 |             sess: tf.Session()
 79 |             input_batch: np.ndarray of shape (n_samples, n_features)
 80 |             labels_batch: np.ndarray of shape (n_samples, n_classes)
 81 |         Returns:
 82 |             loss: loss over the batch (a scalar)
 83 |         """
 84 |         feed = self.create_feed_dict(inputs_batch, labels_batch=labels_batch)
 85 |         _, loss = sess.run([self.train_op, self.loss], feed_dict=feed)
 86 |         return loss
 87 | 
 88 |     def predict_on_batch(self, sess, inputs_batch):
 89 |         """Make predictions for the provided batch of data
 90 | 
 91 |         Args:
 92 |             sess: tf.Session()
 93 |             input_batch: np.ndarray of shape (n_samples, n_features)
 94 |         Returns:
 95 |             predictions: np.ndarray of shape (n_samples, n_classes)
 96 |         """
 97 |         feed = self.create_feed_dict(inputs_batch)
 98 |         predictions = sess.run(self.pred, feed_dict=feed)
 99 |         return predictions
100 | 
101 |     def run_epoch(self, sess, train):
102 |         prog = Progbar(target=1 + int(len(train) / self.config.batch_size))
103 |         losses = []
104 |         for i, batch in enumerate(minibatches(train, self.config.batch_size)):
105 |             loss = self.train_on_batch(sess, *batch)
106 |             losses.append(loss)
107 |             prog.update(i + 1, [("train loss", loss)])
108 |         return losses
109 | 
110 |     def fit(self, sess, train):
111 |         losses = []
112 |         for epoch in range(self.config.n_epochs):
113 |             logger.info("Epoch %d out of %d", epoch + 1, self.config.n_epochs)
114 |             loss = self.run_epoch(sess, train)
115 |             losses.append(loss)
116 |         return losses
117 | 
118 |     def build(self):
119 |         self.add_placeholders()
120 |         self.pred = self.add_prediction_op()
121 |         self.loss = self.add_loss_op(self.pred)
122 |         self.train_op = self.add_training_op(self.loss)


--------------------------------------------------------------------------------
/code/our_util.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Utility functions
  5 | """
  6 | 
  7 | from __future__ import division
  8 | 
  9 | import sys
 10 | import time
 11 | import logging
 12 | import StringIO
 13 | import pandas as pd
 14 | from collections import defaultdict, Counter, OrderedDict
 15 | import numpy as np
 16 | from numpy import array, zeros, allclose
 17 | 
 18 | 
 19 | def split_data(data, prop_train = 0.6, prop_dev = 0.2, seed = None):
 20 |     ## Generate hold-out data
 21 |     np.random.seed(seed)
 22 |     # If data is a numpy object
 23 | 
 24 |     assert prop_train + prop_dev <= 1
 25 | 
 26 |     if (type(data).__module__ == np.__name__):
 27 |         
 28 |         num_samples = data.shape[0]
 29 |         num_train_samples = int(np.floor(num_samples * prop_train))
 30 |         num_dev_samples = int(np.floor(num_samples * prop_dev))
 31 | 
 32 |         indices = range(num_samples)
 33 |         np.random.shuffle(indices)
 34 |         
 35 |         train_indices = indices[0:num_train_samples]
 36 |         dev_indices = indices[num_train_samples:num_train_samples + num_dev_samples]
 37 |         test_indices = indices[num_train_samples+num_dev_samples:num_samples]
 38 | 
 39 |         train_data = data[indices[train_indices], :]
 40 |         dev_data = data[indices[dev_indices], :]
 41 |         test_data = data[indices[test_indices], :]
 42 |     
 43 |     elif isinstance(data, list):
 44 |         
 45 |         num_samples = len(data)
 46 |         num_train_samples = int(np.floor(num_samples * prop_train))
 47 |         num_dev_samples = int(np.floor(num_samples * prop_dev))
 48 |         
 49 |         indices = range(num_samples)
 50 |         np.random.shuffle(indices)
 51 | 
 52 |         # train_indices = indices[range(num_train_samples)]
 53 |         train_indices = indices[0:num_train_samples]
 54 |         dev_indices = indices[num_train_samples:num_train_samples + num_dev_samples]
 55 |         test_indices = indices[num_train_samples+num_dev_samples:num_samples]
 56 | 
 57 |         train_data = [data[i] for i in train_indices]
 58 |         dev_data = [data[i] for i in dev_indices]
 59 |         test_data = [data[i] for i in test_indices]
 60 | 
 61 |     return train_data, dev_data, test_data, train_indices, dev_indices, test_indices,
 62 | 
 63 | def split_indices(num_samples, prop_train = 0.6, prop_dev = 0.2):
 64 |     num_train_samples = int(np.floor(num_samples * prop_train))
 65 |     num_dev_samples = int(np.floor(num_samples * prop_dev))
 66 |     indices = range(num_samples)
 67 |     np.random.shuffle(indices)
 68 |     train_indices = indices[0:num_train_samples]
 69 |     dev_indices = indices[num_train_samples:num_train_samples + num_dev_samples]
 70 |     test_indices = indices[num_train_samples + num_dev_samples:num_samples]
 71 |     return train_indices, dev_indices, test_indices
 72 | 
 73 | def test_data_splitting(data):
 74 |     test_data, train_data = split_data(data)
 75 |     print 'Full data' + str(len(data))
 76 |     print 'Test' + str(len(test_data))
 77 |     print 'Train' + str(len(train_data))
 78 | 
 79 | # Returns a list of indices that should remain in the dataset
 80 | def downsample_label(y, label_for_ds = 3, downsample_factor = 4):
 81 |     y = np.asarray(y)
 82 |     indices = np.asarray(range(len(y)))
 83 |     indices_to_sample = indices[y == label_for_ds]
 84 |     n_samples = int(np.floor(len(indices_to_sample)/downsample_factor))
 85 |     sampled_indices = np.random.choice(indices_to_sample, size = n_samples, replace = False)
 86 |     output = np.append(indices[y != label_for_ds], sampled_indices)
 87 |     return(output)
 88 | 
 89 | def pack_labels(data, labels, seqlen): # MODIF
 90 |     output = []
 91 |     num_rows = data.shape[0]
 92 |     assert num_rows == len(labels)
 93 |     for i in range(data.shape[0]):
 94 |         the_row = data[i, :]
 95 |         output.append((the_row, labels[i], seqlen[i]))
 96 |     return output
 97 | 
 98 | def softmax(x):
 99 |     """Compute the softmax function for each row of the input x.
100 |     """
101 |     orig_shape = x.shape
102 | 
103 |     if len(x.shape) > 1:
104 |         # Matrix
105 |         x = x - np.amax(x, axis = 1).reshape(x.shape[0], 1)
106 |         rowSums = np.sum(np.exp(x), axis = 1).reshape(x.shape[0], 1)
107 |         x = np.exp(x) / rowSums
108 |     else:
109 |         # Vector
110 |         x = x - np.max(x)
111 |         theSum = np.sum(np.exp(x))
112 |         x = np.exp(x) / theSum
113 | 
114 |     assert x.shape == orig_shape
115 |     return x
116 | 
117 | # Compute performance metrics
118 | def get_performance(predicted, truth, n_classes = None, outputStyle = 'dict'):
119 |     # Predicted and observed are both integer vectors of class label
120 | 
121 |     # Cast both predicted and observed to numpy integer
122 |     predicted = np.asarray(predicted, dtype = np.int64)
123 |     truth = np.asarray(truth, dtype = np.int64)
124 | 
125 |     assert len(predicted) == len(truth)
126 | 
127 |     # Compute competition score:
128 |     competition_score = scorer(predicted, truth)
129 | 
130 |     output = []
131 |     # If n_classes is unknown, infer from the labels
132 |     if n_classes is None:
133 |         n_classes = len(np.unique(predicted.extend(truth)))
134 |     
135 |     for i in range(n_classes):
136 | 
137 |         # Get 2-way table
138 |         tp = sum((predicted == i) & (truth == i))
139 |         tn = sum((predicted != i) & (truth != i))
140 |         fp = sum((predicted == i) & (truth != i))
141 |         fn = sum((predicted != i) & (truth == i))
142 | 
143 |         print 'tp ' + str(tp)
144 |         print 'tn ' + str(tn)
145 |         print 'fp ' + str(fp)
146 |         print 'fn ' + str(fn)
147 | 
148 |         # Compute performance metrics
149 |         recall = tp / (tp + fn) # aka sensitivity
150 |         print 'recall ' + str(recall)
151 |         precision = tp / (tp + fp) # aka ppv
152 |         print 'precision ' + str(precision)
153 |         specificity = tn / (tn + fp)
154 |         print 'specificity ' + str(specificity)
155 |         f1 = 2 * tp / (2 * tp + fp + fn)
156 |         print 'f1 ' + str(f1)
157 |         accuracy = (tp + tn)/len(truth)
158 |         
159 |         keys = ['tp', 'tn', 'fp', 'fn', 'recall', 'precision', 'specificity', 'f1', 'accuracy', 'competition']
160 |         values = [tp, tn , fp, fn, recall, precision, specificity, f1, accuracy, competition_score]
161 |         output.append(dict(zip(keys, values)))
162 | 
163 |     return output
164 | 
165 | # Computes competition score
166 | def scorer(pred, truth):
167 |     # Maximum possible score
168 |     max_score = 0.25 * sum(truth == 3) + 1 * sum(truth != 3)
169 |     # Computing achieved sore
170 |     # Score from unrelated correct
171 |     unrelated_score = 0.25 * sum((truth == 3) & (pred == truth))
172 |     # Score from related correct, but specific class incorrect
173 |     related_score1 = 0.25 * sum((truth != 3) & (pred != truth) & (pred != 3))
174 |     # Score from getting related correct, specific class correct
175 |     related_score2 = 0.75 * sum((truth != 3) & (pred == truth))
176 | 
177 |     final_score = (unrelated_score + related_score1 + related_score2) / max_score
178 |     return final_score
179 | 
180 | def convertOutputs(outputpath, config, losses_ep, dev_performances_ep): #MODIF
181 | 
182 |     '''
183 |     Inputs are lists of length n_epochs
184 |         - losses_ep: list. losses_ep[i][j] --> loss after batch j
185 |         - dev_performances_ep: dictionnary
186 |         - dev_predicted_classes_ep: np.array
187 |         - dev_predictions_ep: np.array
188 |     '''        
189 |          
190 |     # Define parameter keys
191 |     parameter_keys = dir(config)
192 |     params_remove = ['__doc__', '__module__','pretrained_embeddings']
193 |     parameter_keys = [param for param in parameter_keys if param not in params_remove]
194 |     print('parameter_keys', parameter_keys)
195 | 
196 |     n_epochs = getattr(config,'n_epochs')
197 |     
198 |     # Define column names
199 |     common_keys = parameter_keys + ['epoch'] # Common keys to all csv files
200 |     performance_keys = (dev_performances_ep[0][0]).keys() # [0] for epoch / [0] for 1st class
201 |                        # Keys specific to performance output
202 |     
203 |     # Initialization        
204 |     performances_pds = []
205 | 
206 |     for i in range(n_epochs):   
207 |         # Performance csv
208 |         performance_pd = pd.DataFrame(index = range(4), columns = common_keys + ['class'] + performance_keys)
209 |         performance_pd['class'] = range(4)
210 |         for j, outp in enumerate(dev_performances_ep[i]):
211 |             for key in outp.keys():
212 |                 performance_pd.loc[j, key] = outp[key]
213 |         performance_pd['epoch'] = i
214 |         performance_pd['train_loss'] = 1.0 * sum(losses_ep[i]) / len(losses_ep[i])
215 |         performances_pds.append(performance_pd)
216 |     # Append all dataframes
217 |     performance_pd_global = pd.concat(performances_pds, axis = 0)
218 |                 
219 |     # Loss dataframe
220 |     losses_pd_global = pd.DataFrame(columns = common_keys + ['loss'])
221 |     losses_ep = np.array(losses_ep)
222 |     losses_pd_global['epoch'] = range(1, n_epochs+1)
223 |     losses_pd_global['loss'] = np.mean(losses_ep, axis = 1)
224 |     
225 |     # Adding parameter columns
226 |     output_pds = [performance_pd_global, losses_pd_global]
227 |     for par_name in parameter_keys:
228 |         for output_pd in output_pds:
229 |             output_pd[par_name] = getattr(config,par_name)
230 |     
231 |     # --- Writing to csv ---
232 |     performance_pd_global.to_csv(outputpath+'/perf_'+ str(time.time()).replace('.','') + '.csv',index = False)
233 |     losses_pd_global.to_csv(outputpath+'/losses_'+ str(time.time()).replace('.','') + '.csv', index = False)
234 | 
235 | 
236 | # BACK-UP FUNCTION
237 | def convertOutputs0(outputpath, config, losses_ep, dev_performances_ep): #MODIF
238 | 
239 |     '''
240 |     Inputs are lists of length n_epochs
241 |         - losses_ep: list. losses_ep[i][j] --> loss after batch j
242 |         - dev_performances_ep: dictionnary
243 |         - dev_predicted_classes_ep: np.array
244 |         - dev_predictions_ep: np.array
245 |     '''        
246 |          
247 |     # Define parameter dict
248 |     parameter_dict = config.__dict__
249 |     parameter_dict.pop('pretrained_embeddings', None) # Removing embedding matrix
250 |     # Added line to handle list-valued parameter
251 |     # if 'extra_hidden_size' in parameter_dict & parameter_dict['extra_hidden_size'] is not None:
252 |     #     parameter_dict['extra_hidden_size'] = str(parameter_dict['extra_hidden_size'])
253 |     parameter_keys = parameter_dict.keys()
254 |     print('parameter_keys', parameter_keys)
255 |     n_epochs = parameter_dict['n_epochs']
256 |     
257 |     # Define column names
258 |     common_keys = parameter_keys + ['epoch'] # Common keys to all csv files
259 |     performance_keys = (dev_performances_ep[0][0]).keys() # [0] for epoch / [0] for 1st class
260 |                        # Keys specific to performance output
261 |     
262 |     # Initialization        
263 |     performances_pds = []
264 | 
265 |     for i in range(n_epochs):   
266 |         # Performance csv
267 |         performance_pd = pd.DataFrame(index = range(4), columns = common_keys + ['class'] + performance_keys)
268 |         performance_pd['class'] = range(4)
269 |         for j, outp in enumerate(dev_performances_ep[i]):
270 |             for key in outp.keys():
271 |                 performance_pd.loc[j, key] = outp[key]
272 |         performance_pd['epoch'] = i
273 |         performances_pds.append(performance_pd)
274 |     # Append all dataframes
275 |     performance_pd_global = pd.concat(performances_pds, axis = 0)
276 |                 
277 |     # Loss dataframe
278 |     losses_pd_global = pd.DataFrame(columns = common_keys + ['loss'])
279 |     losses_ep = np.array(losses_ep)
280 |     losses_pd_global['epoch'] = range(1, n_epochs+1)
281 |     losses_pd_global['loss'] = np.mean(losses_ep, axis = 1)
282 |     
283 |     # Adding parameter columns
284 |     output_pds = [performance_pd_global, losses_pd_global]
285 |     for par_name in parameter_keys:
286 |         for output_pd in output_pds:
287 |             output_pd[par_name] = parameter_dict[par_name]
288 |     
289 |     # --- Writing to csv ---
290 |     performance_pd_global.to_csv(outputpath+'/perf_'+ str(time.time()).replace('.','') + '.csv',index = False)
291 |     losses_pd_global.to_csv(outputpath+'/losses_'+ str(time.time()).replace('.','') + '.csv', index = False)
292 | 
293 | # Ferdinand
294 | def get_minibatches(data, minibatch_size, shuffle=True):
295 |     
296 |     '''
297 |     MODIF
298 |     Assuming we have a list [examples, labels, seqlen] of np.array
299 |     '''
300 | 
301 |     list_data = type(data) is list and (type(data[0]) is list or type(data[0]) is np.ndarray)
302 |     data_size = len(data[0]) if list_data else len(data)
303 |     indices = np.arange(data_size)
304 |     if shuffle:
305 |         np.random.shuffle(indices)
306 |     for minibatch_start in np.arange(0, data_size, minibatch_size):
307 |         minibatch_indices = indices[minibatch_start:minibatch_start + minibatch_size]
308 |         
309 |         if list_data:
310 |             examples_minibatch = minibatch(data[0], minibatch_indices) # np.array of shape (batch_size, max_length_global)
311 |             labels_minibatch = minibatch(data[1], minibatch_indices)
312 |             seqlen_minibatch = minibatch(data[2], minibatch_indices)
313 |             
314 |             # Truncating sentences to the max_length of the minibatch --> NOT HERE, placeholders have fixed side
315 |             #max_len_minibatch = max(seqlen_minibatch)
316 |             #examples_minibatch = examples_minibatch[:,:max_len_minibatch]
317 |             
318 |             yield [examples_minibatch, labels_minibatch, seqlen_minibatch]
319 |         
320 |         else: # no truncating if data not in the 'packed' list format [examples, labels, seqlen]
321 |             yield minibatch(data, minibatch_indices)
322 | 
323 | 
324 | ## Derived from Stanford CS 224n started code provided for assignment 3.
325 | def minibatch(data, minibatch_idx):
326 |     return data[minibatch_idx] if type(data) is np.ndarray else [data[i] for i in minibatch_idx]
327 | 
328 | def minibatches(data, batch_size, shuffle=True):
329 |     batches = [np.array(col) for col in zip(*data)]
330 |     return get_minibatches(batches, batch_size, shuffle)
331 | 
332 | 
333 | class Progbar(object):
334 |     """
335 |     Progbar class copied from keras (https://github.com/fchollet/keras/)
336 |     Displays a progress bar.
337 |     # Arguments
338 |         target: Total number of steps expected.
339 |         interval: Minimum visual progress update interval (in seconds).
340 |     """
341 | 
342 |     def __init__(self, target, width=30, verbose=1):
343 |         self.width = width
344 |         self.target = target
345 |         self.sum_values = {}
346 |         self.unique_values = []
347 |         self.start = time.time()
348 |         self.total_width = 0
349 |         self.seen_so_far = 0
350 |         self.verbose = verbose
351 | 
352 |     def update(self, current, values=None, exact=None):
353 |         """
354 |         Updates the progress bar.
355 |         # Arguments
356 |             current: Index of current step.
357 |             values: List of tuples (name, value_for_last_step).
358 |                 The progress bar will display averages for these values.
359 |             exact: List of tuples (name, value_for_last_step).
360 |                 The progress bar will display these values directly.
361 |         """
362 |         values = values or []
363 |         exact = exact or []
364 | 
365 |         for k, v in values:
366 |             if k not in self.sum_values:
367 |                 self.sum_values[k] = [v * (current - self.seen_so_far), current - self.seen_so_far]
368 |                 self.unique_values.append(k)
369 |             else:
370 |                 self.sum_values[k][0] += v * (current - self.seen_so_far)
371 |                 self.sum_values[k][1] += (current - self.seen_so_far)
372 |         for k, v in exact:
373 |             if k not in self.sum_values:
374 |                 self.unique_values.append(k)
375 |             self.sum_values[k] = [v, 1]
376 |         self.seen_so_far = current
377 | 
378 |         now = time.time()
379 |         if self.verbose == 1:
380 |             prev_total_width = self.total_width
381 |             sys.stdout.write("\b" * prev_total_width)
382 |             sys.stdout.write("\r")
383 | 
384 |             numdigits = int(np.floor(np.log10(self.target))) + 1
385 |             barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
386 |             bar = barstr % (current, self.target)
387 |             prog = float(current)/self.target
388 |             prog_width = int(self.width*prog)
389 |             if prog_width > 0:
390 |                 bar += ('='*(prog_width-1))
391 |                 if current < self.target:
392 |                     bar += '>'
393 |                 else:
394 |                     bar += '='
395 |             bar += ('.'*(self.width-prog_width))
396 |             bar += ']'
397 |             sys.stdout.write(bar)
398 |             self.total_width = len(bar)
399 | 
400 |             if current:
401 |                 time_per_unit = (now - self.start) / current
402 |             else:
403 |                 time_per_unit = 0
404 |             eta = time_per_unit*(self.target - current)
405 |             info = ''
406 |             if current < self.target:
407 |                 info += ' - ETA: %ds' % eta
408 |             else:
409 |                 info += ' - %ds' % (now - self.start)
410 |             for k in self.unique_values:
411 |                 if isinstance(self.sum_values[k], list):
412 |                     info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
413 |                 else:
414 |                     info += ' - %s: %s' % (k, self.sum_values[k])
415 | 
416 |             self.total_width += len(info)
417 |             if prev_total_width > self.total_width:
418 |                 info += ((prev_total_width-self.total_width) * " ")
419 | 
420 |             sys.stdout.write(info)
421 |             sys.stdout.flush()
422 | 
423 |             if current >= self.target:
424 |                 sys.stdout.write("\n")
425 | 
426 |         if self.verbose == 2:
427 |             if current >= self.target:
428 |                 info = '%ds' % (now - self.start)
429 |                 for k in self.unique_values:
430 |                     info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
431 |                 sys.stdout.write(info + "\n")
432 | 
433 |     def add(self, n, values=None):
434 |         self.update(self.seen_so_far+n, values)
435 | 


--------------------------------------------------------------------------------
/code/run_text_processing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Text processing of data
  5 | """
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import os
 10 | 
 11 | import cPickle as pickle
 12 | 
 13 | from fnc_baseline.utils.score import report_score, LABELS, score_submission
 14 | from fnc_baseline.utils.dataset import DataSet
 15 | 
 16 | import codecs
 17 | import sys
 18 | reload(sys) # for text processing
 19 | sys.setdefaultencoding('utf8') # for text processing
 20 | 
 21 | # ======== Load data =======
 22 | 
 23 | def read_data(base_path = '/Users/spfohl/Documents/CS_224n/project/altfactcheckers'):
 24 |     
 25 |     # Extracting data
 26 |     dataset = DataSet(path = base_path + '/data')
 27 |     stances = dataset.stances
 28 |     articles = dataset.articles
 29 |     
 30 |     # Data to lists
 31 |     h, b, y = [],[],[]
 32 |     for stance in stances:
 33 |         y.append(LABELS.index(stance['Stance']))
 34 |         h.append(stance['Headline'])
 35 |         b.append(dataset.articles[stance['Body ID']])
 36 |     y = np.asarray(y, dtype = np.int64)
 37 |     return h, b, y
 38 | 
 39 | # ----- Loading Glove embeddings ----
 40 | def loadGloVe(filename):
 41 |     # Getting embedding dimension
 42 |     file0 = open(filename,'r')
 43 |     # file0 = codecs.open(filename, 'r', 'utf8', 'ignore')
 44 |     line = file0.readline()
 45 |     emb_dim = len(line.strip().split(' ')) - 1
 46 |     file0.close()
 47 | 
 48 |     # First row of embedding matrix is 0 for zero padding
 49 |     vocab = ['<pad>']
 50 |     embd = [[0.0] * emb_dim]
 51 | 
 52 |     # Reading embedding matrix
 53 |     file = open(filename,'r')
 54 |     # file = codecs.open(filename, 'r', 'utf8', 'ignore')
 55 |     for line in file.readlines():
 56 |         row = line.strip().split(' ')
 57 |         vocab.append(row[0])
 58 |         embd.append(map(float,row[1:]))
 59 |     print('Loaded GloVe!')
 60 |     file.close()
 61 |     return vocab,embd
 62 | 
 63 | 
 64 | # ------ Clean quote signs ---------
 65 | def clean_data(sentences):
 66 |     '''
 67 |     Delete quote signs
 68 |         - Rational: quote signs mix with the parsing
 69 |         - Con: quote signs are meaningul --> distanciation from a statement
 70 |     '''
 71 |     new_sentences = []
 72 |     for sentence in sentences:
 73 |         new_sentences.append(sentence.replace("'","").replace('"',''))
 74 |     return new_sentences
 75 | 
 76 | # ---- Build vocab dictionary from embedding matrix -----
 77 | def build_vocDict(vocab):
 78 |     voc_dict = {}
 79 |     for i in range(len(vocab)):
 80 |         voc_dict[vocab[i]] = i
 81 |     return voc_dict
 82 | 
 83 | # -------- words to ids only -------
 84 | #==========ADDED BY OSKAR============#
 85 | def words2ids(sentences, voc_dict, option = 'simple'):
 86 |     '''
 87 |     Inputs: 
 88 |         - sentences: list of sentences as string
 89 |         - embedding_vocab: list of vocab words in the order of the rows of embedding_matrix
 90 |     Ouptut: 
 91 |         - new_sentences_ids: list of sentences as successive word indexes
 92 |     Processing: delete word which do no appear in vocabulary
 93 |         - Alternative: replace missing words by the mean
 94 |     '''
 95 |     new_sentences_ids = []
 96 |     j = 0
 97 |     for sentence in sentences:
 98 |         j+=1
 99 |         if j % 5000 == 0:
100 |             print ('sentence',str(j))
101 |         sentence_ids = []
102 |         if option == 'nltk':
103 |             sentence = sentence.decode('utf8', 'ignore')
104 |             # print('sentence', sentence)
105 |             word_list = tokenize(sentence)
106 |             # print('word_list', word_list)
107 |         elif option == 'simple':
108 |             word_list = sentence.split(" ")
109 |         
110 |         for word in word_list:
111 |             if word.lower() in voc_dict: # Only add word if in dictionary
112 |                 word_index = voc_dict[word.lower()]
113 |                 sentence_ids.append(word_index)
114 |                 
115 |         new_sentences_ids.append(sentence_ids)
116 |         #print ("added",j)
117 |     return new_sentences_ids
118 | 
119 | 
120 | # -------- words to ids and vectors -------
121 | def words2ids_vects(sentences, voc_dict, embedding_matrix, option = 'simple'):
122 |     '''
123 |     Inputs: 
124 |         - sentences: list of sentences as string
125 |         - embedding_vocab: list of vocab words in the order of the rows of embedding_matrix
126 |         - embedding_matrix
127 |     Ouptut: 
128 |         - new_sentences_ids: list of sentences as successive word indexes
129 |         - new_sentences_vects: list of sentences as successive word vectors
130 |     Processing: delete word which do no appear in vocabulary
131 |         - Alternative: replace missing words by the mean
132 |     '''
133 |     
134 |     new_sentences_ids = []
135 |     new_sentences_vects = []
136 |     j = 0
137 |     for sentence in sentences:
138 |         j+=1
139 |         if j % 5000 == 0:
140 |             print ('sentence',str(j))
141 |         sentence_ids = []
142 |         sentence_vects = []
143 |         if option == 'nltk':
144 |             sentence = sentence.decode('utf8', 'ignore')
145 |             # print('sentence', sentence)
146 |             word_list = tokenize(sentence)
147 |             # print('word_list', word_list)
148 |         elif option == 'simple':
149 |             word_list = sentence.split(" ")
150 |         
151 |         for word in word_list:
152 |             if word.lower() in voc_dict: # Only add word if in dictionary
153 |                 word_index = voc_dict[word.lower()]
154 |                 sentence_ids.append(word_index)
155 |                 sentence_vects.append(embedding_matrix[word_index])
156 |                 
157 |         new_sentences_ids.append(sentence_ids)
158 |         #print ("added", j)
159 |         new_sentences_vects.append(sentence_vects)
160 |     return new_sentences_ids, new_sentences_vects
161 | 
162 | def tokenize(sequence):
163 |     tokens = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sequence)]
164 |     # return tokens
165 |     return map(lambda x:x.encode('utf8', errors = 'ignore'), tokens)
166 | 
167 | # ---------- Averaging vectors for headline and truncated body ---------
168 | 
169 | def avg_trunc(sentences_vects):
170 |     s_vects_np = []
171 |     for sentence in sentences_vects:
172 |         s_vects_np.append(np.array(sentence))
173 |     s_vects_avg = []
174 |     for sentence in s_vects_np:
175 |         s_vects_avg.append(np.mean(sentence,axis=0))
176 |     return s_vects_avg
177 | 
178 | def concatConvert_np(h_list, b_list):
179 |     '''
180 |     1. Concatenate headlines and bodies
181 |     2. Convert list data to numpy zero padded data
182 |     3. Also outputs sequences lengths as np vector
183 |     '''
184 |     
185 |     # Concatenate
186 |     n_sentences = len(h_list)
187 |     h_b_list = []
188 |     seqlen = []
189 |     for i in range(n_sentences):
190 |         h_b_list.append(h_list[i] + b_list[i])
191 |         seqlen.append(len(h_b_list[i]))
192 |         
193 |     max_len = max(seqlen)
194 |     
195 |     # Convert to numpy with zero padding. No truncating
196 |     h_b_np = np.zeros((n_sentences, max_len))
197 |     for i in range(n_sentences):
198 |         h_b_np[i,:seqlen[i]] = h_b_list[i]
199 |     
200 |     return h_b_list, h_b_np, np.array(seqlen)
201 | 
202 | def distinctConvert_np(h_list, b_list):
203 |     '''
204 |     1. Convert list data to numpy zero padded data, 2 distinct matrices for headlines and bodies 
205 |     2. Also outputs sequences lengths as np vector
206 |     '''
207 |     # Compute sequences lengths
208 |     n_sentences = len(h_list)
209 |     h_seqlen = []
210 |     b_seqlen = []
211 |     for i in range(n_sentences):
212 |         h_seqlen.append(len(h_list[i]))
213 |         b_seqlen.append(len(b_list[i]))
214 |         
215 |     h_max_len = max(h_seqlen)
216 |     b_max_len = max(b_seqlen)
217 |     
218 |     # Convert to numpy
219 |     h_np = np.zeros((n_sentences, h_max_len))
220 |     b_np = np.zeros((n_sentences, b_max_len))
221 |     for i in range(n_sentences):
222 |         h_np[i,:h_seqlen[i]] = h_list[i]
223 |         b_np[i,:b_seqlen[i]] = b_list[i]
224 |         
225 |     return h_np, np.array(h_seqlen), b_np, np.array(b_seqlen)
226 | 
227 | #------for nn_test--------#
228 | #==========ADDED BY OSKAR============#
229 | def get_BOW_data(config, reload = None, save_data = None):
230 |     ## Random seed
231 |     np.random.seed(1)
232 | 
233 |     # Define path
234 |     cwd = os.getcwd()
235 |     filename_embeddings = cwd + '/../../glove/glove.6B.50d.txt'
236 | 
237 |     # GloVe embeddings
238 |     vocab,embd = loadGloVe(filename_embeddings)
239 |     vocab_size = len(vocab)
240 |     embedding_dim = len(embd[0])
241 |     embedding = np.asarray(embd)
242 | 
243 |     if reload:
244 |         # Get vocab dict
245 |         voc_dict = build_vocDict(vocab)
246 |         
247 |         # Read and process data
248 |         h, b, y = read_data(cwd + '/../../') # headline / bodies/ labels
249 |         # h_ids, _ = words2ids_vects(h, voc_dict, embd)
250 |         # b_ids, _ = words2ids_vects(b, voc_dict, embd)
251 |         h_ids = words2ids(h, voc_dict)
252 |         b_ids = words2ids(b, voc_dict)
253 |         
254 |         # zero padded np matrices for headlines and bodies; seq. lengths as np vector
255 |         h, h_len, b, b_len = distinctConvert_np(h_ids, b_ids)
256 | 
257 |         # Find and delete empty headings/bodies
258 |         ind_empty = []
259 |         for i in range(np.shape(h)[0]):
260 |             if ((h_len[i] == 0) or (b_len[i] == 0)):
261 |                 ind_empty.append(i)
262 |                 # print(i)
263 |         print('Empty sequences: ', ind_empty)
264 |         if (len(ind_empty) > 0):
265 |             y = np.delete(y, ind_empty)
266 |             h = np.delete(h, ind_empty, 0)
267 |             b = np.delete(b, ind_empty, 0)
268 |             h_len = np.delete(h_len, ind_empty)
269 |             b_len = np.delete(b_len, ind_empty)
270 | 
271 |         if save_data:
272 |             # Attention: Bodies CSV is HUGE (800mb)
273 |             assert(False) ## Do you REALLY want to do this? Consider saving it in a txt file instead.
274 |             # Write
275 |             y_pd = pd.DataFrame(y) 
276 |             h_pd = pd.DataFrame(h) 
277 |             b_pd = pd.DataFrame(b) 
278 |             h_len_pd = pd.DataFrame(h_len) 
279 |             b_len_pd = pd.DataFrame(b_len) 
280 |             y_pd.to_csv('saved_data/y_noempty.csv', index = False, header = False)
281 |             h_pd.to_csv('saved_data/h_noempty.csv', index = False, header = False)
282 |             b_pd.to_csv('saved_data/b_noempty.csv', index = False, header = False)
283 |             h_len_pd.to_csv('saved_data/h_len_noempty.csv', index = False, header = False)
284 |             b_len_pd.to_csv('saved_data/b_len_noempty.csv', index = False, header = False)
285 |             # assert(False)
286 | 
287 |     if not reload:
288 |         # Load 
289 |         # Attention: Bodies CSV is HUGE (800mb)
290 |         print("Loading Data")
291 |         y = np.asarray(pd.read_csv('saved_data/y_noempty.csv', header = None))
292 |         print("loaded labels")
293 |         h = np.asarray(pd.read_csv('saved_data/h_noempty.csv', header = None))
294 |         print("loaded headings")
295 |         b = np.asarray(pd.read_csv('saved_data/b_noempty.csv', header = None))
296 |         print("loaded headings")
297 |         h_len = np.asarray(pd.read_csv('saved_data/h_len_noempty.csv', header = None))
298 |         b_len = np.asarray(pd.read_csv('saved_data/b_len_noempty.csv', header = None))
299 |         print("loaded lengths")
300 |         # assert(False)
301 | 
302 |     # Modify the config
303 |     config.embed_size = embedding_dim
304 |     config.pretrained_embeddings = embedding
305 |     config.vocab_size = vocab_size    
306 |     # finish
307 |     return config, y, h, b, h_len, b_len
308 | 
309 | ## Added by Stephen
310 | def save_data_pickle(outfilename, 
311 |                     embedding_type = 'twitter.27B.50d',
312 |                     parserOption = 'nltk'):
313 |     cwd = os.getcwd()
314 |     if embedding_type == 'twitter.27B.50d':
315 |         filename_embeddings = cwd + '/../../glove/glove.twitter.27B.50d.txt'
316 |     else: 
317 |         filename_embeddings = cwd + '/../../glove/glove.6B.50d.txt'
318 | 
319 |     # filename_embeddings = cwd + filename_embeddings
320 | 
321 |     # GloVe embeddings
322 |     vocab, embd = loadGloVe(filename_embeddings)
323 |     vocab_size = len(vocab)
324 |     embedding_dim = len(embd[0])
325 |     embedding = np.asarray(embd, dtype = np.float64)
326 | 
327 |     # Get vocab dict
328 |     voc_dict = build_vocDict(vocab)
329 |     
330 |     # Read and process data
331 |     h, b, y = read_data(cwd + '/../../') # headline / bodies/ labels
332 |     h_ids, h_vects = words2ids_vects(h, voc_dict, embd, parserOption)
333 |     b_ids, b_vects = words2ids_vects(b, voc_dict, embd, parserOption)
334 |     
335 |     # Concatenated headline_bodies zero padded np matrices; seq. lengths as np vector
336 |     h_b_ids, h_b_np, seqlen = concatConvert_np(h_ids, b_ids)
337 |     h_np, h_seqlen, b_np, b_seqlen = distinctConvert_np(h_ids, b_ids)
338 | 
339 |     data_dict = {'h_ids':h_ids, 'b_ids':b_ids, 'y':y}
340 |     with open(cwd + outfilename, 'wb') as fp:
341 |         pickle.dump(data_dict, fp)
342 | 
343 | ## Added by Stephen
344 | def get_data(config, 
345 |             filename_embeddings = '/../../glove/glove.twitter.27B.50d.txt',
346 |             pickle_path = '/../../glove/twitter50d_h_ids_b_ids_pickle.p',
347 |             concat = True):
348 |     # np.random.seed(41)
349 | 
350 |     # Base path
351 |     cwd = os.getcwd()
352 |     # filename_embeddings = cwd + '/../../glove/glove.6B.50d.txt'
353 | 
354 |     filename_embeddings = cwd + filename_embeddings
355 |     
356 |     # GloVe embeddings
357 |     vocab, embd = loadGloVe(filename_embeddings)
358 |     vocab_size = len(vocab)
359 |     embedding_dim = len(embd[0])
360 |     embedding = np.asarray(embd, dtype = np.float64)
361 | 
362 |     # Get vocab dict
363 |     voc_dict = build_vocDict(vocab)
364 |     
365 |     # Read and process data
366 |     # h, b, y = read_data(cwd + '/../../') # headline / bodies/ labels
367 | 
368 |     print('Loading Pickle')
369 |     load_path = cwd + pickle_path
370 |     with open (load_path, 'rb') as fp:
371 |         data_dict = pickle.load(fp)
372 |     h_ids = data_dict['h_ids']
373 |     b_ids = data_dict['b_ids']
374 |     y = data_dict['y']
375 |     print('finished loading Pickle')
376 |     
377 |     # Concatenated headline_bodies zero padded np matrices; seq. lengths as np vector
378 |     # h_b_ids, h_b_np, seqlen = concatConvert_np(h_ids, b_ids)
379 |     # h_np, h_seqlen, b_np, b_seqlen = distinctConvert_np(h_ids, b_ids)
380 | 
381 |     if concat:
382 |         h_b_ids, h_b_np, seqlen = concatConvert_np(h_ids, b_ids)
383 |         output_dict = {'y':y,
384 |                        'h_b_np':h_b_np, 
385 |                        'seqlen':seqlen}
386 |     else:
387 |         h_np, h_seqlen, b_np, b_seqlen = distinctConvert_np(h_ids, b_ids)
388 |         # Find and delete empty
389 |         ind_empty = []
390 |         for i in range(np.shape(h_np)[0]):
391 |             if ((h_seqlen[i] == 0) or (b_seqlen[i] == 0)):
392 |                 ind_empty.append(i)
393 |         print('Empty sequences: ', ind_empty)
394 |         if (len(ind_empty) > 0):
395 |             y = np.delete(y, ind_empty)
396 |             h_np = np.delete(h_np, ind_empty, 0)
397 |             b_np = np.delete(b_np, ind_empty, 0)
398 |             h_seqlen = np.delete(h_seqlen, ind_empty)
399 |             b_seqlen = np.delete(b_seqlen, ind_empty)
400 |         output_dict = {'y':y,
401 |                        'h_np':h_np, 
402 |                        'b_np':b_np, 
403 |                        'h_seqlen':h_seqlen,
404 |                        'b_seqlen':b_seqlen}
405 | 
406 |     config.embed_size = embedding_dim
407 |     config.pretrained_embeddings = embedding
408 |     config.vocab_size = vocab_size
409 |     return config, output_dict
410 | 
411 | # if __name__ == '__main__':
412 | #     # ========== YOUR OWN EMBEDDING MATRIX PATH HERE =========
413 | #     filename_embeddings = '/Users/spfohl/Documents/CS_224n/project/altfactcheckers/code/stephen_scratch/glove.6B/glove.6B.50d.txt'
414 |     
415 | #     # Glove
416 | #     vocab,embd = loadGloVe(filename_embeddings)
417 | #     vocab_size = len(vocab)
418 | #     embedding_dim = len(embd[0])
419 | #     embedding = np.asarray(embd)
420 |     
421 | #     print(embedding[0:5, :])
422 | #     # Dictionary
423 | #     voc_dict = build_vocDict(vocab)
424 |     
425 | #     # Read and process data
426 | #     h, b, y = read_data() # headline / bodies/ labels
427 | #     h_ids, h_vects = words2ids_vects(h, voc_dict, embd)
428 | #     b_ids, b_vects = words2ids_vects(b, voc_dict, embd)
429 |     
430 | #     # Concatenated headline_bodies zero padded np matrices; seq. lengths as np vector
431 | #     h_b_ids, h_b_np, seqlen = concatConvert_np(h_ids, b_ids)
432 |     
433 | #     # Distinct headline / bodies zero padded np matrices; seq lengths as np vectors
434 | #     h_np, h_seqlen, b_np, b_seqlen = distinctConvert_np(h_ids, b_ids)


--------------------------------------------------------------------------------
/code/test_script6.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | ######
  5 | # Call all models with different hyperparameters
  6 | ######
  7 | 
  8 | # standard libs
  9 | import numpy as np
 10 | 
 11 | # our code imports
 12 | from execute_bow_config import run_bow
 13 | from execute_lstm_config import run_lstm
 14 | from execute_lstm_attention import run_lstm_attention
 15 | from execute_lstm_conditional import run_lstm_conditional
 16 | 
 17 | ### Parameter Overview:
 18 | class Config:
 19 |   """Holds model hyperparams and data information.
 20 |   The config class is used to store various hyperparameters and dataset
 21 |   information parameters. Model objects are passed a Config() object at
 22 |   instantiation. Use self.config.? instead of Config.?
 23 |   """
 24 |   ### Parameter Overview:
 25 |   ## For all models:
 26 |   # main params,
 27 |   n_epochs = 40
 28 |   lr = 0.001
 29 |   batch_size = 128 
 30 |   n_classes = 4 
 31 |   hidden_size = 100
 32 |   n_layers = 0
 33 |   xp = None
 34 |   model = None
 35 | 
 36 |   ## Determined at data loading:
 37 |   embed_size = None  # not passed to config - assigned in get_data
 38 |   vocab_size = None  # not passed to config - assigned in get_data
 39 |   pretrained_embeddings = []  # not passed to config   - assigned in get_data
 40 |   num_samples = None  # only indirectly passed to comfig, If defined, shortens the dataset, Otherwise determined at data loading,
 41 |   downsample = False
 42 | 
 43 |   ## LSTM specific:
 44 |   # main params 
 45 |   dropout  = 0.8  ## Attention: this is the keep_prob! # not assigned to BOW
 46 |   # extra_hidden_size = None
 47 |   trainable_embeddings = 'Variable'
 48 |   max_length = None  # indirectly passed to config in LSTM, If defined, truncates sequences, Otherwise determined at data loading
 49 |   attention_length = 15
 50 | 
 51 |   ## BOW specific:
 52 |   # main params
 53 |   hidden_next = 0.6  # defines the number of hidden units in next layer
 54 |   # Determined at data loading:
 55 |   h_max_len = None  # not passed to config
 56 |   b_max_len = None  # not passed to config
 57 | 
 58 | 
 59 | def run_bow_with_parameters(args):
 60 | 
 61 |   # Final test st
 62 |   np.random.seed(1)
 63 |   config = Config()
 64 |   config.n_layers = 1
 65 |   config.xp = 'final_test'
 66 |   config.model = 'bow'
 67 |   config.lr = 0.005
 68 |   config.trainable_embeddings = 'Variable'
 69 |   config.b_max_len = 600
 70 |   config.n_epochs = 40
 71 |   result = run_bow(config, final = True)
 72 | 
 73 |   ## Experiment 
 74 |   # np.random.seed(1)
 75 |   # config = Config()
 76 |   # config.n_layers = 1
 77 |   # config.xp = 'layers'
 78 |   # config.model = 'bow'
 79 |   # config.lr = 0.005
 80 |   # config.trainable_embeddings = 'Variable'
 81 |   # config.b_max_len = 75
 82 |   # result = run_bow(config)
 83 | 
 84 |   # ## Experiment 
 85 |   # np.random.seed(1)
 86 |   # config = Config()
 87 |   # config.n_layers = 3
 88 |   # config.xp = 'layers'
 89 |   # config.model = 'bow'
 90 |   # config.lr = 0.005
 91 |   # config.trainable_embeddings = 'Constant'
 92 |   # config.b_max_len = 75
 93 |   # result = run_bow(config)
 94 | 
 95 |   # ## Experiment 
 96 |   # np.random.seed(1)
 97 |   # config = Config()
 98 |   # config.n_layers = 0
 99 |   # config.xp = 'layers'
100 |   # config.model = 'bow'
101 |   # config.lr = 0.005
102 |   # config.trainable_embeddings = 'Variable'
103 |   # config.b_max_len = 150
104 |   # result = run_bow(config)
105 | 
106 |   # ## Experiment 
107 |   # np.random.seed(1)
108 |   # config = Config()
109 |   # config.n_layers = 1
110 |   # config.xp = 'layers'
111 |   # config.model = 'bow'
112 |   # config.lr = 0.005
113 |   # config.trainable_embeddings = 'Variable'
114 |   # config.b_max_len = 150
115 |   # result = run_bow(config)
116 | 
117 |   # ## Experiment 
118 |   # np.random.seed(1)
119 |   # config = Config()
120 |   # config.n_layers = 3
121 |   # config.xp = 'layers'
122 |   # config.model = 'bow'
123 |   # config.lr = 0.005
124 |   # config.trainable_embeddings = 'Variable'
125 |   # config.b_max_len = 150
126 |   # result = run_bow(config)
127 | 
128 |   # np.random.seed(1)
129 |   # config = Config()
130 |   # config.n_layers = 0
131 |   # config.xp = 'layers'
132 |   # config.model = 'bow'
133 |   # config.lr = 0.005
134 |   # config.trainable_embeddings = 'Variable'
135 |   # config.b_max_len = 300
136 |   # result = run_bow(config)
137 | 
138 |   # ## Experiment 
139 |   # np.random.seed(1)
140 |   # config = Config()
141 |   # config.n_layers = 1
142 |   # config.xp = 'layers'
143 |   # config.model = 'bow'
144 |   # config.lr = 0.005
145 |   # config.trainable_embeddings = 'Variable'
146 |   # config.b_max_len = 300
147 |   # result = run_bow(config)
148 | 
149 |   # ## Experiment 
150 |   # np.random.seed(1)
151 |   # config = Config()
152 |   # config.n_layers = 3
153 |   # config.xp = 'layers'
154 |   # config.model = 'bow'
155 |   # config.lr = 0.005
156 |   # config.trainable_embeddings = 'Constant'
157 |   # config.b_max_len = 300
158 |   # result = run_bow(config)
159 | 
160 |   # np.random.seed(1)
161 |   # config = Config()
162 |   # config.n_layers = 0
163 |   # config.xp = 'layers'
164 |   # config.model = 'bow'
165 |   # config.lr = 0.005
166 |   # config.trainable_embeddings = 'Variable'
167 |   # config.b_max_len = 600
168 |   # result = run_bow(config)
169 | 
170 |   # ## Experiment 
171 |   # np.random.seed(1)
172 |   # config = Config()
173 |   # config.n_layers = 1
174 |   # config.xp = 'layers'
175 |   # config.model = 'bow'
176 |   # config.lr = 0.005
177 |   # config.trainable_embeddings = 'Variable'
178 |   # config.b_max_len = 600
179 |   # result = run_bow(config)
180 | 
181 |   # ## Experiment 
182 |   # np.random.seed(1)
183 |   # config = Config()
184 |   # config.n_layers = 3
185 |   # config.xp = 'layers'
186 |   # config.model = 'bow'
187 |   # config.lr = 0.005
188 |   # config.trainable_embeddings = 'Constant'
189 |   # config.b_max_len = 600
190 |   # result = run_bow(config)
191 | 
192 |   ## Experiment 
193 |   # np.random.seed(1)
194 |   # config = Config()
195 |   # config.n_layers = 3
196 |   # config.xp = 'layers'
197 |   # config.model = 'bow'
198 |   # config.lr = 0.005
199 |   # config.trainable_embeddings = 'Constant'
200 |   # config.b_max_len = 150
201 |   # result = run_bow(config)
202 | 
203 | 
204 | 
205 | 
206 | def run_lstm_with_parameters(args):
207 |   # Final test
208 |   np.random.seed(1)
209 |   config0 = Config()
210 |   config0.max_length = 75
211 |   config0.trainable_embeddings = 'Variable'
212 |   config0.hidden_size = 100
213 |   config0.n_epochs = 40
214 |   config0.n_layers = 2
215 |   config0.batch_size = 128
216 |   config0.dropout = 0.8
217 |   config0.lr = 0.001
218 |   # config0.num_samples = 100
219 |   config0.xp = 'final_test'
220 |   config0.model = 'lstm_basic'
221 |   result = run_lstm(config0, final = True)
222 | 
223 | 
224 |   #### Testing Downsampling
225 | 
226 |   # # Experiment 1
227 |   # # 2 layer, max_length = 75
228 |   # np.random.seed(1)
229 |   # config0 = Config()
230 |   # # print('Running run_lstm_with_parameters')
231 |   # # config0.n_layers = 0
232 |   # config0.max_length = 75
233 |   # config0.trainable_embeddings = 'Variable'
234 |   # config0.hidden_size = 100
235 |   # config0.n_epochs = 40
236 |   # config0.n_layers = 1
237 |   # config0.batch_size = 128
238 |   # config0.dropout = 0.8
239 |   # config0.n_layers = 1
240 |   # # config0.downsample = True
241 |   # config0.lr = 0.001
242 |   # config0.attention_length = 15
243 |   # result = run_lstm(config0)
244 | 
245 |   # # # # Experiment 2
246 |   # # # # 2 layer, max_length = 150
247 |   # np.random.seed(1)
248 |   # config1 = Config()
249 |   # config1.max_length = 150
250 |   # config1.trainable_embeddings = 'Variable'
251 |   # config1.hidden_size = 100
252 |   # config1.n_epochs = 40
253 |   # config1.batch_size = 128
254 |   # config1.dropout = 0.8
255 |   # config1.n_layers = 1
256 |   # # config1.downsample = True
257 |   # config1.lr = 0.001
258 |   # config1.attention_length = 15
259 |   # result = run_lstm(config1)
260 | 
261 |   # # ## Experiment 3
262 |   # # # 2 layer, max_length = 300
263 |   # np.random.seed(1)
264 |   # config2 = Config()
265 |   # config2.max_length = 250
266 |   # config2.trainable_embeddings = 'Variable'
267 |   # config2.hidden_size = 100
268 |   # config2.n_epochs = 40
269 |   # config2.batch_size = 128
270 |   # config2.dropout = 0.8
271 |   # config2.n_layers = 1
272 |   # # config2.downsample = True
273 |   # config2.lr = 0.001
274 |   # config2.attention_length = 15
275 |   # result = run_lstm(config2)
276 | 
277 |   # ## Experiment 4
278 |   # # max_length = 150, n_layers = 1
279 |   # np.random.seed(1)
280 |   # config3 = Config()
281 |   # config3.max_length = 150
282 |   # config3.trainable_embeddings = 'Variable'
283 |   # config3.hidden_size = 100
284 |   # config3.n_epochs = 40
285 |   # config3.batch_size = 128
286 |   # config3.dropout = 0.8
287 |   # config3.n_layers = 1
288 |   # config3.downsample = True
289 |   # # config3.extra_hidden_size = None
290 |   # result = run_lstm(config3)
291 | 
292 | 
293 |   # ## Experiment 5
294 |   # # max_length = 150, n_layers = 2
295 |   # np.random.seed(1)
296 |   # config4 = Config()
297 |   # config4.max_length = 150
298 |   # config4.trainable_embeddings = 'Variable'
299 |   # config4.hidden_size = 100
300 |   # config4.n_epochs = 40
301 |   # config4.batch_size = 128
302 |   # config4.dropout = 0.8
303 |   # config4.n_layers = 2
304 |   # config4.downsample = True
305 |   # result = run_lstm(config4)
306 | 
307 |   # ## Experiment 6
308 |   # # max_length = 150, n_layers = 4
309 |   # np.random.seed(1)
310 |   # config5 = Config()
311 |   # config5.max_length = 150
312 |   # config5.trainable_embeddings = 'Variable'
313 |   # config5.hidden_size = 100
314 |   # config5.n_epochs = 40
315 |   # config5.batch_size = 128
316 |   # config5.dropout = 0.8
317 |   # config5.n_layers = 4
318 |   # config5.downsample = True
319 |   # result = run_lstm(config5)
320 | 
321 | 
322 |   # #### Testing Dropout
323 | 
324 |   # # ## Experiment 1
325 |   # # # max_length = 75, n_layers = 2, dropout = 0.9
326 |   # np.random.seed(1)
327 |   # config = Config()
328 |   # config.max_length = 75
329 |   # config.trainable_embeddings = 'Variable'
330 |   # config.hidden_size = 100
331 |   # config.n_epochs = 40
332 |   # config.batch_size = 128
333 |   # config.dropout = 0.9
334 |   # config.n_layers = 2
335 |   # config.downsample = False
336 |   # config.lr = 0.005
337 |   # result = run_lstm(config)
338 | 
339 |   # # ## Experiment 2
340 |   # # # max_length = 75, n_layers = 2, dropout = 0.65
341 |   # np.random.seed(1)
342 |   # config = Config()
343 |   # config.max_length = 75
344 |   # config.trainable_embeddings = 'Variable'
345 |   # config.hidden_size = 100
346 |   # config.n_epochs = 40
347 |   # config.batch_size = 128
348 |   # config.dropout = 0.65
349 |   # config.n_layers = 2
350 |   # config.downsample = False
351 |   # config.lr = 0.005
352 |   # result = run_lstm(config)
353 | 
354 |   # # ## Experiment 3
355 |   # # # max_length = 75, n_layers = 2, dropout = 0.5
356 |   # np.random.seed(1)
357 |   # config = Config()
358 |   # config.max_length = 75
359 |   # config.trainable_embeddings = 'Variable'
360 |   # config.hidden_size = 100
361 |   # config.n_epochs = 40
362 |   # config.batch_size = 128
363 |   # config.dropout = 0.5
364 |   # config.n_layers = 2
365 |   # config.downsample = False
366 |   # config.lr = 0.005
367 |   # result = run_lstm(config)
368 | 
369 | 
370 |   # #### Testing max_length
371 | 
372 |   # # ## Experiment 1
373 |   # # # max_length = 50, n_layers = 2,
374 |   # np.random.seed(1)
375 |   # config = Config()
376 |   # config.max_length = 50
377 |   # config.trainable_embeddings = 'Variable'
378 |   # config.hidden_size = 100
379 |   # config.n_epochs = 40
380 |   # config.batch_size = 128
381 |   # config.dropout = 0.8
382 |   # config.n_layers = 2
383 |   # config.downsample = False
384 |   # config.lr = 0.005
385 |   # result = run_lstm(config)  
386 | 
387 |   # # ## Experiment 2
388 |   # # # max_length = 30, n_layers = 2,
389 |   # np.random.seed(1)
390 |   # config = Config()
391 |   # config.max_length = 30
392 |   # config.trainable_embeddings = 'Variable'
393 |   # config.hidden_size = 100
394 |   # config.n_epochs = 40
395 |   # config.batch_size = 128
396 |   # config.dropout = 0.8
397 |   # config.n_layers = 2
398 |   # config.downsample = False
399 |   # config.lr = 0.005
400 |   # result = run_lstm(config)
401 | 
402 | def run_lstm_attention_with_parameters(args):
403 |   #### Testing max_length # Experiment 1
404 |   ## 1 layer, max_length = 50
405 |   np.random.seed(1)
406 |   config0 = Config()
407 |   # print('Running run_lstm_with_parameters')
408 |   config0.max_length = 75
409 |   config0.trainable_embeddings = 'Variable'
410 |   config0.hidden_size = 100
411 |   config0.n_epochs = 40
412 |   config0.batch_size = 128
413 |   config0.dropout = 0.8
414 |   config0.n_layers = 2
415 |   config0.lr = 0.001
416 |   config0.xp = 'final_test'
417 |   config0.model = 'lstm_attention'
418 |   # config0.num_samples = 100
419 |   config0.attention_length = 15
420 |   result = run_lstm_attention(config0, final = True)
421 | 
422 |   # np.random.seed(1)
423 |   # config0 = Config()
424 |   # # print('Running run_lstm_with_parameters')
425 |   # config0.max_length = 150
426 |   # config0.trainable_embeddings = 'Variable'
427 |   # config0.hidden_size = 100
428 |   # config0.n_epochs = 40
429 |   # config0.n_layers = 2
430 |   # config0.batch_size = 128
431 |   # config0.dropout = 0.8
432 |   # config0.n_layers = 4
433 |   # # config0.downsample = False
434 |   # config0.lr = 0.001
435 |   # # config0.num_samples = 
436 |   # config0.attention_length = 15
437 |   # result = run_lstm_attention(config0)
438 | 
439 | 
440 | 
441 |   #### Testing attention_length # Experiment 1
442 |   ## 1 layer, max_length = 150, attention_length = 10
443 |   # np.random.seed(1)
444 |   # config0 = Config()
445 |   # # print('Running run_lstm_with_parameters')
446 |   # config0.max_length = 150
447 |   # config0.trainable_embeddings = 'Variable'
448 |   # config0.hidden_size = 100
449 |   # config0.n_epochs = 40
450 |   # config0.n_layers = 1
451 |   # config0.batch_size = 128
452 |   # config0.dropout = 0.8
453 |   # config0.n_layers = 1
454 |   # # config0.downsample = False
455 |   # config0.lr = 0.001
456 |   # config0.attention_length = 10
457 |   # result = run_lstm_attention(config0)
458 | 
459 |   # #### Testing attention_length # Experiment 2
460 |   # ## 1 layer, max_length = 150, attention_length = 20
461 |   # np.random.seed(1)
462 |   # config0 = Config()
463 |   # # print('Running run_lstm_with_parameters')
464 |   # config0.max_length = 150
465 |   # config0.trainable_embeddings = 'Variable'
466 |   # config0.hidden_size = 100
467 |   # config0.n_epochs = 40
468 |   # config0.n_layers = 1
469 |   # config0.batch_size = 128
470 |   # config0.dropout = 0.8
471 |   # config0.n_layers = 1
472 |   # # config0.downsample = False
473 |   # config0.lr = 0.001
474 |   # config0.attention_length = 20
475 |   # result = run_lstm_attention(config0)
476 | 
477 | 
478 |   # #### Testing alearning rate # Experiment 1
479 |   # ## 1 layer, max_length = 150, lr = 0.0005
480 |   # np.random.seed(1)
481 |   # config0 = Config()
482 |   # # print('Running run_lstm_with_parameters')
483 |   # config0.max_length = 150
484 |   # config0.trainable_embeddings = 'Variable'
485 |   # config0.hidden_size = 100
486 |   # config0.n_epochs = 40
487 |   # config0.n_layers = 1
488 |   # config0.batch_size = 128
489 |   # config0.dropout = 0.8
490 |   # config0.n_layers = 1
491 |   # # config0.downsample = False
492 |   # config0.lr = 0.0005
493 |   # config0.attention_length = 15
494 |   # result = run_lstm_attention(config0)
495 | 
496 |   # #### Testing alearning rate # Experiment 2
497 |   # ## 1 layer, max_length = 150, lr = 0.0002
498 |   # np.random.seed(1)
499 |   # config0 = Config()
500 |   # # print('Running run_lstm_with_parameters')
501 |   # config0.max_length = 150
502 |   # config0.trainable_embeddings = 'Variable'
503 |   # config0.hidden_size = 100
504 |   # config0.n_epochs = 40
505 |   # config0.n_layers = 1
506 |   # config0.batch_size = 128
507 |   # config0.dropout = 0.8
508 |   # config0.n_layers = 1
509 |   # # config0.downsample = False
510 |   # config0.lr = 0.0002
511 |   # config0.attention_length = 15
512 |   # result = run_lstm_attention(config0)
513 | 
514 | def run_lstm_conditional_with_parameters(args):
515 |   # To be defined - parameter saving not ready
516 |   np.random.seed(1)
517 |   config0 = Config()
518 |   # print('Running run_lstm_with_parameters')
519 |   config0.trainable_embeddings = 'Variable'
520 |   config0.hidden_size = 100
521 |   config0.n_epochs = 40
522 |   config0.n_layers = 1
523 |   config0.batch_size = 128
524 |   config0.dropout = 0.8
525 |   config0.n_layers = 2
526 |   config0.lr = 0.001
527 |   # config0.num_samples = 100
528 |   config0.b_max_len = 75
529 |   config0.attention_length = 15
530 |   config0.xp = 'final_test'
531 |   config0.model = 'conditional_lstm'
532 |   # print 'config0' + str(config0.__dict__)
533 |   result0 = run_lstm_conditional(config0, final = True)
534 | 
535 | 
536 |   # np.random.seed(1)
537 |   # config0 = Config()
538 |   # # print('Running run_lstm_with_parameters')
539 |   # # config0.n_layers = 0
540 |   # # config0.max_length = 75
541 |   # config0.trainable_embeddings = 'Variable'
542 |   # config0.hidden_size = 100
543 |   # config0.n_epochs = 40
544 |   # config0.n_layers = 1
545 |   # config0.batch_size = 128
546 |   # config0.dropout = 0.8
547 |   # config0.n_layers = 4
548 |   # config0.lr = 0.001
549 |   # # config0.num_samples = 100
550 |   # config0.b_max_len = 150
551 |   # # config0.downsample = True
552 |   # config0.attention_length = 15
553 |   # config0.xp = 'layers'
554 |   # config0.model = 'conditional_lstm'
555 |   # # print 'config0' + str(config0.__dict__)
556 |   # result0 = run_lstm_conditional(config0)
557 | 
558 |   # np.random.seed(1)
559 |   # config1 = Config()
560 |   # # print('Running run_lstm_with_parameters')
561 |   # # config0.n_layers = 0
562 |   # # config0.max_length = 75
563 |   # config1.trainable_embeddings = 'Variable'
564 |   # config1.hidden_size = 100
565 |   # config1.n_epochs = 40
566 |   # config1.n_layers = 1
567 |   # config1.batch_size = 128
568 |   # config1.dropout = 0.8
569 |   # config1.n_layers = 1
570 |   # config1.lr = 0.001
571 |   # # config0.num_samples = 1000
572 |   # config1.b_max_len = 150
573 |   # # config0.downsample = True
574 |   # config1.attention_length = 15
575 |   # config1.xp = 'body_length'
576 |   # config1.model = 'conditional_lstm'
577 |   # # print 'config0' + str(config0.__dict__)
578 |   # result1 = run_lstm_conditional(config1)
579 | 
580 |   # np.random.seed(1)
581 |   # config2 = Config()
582 |   # # print('Running run_lstm_with_parameters')
583 |   # # config0.n_layers = 0
584 |   # # config0.max_length = 75
585 |   # config2.trainable_embeddings = 'Variable'
586 |   # config2.hidden_size = 100
587 |   # config2.n_epochs = 40
588 |   # config2.n_layers = 1
589 |   # config2.batch_size = 128
590 |   # config2.dropout = 0.8
591 |   # config2.n_layers = 1
592 |   # config2.lr = 0.001
593 |   # # config0.num_samples = 1000
594 |   # config2.b_max_len = 300
595 |   # # config0.downsample = True
596 |   # config2.attention_length = 15
597 |   # config2.xp = 'body_length'
598 |   # config2.model = 'conditional_lstm'
599 |   # # print 'config0' + str(config0.__dict__)
600 |   # result2 = run_lstm_conditional(config2)
601 | 
602 | if __name__ == "__main__":
603 |   print("-- Running Test Script --")
604 |   print("-- Start BOW Experiments --")	
605 |   run_bow_with_parameters('')
606 |   print("-- Start LSTM Basic Experiments --")	
607 |   run_lstm_with_parameters('')
608 |   print("-- Start LSTM Attention Experiments --")
609 |   run_lstm_attention_with_parameters('')
610 |   print("-- Start LSTM Conditional Experiments --")
611 |   run_lstm_conditional_with_parameters('')
612 |   print("-- Finished Test Script --")


--------------------------------------------------------------------------------
/paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ourownstory/stance_detection/df9f34883005d203cb6d8ee7d56d9bc82ef39b62/paper.pdf


--------------------------------------------------------------------------------
/poster.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ourownstory/stance_detection/df9f34883005d203cb6d8ee7d56d9bc82ef39b62/poster.pdf


--------------------------------------------------------------------------------