├── README.md
├── data
    ├── dev-v1.1.json
    └── train-v1.1.json
├── main.py
├── parse_data.py
├── plot.py
└── preprocess_data.py


/README.md:
--------------------------------------------------------------------------------
1 | # RaSoR-in-Tensorflow
2 | The implementation of one of the SQuAD solutions
3 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import tensorflow as tf
  4 | from tensorflow.contrib import rnn
  5 | import numpy as np
  6 | import cPickle as pickle
  7 | import argparse
  8 | 
  9 | parser = argparse.ArgumentParser()
 10 | parser.add_argument('--batch_size', default='10', help='Batch size', type=int)
 11 | parser.add_argument('--dropout', default='0.1', help='Dropout in LSTMs', type=float)
 12 | parser.add_argument('--epochs', default='10', help='Number of epochs', type=int)
 13 | parser.add_argument('--test_every', default='100', help='Number of iterations before validation testing', type=int)
 14 | parser.add_argument('--lr', default='0.01', help='Learning rate', type=float)
 15 | args = parser.parse_args()
 16 | 
 17 | 
 18 | learning_rate = args.lr
 19 | epochs = args.epochs
 20 | dropout = args.dropout
 21 | batch_size = args.batch_size
 22 | test_iter = args.test_every
 23 | 
 24 | print("Reading train data...")
 25 | with open('data/train_data.pkl', 'rb') as fd:
 26 |     train_data = pickle.load(fd)
 27 | print("Done!")
 28 | 
 29 | print("Reading val data...")
 30 | with open('data/valid_data.pkl', 'rb') as fd:
 31 |     val_data = pickle.load(fd)
 32 | print("Done!")
 33 | 
 34 | ###############################################################################################################
 35 | 
 36 | # train_data = [[train_data[0][0][:1000], train_data[0][1][:1000]], [train_data[1][0][:1000], train_data[1][1][:1000]]]
 37 | # val_data = [[val_data[0][0][:1000], val_data[0][1][:1000]], [val_data[1][0][:1000], val_data[1][1][:1000]]]
 38 | 
 39 | ################################################################################################################
 40 | 
 41 | max_span_length = 30
 42 | n_hidden = 50
 43 | word_vec_size = 300
 44 | 
 45 | def data_generator(data, is_train=True, batch_size=batch_size, shuffle=False):
 46 |     n_samples = len(data[0][0])
 47 |     if shuffle:
 48 |         perm = np.random.permutation(n_samples)
 49 |     else:
 50 |         perm = np.arange(n_samples)
 51 |     for i in range(0, n_samples, batch_size):
 52 |         indices = perm[i:i+batch_size]
 53 |         bs = len(indices)
 54 |         max_plen = max([data[0][0][j].shape[0] for j in indices])
 55 |         max_qlen = max([data[0][1][j].shape[0] for j in indices])
 56 |         p_mask = np.ones( (bs, max_plen, 1), dtype=np.float32)
 57 |         q_mask = np.ones( (bs, max_qlen, 1), dtype=np.float32)
 58 |         p_s = []
 59 |         q_s = []
 60 |         for j in range(bs):
 61 |             ind = indices[j]
 62 |             l_p = data[0][0][ind].shape[0]
 63 |             l_q = data[0][1][ind].shape[0]
 64 |             p_s.append(np.lib.pad(data[0][0][ind], ((0, max_plen - l_p), (0, 0)), 'constant', constant_values=(0,0)))
 65 |             q_s.append(np.lib.pad(data[0][1][ind], ((0, max_qlen - l_q), (0, 0)), 'constant', constant_values=(0,0)))
 66 |             p_mask[j, l_p:, 0] = 0
 67 |             q_mask[j, l_q:, 0] = 0
 68 |         p = np.stack(p_s)
 69 |         q = np.stack(q_s)
 70 | 
 71 | 
 72 |         n_s = np.zeros((bs), dtype=np.int32)
 73 | 
 74 |         for j in range(bs):
 75 |             ind = indices[j]
 76 |             l_p = data[0][0][ind].shape[0]
 77 |             if l_p >= max_span_length:
 78 |                 n_s[j] = (max_span_length + 1) * max_span_length / 2 + (l_p - max_span_length) * max_span_length
 79 |             else:
 80 |                 n_s[j] = (l_p + 1) * l_p / 2
 81 |         max_n_s = n_s.max()
 82 |         y = np.zeros((bs, max_n_s))
 83 |         i_p = np.zeros((bs, max_n_s, 2), dtype=np.int32)
 84 |         i_p_mask = np.ones((bs, max_n_s, 1), dtype=np.float32)
 85 |         for j in range(bs):
 86 |             ind = indices[j]
 87 |             l_p = data[0][0][ind].shape[0]
 88 |             k = 0
 89 |             a1 = data[1][0][ind]
 90 |             a2 = data[1][1][ind]
 91 |             for m in range(l_p):
 92 |                 for n in range(m, min(m+max_span_length, l_p)):
 93 |                     i_p[j, k, 0] = m
 94 |                     i_p[j, k, 1] = n
 95 |                     if is_train and m == a1 and n == a2:
 96 |                         y[j, k] = 1
 97 |                     k += 1
 98 |                     assert k <= n_s[j]
 99 |             i_p_mask[j, n_s[j]:, 0] = 0
100 |         if is_train:
101 |             yield ( (p, q, i_p), (p_mask, q_mask, i_p_mask), y)
102 |         else:
103 |             yield ( (p, q, i_p), (p_mask, q_mask, i_p_mask))
104 |     return 
105 | 
106 | p = tf.placeholder("float", [None, None, word_vec_size])
107 | q = tf.placeholder("float", [None, None, word_vec_size])
108 | p_mask = tf.placeholder("float", [None, None, 1])
109 | q_mask = tf.placeholder("float", [None, None, 1])
110 | index_pairs = tf.placeholder("int32", [None, None, 2])
111 | index_pairs_mask = tf.placeholder("float", [None, None, 1])
112 | y = tf.placeholder("float", [None, None])
113 | 
114 | def softmax_with_mask(input, mask, dim=-1):
115 |     m = tf.reduce_max(input,axis=dim, keep_dims=True)
116 |     e = tf.exp(input - m) * mask
117 |     s = tf.reduce_sum(e, axis=dim, keep_dims=True) 
118 |     s = tf.clip_by_value(s, np.finfo(np.float32).eps, np.finfo(np.float32).max)
119 |     return e / s
120 | 
121 | def FFNN(input, input_mask, name, layer_shapes = [n_hidden]):
122 |     # A Feed Forward Neural Network
123 | 
124 |     x = input
125 |     for i in range(len(layer_shapes)):
126 |         s = layer_shapes[i]
127 |         with tf.variable_scope('{}_{}'.format(name, i)):
128 |             x = tf.layers.dense(inputs=x, units=s, activation=tf.nn.relu)
129 |             x = x * input_mask
130 | 
131 |     return x
132 | 
133 | 
134 | def BiLSTM(input, input_mask, name):
135 |     with tf.variable_scope(name):
136 |         lstm_fw_cell = rnn.LSTMCell(n_hidden, forget_bias=1.0)
137 |         lstm_fw_cell = tf.contrib.rnn.DropoutWrapper(lstm_fw_cell, state_keep_prob=1.0-dropout,
138 | #                                                     input_keep_prob=1.0-dropout, input_size=tf.shape(input)[1:],
139 |                                                      variational_recurrent=True, dtype=tf.float32)
140 |         lstm_bw_cell = rnn.LSTMCell(n_hidden, forget_bias=1.0)
141 |         lstm_bw_cell = tf.contrib.rnn.DropoutWrapper(lstm_bw_cell, state_keep_prob=1.0-dropout,
142 | #                                                     input_keep_prob=1.0-dropout, input_size=tf.shape(input)[1:],
143 |                                                      variational_recurrent=True,dtype=tf.float32)
144 |         outputs, states = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, input, dtype=tf.float32)
145 |     outputs = tf.concat(outputs, axis=-1) * input_mask
146 |     return outputs
147 | 
148 | def q_align(p, q, p_mask, q_mask):
149 |     p_n = FFNN(p, p_mask, 'align_p')
150 |     q_n = FFNN(q, q_mask, 'align_q')
151 |     s = tf.matmul(p_n, q_n, transpose_b=True)
152 |     a = softmax_with_mask(s, p_mask)
153 |     return tf.matmul(a, q)
154 | 
155 | def q_indep(q, q_mask):
156 |     q_s = q
157 |     for i in range(2):
158 |         q_s = BiLSTM(q_s, q_mask, 'BiLSTM_q_indep_{}'.format(i))
159 |     w_q = tf.Variable(tf.random_normal([1, n_hidden]))
160 |     s = tf.tensordot(FFNN(q_s, q_mask, 'FFNN_q_s'), w_q, axes=[[-1],[-1]])
161 |     a = softmax_with_mask(s,q_mask, dim=1)
162 |     return tf.matmul(a, q_s, transpose_a=True)
163 | 
164 | def concat(p, q_a, q_i):
165 |     p_tmp = tf.reduce_sum(p, axis=-1, keep_dims=True)
166 |     q_i = q_i + p_tmp * 0 
167 |     return tf.concat([p, q_a, q_i], axis=-1)
168 | 
169 | def question_focused_passage(p, q, p_mask, q_mask):
170 |     q_a = q_align(p, q, p_mask, q_mask)
171 |     q_i = q_indep(q, q_mask)
172 |     h_a = concat(p, q_a, q_i)
173 |     return h_a
174 | 
175 | p_qf = question_focused_passage(p, q, p_mask, q_mask)
176 | 
177 | for i in range(2):
178 |     p_qf = BiLSTM(p_qf, p_mask, 'BiLSTM_p_qf_{}'.format(i))
179 | 
180 | 
181 | # Getting answer span representation
182 | 
183 | start_indices = index_pairs[:, :, 0]
184 | start_indices = tf.expand_dims(start_indices, -1)
185 | end_indices = index_pairs[:, :, 1]
186 | end_indices = tf.expand_dims(end_indices, -1)
187 | symbolic_batch_size = tf.shape(index_pairs)[0]
188 | b_s = tf.range(0, symbolic_batch_size, dtype=tf.int32)
189 | b_s = tf.expand_dims(b_s,-1)
190 | b_s = tf.expand_dims(b_s,-1)  # b_s.shape == (B, 1, 1)
191 | b_s = start_indices * 0 + b_s # b_s broadcasts to shape (batch_size, n_spans, 1) == shape of start_indices
192 | 
193 | start_indices = tf.concat((b_s, start_indices), axis=-1)
194 | end_indices = tf.concat((b_s, end_indices), axis=-1)
195 | 
196 | start_vectors = tf.gather_nd(p_qf, start_indices)
197 | end_vectors = tf.gather_nd(p_qf, end_indices)
198 | 
199 | spans = tf.concat((start_vectors, end_vectors), axis=-1) # spans.shape == (batch_size, n_spans, 2 * n_hidden)
200 | spans = spans * index_pairs_mask
201 | 
202 | 
203 | def span_score_logits(spans, spans_mask):
204 |     w_a = tf.Variable(tf.random_normal([n_hidden]))
205 |     h_a = FFNN(spans, spans_mask, 'spans')
206 |     s_a = tf.tensordot(h_a, w_a, axes=[[-1],[-1]])
207 |     return s_a * spans_mask[:, :, 0]
208 | 
209 | logits = span_score_logits(spans, index_pairs_mask)
210 | probs = softmax_with_mask(logits, index_pairs_mask[:, :, 0])
211 | def cross_entropy(y_, y):
212 |     y_ = tf.clip_by_value(y_, np.finfo(np.float32).eps, np.finfo(np.float32).max)
213 |     return tf.reduce_mean(-tf.reduce_sum(y * tf.log(y_), reduction_indices=[1]))
214 | 
215 | 
216 | #cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))
217 | cost = cross_entropy(probs, y)
218 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
219 | 
220 | correct_pred = tf.equal(tf.argmax(logits,-1), tf.argmax(y,-1))
221 | accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
222 | 
223 | 
224 | init = tf.global_variables_initializer()
225 | saver = tf.train.Saver(max_to_keep=epochs) 
226 | 
227 | def test_on_validation_set(val_data, global_iter):
228 |     acc = 0
229 |     loss = 0
230 |     iter = 0
231 |     counter = 0.0
232 |     data_len = len(val_data[1][1])
233 |     print("Running on Validataion Set")
234 | 
235 |     for ((batch_p, batch_q, batch_i_p), (batch_p_mask, batch_q_mask, batch_i_p_mask), batch_y) in data_generator(val_data, is_train=True, batch_size=batch_size, shuffle=False):
236 |         f_dict={p: batch_p, q: batch_q, index_pairs: batch_i_p, p_mask: batch_p_mask, q_mask: batch_q_mask, index_pairs_mask: batch_i_p_mask, y: batch_y}            
237 |         acc += sess.run(accuracy, feed_dict=f_dict)
238 |         loss += sess.run(cost, feed_dict=f_dict)
239 |         counter += len(batch_p)
240 |         iter += 1
241 |         print("{:.4f}%".format(counter * 100 / data_len), end='\r')
242 | 
243 |     print("\nIter: {:4d}  Val Loss: {:.4f} Val Acc: {:.4f}".format(global_iter, loss/iter, acc/iter))
244 | 
245 | with tf.Session() as sess:
246 |     sess.run(init)
247 |     global_iter = 0
248 | 
249 |     for e in range(epochs):
250 |         
251 |         for ((batch_p, batch_q, batch_i_p), (batch_p_mask, batch_q_mask, batch_i_p_mask), batch_y) in data_generator(train_data, is_train=True, batch_size=batch_size, shuffle=True):
252 |             f_dict={p: batch_p, q: batch_q, index_pairs: batch_i_p, p_mask: batch_p_mask, q_mask: batch_q_mask, index_pairs_mask: batch_i_p_mask, y: batch_y}
253 |             if global_iter % test_iter == 0:
254 |                 test_on_validation_set(val_data, global_iter)
255 |             train_loss = sess.run(cost, feed_dict=f_dict)
256 |             train_acc = sess.run(accuracy, feed_dict=f_dict)
257 |             print("Iter: {:4d} Train Loss: {:.4f} Train Acc: {:.4f}".format(global_iter, train_loss, train_acc))
258 |             sess.run(optimizer, feed_dict=f_dict)
259 |             global_iter += 1
260 | 
261 | 
262 |         
263 |  
264 |         save_path = saver.save(sess, "models/model", global_step=e)
265 |         print("Model saved in file: {}".format(save_path))
266 | 
267 |     print("Optimization Finished!")
268 | 


--------------------------------------------------------------------------------
/parse_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage: python parse_data.py dataset_file --output_file outfile.json --train_ratio 1.
 3 | """
 4 | # -*- coding: utf-8 -*-
 5 | import json
 6 | import argparse
 7 | import random
 8 | random.seed(20)
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('data', help='Path to the dataset file', type=str)
12 | parser.add_argument('--output_destination', default='data/tmp.json',
13 |                     help='Desired path to output json', type=str)
14 | parser.add_argument('--train_ratio', default=1., help='ratio for train/val split', type=float)
15 | args = parser.parse_args()
16 | 
17 | 
18 | file_path = args.data
19 | outfile = args.output_destination
20 | train_ratio = args.train_ratio
21 | 
22 | json_data = open(file_path, 'r').read()
23 | data = json.loads(json_data)
24 | 
25 | print "Keys of json are:", data.keys()
26 | data = data['data']
27 | print "Dataset is a list of %d topics, each topic contrains some paragraphs" % len(data)
28 | print "Keys of topics are", data[0].keys() 
29 | topics = [data[i]['title'] for i in range(len(data))]
30 | #print "The topics are:", topics
31 | 
32 | cnt_paragraphs_in_topic = dict([(data[i]['title'], len(data[i]['paragraphs'])) for i in range(len(data))])
33 | print "Keys of paragraphs are:", data[0]['paragraphs'][0].keys()
34 | print "Dataset contains %d paragraphs in total" % sum(cnt_paragraphs_in_topic[x] for x in cnt_paragraphs_in_topic)
35 | print "Each paragraph has some questions and answers associated with it"
36 | print "Keys of qas sections are:", data[0]['paragraphs'][0]['qas'][0].keys()
37 | print "Keys of answers are:", data[0]['paragraphs'][0]['qas'][0]['answers'][0].keys()
38 | 
39 | train_cqas = [] # ContextQuestionAnswer
40 | val_cqas = []
41 | 
42 | for topic_id in range(len(data)):
43 |     paragraphs = data[topic_id]['paragraphs']
44 |     if random.random() < train_ratio:
45 |         train = True
46 |     else:
47 |         train = False
48 |     for paragraph in paragraphs:
49 |         context = paragraph['context']
50 |         for qa in paragraph['qas']:
51 |             # assert len(qa['answers']) == 1 # for trainset, dev set has a few answers
52 |             
53 |             question = qa['question']
54 |             _id = qa['id']
55 |             answer = qa['answers'][0]['text']
56 |             answer_start = qa['answers'][0]['answer_start'] 
57 |             answer_end = answer_start + len(answer) - 1 # answer == context[answer_start : answer_end + 1]
58 |             if train:
59 |                 train_cqas.append({"context": context, "question": question, "answer": answer,
60 |                     'answer_start': answer_start, 'answer_end': answer_end,
61 |                     'id': _id, 'topic': topics[topic_id]
62 |                 })
63 |             else:
64 |                 val_cqas.append({"context": context, "question": question, "answer": answer,
65 |                     'answer_start': answer_start, 'answer_end': answer_end,
66 |                     'id': _id, 'topic': topics[topic_id]
67 |                 })
68 |             
69 | 
70 | print "Saving dataset to outfile..."
71 | if train_ratio == 1.:
72 |     with open(outfile, 'w') as fd:
73 |         json.dump(train_cqas,fd)
74 | else:
75 |     print "Train/Val ratio is %f" % (1. * len(train_cqas) / len(val_cqas))
76 |     train_file = 'train_' + outfile
77 |     val_file = 'val_' + outfile
78 |     with open(train_file, 'w') as fd:
79 |         json.dump(train_cqas, fd)
80 |     with open(val_file, 'w') as fd:
81 |         json.dump(val_cqas, fd)


--------------------------------------------------------------------------------
/plot.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import argparse
 5 | import matplotlib
 6 | matplotlib.use('Agg')
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--log_file', default='log.txt', help='Log file', type=str)
11 | parser.add_argument('--loss_lim', default='10', help='Upper limit for the loss plot', type=float)
12 | parser.add_argument('--acc_lim', default='0', help='Lower limit for the accuracy plot', type=float)
13 | parser.add_argument('--window_size', default='15', help='The size of the smoothing window', type=int)
14 | args = parser.parse_args()
15 | 
16 | file_name = args.log_file
17 | ws = args.window_size
18 | loss_lim = args.loss_lim
19 | acc_lim = args.acc_lim
20 | 
21 | def smoothen(arr, window):
22 |     to_return = []
23 |     s = 0.
24 |     for i in range(min(len(arr), window)):
25 |         s += arr[i]
26 |         to_return.append(s/(i+1))
27 |     for i in range(window, len(arr)):
28 |         s += arr[i]
29 |         s -= arr[i-window]
30 |         to_return.append(s/window)
31 |     return to_return
32 | 
33 | with open(file_name, 'r') as log:
34 |     lines = log.readlines()
35 |     train_iters = [float(line.split()[1]) for line in lines if "Train Loss" in line]
36 |     train_losses = [float(line.split()[4]) for line in lines if "Train Loss" in line]
37 |     train_acc = [float(line.split()[7]) for line in lines if "Train Acc" in line]
38 | 
39 |     val_iters = [float(line.split()[1]) for line in lines if "Val Loss" in line]
40 |     val_losses = [float(line.split()[4]) for line in lines if  "Val Loss" in line]
41 |     val_acc = [float(line.split()[7]) for line in lines if "Val Acc" in line]
42 | 
43 | 
44 | plt.plot(train_iters, smoothen(train_losses, window=ws))
45 | plt.plot(np.array(val_iters), val_losses)
46 | plt.ylim([0, loss_lim])
47 | #plt.show()
48 | plt.savefig('loss.png')
49 | plt.plot(train_iters, smoothen(train_acc, window=ws))
50 | plt.plot(np.array(val_iters), val_acc)
51 | plt.ylim([0, acc_lim])
52 | #plt.show()
53 | plt.savefig('acc.png')
54 | 


--------------------------------------------------------------------------------
/preprocess_data.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage: python preprocess_data.py parsed_file.json --output_destination outfile
  3 | """
  4 | # -*- coding: utf-8 -*-
  5 | import json
  6 | import argparse
  7 | import gensim
  8 | import numpy as np
  9 | import random
 10 | import cPickle as pickle
 11 | from tqdm import tqdm
 12 | 
 13 | random.seed(20)
 14 | 
 15 | parser = argparse.ArgumentParser()
 16 | parser.add_argument('data', help='Data json', type=str)
 17 | parser.add_argument('--output_destination', default='data/tmp.pkl', help='Desired path to output pickle', type=str)
 18 | args = parser.parse_args()
 19 | 
 20 | file_path = args.data
 21 | outfile = args.output_destination
 22 | if not outfile.endswith('.pkl'):
 23 |     outfile += '.pkl'
 24 | 
 25 | print "Reading SQuAD data... ",
 26 | with open(file_path) as fd:
 27 |     samples = json.load(fd)
 28 | print "Done!"
 29 | 
 30 | print "Reading word2vec data... ",
 31 | word_vec_size = 300
 32 | w2v_model = gensim.models.KeyedVectors.load_word2vec_format('word2vec_from_glove_300.vec')
 33 | vocab = w2v_model.vocab
 34 | print "Done!"
 35 | 
 36 | def get_word_vector(word):
 37 |     if word in vocab:
 38 |         return w2v_model[word]
 39 |     else:
 40 |         return np.zeros(word_vec_size)
 41 | 
 42 | print "Initiating CoreNLP service connection... ",
 43 | from stanford_corenlp_pywrapper import CoreNLP
 44 | proc = CoreNLP(configdict={'annotators': "tokenize,ssplit"}, corenlp_jars=["/home/tigrann/Documents/stanford-corenlp-full-2017-06-09/*"])
 45 | print "Done!"
 46 | 
 47 | def parse_sample(context, question, answer_start, answer_end, **kwargs):
 48 |     context = proc.parse_doc(context)
 49 |     tokens = []
 50 |     char_offsets = []
 51 |     for s in context['sentences']:
 52 |         tokens += s['tokens']
 53 |         char_offsets += s['char_offsets']
 54 |     
 55 |     try:
 56 |         answer_start = [answer_start >= s and answer_start < e for s, e in char_offsets].index(True)
 57 |         answer_end   = [answer_end   >= s and answer_end   < e for s, e in char_offsets].index(True)
 58 |     except ValueError:
 59 |         # print(char_offsets)
 60 |         # print(answer_start, answer_end)
 61 |         return None
 62 |     
 63 |     print('context', tokens)
 64 |     context_vecs = [get_word_vector(token) for token in tokens]
 65 |     context_vecs = np.vstack(context_vecs).astype(np.float32)
 66 | 
 67 |     question = proc.parse_doc(question)
 68 |     tokens = []
 69 |     for s in question['sentences']:
 70 |         tokens += s['tokens']
 71 |     # print('question', tokens)
 72 |     question_vecs = [get_word_vector(token) for token in tokens]
 73 |     question_vecs = np.vstack(question_vecs).astype(np.float32)
 74 |     # print('ans', answer_start, answer_end)
 75 |     # exit(0)
 76 |     return [[context_vecs, question_vecs],
 77 |             [answer_start, answer_end]]
 78 | 
 79 | print "Parsing samples... ",
 80 | samples = [parse_sample(**sample) for sample in tqdm(samples)]
 81 | #samples = [sample for sample in samples if sample is not None]
 82 | 
 83 | 
 84 | print len(samples), "=>",
 85 | samples = [sample for sample in samples if sample is not None]
 86 | print len(samples)
 87 | 
 88 | print "Done!"
 89 | 
 90 | # Transpose
 91 | data = [[[], []], 
 92 |         [[], []]]
 93 | for sample in samples:
 94 |     data[0][0].append(sample[0][0])
 95 |     data[0][1].append(sample[0][1])
 96 |     data[1][0].append(sample[1][0])
 97 |     data[1][1].append(sample[1][1])
 98 | 
 99 | print "Writing data to file '{}'... ".format(outfile)
100 | with open(outfile, 'w') as fd:
101 |     pickle.dump(data, fd, protocol=pickle.HIGHEST_PROTOCOL)
102 | print "Done!"
103 | 
104 | print "Bye!"
105 | 


--------------------------------------------------------------------------------