├── README.md
├── get_started.sh
├── lib
    └── recurrence.py
├── notebook.ipynb
├── preprocessing
    ├── dwr.py
    └── squad_preprocess.py
└── qa_data.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Implementation of conversational intelligence Challenge by ai-guild
 2 | 
 3 | This is an implementation of specific models like R-NET and Dilated CNN for The Conversation Intelligence Challenge other than those implemented in [initial repository](https://github.com/ai-guild/convai/tree/master) of ai-guild's Convai Challenge
 4 | 
 5 | ## Reference Papers
 6 | 
 7 | - [R-Net: Machine Reading Comprehension](https://www.microsoft.com/en-us/research/publication/mrc/#)
 8 | - [Multi-Scale Context Aggregation by Dilated Convolutions](https://arxiv.org/abs/1511.07122)
 9 | 
10 | ## Datasets
11 | 
12 | - [SQuAD Explorer](https://rajpurkar.github.io/SQuAD-explorer/)
13 | - [MS MARCO](http://www.msmarco.org/)
14 | - [SNLI](https://nlp.stanford.edu/projects/snli/)
15 | - [CBT](https://research.fb.com/projects/babi/)
16 | 
17 | 


--------------------------------------------------------------------------------
/get_started.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Downloads raw data into ./download
 3 | # and saves preprocessed data into ./data
 4 | # Get directory containing this script
 5 | 
 6 | CODE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 7 | echo $CODE_DIR
 8 | 
 9 | export PYTHONPATH=$PYTHONPATH:$CODE_DIR
10 | 
11 | pip install -r $CODE_DIR/requirements.txt --user
12 | 
13 | # download punkt, perluniprops
14 | if [ ! -d "/usr/local/share/nltk_data/tokenizers/punkt" ]; then
15 |     python2 -m nltk.downloader punkt
16 | fi
17 | 
18 | 
19 | # SQuAD preprocess is in charge of downloading
20 | # and formatting the data to be consumed later
21 | DATA_DIR=data
22 | DOWNLOAD_DIR=download
23 | mkdir -p $DATA_DIR
24 | rm -rf $DATA_DIR
25 | python2 $CODE_DIR/preprocessing/squad_preprocess.py
26 | 
27 | # Download distributed word representations
28 | python2 $CODE_DIR/preprocessing/dwr.py
29 | 
30 | # Data processing for TensorFlow
31 | python2 $CODE_DIR/qa_data.py --glove_dim 100
32 | 


--------------------------------------------------------------------------------
/lib/recurrence.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensorflow.contrib import rnn
  3 | 
  4 | #Gated recurrent units
  5 | def gru(num_units):
  6 |     gru_cell = rnn.GRUCell(num_units)
  7 |     return gru_cell
  8 | 
  9 | #Create a stacked gated recurrent units, with n-layer
 10 | def gru_n(num_units, num_layers):
 11 |     stacked_gru_cells = rnn.MultiRNNCell(
 12 |                             [gru(num_units) for _ in range(num_layers)])
 13 |     return stacked_gru_cells
 14 | 
 15 | 
 16 | def get_variables(n, shape, name='W'):
 17 |     return (tf.get_variable(name+str(i), dtype=tf.float32, shape=shape)
 18 |                for i in range(n))
 19 | 
 20 | 
 21 | '''
 22 |     Uni-directional RNN
 23 | 
 24 |     [usage]
 25 |     cell_ = gru_n(hdim, 3)
 26 |     outputs, states = uni_net(cell = cell_,
 27 |                              inputs= inputs_emb,
 28 |                              init_state= cell_.zero_state(batch_size, tf.float32),
 29 |                              timesteps = L)
 30 | '''
 31 | def uni_net(cell, inputs, init_state, timesteps, time_major=False, scope='uni_net_0'):
 32 |     # convert to time major format
 33 |     if not time_major:
 34 |         inputs_tm = tf.transpose(inputs, [1, 0, -1],name="input_time_major")
 35 |     # collection of states and outputs
 36 |     states, outputs = [init_state], []
 37 | 
 38 |     with tf.variable_scope(scope):
 39 | 
 40 |         for i in range(timesteps):
 41 |             if i > 0:
 42 |                 tf.get_variable_scope().reuse_variables()
 43 |             output, state = cell(inputs_tm[i], states[-1])
 44 |             outputs.append(output)
 45 |             states.append(state)
 46 | 
 47 |     return tf.stack(outputs), tf.stack(states[1:])
 48 | 
 49 | 
 50 | '''
 51 |     Bi-directional RNN
 52 | 
 53 |     [usage]
 54 |     (states_f, states_b), outputs = bi_net(cell_f= gru_n(hdim,3),
 55 |                                         cell_b= gru_n(hdim,3),
 56 |                                         inputs= inputs_emb,
 57 |                                         batch_size= batch_size,
 58 |                                         timesteps=L,
 59 |                                         scope='bi_net_0',
 60 |                                         project_outputs=True)
 61 | '''
 62 | def bi_net(cell_f, cell_b, inputs, batch_size, timesteps, 
 63 |            scope= 'bi_net_0',
 64 |            project_outputs=False):
 65 | 
 66 |     # forward
 67 |     _, states_f = uni_net(cell_f, 
 68 |                           inputs,
 69 |                           cell_f.zero_state(batch_size, tf.float32),
 70 |                           timesteps,
 71 |                           scope=scope + '_f')
 72 |     # backward
 73 |     _, states_b = uni_net(cell_b, 
 74 |                           tf.reverse(inputs, axis=[1]),
 75 |                           cell_b.zero_state(batch_size, tf.float32),
 76 |                           timesteps,
 77 |                           scope=scope + '_b')
 78 |    
 79 |     # infer num_layers and hidden_dim from cell
 80 |     num_layers = len(cell_f.state_size)
 81 |     hidden_dim = cell_f.state_size[0]
 82 | 
 83 |     outputs = None
 84 |     # outputs
 85 |     if project_outputs:
 86 |         #chain both forward and backword states together
 87 |         states = tf.concat([states_f, states_b], axis=-1)
 88 |         
 89 |         if len(states.shape) == 4 and num_layers:
 90 |             states = tf.reshape(tf.transpose(states, [-2, 0, 1, -1]), [-1, hidden_dim*2*num_layers])
 91 |             Wo = tf.get_variable(scope+'/Wo', dtype=tf.float32, shape=[num_layers*2*hidden_dim, hidden_dim])
 92 |         elif len(states.shape) == 3:
 93 |             states = tf.reshape(tf.transpose(states, [-2, 0, -1]), [-1, hidden_dim*2])
 94 |             Wo = tf.get_variable(scope+'/Wo', dtype=tf.float32, shape=[2*hidden_dim, hidden_dim])
 95 |         else:
 96 |             print('>> ERR : Unable to handle state reshape')
 97 |             return None
 98 |         
 99 |         outputs = tf.reshape(tf.matmul(states, Wo), [batch_size, timesteps, hidden_dim])
100 | 
101 |     return (states_f, states_b), outputs
102 | 
103 | 
104 | '''
105 |     Attention Mechanism
106 | 
107 |     based on "Neural Machine Translation by Jointly Learning to Align and Translate"
108 |         https://arxiv.org/abs/1409.0473
109 | 
110 |     [usage]
111 |     ci = attention(enc_states, dec_state, params= {
112 |         'Wa' : Wa, # [d,d]
113 |         'Ua' : Ua, # [d,d]
114 |         'Va' : Va  # [d,1]
115 |         })
116 |     shape(enc_states) : [B, L, d]
117 |     shape(dec_state)  : [B, d]
118 |     shape(ci)         : [B,d]
119 | 
120 | '''
121 | def attention(enc_states, dec_state, params, d, timesteps):
122 |     Wa, Ua = params['Wa'], params['Ua']
123 |     # s_ij -> [B,L,d]
124 |     a = tf.tanh(tf.expand_dims(tf.matmul(dec_state, Wa), axis=1) + 
125 |             tf.reshape(tf.matmul(tf.reshape(enc_states,[-1, d]), Ua), [-1, timesteps, d]))
126 |     Va = params['Va'] # [d, 1]
127 |     # e_ij -> softmax(aV_a) : [B, L]
128 |     scores = tf.nn.softmax(tf.reshape(tf.matmul(tf.reshape(a, [-1, d]), Va), [-1, timesteps]))
129 |     # c_i -> weighted sum of encoder states
130 |     return tf.reduce_sum(enc_states*tf.expand_dims(scores, axis=-1), axis=1) # [B, d]    
131 | 
132 | 
133 | '''
134 |     Attentive Decoder
135 | 
136 |     [usage]
137 |     dec_outputs, dec_states = attentive_decoder(enc_states,
138 |                                     tf.zeros(dtype=tf.float32, shape=[B,d]),
139 |                                     batch_size=B,timesteps=L,feed_previous=True,
140 |                                     inputs = inputs)
141 |     shape(enc_states) : [B, L, d]
142 |     shape(inputs) : [[B, d]] if feed_previous else [L, B, d]
143 | 
144 | 
145 | '''
146 | def attentive_decoder(enc_states, init_state, batch_size, 
147 |                       d, timesteps,
148 |                       inputs = [],
149 |                       scope='attentive_decoder_0',
150 |                       num_layers=1,
151 |                       feed_previous=False):
152 |     # get parameters
153 |     U,W,C,Ur,Wr,Cr,Uz,Wz,Cz,Uo,Vo,Co = get_variables(12, [d,d], name='decoder_param')
154 |     Wa, Ua = get_variables(2, [d,d], 'att')
155 |     Va = tf.get_variable('Va', shape=[d, 1], dtype=tf.float32)
156 |     att_params = {
157 |         'Wa' : Wa, 'Ua' : Ua, 'Va' : Va
158 |     }
159 |     
160 |         
161 |     def step(input_, state, ci):
162 |         z = tf.nn.sigmoid(tf.matmul(input_, Wz)+tf.matmul(state, Uz)+tf.matmul(ci, Cz))
163 |         r = tf.nn.sigmoid(tf.matmul(input_, Wr)+tf.matmul(state, Ur)+tf.matmul(ci, Cr))
164 |         si = tf.nn.tanh(tf.matmul(input_, W)+tf.matmul(ci, C)+tf.matmul(r*state, U))
165 |         
166 |         state = (1-z)*state + z*si
167 |         output = tf.matmul(state, Uo) + tf.matmul(input_, Vo) + tf.matmul(ci, Co)
168 |         
169 |         return output, state
170 |     
171 |     outputs = [inputs[0]] # include GO token as init input
172 |     states = [init_state]
173 |     for i in range(timesteps):
174 |         input_ = outputs[-1] if feed_previous else inputs[i]
175 |         output, state = step(input_, states[-1],
176 |                             attention(enc_states, states[-1], att_params, d, timesteps))
177 |     
178 |         outputs.append(output)
179 |         states.append(state)
180 |     # time major -> batch major
181 |     states_bm = tf.transpose(tf.stack(states[1:]), [1, 0, 2])
182 |     outputs_bm = tf.transpose(tf.stack(outputs[1:]), [1, 0, 2])
183 |     return outputs_bm, states_bm
184 | 
185 | 
186 | '''
187 |     Gated Attention Network
188 | 
189 |     based on "R-NET: Machine Reading Comprehension with Self-matching Networks"
190 |         https://www.microsoft.com/en-us/research/publication/mrc/
191 | 
192 |     [usage]
193 |     dec_outputs, dec_states = gated_attention_net(enc_states, # encoded representation of text
194 |                                     tf.zeros(dtype=tf.float32, shape=[B,d*2]), # notice d*2
195 |                                     batch_size=B,timesteps=L,feed_previous=False,
196 |                                     inputs = inputs)
197 |     shape(enc_states) : [B, L, d]
198 |     shape(inputs) : [[B, d]] if feed_previous else [L, B, d]
199 | 
200 |     For reading comprehension, inputs is same as enc_states; feed_previous doesn't apply
201 | 
202 | 
203 | '''
204 | def gated_attention_net(states_a, states_b, init_state_c, 
205 |                         batch_size, d, La, Lb, 
206 |                         scope='gated_attention_net_0'):
207 |     
208 |     with tf.variable_scope(scope, reuse=False):
209 |         # define attention parameters
210 |         Wa = tf.get_variable('Wa', shape=[d, d], dtype=tf.float32)
211 |         Wb = tf.get_variable('Wb', shape=[d, d], dtype=tf.float32)
212 |         Wc = tf.get_variable('Wc', shape=[d, d], dtype=tf.float32)
213 |         Va = tf.get_variable('Va', shape=[d, 1], dtype=tf.float32)
214 | 
215 |         att_params = {
216 |             'Wa' : Wa, 'Wb' : Wb, 'Wc' : Wc, 'Va' : Va
217 |         }
218 | 
219 |         # input gate/projection
220 |         Wg = tf.get_variable('Wg', shape=[d, d], dtype=tf.float32)
221 |         Wi = tf.get_variable('Wi', shape=[d*2, d], dtype=tf.float32)
222 | 
223 |     # define rnn cell
224 |     cell = gru(num_units=d)
225 |     
226 |     # convert states_a to batch_major
227 |     states_a = tf.transpose(states_a, [1,0,2])
228 |         
229 |     def step(input_, state):
230 |         # define input gate
231 |         gi = tf.nn.sigmoid(tf.matmul(input_, Wg))
232 |         # apply gate to input
233 |         input_ = gi * input_
234 |         # recurrent step
235 |         output, state = cell(input_, state)
236 |         return output, state
237 |     
238 |     states = [init_state_c]
239 |     for i in range(Lb):
240 |         if i>0:
241 |             tf.get_variable_scope().reuse_variables()
242 | 
243 |         # get match for current word
244 |         ci = attention_pooling(states_a, states_b[i], states[-1], 
245 |                                 att_params, d, La)
246 |         # combine ci and input(i) 
247 |         input_ = tf.matmul(tf.concat([states_b[i], ci], axis=-1), Wi)
248 |         _, state = step(input_, states[-1])
249 |     
250 |         states.append(state)
251 | 
252 |     # time major -> batch major
253 |     states_bm = tf.transpose(tf.stack(states[1:]), [1, 0, 2])
254 |     #outputs_bm = tf.transpose(tf.stack(outputs[1:]), [1, 0, 2])
255 | 
256 |     return states_bm
257 | 
258 | '''
259 |     Attention Pooling Mechanism
260 | 
261 |     based on "R-NET: Machine Reading Comprehension with Self-matching Networks"
262 |         https://www.microsoft.com/en-us/research/publication/mrc/
263 | 
264 |     [usage]
265 |     ci = attention(qstates, pstates_i, state, params= {
266 |         'Wa' : Wa, # [d,d]
267 |         'Wb' : Wb, # [d,d]
268 |         'Wc' : Wc, # [d,d]
269 |         'Va' : Va  # [d,1]
270 |         })
271 |     shape(qstates) : [B, L, d]
272 |     shape(pstates_i) : [B, d]
273 |     shape(state)     : [B, d]
274 |     shape(ci)        : [B, d]
275 | 
276 | '''
277 | def attention_pooling(states_a, states_b_i, state_c, params, d, timesteps):
278 |     Wa, Wb, Wc = params['Wa'], params['Wb'], params['Wc']
279 |     # s_ij -> [B,L,d]
280 |     a = tf.tanh(tf.expand_dims(tf.matmul(states_b_i, Wb), axis=1) +
281 |             tf.reshape(tf.matmul(tf.reshape(states_a,[-1, d]), Wa), [-1, timesteps, d]) +
282 |             tf.expand_dims(tf.matmul(state_c, Wc), axis=1))
283 |     Va = params['Va'] # [d, 1]
284 |     # e_ij -> softmax(aV_a) : [B, L]
285 |     scores = tf.nn.softmax(tf.reshape(tf.matmul(tf.reshape(a, [-1, d]), Va), [-1, timesteps]))
286 |     # c_i -> weighted sum of encoder states
287 |     return tf.reduce_sum(states_a*tf.expand_dims(scores, axis=-1), axis=1) # [B, d]
288 | 
289 | 


--------------------------------------------------------------------------------
/notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# R-NET"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import tensorflow as tf\n",
 19 |     "from tensorflow.contrib.layers import xavier_initializer as xinit"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## Placeholders"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {
 33 |     "collapsed": true
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "Lp = 100\n",
 38 |     "Lq = 20\n",
 39 |     "B = 8\n",
 40 |     "embed_dim = 150\n",
 41 |     "hdim = 2*embed_dim\n",
 42 |     "vocab_size = 10000"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "metadata": {
 49 |     "collapsed": true
 50 |    },
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "tf.reset_default_graph()\n",
 54 |     "p = tf.placeholder(tf.int32, shape=[None, Lp], name='passage')\n",
 55 |     "q = tf.placeholder(tf.int32, shape=[None, Lq], name='question')"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "## Embedding\n",
 63 |     "\n",
 64 |     "**TODO**\n",
 65 |     "- [ ] Include character embedding for OOV tokens"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 4,
 71 |    "metadata": {
 72 |     "collapsed": true
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "E = tf.get_variable('E', dtype=tf.float32, shape=[vocab_size, embed_dim], initializer=xinit())\n",
 77 |     "qe = tf.nn.embedding_lookup(E, q)\n",
 78 |     "pe = tf.nn.embedding_lookup(E, p)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## Question Encoder"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 5,
 91 |    "metadata": {
 92 |     "collapsed": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "from lib.recurrence import *"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 6,
102 |    "metadata": {
103 |     "collapsed": true
104 |    },
105 |    "outputs": [],
106 |    "source": [
107 |     "(qstates_f, qstates_b), _ = bi_net(cell_f= gru_n(embed_dim,3), cell_b=gru_n(embed_dim,3),\n",
108 |     "                                inputs=qe, batch_size=B, timesteps=Lq,\n",
109 |     "                                scope='q_enc')\n",
110 |     "qstates = tf.concat([qstates_f, qstates_b], axis=-1)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "## Passage Encoder"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 7,
123 |    "metadata": {
124 |     "collapsed": true
125 |    },
126 |    "outputs": [],
127 |    "source": [
128 |     "(pstates_f, pstates_b), _ = bi_net(cell_f= gru_n(embed_dim,3), cell_b=gru_n(embed_dim,3),\n",
129 |     "                                inputs=pe, batch_size=B, timesteps=Lp,\n",
130 |     "                                scope='a_enc')\n",
131 |     "pstates = tf.concat([pstates_f, pstates_b], axis=-1)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "### Question-aware Passage Representation"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 8,
144 |    "metadata": {
145 |     "collapsed": true
146 |    },
147 |    "outputs": [],
148 |    "source": [
149 |     "with tf.variable_scope('qp'):\n",
150 |     "    qp_states = gated_attention_net(qstates, pstates,# encoded representation of text\n",
151 |     "                                    tf.zeros(dtype=tf.float32, shape=[B,hdim]), # notice d*2\n",
152 |     "                                    batch_size=B, d=hdim, La=Lq, Lb=Lp,\n",
153 |     "                                    scope='qp')                                    "
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {
159 |     "collapsed": true
160 |    },
161 |    "source": [
162 |     "## Self-matching Representation"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 9,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "with tf.variable_scope('pp'):\n",
172 |     "    # convert qp_states to time-major\n",
173 |     "    qp_states = tf.transpose(qp_states, [1,0,-1])\n",
174 |     "    pp_states = gated_attention_net(pstates, qp_states,# encoded representation of text\n",
175 |     "                                    tf.zeros(dtype=tf.float32, shape=[B,hdim]), # notice d*2\n",
176 |     "                                    batch_size=B, d=hdim, La=Lp, Lb=Lp,\n",
177 |     "                                    scope='pp')                                    "
178 |    ]
179 |   }
180 |  ],
181 |  "metadata": {
182 |   "kernelspec": {
183 |    "display_name": "Python 3",
184 |    "language": "python",
185 |    "name": "python3"
186 |   },
187 |   "language_info": {
188 |    "codemirror_mode": {
189 |     "name": "ipython",
190 |     "version": 3
191 |    },
192 |    "file_extension": ".py",
193 |    "mimetype": "text/x-python",
194 |    "name": "python",
195 |    "nbconvert_exporter": "python",
196 |    "pygments_lexer": "ipython3",
197 |    "version": "3.5.2"
198 |   }
199 |  },
200 |  "nbformat": 4,
201 |  "nbformat_minor": 2
202 | }
203 | 


--------------------------------------------------------------------------------
/preprocessing/dwr.py:
--------------------------------------------------------------------------------
 1 | import zipfile
 2 | from squad_preprocess import *
 3 | import os
 4 | 
 5 | if __name__ == '__main__':
 6 |     glove_base_url = "http://nlp.stanford.edu/data/"
 7 |     glove_filename = "glove.6B.zip"
 8 |     prefix = os.path.join("download", "dwr")
 9 | 
10 |     print("Storing datasets in {}".format(prefix))
11 | 
12 |     if not os.path.exists(prefix):
13 |         os.makedirs(prefix)
14 | 
15 |     glove_zip = maybe_download(glove_base_url, glove_filename, prefix, 862182613L)
16 |     glove_zip_ref = zipfile.ZipFile(os.path.join(prefix, glove_filename), 'r')
17 | 
18 |     glove_zip_ref.extractall(prefix)
19 |     glove_zip_ref.close()
20 | 


--------------------------------------------------------------------------------
/preprocessing/squad_preprocess.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import argparse
  3 | import json
  4 | import linecache
  5 | import nltk
  6 | import numpy as np
  7 | import os
  8 | import sys
  9 | from tqdm import tqdm
 10 | import random
 11 | 
 12 | from collections import Counter
 13 | from six.moves.urllib.request import urlretrieve
 14 | 
 15 | reload(sys)
 16 | sys.setdefaultencoding('utf8')
 17 | random.seed(42)
 18 | np.random.seed(42)
 19 | 
 20 | squad_base_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"
 21 | 
 22 | # Size train: 30288272
 23 | # size dev: 4854279
 24 | 
 25 | def reporthook(t):
 26 |   """https://github.com/tqdm/tqdm"""
 27 |   last_b = [0]
 28 | 
 29 |   def inner(b=1, bsize=1, tsize=None):
 30 |     """
 31 |     b: int, optional
 32 |         Number of blocks just transferred [default: 1].
 33 |     bsize: int, optional
 34 |         Size of each block (in tqdm units) [default: 1].
 35 |     tsize: int, optional
 36 |         Total size (in tqdm units). If [default: None] remains unchanged.
 37 |     """
 38 |     if tsize is not None:
 39 |         t.total = tsize
 40 |     t.update((b - last_b[0]) * bsize)
 41 |     last_b[0] = b
 42 |   return inner
 43 | 
 44 | def maybe_download(url, filename, prefix, num_bytes=None):
 45 |     """Takes an URL, a filename, and the expected bytes, download
 46 |     the contents and returns the filename
 47 |     num_bytes=None disables the file size check."""
 48 |     local_filename = None
 49 |     if not os.path.exists(os.path.join(prefix, filename)):
 50 |         try:
 51 |             print("Downloading file {}...".format(url + filename))
 52 |             with tqdm(unit='B', unit_scale=True, miniters=1, desc=filename) as t:
 53 |                 local_filename, _ = urlretrieve(url + filename, os.path.join(prefix,filename), reporthook=reporthook(t))
 54 |         except AttributeError as e:
 55 |             print("An error occurred when downloading the file! Please get the dataset using a browser.")
 56 |             raise e
 57 |     # We have a downloaded file
 58 |     # Check the stats and make sure they are ok
 59 |     file_stats = os.stat(os.path.join(prefix,filename))
 60 |     if num_bytes is None or file_stats.st_size == num_bytes:
 61 |         print("File {} successfully loaded".format(filename))
 62 |     else:
 63 |         raise Exception("Unexpected dataset size. Please get the dataset using a browser.")
 64 | 
 65 |     return local_filename
 66 | 
 67 | 
 68 | def data_from_json(filename):
 69 |     with open(filename) as data_file:
 70 |         data = json.load(data_file)
 71 |     return data
 72 | 
 73 | 
 74 | def list_topics(data):
 75 |     list_topics = [data['data'][idx]['title'] for idx in range(0,len(data['data']))]
 76 |     return list_topics
 77 | 
 78 | 
 79 | def tokenize(sequence):
 80 |     tokens = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sequence)]
 81 |     return map(lambda x:x.encode('utf8'), tokens)
 82 | 
 83 | 
 84 | def token_idx_map(context, context_tokens):
 85 |     acc = ''
 86 |     current_token_idx = 0
 87 |     token_map = dict()
 88 | 
 89 |     for char_idx, char in enumerate(context):
 90 |         if char != u' ':
 91 |             acc += char
 92 |             context_token = unicode(context_tokens[current_token_idx])
 93 |             if acc == context_token:
 94 |                 syn_start = char_idx - len(acc) + 1
 95 |                 token_map[syn_start] = [acc, current_token_idx]
 96 |                 acc = ''
 97 |                 current_token_idx += 1
 98 |     return token_map
 99 | 
100 | 
101 | def invert_map(answer_map):
102 |     return {v[1]: [v[0], k] for k, v in answer_map.iteritems()}
103 | 
104 | 
105 | def read_write_dataset(dataset, tier, prefix):
106 |     """Reads the dataset, extracts context, question, answer,
107 |     and answer pointer in their own file. Returns the number
108 |     of questions and answers processed for the dataset"""
109 |     qn, an = 0, 0
110 |     skipped = 0
111 | 
112 |     with open(os.path.join(prefix, tier +'.context'), 'w') as context_file,  \
113 |          open(os.path.join(prefix, tier +'.question'), 'w') as question_file,\
114 |          open(os.path.join(prefix, tier +'.answer'), 'w') as text_file, \
115 |          open(os.path.join(prefix, tier +'.span'), 'w') as span_file:
116 | 
117 |         for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing {}".format(tier)):
118 |             article_paragraphs = dataset['data'][articles_id]['paragraphs']
119 |             for pid in range(len(article_paragraphs)):
120 |                 context = article_paragraphs[pid]['context']
121 |                 # The following replacements are suggested in the paper
122 |                 # BidAF (Seo et al., 2016)
123 |                 context = context.replace("''", '" ')
124 |                 context = context.replace("``", '" ')
125 | 
126 |                 context_tokens = tokenize(context)
127 |                 answer_map = token_idx_map(context, context_tokens)
128 | 
129 |                 qas = article_paragraphs[pid]['qas']
130 |                 for qid in range(len(qas)):
131 |                     question = qas[qid]['question']
132 |                     question_tokens = tokenize(question)
133 | 
134 |                     answers = qas[qid]['answers']
135 |                     qn += 1
136 | 
137 |                     num_answers = range(1)
138 | 
139 |                     for ans_id in num_answers:
140 |                         # it contains answer_start, text
141 |                         text = qas[qid]['answers'][ans_id]['text']
142 |                         a_s = qas[qid]['answers'][ans_id]['answer_start']
143 | 
144 |                         text_tokens = tokenize(text)
145 | 
146 |                         answer_start = qas[qid]['answers'][ans_id]['answer_start']
147 | 
148 |                         answer_end = answer_start + len(text)
149 | 
150 |                         last_word_answer = len(text_tokens[-1]) # add one to get the first char
151 | 
152 |                         try:
153 |                             a_start_idx = answer_map[answer_start][1]
154 | 
155 |                             a_end_idx = answer_map[answer_end - last_word_answer][1]
156 | 
157 |                             # remove length restraint since we deal with it later
158 |                             context_file.write(' '.join(context_tokens) + '\n')
159 |                             question_file.write(' '.join(question_tokens) + '\n')
160 |                             text_file.write(' '.join(text_tokens) + '\n')
161 |                             span_file.write(' '.join([str(a_start_idx), str(a_end_idx)]) + '\n')
162 | 
163 |                         except Exception as e:
164 |                             skipped += 1
165 | 
166 |                         an += 1
167 | 
168 |     print("Skipped {} question/answer pairs in {}".format(skipped, tier))
169 |     return qn,an
170 | 
171 | 
172 | def save_files(prefix, tier, indices):
173 |   with open(os.path.join(prefix, tier + '.context'), 'w') as context_file,  \
174 |      open(os.path.join(prefix, tier + '.question'), 'w') as question_file,\
175 |      open(os.path.join(prefix, tier + '.answer'), 'w') as text_file, \
176 |      open(os.path.join(prefix, tier + '.span'), 'w') as span_file:
177 | 
178 |     for i in indices:
179 |       context_file.write(linecache.getline(os.path.join(prefix, 'train.context'), i))
180 |       question_file.write(linecache.getline(os.path.join(prefix, 'train.question'), i))
181 |       text_file.write(linecache.getline(os.path.join(prefix, 'train.answer'), i))
182 |       span_file.write(linecache.getline(os.path.join(prefix, 'train.span'), i))
183 | 
184 | 
185 | def split_tier(prefix, train_percentage = 0.9, shuffle=False):
186 |     # Get number of lines in file
187 |     context_filename = os.path.join(prefix, 'train' + '.context')
188 |     # Get the number of lines
189 |     with open(context_filename) as current_file:
190 |         num_lines = sum(1 for line in current_file)
191 |     # Get indices and split into two files
192 |     indices_dev = range(num_lines)[int(num_lines * train_percentage)::]
193 |     if shuffle:
194 |         np.random.shuffle(indices_dev)
195 |         print("Shuffling...")
196 |     save_files(prefix, 'val', indices_dev)
197 |     indices_train = range(num_lines)[:int(num_lines * train_percentage)]
198 |     if shuffle:
199 |         np.random.shuffle(indices_train)
200 |     save_files(prefix, 'train', indices_train)
201 | 
202 | 
203 | if __name__ == '__main__':
204 | 
205 |     download_prefix = os.path.join("download", "squad")
206 |     data_prefix = os.path.join("data", "squad")
207 | 
208 |     print("Downloading datasets into {}".format(download_prefix))
209 |     print("Preprocessing datasets into {}".format(data_prefix))
210 | 
211 |     if not os.path.exists(download_prefix):
212 |         os.makedirs(download_prefix)
213 |     if not os.path.exists(data_prefix):
214 |         os.makedirs(data_prefix)
215 | 
216 |     train_filename = "train-v1.1.json"
217 |     dev_filename = "dev-v1.1.json"
218 | 
219 |     maybe_download(squad_base_url, train_filename, download_prefix, 30288272L)
220 | 
221 |     train_data = data_from_json(os.path.join(download_prefix, train_filename))
222 | 
223 |     train_num_questions, train_num_answers = read_write_dataset(train_data, 'train', data_prefix)
224 | 
225 |     # In train we have 87k+ questions, and one answer per question.
226 |     # The answer start range is also indicated
227 | 
228 |     # 1. Split train into train and validation into 95-5
229 |     # 2. Shuffle train, validation
230 |     print("Splitting the dataset into train and validation")
231 |     split_tier(data_prefix, 0.95, shuffle=True)
232 | 
233 |     print("Processed {} questions and {} answers in train".format(train_num_questions, train_num_answers))
234 | 
235 |     print("Downloading {}".format(dev_filename))
236 |     dev_dataset = maybe_download(squad_base_url, dev_filename, download_prefix, 4854279L)
237 | 
238 |     # In dev, we have 10k+ questions, and around 3 answers per question (totaling
239 |     # around 34k+ answers).
240 |     # dev_data = data_from_json(os.path.join(download_prefix, dev_filename))
241 |     # list_topics(dev_data)
242 |     # dev_num_questions, dev_num_answers = read_write_dataset(dev_data, 'dev', data_prefix)
243 |     # print("Processed {} questions and {} answers in dev".format(dev_num_questions, dev_num_answers))
244 | 


--------------------------------------------------------------------------------
/qa_data.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import gzip
  6 | import os
  7 | import re
  8 | import tarfile
  9 | import argparse
 10 | 
 11 | from six.moves import urllib
 12 | 
 13 | from tensorflow.python.platform import gfile
 14 | from tqdm import *
 15 | import numpy as np
 16 | from os.path import join as pjoin
 17 | 
 18 | _PAD = b"<pad>"
 19 | _SOS = b"<sos>"
 20 | _UNK = b"<unk>"
 21 | _START_VOCAB = [_PAD, _SOS, _UNK]
 22 | 
 23 | PAD_ID = 0
 24 | SOS_ID = 1
 25 | UNK_ID = 2
 26 | 
 27 | def setup_args():
 28 |     parser = argparse.ArgumentParser()
 29 |     #home = os.path.join(os.path.dirname(os.path.realpath(__file__)))
 30 |     home = os.getcwd()
 31 |     vocab_dir = os.path.join(home, "data", "squad")
 32 |     glove_dir = os.path.join(home, "download", "dwr")
 33 |     source_dir = os.path.join(home, "data", "squad")
 34 |     parser.add_argument("--source_dir", default=source_dir)
 35 |     parser.add_argument("--glove_dir", default=glove_dir)
 36 |     parser.add_argument("--vocab_dir", default=vocab_dir)
 37 |     parser.add_argument("--glove_dim", default=100, type=int)
 38 |     parser.add_argument("--random_init", default=True, type=bool)
 39 |     return parser.parse_args()
 40 | 
 41 | 
 42 | def basic_tokenizer(sentence):
 43 |     words = []
 44 |     for space_separated_fragment in sentence.strip().split():
 45 |         words.extend(re.split(" ", space_separated_fragment))
 46 |     return [w for w in words if w]
 47 | 
 48 | 
 49 | def initialize_vocabulary(vocabulary_path):
 50 |     # map vocab to word embeddings
 51 |     if gfile.Exists(vocabulary_path):
 52 |         rev_vocab = []
 53 |         with gfile.GFile(vocabulary_path, mode="r") as f:
 54 |             rev_vocab.extend(f.readlines())
 55 |         rev_vocab = [line.strip('\n') for line in rev_vocab]
 56 |         vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
 57 |         return vocab, rev_vocab
 58 |     else:
 59 |         raise ValueError("Vocabulary file %s not found.", vocabulary_path)
 60 | 
 61 | 
 62 | def process_glove(args, vocab_list, save_path, size=4e5, random_init=True):
 63 |     """
 64 |     :param vocab_list: [vocab]
 65 |     :return:
 66 |     """
 67 | 
 68 |     if not gfile.Exists(save_path + ".npz"):
 69 |         glove_path = os.path.join(args.glove_dir, "glove.6B.{}d.txt".format(args.glove_dim))
 70 |         if random_init:
 71 |             glove = np.random.randn(len(vocab_list), args.glove_dim)
 72 |         else:
 73 |             glove = np.zeros((len(vocab_list), args.glove_dim))
 74 |         found = 0
 75 | 
 76 |         vocab_dict = dict(zip(vocab_list, range(len(vocab_list))))
 77 | 
 78 | 
 79 |         with open(glove_path, 'r') as fh:
 80 |             for line in tqdm(fh, total=size):
 81 |                 array = line.lstrip().rstrip().split(" ")
 82 |                 word = array[0]
 83 |                 vector = list(map(float, array[1:]))
 84 |                 if word in vocab_dict:
 85 |                     idx = vocab_dict[word]
 86 |                     glove[idx, :] = vector
 87 |                     found += 1
 88 |                 if word.capitalize() in vocab_dict:
 89 |                     idx = vocab_dict[word.capitalize()]
 90 |                     glove[idx, :] = vector
 91 |                     found += 1
 92 |                 if word.upper() in vocab_dict:
 93 |                     idx = vocab_dict[word.upper()]
 94 |                     glove[idx, :] = vector
 95 |                     found += 1
 96 | 
 97 |         print("{}/{} of word vocab have corresponding vectors in {}".format(found, len(vocab_list), glove_path))
 98 |         np.savez_compressed(save_path, glove=glove)
 99 |         print("saved trimmed glove matrix at: {}".format(save_path))
100 | 
101 | 
102 | def create_vocabulary(vocabulary_path, data_paths, tokenizer=None):
103 |     if not gfile.Exists(vocabulary_path):
104 |         print("Creating vocabulary %s from data %s" % (vocabulary_path, str(data_paths)))
105 |         vocab = {}
106 |         for path in data_paths:
107 |             with open(path, mode="rb") as f:
108 |                 counter = 0
109 |                 for line in f:
110 |                     counter += 1
111 |                     if counter % 100000 == 0:
112 |                         print("processing line %d" % counter)
113 |                     tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
114 |                     for w in tokens:
115 |                         if w in vocab:
116 |                             vocab[w] += 1
117 |                         else:
118 |                             vocab[w] = 1
119 |         vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
120 |         print("Vocabulary size: %d" % len(vocab_list))
121 |         with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
122 |             for w in vocab_list:
123 |                 vocab_file.write(w + b"\n")
124 | 
125 | 
126 | def sentence_to_token_ids(sentence, vocabulary, tokenizer=None):
127 |     if tokenizer:
128 |         words = tokenizer(sentence)
129 |     else:
130 |         words = basic_tokenizer(sentence)
131 |     return [vocabulary.get(w, UNK_ID) for w in words]
132 | 
133 | 
134 | def data_to_token_ids(data_path, target_path, vocabulary_path,
135 |                       tokenizer=None):
136 |     if not gfile.Exists(target_path):
137 |         print("Tokenizing data in %s" % data_path)
138 |         vocab, _ = initialize_vocabulary(vocabulary_path)
139 |         with gfile.GFile(data_path, mode="rb") as data_file:
140 |             with gfile.GFile(target_path, mode="w") as tokens_file:
141 |                 counter = 0
142 |                 for line in data_file:
143 |                     counter += 1
144 |                     if counter % 5000 == 0:
145 |                         print("tokenizing line %d" % counter)
146 |                     token_ids = sentence_to_token_ids(line, vocab, tokenizer)
147 |                     tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
148 | 
149 | 
150 | if __name__ == '__main__':
151 |     args = setup_args()
152 |     vocab_path = pjoin(args.vocab_dir, "vocab.dat")
153 | 
154 |     train_path = pjoin(args.source_dir, "train")
155 |     valid_path = pjoin(args.source_dir, "val")
156 |     dev_path = pjoin(args.source_dir, "dev")
157 | 
158 |     create_vocabulary(vocab_path,
159 |                       [pjoin(args.source_dir, "train.context"),
160 |                        pjoin(args.source_dir, "train.question"),
161 |                        pjoin(args.source_dir, "val.context"),
162 |                        pjoin(args.source_dir, "val.question"),
163 |                        #pjoin(args.source_dir, "dev-v1.1.json")
164 |                        ])
165 |     vocab, rev_vocab = initialize_vocabulary(pjoin(args.vocab_dir, "vocab.dat"))
166 | 
167 |     # ======== Trim Distributed Word Representation =======
168 |     # If you use other word representations, you should change the code below
169 | 
170 |     process_glove(args, rev_vocab, args.source_dir + "/glove.trimmed.{}".format(args.glove_dim),
171 |                   random_init=args.random_init)
172 | 
173 |     # ======== Creating Dataset =========
174 |     # We created our data files seperately
175 |     # If your model loads data differently (like in bulk)
176 |     # You should change the below code
177 | 
178 |     x_train_dis_path = train_path + ".ids.context"
179 |     y_train_ids_path = train_path + ".ids.question"
180 |     data_to_token_ids(train_path + ".context", x_train_dis_path, vocab_path)
181 |     data_to_token_ids(train_path + ".question", y_train_ids_path, vocab_path)
182 | 
183 |     x_dis_path = valid_path + ".ids.context"
184 |     y_ids_path = valid_path + ".ids.question"
185 |     data_to_token_ids(valid_path + ".context", x_dis_path, vocab_path)
186 |     data_to_token_ids(valid_path + ".question", y_ids_path, vocab_path)
187 | 


--------------------------------------------------------------------------------