├── README.md ├── get_started.sh ├── lib └── recurrence.py ├── notebook.ipynb ├── preprocessing ├── dwr.py └── squad_preprocess.py └── qa_data.py /README.md: -------------------------------------------------------------------------------- 1 | # Implementation of conversational intelligence Challenge by ai-guild 2 | 3 | This is an implementation of specific models like R-NET and Dilated CNN for The Conversation Intelligence Challenge other than those implemented in [initial repository](https://github.com/ai-guild/convai/tree/master) of ai-guild's Convai Challenge 4 | 5 | ## Reference Papers 6 | 7 | - [R-Net: Machine Reading Comprehension](https://www.microsoft.com/en-us/research/publication/mrc/#) 8 | - [Multi-Scale Context Aggregation by Dilated Convolutions](https://arxiv.org/abs/1511.07122) 9 | 10 | ## Datasets 11 | 12 | - [SQuAD Explorer](https://rajpurkar.github.io/SQuAD-explorer/) 13 | - [MS MARCO](http://www.msmarco.org/) 14 | - [SNLI](https://nlp.stanford.edu/projects/snli/) 15 | - [CBT](https://research.fb.com/projects/babi/) 16 | 17 | -------------------------------------------------------------------------------- /get_started.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Downloads raw data into ./download 3 | # and saves preprocessed data into ./data 4 | # Get directory containing this script 5 | 6 | CODE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 7 | echo $CODE_DIR 8 | 9 | export PYTHONPATH=$PYTHONPATH:$CODE_DIR 10 | 11 | pip install -r $CODE_DIR/requirements.txt --user 12 | 13 | # download punkt, perluniprops 14 | if [ ! -d "/usr/local/share/nltk_data/tokenizers/punkt" ]; then 15 | python2 -m nltk.downloader punkt 16 | fi 17 | 18 | 19 | # SQuAD preprocess is in charge of downloading 20 | # and formatting the data to be consumed later 21 | DATA_DIR=data 22 | DOWNLOAD_DIR=download 23 | mkdir -p $DATA_DIR 24 | rm -rf $DATA_DIR 25 | python2 $CODE_DIR/preprocessing/squad_preprocess.py 26 | 27 | # Download distributed word representations 28 | python2 $CODE_DIR/preprocessing/dwr.py 29 | 30 | # Data processing for TensorFlow 31 | python2 $CODE_DIR/qa_data.py --glove_dim 100 32 | -------------------------------------------------------------------------------- /lib/recurrence.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib import rnn 3 | 4 | #Gated recurrent units 5 | def gru(num_units): 6 | gru_cell = rnn.GRUCell(num_units) 7 | return gru_cell 8 | 9 | #Create a stacked gated recurrent units, with n-layer 10 | def gru_n(num_units, num_layers): 11 | stacked_gru_cells = rnn.MultiRNNCell( 12 | [gru(num_units) for _ in range(num_layers)]) 13 | return stacked_gru_cells 14 | 15 | 16 | def get_variables(n, shape, name='W'): 17 | return (tf.get_variable(name+str(i), dtype=tf.float32, shape=shape) 18 | for i in range(n)) 19 | 20 | 21 | ''' 22 | Uni-directional RNN 23 | 24 | [usage] 25 | cell_ = gru_n(hdim, 3) 26 | outputs, states = uni_net(cell = cell_, 27 | inputs= inputs_emb, 28 | init_state= cell_.zero_state(batch_size, tf.float32), 29 | timesteps = L) 30 | ''' 31 | def uni_net(cell, inputs, init_state, timesteps, time_major=False, scope='uni_net_0'): 32 | # convert to time major format 33 | if not time_major: 34 | inputs_tm = tf.transpose(inputs, [1, 0, -1],name="input_time_major") 35 | # collection of states and outputs 36 | states, outputs = [init_state], [] 37 | 38 | with tf.variable_scope(scope): 39 | 40 | for i in range(timesteps): 41 | if i > 0: 42 | tf.get_variable_scope().reuse_variables() 43 | output, state = cell(inputs_tm[i], states[-1]) 44 | outputs.append(output) 45 | states.append(state) 46 | 47 | return tf.stack(outputs), tf.stack(states[1:]) 48 | 49 | 50 | ''' 51 | Bi-directional RNN 52 | 53 | [usage] 54 | (states_f, states_b), outputs = bi_net(cell_f= gru_n(hdim,3), 55 | cell_b= gru_n(hdim,3), 56 | inputs= inputs_emb, 57 | batch_size= batch_size, 58 | timesteps=L, 59 | scope='bi_net_0', 60 | project_outputs=True) 61 | ''' 62 | def bi_net(cell_f, cell_b, inputs, batch_size, timesteps, 63 | scope= 'bi_net_0', 64 | project_outputs=False): 65 | 66 | # forward 67 | _, states_f = uni_net(cell_f, 68 | inputs, 69 | cell_f.zero_state(batch_size, tf.float32), 70 | timesteps, 71 | scope=scope + '_f') 72 | # backward 73 | _, states_b = uni_net(cell_b, 74 | tf.reverse(inputs, axis=[1]), 75 | cell_b.zero_state(batch_size, tf.float32), 76 | timesteps, 77 | scope=scope + '_b') 78 | 79 | # infer num_layers and hidden_dim from cell 80 | num_layers = len(cell_f.state_size) 81 | hidden_dim = cell_f.state_size[0] 82 | 83 | outputs = None 84 | # outputs 85 | if project_outputs: 86 | #chain both forward and backword states together 87 | states = tf.concat([states_f, states_b], axis=-1) 88 | 89 | if len(states.shape) == 4 and num_layers: 90 | states = tf.reshape(tf.transpose(states, [-2, 0, 1, -1]), [-1, hidden_dim*2*num_layers]) 91 | Wo = tf.get_variable(scope+'/Wo', dtype=tf.float32, shape=[num_layers*2*hidden_dim, hidden_dim]) 92 | elif len(states.shape) == 3: 93 | states = tf.reshape(tf.transpose(states, [-2, 0, -1]), [-1, hidden_dim*2]) 94 | Wo = tf.get_variable(scope+'/Wo', dtype=tf.float32, shape=[2*hidden_dim, hidden_dim]) 95 | else: 96 | print('>> ERR : Unable to handle state reshape') 97 | return None 98 | 99 | outputs = tf.reshape(tf.matmul(states, Wo), [batch_size, timesteps, hidden_dim]) 100 | 101 | return (states_f, states_b), outputs 102 | 103 | 104 | ''' 105 | Attention Mechanism 106 | 107 | based on "Neural Machine Translation by Jointly Learning to Align and Translate" 108 | https://arxiv.org/abs/1409.0473 109 | 110 | [usage] 111 | ci = attention(enc_states, dec_state, params= { 112 | 'Wa' : Wa, # [d,d] 113 | 'Ua' : Ua, # [d,d] 114 | 'Va' : Va # [d,1] 115 | }) 116 | shape(enc_states) : [B, L, d] 117 | shape(dec_state) : [B, d] 118 | shape(ci) : [B,d] 119 | 120 | ''' 121 | def attention(enc_states, dec_state, params, d, timesteps): 122 | Wa, Ua = params['Wa'], params['Ua'] 123 | # s_ij -> [B,L,d] 124 | a = tf.tanh(tf.expand_dims(tf.matmul(dec_state, Wa), axis=1) + 125 | tf.reshape(tf.matmul(tf.reshape(enc_states,[-1, d]), Ua), [-1, timesteps, d])) 126 | Va = params['Va'] # [d, 1] 127 | # e_ij -> softmax(aV_a) : [B, L] 128 | scores = tf.nn.softmax(tf.reshape(tf.matmul(tf.reshape(a, [-1, d]), Va), [-1, timesteps])) 129 | # c_i -> weighted sum of encoder states 130 | return tf.reduce_sum(enc_states*tf.expand_dims(scores, axis=-1), axis=1) # [B, d] 131 | 132 | 133 | ''' 134 | Attentive Decoder 135 | 136 | [usage] 137 | dec_outputs, dec_states = attentive_decoder(enc_states, 138 | tf.zeros(dtype=tf.float32, shape=[B,d]), 139 | batch_size=B,timesteps=L,feed_previous=True, 140 | inputs = inputs) 141 | shape(enc_states) : [B, L, d] 142 | shape(inputs) : [[B, d]] if feed_previous else [L, B, d] 143 | 144 | 145 | ''' 146 | def attentive_decoder(enc_states, init_state, batch_size, 147 | d, timesteps, 148 | inputs = [], 149 | scope='attentive_decoder_0', 150 | num_layers=1, 151 | feed_previous=False): 152 | # get parameters 153 | U,W,C,Ur,Wr,Cr,Uz,Wz,Cz,Uo,Vo,Co = get_variables(12, [d,d], name='decoder_param') 154 | Wa, Ua = get_variables(2, [d,d], 'att') 155 | Va = tf.get_variable('Va', shape=[d, 1], dtype=tf.float32) 156 | att_params = { 157 | 'Wa' : Wa, 'Ua' : Ua, 'Va' : Va 158 | } 159 | 160 | 161 | def step(input_, state, ci): 162 | z = tf.nn.sigmoid(tf.matmul(input_, Wz)+tf.matmul(state, Uz)+tf.matmul(ci, Cz)) 163 | r = tf.nn.sigmoid(tf.matmul(input_, Wr)+tf.matmul(state, Ur)+tf.matmul(ci, Cr)) 164 | si = tf.nn.tanh(tf.matmul(input_, W)+tf.matmul(ci, C)+tf.matmul(r*state, U)) 165 | 166 | state = (1-z)*state + z*si 167 | output = tf.matmul(state, Uo) + tf.matmul(input_, Vo) + tf.matmul(ci, Co) 168 | 169 | return output, state 170 | 171 | outputs = [inputs[0]] # include GO token as init input 172 | states = [init_state] 173 | for i in range(timesteps): 174 | input_ = outputs[-1] if feed_previous else inputs[i] 175 | output, state = step(input_, states[-1], 176 | attention(enc_states, states[-1], att_params, d, timesteps)) 177 | 178 | outputs.append(output) 179 | states.append(state) 180 | # time major -> batch major 181 | states_bm = tf.transpose(tf.stack(states[1:]), [1, 0, 2]) 182 | outputs_bm = tf.transpose(tf.stack(outputs[1:]), [1, 0, 2]) 183 | return outputs_bm, states_bm 184 | 185 | 186 | ''' 187 | Gated Attention Network 188 | 189 | based on "R-NET: Machine Reading Comprehension with Self-matching Networks" 190 | https://www.microsoft.com/en-us/research/publication/mrc/ 191 | 192 | [usage] 193 | dec_outputs, dec_states = gated_attention_net(enc_states, # encoded representation of text 194 | tf.zeros(dtype=tf.float32, shape=[B,d*2]), # notice d*2 195 | batch_size=B,timesteps=L,feed_previous=False, 196 | inputs = inputs) 197 | shape(enc_states) : [B, L, d] 198 | shape(inputs) : [[B, d]] if feed_previous else [L, B, d] 199 | 200 | For reading comprehension, inputs is same as enc_states; feed_previous doesn't apply 201 | 202 | 203 | ''' 204 | def gated_attention_net(states_a, states_b, init_state_c, 205 | batch_size, d, La, Lb, 206 | scope='gated_attention_net_0'): 207 | 208 | with tf.variable_scope(scope, reuse=False): 209 | # define attention parameters 210 | Wa = tf.get_variable('Wa', shape=[d, d], dtype=tf.float32) 211 | Wb = tf.get_variable('Wb', shape=[d, d], dtype=tf.float32) 212 | Wc = tf.get_variable('Wc', shape=[d, d], dtype=tf.float32) 213 | Va = tf.get_variable('Va', shape=[d, 1], dtype=tf.float32) 214 | 215 | att_params = { 216 | 'Wa' : Wa, 'Wb' : Wb, 'Wc' : Wc, 'Va' : Va 217 | } 218 | 219 | # input gate/projection 220 | Wg = tf.get_variable('Wg', shape=[d, d], dtype=tf.float32) 221 | Wi = tf.get_variable('Wi', shape=[d*2, d], dtype=tf.float32) 222 | 223 | # define rnn cell 224 | cell = gru(num_units=d) 225 | 226 | # convert states_a to batch_major 227 | states_a = tf.transpose(states_a, [1,0,2]) 228 | 229 | def step(input_, state): 230 | # define input gate 231 | gi = tf.nn.sigmoid(tf.matmul(input_, Wg)) 232 | # apply gate to input 233 | input_ = gi * input_ 234 | # recurrent step 235 | output, state = cell(input_, state) 236 | return output, state 237 | 238 | states = [init_state_c] 239 | for i in range(Lb): 240 | if i>0: 241 | tf.get_variable_scope().reuse_variables() 242 | 243 | # get match for current word 244 | ci = attention_pooling(states_a, states_b[i], states[-1], 245 | att_params, d, La) 246 | # combine ci and input(i) 247 | input_ = tf.matmul(tf.concat([states_b[i], ci], axis=-1), Wi) 248 | _, state = step(input_, states[-1]) 249 | 250 | states.append(state) 251 | 252 | # time major -> batch major 253 | states_bm = tf.transpose(tf.stack(states[1:]), [1, 0, 2]) 254 | #outputs_bm = tf.transpose(tf.stack(outputs[1:]), [1, 0, 2]) 255 | 256 | return states_bm 257 | 258 | ''' 259 | Attention Pooling Mechanism 260 | 261 | based on "R-NET: Machine Reading Comprehension with Self-matching Networks" 262 | https://www.microsoft.com/en-us/research/publication/mrc/ 263 | 264 | [usage] 265 | ci = attention(qstates, pstates_i, state, params= { 266 | 'Wa' : Wa, # [d,d] 267 | 'Wb' : Wb, # [d,d] 268 | 'Wc' : Wc, # [d,d] 269 | 'Va' : Va # [d,1] 270 | }) 271 | shape(qstates) : [B, L, d] 272 | shape(pstates_i) : [B, d] 273 | shape(state) : [B, d] 274 | shape(ci) : [B, d] 275 | 276 | ''' 277 | def attention_pooling(states_a, states_b_i, state_c, params, d, timesteps): 278 | Wa, Wb, Wc = params['Wa'], params['Wb'], params['Wc'] 279 | # s_ij -> [B,L,d] 280 | a = tf.tanh(tf.expand_dims(tf.matmul(states_b_i, Wb), axis=1) + 281 | tf.reshape(tf.matmul(tf.reshape(states_a,[-1, d]), Wa), [-1, timesteps, d]) + 282 | tf.expand_dims(tf.matmul(state_c, Wc), axis=1)) 283 | Va = params['Va'] # [d, 1] 284 | # e_ij -> softmax(aV_a) : [B, L] 285 | scores = tf.nn.softmax(tf.reshape(tf.matmul(tf.reshape(a, [-1, d]), Va), [-1, timesteps])) 286 | # c_i -> weighted sum of encoder states 287 | return tf.reduce_sum(states_a*tf.expand_dims(scores, axis=-1), axis=1) # [B, d] 288 | 289 | -------------------------------------------------------------------------------- /notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# R-NET" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import tensorflow as tf\n", 19 | "from tensorflow.contrib.layers import xavier_initializer as xinit" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Placeholders" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "Lp = 100\n", 38 | "Lq = 20\n", 39 | "B = 8\n", 40 | "embed_dim = 150\n", 41 | "hdim = 2*embed_dim\n", 42 | "vocab_size = 10000" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "tf.reset_default_graph()\n", 54 | "p = tf.placeholder(tf.int32, shape=[None, Lp], name='passage')\n", 55 | "q = tf.placeholder(tf.int32, shape=[None, Lq], name='question')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "## Embedding\n", 63 | "\n", 64 | "**TODO**\n", 65 | "- [ ] Include character embedding for OOV tokens" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "E = tf.get_variable('E', dtype=tf.float32, shape=[vocab_size, embed_dim], initializer=xinit())\n", 77 | "qe = tf.nn.embedding_lookup(E, q)\n", 78 | "pe = tf.nn.embedding_lookup(E, p)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "## Question Encoder" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 5, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "from lib.recurrence import *" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 6, 102 | "metadata": { 103 | "collapsed": true 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "(qstates_f, qstates_b), _ = bi_net(cell_f= gru_n(embed_dim,3), cell_b=gru_n(embed_dim,3),\n", 108 | " inputs=qe, batch_size=B, timesteps=Lq,\n", 109 | " scope='q_enc')\n", 110 | "qstates = tf.concat([qstates_f, qstates_b], axis=-1)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "## Passage Encoder" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 7, 123 | "metadata": { 124 | "collapsed": true 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "(pstates_f, pstates_b), _ = bi_net(cell_f= gru_n(embed_dim,3), cell_b=gru_n(embed_dim,3),\n", 129 | " inputs=pe, batch_size=B, timesteps=Lp,\n", 130 | " scope='a_enc')\n", 131 | "pstates = tf.concat([pstates_f, pstates_b], axis=-1)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "### Question-aware Passage Representation" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 8, 144 | "metadata": { 145 | "collapsed": true 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "with tf.variable_scope('qp'):\n", 150 | " qp_states = gated_attention_net(qstates, pstates,# encoded representation of text\n", 151 | " tf.zeros(dtype=tf.float32, shape=[B,hdim]), # notice d*2\n", 152 | " batch_size=B, d=hdim, La=Lq, Lb=Lp,\n", 153 | " scope='qp') " 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": { 159 | "collapsed": true 160 | }, 161 | "source": [ 162 | "## Self-matching Representation" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 9, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "with tf.variable_scope('pp'):\n", 172 | " # convert qp_states to time-major\n", 173 | " qp_states = tf.transpose(qp_states, [1,0,-1])\n", 174 | " pp_states = gated_attention_net(pstates, qp_states,# encoded representation of text\n", 175 | " tf.zeros(dtype=tf.float32, shape=[B,hdim]), # notice d*2\n", 176 | " batch_size=B, d=hdim, La=Lp, Lb=Lp,\n", 177 | " scope='pp') " 178 | ] 179 | } 180 | ], 181 | "metadata": { 182 | "kernelspec": { 183 | "display_name": "Python 3", 184 | "language": "python", 185 | "name": "python3" 186 | }, 187 | "language_info": { 188 | "codemirror_mode": { 189 | "name": "ipython", 190 | "version": 3 191 | }, 192 | "file_extension": ".py", 193 | "mimetype": "text/x-python", 194 | "name": "python", 195 | "nbconvert_exporter": "python", 196 | "pygments_lexer": "ipython3", 197 | "version": "3.5.2" 198 | } 199 | }, 200 | "nbformat": 4, 201 | "nbformat_minor": 2 202 | } 203 | -------------------------------------------------------------------------------- /preprocessing/dwr.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | from squad_preprocess import * 3 | import os 4 | 5 | if __name__ == '__main__': 6 | glove_base_url = "http://nlp.stanford.edu/data/" 7 | glove_filename = "glove.6B.zip" 8 | prefix = os.path.join("download", "dwr") 9 | 10 | print("Storing datasets in {}".format(prefix)) 11 | 12 | if not os.path.exists(prefix): 13 | os.makedirs(prefix) 14 | 15 | glove_zip = maybe_download(glove_base_url, glove_filename, prefix, 862182613L) 16 | glove_zip_ref = zipfile.ZipFile(os.path.join(prefix, glove_filename), 'r') 17 | 18 | glove_zip_ref.extractall(prefix) 19 | glove_zip_ref.close() 20 | -------------------------------------------------------------------------------- /preprocessing/squad_preprocess.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import json 4 | import linecache 5 | import nltk 6 | import numpy as np 7 | import os 8 | import sys 9 | from tqdm import tqdm 10 | import random 11 | 12 | from collections import Counter 13 | from six.moves.urllib.request import urlretrieve 14 | 15 | reload(sys) 16 | sys.setdefaultencoding('utf8') 17 | random.seed(42) 18 | np.random.seed(42) 19 | 20 | squad_base_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/" 21 | 22 | # Size train: 30288272 23 | # size dev: 4854279 24 | 25 | def reporthook(t): 26 | """https://github.com/tqdm/tqdm""" 27 | last_b = [0] 28 | 29 | def inner(b=1, bsize=1, tsize=None): 30 | """ 31 | b: int, optional 32 | Number of blocks just transferred [default: 1]. 33 | bsize: int, optional 34 | Size of each block (in tqdm units) [default: 1]. 35 | tsize: int, optional 36 | Total size (in tqdm units). If [default: None] remains unchanged. 37 | """ 38 | if tsize is not None: 39 | t.total = tsize 40 | t.update((b - last_b[0]) * bsize) 41 | last_b[0] = b 42 | return inner 43 | 44 | def maybe_download(url, filename, prefix, num_bytes=None): 45 | """Takes an URL, a filename, and the expected bytes, download 46 | the contents and returns the filename 47 | num_bytes=None disables the file size check.""" 48 | local_filename = None 49 | if not os.path.exists(os.path.join(prefix, filename)): 50 | try: 51 | print("Downloading file {}...".format(url + filename)) 52 | with tqdm(unit='B', unit_scale=True, miniters=1, desc=filename) as t: 53 | local_filename, _ = urlretrieve(url + filename, os.path.join(prefix,filename), reporthook=reporthook(t)) 54 | except AttributeError as e: 55 | print("An error occurred when downloading the file! Please get the dataset using a browser.") 56 | raise e 57 | # We have a downloaded file 58 | # Check the stats and make sure they are ok 59 | file_stats = os.stat(os.path.join(prefix,filename)) 60 | if num_bytes is None or file_stats.st_size == num_bytes: 61 | print("File {} successfully loaded".format(filename)) 62 | else: 63 | raise Exception("Unexpected dataset size. Please get the dataset using a browser.") 64 | 65 | return local_filename 66 | 67 | 68 | def data_from_json(filename): 69 | with open(filename) as data_file: 70 | data = json.load(data_file) 71 | return data 72 | 73 | 74 | def list_topics(data): 75 | list_topics = [data['data'][idx]['title'] for idx in range(0,len(data['data']))] 76 | return list_topics 77 | 78 | 79 | def tokenize(sequence): 80 | tokens = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sequence)] 81 | return map(lambda x:x.encode('utf8'), tokens) 82 | 83 | 84 | def token_idx_map(context, context_tokens): 85 | acc = '' 86 | current_token_idx = 0 87 | token_map = dict() 88 | 89 | for char_idx, char in enumerate(context): 90 | if char != u' ': 91 | acc += char 92 | context_token = unicode(context_tokens[current_token_idx]) 93 | if acc == context_token: 94 | syn_start = char_idx - len(acc) + 1 95 | token_map[syn_start] = [acc, current_token_idx] 96 | acc = '' 97 | current_token_idx += 1 98 | return token_map 99 | 100 | 101 | def invert_map(answer_map): 102 | return {v[1]: [v[0], k] for k, v in answer_map.iteritems()} 103 | 104 | 105 | def read_write_dataset(dataset, tier, prefix): 106 | """Reads the dataset, extracts context, question, answer, 107 | and answer pointer in their own file. Returns the number 108 | of questions and answers processed for the dataset""" 109 | qn, an = 0, 0 110 | skipped = 0 111 | 112 | with open(os.path.join(prefix, tier +'.context'), 'w') as context_file, \ 113 | open(os.path.join(prefix, tier +'.question'), 'w') as question_file,\ 114 | open(os.path.join(prefix, tier +'.answer'), 'w') as text_file, \ 115 | open(os.path.join(prefix, tier +'.span'), 'w') as span_file: 116 | 117 | for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing {}".format(tier)): 118 | article_paragraphs = dataset['data'][articles_id]['paragraphs'] 119 | for pid in range(len(article_paragraphs)): 120 | context = article_paragraphs[pid]['context'] 121 | # The following replacements are suggested in the paper 122 | # BidAF (Seo et al., 2016) 123 | context = context.replace("''", '" ') 124 | context = context.replace("``", '" ') 125 | 126 | context_tokens = tokenize(context) 127 | answer_map = token_idx_map(context, context_tokens) 128 | 129 | qas = article_paragraphs[pid]['qas'] 130 | for qid in range(len(qas)): 131 | question = qas[qid]['question'] 132 | question_tokens = tokenize(question) 133 | 134 | answers = qas[qid]['answers'] 135 | qn += 1 136 | 137 | num_answers = range(1) 138 | 139 | for ans_id in num_answers: 140 | # it contains answer_start, text 141 | text = qas[qid]['answers'][ans_id]['text'] 142 | a_s = qas[qid]['answers'][ans_id]['answer_start'] 143 | 144 | text_tokens = tokenize(text) 145 | 146 | answer_start = qas[qid]['answers'][ans_id]['answer_start'] 147 | 148 | answer_end = answer_start + len(text) 149 | 150 | last_word_answer = len(text_tokens[-1]) # add one to get the first char 151 | 152 | try: 153 | a_start_idx = answer_map[answer_start][1] 154 | 155 | a_end_idx = answer_map[answer_end - last_word_answer][1] 156 | 157 | # remove length restraint since we deal with it later 158 | context_file.write(' '.join(context_tokens) + '\n') 159 | question_file.write(' '.join(question_tokens) + '\n') 160 | text_file.write(' '.join(text_tokens) + '\n') 161 | span_file.write(' '.join([str(a_start_idx), str(a_end_idx)]) + '\n') 162 | 163 | except Exception as e: 164 | skipped += 1 165 | 166 | an += 1 167 | 168 | print("Skipped {} question/answer pairs in {}".format(skipped, tier)) 169 | return qn,an 170 | 171 | 172 | def save_files(prefix, tier, indices): 173 | with open(os.path.join(prefix, tier + '.context'), 'w') as context_file, \ 174 | open(os.path.join(prefix, tier + '.question'), 'w') as question_file,\ 175 | open(os.path.join(prefix, tier + '.answer'), 'w') as text_file, \ 176 | open(os.path.join(prefix, tier + '.span'), 'w') as span_file: 177 | 178 | for i in indices: 179 | context_file.write(linecache.getline(os.path.join(prefix, 'train.context'), i)) 180 | question_file.write(linecache.getline(os.path.join(prefix, 'train.question'), i)) 181 | text_file.write(linecache.getline(os.path.join(prefix, 'train.answer'), i)) 182 | span_file.write(linecache.getline(os.path.join(prefix, 'train.span'), i)) 183 | 184 | 185 | def split_tier(prefix, train_percentage = 0.9, shuffle=False): 186 | # Get number of lines in file 187 | context_filename = os.path.join(prefix, 'train' + '.context') 188 | # Get the number of lines 189 | with open(context_filename) as current_file: 190 | num_lines = sum(1 for line in current_file) 191 | # Get indices and split into two files 192 | indices_dev = range(num_lines)[int(num_lines * train_percentage)::] 193 | if shuffle: 194 | np.random.shuffle(indices_dev) 195 | print("Shuffling...") 196 | save_files(prefix, 'val', indices_dev) 197 | indices_train = range(num_lines)[:int(num_lines * train_percentage)] 198 | if shuffle: 199 | np.random.shuffle(indices_train) 200 | save_files(prefix, 'train', indices_train) 201 | 202 | 203 | if __name__ == '__main__': 204 | 205 | download_prefix = os.path.join("download", "squad") 206 | data_prefix = os.path.join("data", "squad") 207 | 208 | print("Downloading datasets into {}".format(download_prefix)) 209 | print("Preprocessing datasets into {}".format(data_prefix)) 210 | 211 | if not os.path.exists(download_prefix): 212 | os.makedirs(download_prefix) 213 | if not os.path.exists(data_prefix): 214 | os.makedirs(data_prefix) 215 | 216 | train_filename = "train-v1.1.json" 217 | dev_filename = "dev-v1.1.json" 218 | 219 | maybe_download(squad_base_url, train_filename, download_prefix, 30288272L) 220 | 221 | train_data = data_from_json(os.path.join(download_prefix, train_filename)) 222 | 223 | train_num_questions, train_num_answers = read_write_dataset(train_data, 'train', data_prefix) 224 | 225 | # In train we have 87k+ questions, and one answer per question. 226 | # The answer start range is also indicated 227 | 228 | # 1. Split train into train and validation into 95-5 229 | # 2. Shuffle train, validation 230 | print("Splitting the dataset into train and validation") 231 | split_tier(data_prefix, 0.95, shuffle=True) 232 | 233 | print("Processed {} questions and {} answers in train".format(train_num_questions, train_num_answers)) 234 | 235 | print("Downloading {}".format(dev_filename)) 236 | dev_dataset = maybe_download(squad_base_url, dev_filename, download_prefix, 4854279L) 237 | 238 | # In dev, we have 10k+ questions, and around 3 answers per question (totaling 239 | # around 34k+ answers). 240 | # dev_data = data_from_json(os.path.join(download_prefix, dev_filename)) 241 | # list_topics(dev_data) 242 | # dev_num_questions, dev_num_answers = read_write_dataset(dev_data, 'dev', data_prefix) 243 | # print("Processed {} questions and {} answers in dev".format(dev_num_questions, dev_num_answers)) 244 | -------------------------------------------------------------------------------- /qa_data.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import gzip 6 | import os 7 | import re 8 | import tarfile 9 | import argparse 10 | 11 | from six.moves import urllib 12 | 13 | from tensorflow.python.platform import gfile 14 | from tqdm import * 15 | import numpy as np 16 | from os.path import join as pjoin 17 | 18 | _PAD = b"" 19 | _SOS = b"" 20 | _UNK = b"" 21 | _START_VOCAB = [_PAD, _SOS, _UNK] 22 | 23 | PAD_ID = 0 24 | SOS_ID = 1 25 | UNK_ID = 2 26 | 27 | def setup_args(): 28 | parser = argparse.ArgumentParser() 29 | #home = os.path.join(os.path.dirname(os.path.realpath(__file__))) 30 | home = os.getcwd() 31 | vocab_dir = os.path.join(home, "data", "squad") 32 | glove_dir = os.path.join(home, "download", "dwr") 33 | source_dir = os.path.join(home, "data", "squad") 34 | parser.add_argument("--source_dir", default=source_dir) 35 | parser.add_argument("--glove_dir", default=glove_dir) 36 | parser.add_argument("--vocab_dir", default=vocab_dir) 37 | parser.add_argument("--glove_dim", default=100, type=int) 38 | parser.add_argument("--random_init", default=True, type=bool) 39 | return parser.parse_args() 40 | 41 | 42 | def basic_tokenizer(sentence): 43 | words = [] 44 | for space_separated_fragment in sentence.strip().split(): 45 | words.extend(re.split(" ", space_separated_fragment)) 46 | return [w for w in words if w] 47 | 48 | 49 | def initialize_vocabulary(vocabulary_path): 50 | # map vocab to word embeddings 51 | if gfile.Exists(vocabulary_path): 52 | rev_vocab = [] 53 | with gfile.GFile(vocabulary_path, mode="r") as f: 54 | rev_vocab.extend(f.readlines()) 55 | rev_vocab = [line.strip('\n') for line in rev_vocab] 56 | vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) 57 | return vocab, rev_vocab 58 | else: 59 | raise ValueError("Vocabulary file %s not found.", vocabulary_path) 60 | 61 | 62 | def process_glove(args, vocab_list, save_path, size=4e5, random_init=True): 63 | """ 64 | :param vocab_list: [vocab] 65 | :return: 66 | """ 67 | 68 | if not gfile.Exists(save_path + ".npz"): 69 | glove_path = os.path.join(args.glove_dir, "glove.6B.{}d.txt".format(args.glove_dim)) 70 | if random_init: 71 | glove = np.random.randn(len(vocab_list), args.glove_dim) 72 | else: 73 | glove = np.zeros((len(vocab_list), args.glove_dim)) 74 | found = 0 75 | 76 | vocab_dict = dict(zip(vocab_list, range(len(vocab_list)))) 77 | 78 | 79 | with open(glove_path, 'r') as fh: 80 | for line in tqdm(fh, total=size): 81 | array = line.lstrip().rstrip().split(" ") 82 | word = array[0] 83 | vector = list(map(float, array[1:])) 84 | if word in vocab_dict: 85 | idx = vocab_dict[word] 86 | glove[idx, :] = vector 87 | found += 1 88 | if word.capitalize() in vocab_dict: 89 | idx = vocab_dict[word.capitalize()] 90 | glove[idx, :] = vector 91 | found += 1 92 | if word.upper() in vocab_dict: 93 | idx = vocab_dict[word.upper()] 94 | glove[idx, :] = vector 95 | found += 1 96 | 97 | print("{}/{} of word vocab have corresponding vectors in {}".format(found, len(vocab_list), glove_path)) 98 | np.savez_compressed(save_path, glove=glove) 99 | print("saved trimmed glove matrix at: {}".format(save_path)) 100 | 101 | 102 | def create_vocabulary(vocabulary_path, data_paths, tokenizer=None): 103 | if not gfile.Exists(vocabulary_path): 104 | print("Creating vocabulary %s from data %s" % (vocabulary_path, str(data_paths))) 105 | vocab = {} 106 | for path in data_paths: 107 | with open(path, mode="rb") as f: 108 | counter = 0 109 | for line in f: 110 | counter += 1 111 | if counter % 100000 == 0: 112 | print("processing line %d" % counter) 113 | tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) 114 | for w in tokens: 115 | if w in vocab: 116 | vocab[w] += 1 117 | else: 118 | vocab[w] = 1 119 | vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) 120 | print("Vocabulary size: %d" % len(vocab_list)) 121 | with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: 122 | for w in vocab_list: 123 | vocab_file.write(w + b"\n") 124 | 125 | 126 | def sentence_to_token_ids(sentence, vocabulary, tokenizer=None): 127 | if tokenizer: 128 | words = tokenizer(sentence) 129 | else: 130 | words = basic_tokenizer(sentence) 131 | return [vocabulary.get(w, UNK_ID) for w in words] 132 | 133 | 134 | def data_to_token_ids(data_path, target_path, vocabulary_path, 135 | tokenizer=None): 136 | if not gfile.Exists(target_path): 137 | print("Tokenizing data in %s" % data_path) 138 | vocab, _ = initialize_vocabulary(vocabulary_path) 139 | with gfile.GFile(data_path, mode="rb") as data_file: 140 | with gfile.GFile(target_path, mode="w") as tokens_file: 141 | counter = 0 142 | for line in data_file: 143 | counter += 1 144 | if counter % 5000 == 0: 145 | print("tokenizing line %d" % counter) 146 | token_ids = sentence_to_token_ids(line, vocab, tokenizer) 147 | tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n") 148 | 149 | 150 | if __name__ == '__main__': 151 | args = setup_args() 152 | vocab_path = pjoin(args.vocab_dir, "vocab.dat") 153 | 154 | train_path = pjoin(args.source_dir, "train") 155 | valid_path = pjoin(args.source_dir, "val") 156 | dev_path = pjoin(args.source_dir, "dev") 157 | 158 | create_vocabulary(vocab_path, 159 | [pjoin(args.source_dir, "train.context"), 160 | pjoin(args.source_dir, "train.question"), 161 | pjoin(args.source_dir, "val.context"), 162 | pjoin(args.source_dir, "val.question"), 163 | #pjoin(args.source_dir, "dev-v1.1.json") 164 | ]) 165 | vocab, rev_vocab = initialize_vocabulary(pjoin(args.vocab_dir, "vocab.dat")) 166 | 167 | # ======== Trim Distributed Word Representation ======= 168 | # If you use other word representations, you should change the code below 169 | 170 | process_glove(args, rev_vocab, args.source_dir + "/glove.trimmed.{}".format(args.glove_dim), 171 | random_init=args.random_init) 172 | 173 | # ======== Creating Dataset ========= 174 | # We created our data files seperately 175 | # If your model loads data differently (like in bulk) 176 | # You should change the below code 177 | 178 | x_train_dis_path = train_path + ".ids.context" 179 | y_train_ids_path = train_path + ".ids.question" 180 | data_to_token_ids(train_path + ".context", x_train_dis_path, vocab_path) 181 | data_to_token_ids(train_path + ".question", y_train_ids_path, vocab_path) 182 | 183 | x_dis_path = valid_path + ".ids.context" 184 | y_ids_path = valid_path + ".ids.question" 185 | data_to_token_ids(valid_path + ".context", x_dis_path, vocab_path) 186 | data_to_token_ids(valid_path + ".question", y_ids_path, vocab_path) 187 | --------------------------------------------------------------------------------