├── BiDAF
    ├── BiDAFF++_with_glove+bert.jsonnet
    ├── BiDAFF++_with_glove+elmo.jsonnet
    ├── Figures
    │   ├── 1.png
    │   ├── 2.png
    │   ├── 3.png
    │   ├── Arch.png
    │   ├── baseline.png
    │   ├── enhenced.png
    │   └── enhenced_elmo.png
    ├── README.md
    ├── bert_token_embedder.py
    └── wordpiece_indexer.py
├── FlowQA_Attention
    ├── README.md
    ├── code
    │   ├── QA_model
    │   │   ├── README.md
    │   │   ├── detail_model.py
    │   │   ├── layers.py
    │   │   ├── model_QuAC.py
    │   │   └── utils.py
    │   ├── README.md
    │   ├── general_utils.py
    │   ├── predict_QuAC.py
    │   ├── preprocess_QuAC.py
    │   ├── requirements.txt
    │   └── train_QuAC.py
    └── figure
    │   ├── Attention Over Flow.png
    │   ├── Readme.me
    │   ├── Vanilla Flow Operation.png
    │   └── result.png
├── FlowQA_Coreference
    └── README.md
├── README.md
├── SDNet
    ├── Figures
    │   ├── f1.jpg
    │   └── loss.jpg
    ├── README.md
    └── code
    │   ├── SDNet
    │       ├── CONTRIBUTING.md
    │       ├── LICENSE
    │       ├── Models
    │       │   ├── BaseTrainer.py
    │       │   ├── Bert
    │       │   │   ├── Bert.py
    │       │   │   ├── __pycache__
    │       │   │   │   ├── Bert.cpython-37.pyc
    │       │   │   │   ├── modeling.cpython-37.pyc
    │       │   │   │   └── tokenization.cpython-37.pyc
    │       │   │   ├── modeling.py
    │       │   │   ├── optimization.py
    │       │   │   └── tokenization.py
    │       │   ├── Layers.py
    │       │   ├── SDNet.py
    │       │   ├── SDNetTrainer.py
    │       │   └── __pycache__
    │       │   │   ├── BaseTrainer.cpython-37.pyc
    │       │   │   ├── Layers.cpython-37.pyc
    │       │   │   ├── SDNet.cpython-37.pyc
    │       │   │   └── SDNetTrainer.cpython-37.pyc
    │       ├── NOTICE.txt
    │       ├── README.md
    │       ├── Scratch
    │       │   └── SQuAD_to_CoQA.py
    │       ├── Utils
    │       │   ├── Arguments.py
    │       │   ├── CoQAPreprocess.py
    │       │   ├── CoQAUtils.py
    │       │   ├── Constants.py
    │       │   ├── GeneralUtils.py
    │       │   ├── Timing.py
    │       │   └── __pycache__
    │       │   │   ├── Arguments.cpython-37.pyc
    │       │   │   ├── CoQAPreprocess.cpython-37.pyc
    │       │   │   ├── CoQAUtils.cpython-37.pyc
    │       │   │   ├── Constants.cpython-37.pyc
    │       │   │   └── GeneralUtils.cpython-37.pyc
    │       ├── bert_vocab_files
    │       │   ├── bert-base-uncased-vocab.txt
    │       │   └── bert-large-uncased-vocab.txt
    │       ├── conf
    │       └── main.py
    │   └── preprocess
    │       └── prepro.py
└── figure
    ├── 1
    ├── QA.png
    ├── coref-F1.png
    ├── flow-coref.png
    ├── quac.png
    └── task.png


/BiDAF/BiDAFF++_with_glove+bert.jsonnet:
--------------------------------------------------------------------------------
  1 | {
  2 |     "dataset_reader": {
  3 |         "type": "quac",
  4 |         "lazy": true,
  5 |         "num_context_answers": 2,
  6 | 
  7 |     "token_indexers": {
  8 |           "glove": {
  9 |               "type": "single_id",
 10 |               "lowercase_tokens": false
 11 |           },
 12 |           "bert": {
 13 |               "type": "bert-pretrained",
 14 |               "pretrained_model": "bert-large-cased",
 15 |               "do_lowercase": false,
 16 |               "use_starting_offsets": true,
 17 |               "truncate_long_sequences": false
 18 |           },
 19 |           "token_characters": {
 20 |               "type": "characters",
 21 |               "min_padding_length": 3,
 22 |               "character_tokenizer": {
 23 |                     "byte_encoding": "utf-8",
 24 |                     "end_tokens": [
 25 |                         260
 26 |                     ],
 27 |                     "start_tokens": [
 28 |                         259
 29 |                     ]
 30 |                 }
 31 |             }
 32 |         }
 33 |     },
 34 |     "iterator": {
 35 |         "type": "bucket",
 36 |         "batch_size": 10,
 37 |         "max_instances_in_memory": 1000,
 38 |         "sorting_keys": [
 39 |             [
 40 |                 "question",
 41 |                 "num_fields"
 42 |             ],
 43 |             [
 44 |                 "passage",
 45 |                 "num_tokens"
 46 |             ]
 47 |         ]
 48 |     },
 49 |     "model": {
 50 |         "type": "dialog_qa",
 51 |         "dropout": 0.2,
 52 |         "initializer": [],
 53 |         "marker_embedding_dim": 10,
 54 |         "num_context_answers": 2,
 55 |         "phrase_layer": {
 56 |             "type": "gru",
 57 |             "bidirectional": true,
 58 |             "hidden_size": 100,
 59 |             "input_size": 1472,  // elmo (1024) + cnn (100) + num_context_answers (2) * marker_embedding_dim (10)
 60 |             "num_layers": 1
 61 |         },
 62 |         "residual_encoder": {
 63 |             "type": "gru",
 64 |             "bidirectional": true,
 65 |             "hidden_size": 100,
 66 |             "input_size": 200,
 67 |             "num_layers": 1
 68 |         },
 69 |         "span_end_encoder": {
 70 |             "type": "gru",
 71 |             "bidirectional": true,
 72 |             "hidden_size": 100,
 73 |             "input_size": 400,
 74 |             "num_layers": 1
 75 |         },
 76 |         "span_start_encoder": {
 77 |             "type": "gru",
 78 |             "bidirectional": true,
 79 |             "hidden_size": 100,
 80 |             "input_size": 200,
 81 |             "num_layers": 1
 82 |         },
 83 | 
 84 |         "text_field_embedder": {
 85 |             "allow_unmatched_keys": true,
 86 |             "embedder_to_indexer_map": {
 87 |                 "glove": ["glove"],
 88 |                 "bert": ["bert", "bert-offsets"],
 89 |                 "token_characters": ["token_characters"]
 90 |             },
 91 |             "token_embedders": {
 92 |                 "glove": {
 93 |                     "type": "embedding",
 94 |                     "embedding_dim": 300,
 95 |                     "pretrained_file": "http://nlp.stanford.edu/data/glove.840B.300d.zip",
 96 |                     "trainable": true
 97 |                 },
 98 |                 "bert": {
 99 |                     "type": "bert-pretrained",
100 |                     "pretrained_model": "bert-large-cased"
101 |                 },
102 |                 "token_characters": {
103 |                     "type": "character_encoding",
104 |                     "embedding": {
105 |                         "embedding_dim": 16,
106 |                         "num_embeddings": 262
107 |                     },
108 |                     "encoder": {
109 |                         "type": "cnn",
110 |                         "embedding_dim": 16,
111 |                         "num_filters": 128,
112 |                         "ngram_filter_sizes": [3],
113 |                         "conv_layer_activation": "relu"
114 |                     }
115 |                 }
116 |             }
117 |         }
118 |     },
119 |     "train_data_path": "https://s3.amazonaws.com/my89public/quac/train_5000.json",
120 |     "validation_data_path": "https://s3.amazonaws.com/my89public/quac/val.json",
121 |     "trainer": {
122 |         "cuda_device": 0,
123 |         "learning_rate_scheduler": {
124 |             "type": "reduce_on_plateau",
125 |             "factor": 0.5,
126 |             "mode": "max",
127 |             "patience": 3
128 |         },
129 |         "num_epochs": 30,
130 |         "optimizer": {
131 |             "type": "sgd",
132 |             "lr": 0.01,
133 |             "momentum": 0.9
134 |         },
135 |         "patience": 10,
136 |         "validation_metric": "+f1"
137 |     },
138 |     "validation_iterator": {
139 |         "type": "bucket",
140 |         "batch_size": 3,
141 |         "max_instances_in_memory": 1000,
142 |         "sorting_keys": [
143 |             [
144 |                 "question",
145 |                 "num_fields"
146 |             ],
147 |             [
148 |                 "passage",
149 |                 "num_tokens"
150 |             ]
151 |         ]
152 |     }
153 | }
154 | 


--------------------------------------------------------------------------------
/BiDAF/BiDAFF++_with_glove+elmo.jsonnet:
--------------------------------------------------------------------------------
  1 | {
  2 |     "dataset_reader": {
  3 |         "type": "quac",
  4 |         "lazy": true,
  5 |         "num_context_answers": 2,
  6 |         "token_indexers": {
  7 |             "glove": {
  8 |               "type": "single_id",
  9 |               "lowercase_tokens": false
 10 |             },
 11 |             "elmo": {
 12 |                 "type": "elmo_characters"
 13 |             },
 14 |             "token_characters": {
 15 |                 "type": "characters",
 16 |                 "character_tokenizer": {
 17 |                     "byte_encoding": "utf-8",
 18 |                     "end_tokens": [
 19 |                         260
 20 |                     ],
 21 |                     "start_tokens": [
 22 |                         259
 23 |                     ]
 24 |                 },
 25 |                 "min_padding_length": 5
 26 |             }
 27 |         }
 28 |     },
 29 |     "iterator": {
 30 |         "type": "bucket",
 31 |         "batch_size": 10,
 32 |         "max_instances_in_memory": 1000,
 33 |         "sorting_keys": [
 34 |             [
 35 |                 "question",
 36 |                 "num_fields"
 37 |             ],
 38 |             [
 39 |                 "passage",
 40 |                 "num_tokens"
 41 |             ]
 42 |         ]
 43 |     },
 44 |     "model": {
 45 |         "type": "dialog_qa",
 46 |         "dropout": 0.2,
 47 |         "initializer": [],
 48 |         "marker_embedding_dim": 10,
 49 |         "num_context_answers": 2,
 50 |         "phrase_layer": {
 51 |             "type": "gru",
 52 |             "bidirectional": true,
 53 |             "hidden_size": 100,
 54 |             "input_size": 1444,  // elmo (1024) + cnn (100) + num_context_answers (2) * marker_embedding_dim (10)
 55 |             "num_layers": 1
 56 |         },
 57 |         "residual_encoder": {
 58 |             "type": "gru",
 59 |             "bidirectional": true,
 60 |             "hidden_size": 100,
 61 |             "input_size": 200,
 62 |             "num_layers": 1
 63 |         },
 64 |         "span_end_encoder": {
 65 |             "type": "gru",
 66 |             "bidirectional": true,
 67 |             "hidden_size": 100,
 68 |             "input_size": 400,
 69 |             "num_layers": 1
 70 |         },
 71 |         "span_start_encoder": {
 72 |             "type": "gru",
 73 |             "bidirectional": true,
 74 |             "hidden_size": 100,
 75 |             "input_size": 200,
 76 |             "num_layers": 1
 77 |         },
 78 |         "text_field_embedder": {
 79 |             "glove": {
 80 |                 "type": "embedding",
 81 |                 "embedding_dim": 300,
 82 |                 "pretrained_file": "http://nlp.stanford.edu/data/glove.840B.300d.zip",
 83 |                 "trainable": true
 84 |             },
 85 |             "elmo": {
 86 |                 "type": "elmo_token_embedder",
 87 |                 "do_layer_norm": false,
 88 |                 "dropout": 0.2,
 89 |                 "options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json",
 90 |                 "weight_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
 91 |             },
 92 |             "token_characters": {
 93 |                 "type": "character_encoding",
 94 |                 "dropout": 0.2,
 95 |                 "embedding": {
 96 |                     "embedding_dim": 20,
 97 |                     "num_embeddings": 262
 98 |                 },
 99 |                 "encoder": {
100 |                     "type": "cnn",
101 |                     "embedding_dim": 20,
102 |                     "ngram_filter_sizes": [
103 |                         5
104 |                     ],
105 |                     "num_filters": 100
106 |                 }
107 |             }
108 |         }
109 |     },
110 |     "train_data_path": "https://s3.amazonaws.com/my89public/quac/train_5000.json",
111 |     "validation_data_path": "https://s3.amazonaws.com/my89public/quac/val.json",
112 |     "trainer": {
113 |         "cuda_device": 0,
114 |         "learning_rate_scheduler": {
115 |             "type": "reduce_on_plateau",
116 |             "factor": 0.5,
117 |             "mode": "max",
118 |             "patience": 3
119 |         },
120 |         "num_epochs": 30,
121 |         "optimizer": {
122 |             "type": "sgd",
123 |             "lr": 0.01,
124 |             "momentum": 0.9
125 |         },
126 |         "patience": 10,
127 |         "validation_metric": "+f1"
128 |     },
129 |     "validation_iterator": {
130 |         "type": "bucket",
131 |         "batch_size": 3,
132 |         "max_instances_in_memory": 1000,
133 |         "sorting_keys": [
134 |             [
135 |                 "question",
136 |                 "num_fields"
137 |             ],
138 |             [
139 |                 "passage",
140 |                 "num_tokens"
141 |             ]
142 |         ]
143 |     }
144 | }
145 | 


--------------------------------------------------------------------------------
/BiDAF/Figures/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/1.png


--------------------------------------------------------------------------------
/BiDAF/Figures/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/2.png


--------------------------------------------------------------------------------
/BiDAF/Figures/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/3.png


--------------------------------------------------------------------------------
/BiDAF/Figures/Arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/Arch.png


--------------------------------------------------------------------------------
/BiDAF/Figures/baseline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/baseline.png


--------------------------------------------------------------------------------
/BiDAF/Figures/enhenced.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/enhenced.png


--------------------------------------------------------------------------------
/BiDAF/Figures/enhenced_elmo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/enhenced_elmo.png


--------------------------------------------------------------------------------
/BiDAF/README.md:
--------------------------------------------------------------------------------
  1 | # Using enhanced BiDAF++ on QuAC
  2 | 
  3 | ## Descriptions
  4 | An original BiDAF++ model uses Char-CNN for character embedding and GLoVe for word embedding. It is also equipped with contextualized embeddings and self attention. In this model, marker embeddings corresponding to previous answer words are used, while question turn numbers are encoded into question embeddings.
  5 | 
  6 | We've used [AllenNLP](https://github.com/allenai/allennlp) library to modify the BiDAF++ and used enhenced models to train on [QuAC](https://quac.ai/) dataset. 
  7 | 
  8 | ## Architecture of enhenced BiDAF++
  9 | <p align="center">
 10 |     <img src="Figures/Arch.png" width="90%"/>
 11 | </p>
 12 | 
 13 | ### Embedding layers from BiDAF++
 14 | 
 15 | We aimed to apply ELMo or BERT embedding to the original embeddings.
 16 | 
 17 | 
 18 | Let ELMo<sub>k</sub> be a ELMo vector, x<sub>k</sub> denote a original embedding vector and h<sub>k</sub> present a contextual vector generated by bi-LSTM layer. Then we can use ELMo enhanced representation [x<sub>k</sub>; ELMo<sub>k</sub>] instead of x<sub>k</sub>. Also, replace h<sub>k</sub> with [h<sub>k</sub>; ELMo<sub>k</sub>].
 19 | We do the same thing when using BERT embedding.
 20 | 
 21 | ELMo ver:[options_file](https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json)
 22 |       and [weight_file](https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5)  
 23 | BERT ver: [bert-large-cased](https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt)
 24 | ### Self-Attention layers from BiDAF++
 25 | 
 26 | First, the input is passed through a linear layer with ReLU activations. Then, it passes through a bi-directional GRU or LSTM. After that, we apply attention mechanism between the context and itself as following:
 27 | 
 28 | Let h<sub>i</sub>, h<sub>j</sub> be the vectors for context word i and word j and nc be the lengths of the context. Then compute attention between the two words as:
 29 | <p align="center">
 30 |     <img src="Figures/1.png" width="400"/>
 31 | </p>
 32 | 
 33 | where w<sub>1</sub>,w<sub>2</sub>, and w<sub>3</sub> are learned vectors and ⊙ is element-wise multiplica- tion. In this case, we set a<sub>ij</sub> = −inf if i = j. Then, compute an attended vector c<sub>i</sub> for each context token as:
 34 | <p align="center">
 35 |     <img src="Figures/2.png" width="350"/>
 36 | </p>
 37 | 
 38 | Compute a context-to-context vector t<sub>c</sub>:
 39 | <p align="center">
 40 |     <img src="Figures/3.png" width="450"/>
 41 | </p>
 42 | 
 43 | The output of this layer is built concatenating h<sub>i</sub>, c<sub>i</sub>, h<sub>i</sub> ⊙ c<sub>i</sub> and t<sub>c</sub> ⊙ c<sub>i</sub>,
 44 | additionally summed with the input before ReLU.
 45 | ### Modification
 46 | 
 47 | We modified the wordpiece indexer and bert token embedder to train the model with BERT embedding, cause origianl two files cannot handle the situation with 500 more tokens in the sentence. The modified files would not influence training process of model with other embeddings.
 48 | 
 49 | ## Platform and tools
 50 | ### Launching a Deep Learning VM on GCP
 51 | - 2 vCPUs with 13GB memory
 52 | - 1 NVIDIA Tesla V100 GPU
 53 | - PyTorch 1.0 + fast.ai 1.0(CUDA 10.0)
 54 | - Install NVIDIA GPU driver automatically on first startup 
 55 | 
 56 | ### Setting up an AllenNLP virtual environment
 57 | 
 58 | 1.  Create a Conda environment with Python 3.6
 59 | 
 60 |     ```bash
 61 |     conda create -n allennlp python=3.6
 62 |     ```
 63 | 
 64 | 2.  Activate the Conda environment. You will need to activate the Conda environment in each terminal in which you want to use AllenNLP.
 65 | 
 66 |     ```bash
 67 |     source activate allennlp
 68 |     ```
 69 | 
 70 | 
 71 | ## Usage
 72 | 
 73 | ### Dataset
 74 | [Train data](https://s3.amazonaws.com/my89public/quac/train_5000.json)  
 75 | [Validation data](https://s3.amazonaws.com/my89public/quac/val.json)
 76 | 
 77 | ### Training
 78 | Train a enhanced model with QuAC dataset which includes training and validation dataset
 79 | ```
 80 | nohup allennlp train <your_path_to_jsonnet_file>  --serialization-dir <your_path_to_log_file> > output.log &
 81 | ```
 82 | As [QuAC](https://arxiv.org/pdf/1808.07036.pdf) mentioned，Questions in the training set have one reference answer, while validation and test questions have five references each, which makes the F1 score on validation dataset has a higher score then that on training set.
 83 | 
 84 | Best model's configuration is [here](https://github.com/deepnlp-cs599-usc/quac/blob/master/BiDAF/BiDAFF%2B%2B_with_glove%2Bbert.jsonnet).
 85 | ## Results
 86 | ### F1 score
 87 | Enhanced model by BERT
 88 | <p align="center">
 89 |     <img src="Figures/enhenced.png" width="200%"/>
 90 | </p>
 91 | Enhanced model by ELMo 
 92 | <p align="center">
 93 |     <img src="Figures/enhenced_elmo.png" width="200%"/>
 94 | </p>
 95 | Baseline model
 96 | <p align="center">
 97 |     <img src="Figures/baseline.png" width="200%"/>
 98 | </p>
 99 | 
100 | ### Performance on baseline and enhanced models
101 | 
102 | | | F1 score on training set | F1 score on validation set|
103 | | --- | --- | --- |
104 | | Baseline model | 49.40 | 55.59 |
105 | | Enhanced by ELMo | 49.40 | **58.40** |
106 | | Enhanced by BERT | **53.05** | **60.04**|
107 | 
108 | ## Related works
109 | 
110 | * [Bidirectional attention ﬂow for machine comprehension](https://arxiv.org/abs/1611.01603) by Minjoon Seo et. al.
111 | * [Simple and effective multi-paragraph reading comprehension](https://arxiv.org/abs/1710.10723) by Christopher Clark et. al.
112 | * [Deep contextualized word representations](https://arxiv.org/abs/1802.05365) by Matthew E. Peters et. al.
113 | * [Bert: Pre-training of deep bidirectional transformers for language understanding](https://arxiv.org/abs/1810.04805) by  Jacob Devlin et. al.
114 | 


--------------------------------------------------------------------------------
/BiDAF/bert_token_embedder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A ``TokenEmbedder`` which uses one of the BERT models
  3 | (https://github.com/google-research/bert)
  4 | to produce embeddings.
  5 | At its core it uses Hugging Face's PyTorch implementation
  6 | (https://github.com/huggingface/pytorch-pretrained-BERT),
  7 | so thanks to them!
  8 | """
  9 | import logging
 10 | 
 11 | import torch
 12 | import torch.nn.functional as F
 13 | 
 14 | from pytorch_pretrained_bert.modeling import BertModel
 15 | 
 16 | from allennlp.modules.scalar_mix import ScalarMix
 17 | from allennlp.modules.token_embedders.token_embedder import TokenEmbedder
 18 | from allennlp.nn import util
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | class BertEmbedder(TokenEmbedder):
 24 |     """
 25 |     A ``TokenEmbedder`` that produces BERT embeddings for your tokens.
 26 |     Should be paired with a ``BertIndexer``, which produces wordpiece ids.
 27 |     Most likely you probably want to use ``PretrainedBertEmbedder``
 28 |     for one of the named pretrained models, not this base class.
 29 |     Parameters
 30 |     ----------
 31 |     bert_model: ``BertModel``
 32 |         The BERT model being wrapped.
 33 |     top_layer_only: ``bool``, optional (default = ``False``)
 34 |         If ``True``, then only return the top layer instead of apply the scalar mix.
 35 |     max_pieces : int, optional (default: 512)
 36 |         The BERT embedder uses positional embeddings and so has a corresponding
 37 |         maximum length for its input ids. Assuming the inputs are windowed
 38 |         and padded appropriately by this length, the embedder will split them into a
 39 |         large batch, feed them into BERT, and recombine the output as if it was a
 40 |         longer sequence.
 41 |     start_tokens : int, optional (default: 1)
 42 |         The number of starting special tokens input to BERT (usually 1, i.e., [CLS])
 43 |     end_tokens : int, optional (default: 1)
 44 |         The number of ending tokens input to BERT (usually 1, i.e., [SEP])
 45 |     """
 46 |     def __init__(self,
 47 |                  bert_model: BertModel,
 48 |                  top_layer_only: bool = False,
 49 |                  max_pieces: int = 512,
 50 |                  start_tokens: int = 1,
 51 |                  end_tokens: int = 1) -> None:
 52 |         super().__init__()
 53 |         self.bert_model = bert_model
 54 |         self.output_dim = bert_model.config.hidden_size
 55 |         self.max_pieces = max_pieces
 56 |         self.start_tokens = start_tokens
 57 |         self.end_tokens = end_tokens
 58 | 
 59 |         if not top_layer_only:
 60 |             self._scalar_mix = ScalarMix(bert_model.config.num_hidden_layers,
 61 |                                          do_layer_norm=False)
 62 |         else:
 63 |             self._scalar_mix = None
 64 | 
 65 |     def get_output_dim(self) -> int:
 66 |         return self.output_dim
 67 | 
 68 |     def forward(self,
 69 |                 input_ids: torch.LongTensor,
 70 |                 offsets: torch.LongTensor = None,
 71 |                 token_type_ids: torch.LongTensor = None) -> torch.Tensor:
 72 |         """
 73 |         Parameters
 74 |         ----------
 75 |         input_ids : ``torch.LongTensor``
 76 |             The (batch_size, ..., max_sequence_length) tensor of wordpiece ids.
 77 |         offsets : ``torch.LongTensor``, optional
 78 |             The BERT embeddings are one per wordpiece. However it's possible/likely
 79 |             you might want one per original token. In that case, ``offsets``
 80 |             represents the indices of the desired wordpiece for each original token.
 81 |             Depending on how your token indexer is configured, this could be the
 82 |             position of the last wordpiece for each token, or it could be the position
 83 |             of the first wordpiece for each token.
 84 |             For example, if you had the sentence "Definitely not", and if the corresponding
 85 |             wordpieces were ["Def", "##in", "##ite", "##ly", "not"], then the input_ids
 86 |             would be 5 wordpiece ids, and the "last wordpiece" offsets would be [3, 4].
 87 |             If offsets are provided, the returned tensor will contain only the wordpiece
 88 |             embeddings at those positions, and (in particular) will contain one embedding
 89 |             per token. If offsets are not provided, the entire tensor of wordpiece embeddings
 90 |             will be returned.
 91 |         token_type_ids : ``torch.LongTensor``, optional
 92 |             If an input consists of two sentences (as in the BERT paper),
 93 |             tokens from the first sentence should have type 0 and tokens from
 94 |             the second sentence should have type 1.  If you don't provide this
 95 |             (the default BertIndexer doesn't) then it's assumed to be all 0s.
 96 |         """
 97 |         # pylint: disable=arguments-differ
 98 |         batch_size, full_seq_len = input_ids.size(0), input_ids.size(-1)
 99 |         initial_dims = list(input_ids.shape[:-1])
100 | 
101 |         # The embedder may receive an input tensor that has a sequence length longer than can
102 |         # be fit. In that case, we should expect the wordpiece indexer to create padded windows
103 |         # of length `self.max_pieces` for us, and have them concatenated into one long sequence.
104 |         # E.g., "[CLS] I went to the [SEP] [CLS] to the store to [SEP] ..."
105 |         # We can then split the sequence into sub-sequences of that length, and concatenate them
106 |         # along the batch dimension so we effectively have one huge batch of partial sentences.
107 |         # This can then be fed into BERT without any sentence length issues. Keep in mind
108 |         # that the memory consumption can dramatically increase for large batches with extremely
109 |         # long sentences.
110 |         needs_split = full_seq_len > self.max_pieces
111 |         last_window_size = 0
112 |         if needs_split:
113 |             # Split the flattened list by the window size, `max_pieces`
114 |             split_input_ids = list(input_ids.split(self.max_pieces, dim=-1))
115 | 
116 |             # We want all sequences to be the same length, so pad the last sequence
117 |             last_window_size = split_input_ids[-1].size(-1)
118 |             padding_amount = self.max_pieces - last_window_size
119 |             split_input_ids[-1] = F.pad(split_input_ids[-1], pad=[0, padding_amount], value=0)
120 | 
121 |             # Now combine the sequences along the batch dimension
122 |             input_ids = torch.cat(split_input_ids, dim=0)
123 | 
124 |         if token_type_ids is None:
125 |             token_type_ids = torch.zeros_like(input_ids)
126 | 
127 |         input_mask = (input_ids != 0).long()
128 | 
129 |         # input_ids may have extra dimensions, so we reshape down to 2-d
130 |         # before calling the BERT model and then reshape back at the end.
131 |         all_encoder_layers, _ = self.bert_model(input_ids=util.combine_initial_dims(input_ids),
132 |                                                 token_type_ids=util.combine_initial_dims(token_type_ids),
133 |                                                 attention_mask=util.combine_initial_dims(input_mask))
134 |         all_encoder_layers = torch.stack(all_encoder_layers)
135 | 
136 |         if needs_split:
137 |             # First, unpack the output embeddings into one long sequence again
138 |             unpacked_embeddings = torch.split(all_encoder_layers, batch_size, dim=1)
139 |             unpacked_embeddings = torch.cat(unpacked_embeddings, dim=2)
140 | 
141 |             # Next, select indices of the sequence such that it will result in embeddings representing the original
142 |             # sentence. To do this, the indices will be the left half of each embedded window sub-sequence. E.g.,
143 |             #  0     1 2    3  4   5     6     7  8   9     10 11
144 |             # "[CLS] I went to the [SEP] [CLS] to the store to [SEP]"
145 |             # should now have indices [0, 1, 2, 7, 8]. The remaining code adds the final window indices (so the
146 |             # final indices should be [0, 1, 2, 7, 8, 9, 10, 11])
147 | 
148 |             full_seq_len = unpacked_embeddings.size(-2) - self.max_pieces
149 |             stride = self.max_pieces // 2
150 |             select_indices = [0]
151 |             select_indices.extend(i for i in range(full_seq_len)
152 |                                   if self.start_tokens - 1 < i % self.max_pieces < stride)
153 | 
154 |             # Add the final window indices
155 |             final_window_size = last_window_size - self.start_tokens + self.end_tokens
156 |             select_indices.extend(full_seq_len + i for i in range(self.start_tokens, final_window_size))
157 | 
158 |             initial_dims.append(len(select_indices))
159 | 
160 |             recombined_embeddings = unpacked_embeddings[:, :, select_indices]
161 |         else:
162 |             recombined_embeddings = all_encoder_layers
163 | 
164 |         # Recombine the outputs of all layers
165 |         # (layers, batch_size * d1 * ... * dn, sequence_length, embedding_dim)
166 |         # recombined = torch.cat(combined, dim=2)
167 |         input_mask = (recombined_embeddings != 0).long()
168 | 
169 |         if self._scalar_mix is not None:
170 |             mix = self._scalar_mix(recombined_embeddings, input_mask)
171 |         else:
172 |             mix = recombined_embeddings[-1]
173 | 
174 |         # At this point, mix is (batch_size * d1 * ... * dn, sequence_length, embedding_dim)
175 | 
176 |         if offsets is None:
177 |             # Resize to (batch_size, d1, ..., dn, sequence_length, embedding_dim)
178 |             dims = initial_dims if needs_split else input_ids.size()
179 |             return util.uncombine_initial_dims(mix, dims)
180 |         else:
181 |             # offsets is (batch_size, d1, ..., dn, orig_sequence_length)
182 |             offsets2d = util.combine_initial_dims(offsets)
183 |             # now offsets is (batch_size * d1 * ... * dn, orig_sequence_length)
184 |             range_vector = util.get_range_vector(offsets2d.size(0),
185 |                                                  device=util.get_device_of(mix)).unsqueeze(1)
186 |             # selected embeddings is also (batch_size * d1 * ... * dn, orig_sequence_length)
187 |             selected_embeddings = mix[range_vector, offsets2d]
188 | 
189 |             return util.uncombine_initial_dims(selected_embeddings, offsets.size())
190 | 
191 | 
192 | @TokenEmbedder.register("bert-pretrained")
193 | class PretrainedBertEmbedder(BertEmbedder):
194 |     # pylint: disable=line-too-long
195 |     """
196 |     Parameters
197 |     ----------
198 |     pretrained_model: ``str``
199 |         Either the name of the pretrained model to use (e.g. 'bert-base-uncased'),
200 |         or the path to the .tar.gz file with the model weights.
201 |         If the name is a key in the list of pretrained models at
202 |         https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L41
203 |         the corresponding path will be used; otherwise it will be interpreted as a path or URL.
204 |     requires_grad : ``bool``, optional (default = False)
205 |         If True, compute gradient of BERT parameters for fine tuning.
206 |     top_layer_only: ``bool``, optional (default = ``False``)
207 |         If ``True``, then only return the top layer instead of apply the scalar mix.
208 |     """
209 |     def __init__(self, pretrained_model: str, requires_grad: bool = False, top_layer_only: bool = False) -> None:
210 |         model = BertModel.from_pretrained(pretrained_model)
211 | 
212 |         for param in model.parameters():
213 |             param.requires_grad = requires_grad
214 | 
215 |         super().__init__(bert_model=model, top_layer_only=top_layer_only)


--------------------------------------------------------------------------------
/FlowQA_Attention/README.md:
--------------------------------------------------------------------------------
 1 | # FlowQA + Attention Over Flow
 2 | 
 3 | 
 4 | 
 5 | ## How we improve FlowQA -- Adding Self-attention
 6 | 
 7 | We have noticed that in the "flow" operation, no attention mechanism is used. We believe that applying attention here should be considered, because not every historical conversation is important. When new questions are posted, it is highly likely that only several previous questions are involved, instead of all. By adding attention in the "flow" operation, we could align new representations with previous useful ones. We would like to call it "attention-over-flow".
 8 | 
 9 | ![image](https://github.com/deepnlp-cs599-usc/quac/blob/master/FlowQA_Attention/figure/Attention%20Over%20Flow.png)
10 | 
11 | 
12 | ## Experiments
13 | 
14 | We conducted experiments on Google Cloud, using a 2-CPU, 1 Tesla K80 GPU with 11 GB GPU memory and 60 GB disk memory. We use python 3.6.8 and pytorch 1.0.1. The batch size is set to be 2. The code is based on [momohuang's code on github](https://github.com/momohuang/FlowQA).
15 | 
16 | How to run the code is the same as FlowQA. However, using "attention over flow" is set to be default.
17 | 
18 | From https://github.com/momohuang/FlowQA we borrow their instructions as follows
19 | 
20 | Step 1:
21 | perform the following:
22 | 
23 | > pip install -r requirements.txt
24 | 
25 | to install all dependent python packages.
26 | 
27 | Step 2:
28 | download necessary files using:
29 | 
30 | > ./download.sh
31 | 
32 | Step 3:
33 | preprocess the data files using:
34 | 
35 | > python preprocess_QuAC.py
36 | 
37 | Step 4:
38 | run the training code using:
39 | 
40 | > python train_QuAC.py
41 | 
42 | To specify not using "attention over flow", run:
43 | 
44 | > python train_QuAC.py --flow_attention=0
45 | 
46 | 
47 | ## Results
48 | 
49 | The result shows that our attempt slightly improves FlowQA by 0.1 of F-1 value on dev set. We would like to mention that the model converges much faster, as only 10 epochs is used instead of 20.
50 | 
51 | ![image](https://github.com/deepnlp-cs599-usc/quac/blob/master/FlowQA_Attention/figure/result.png)
52 | 
53 | 
54 | ## References
55 | 
56 | [Flowqa: Grasping flow in history for conversational machine comprehensionn.](https://arxiv.org/abs/1810.06683) By Huang H Y, Choi E, Yih W.
57 | 
58 | [Quac: Question answering in context.](https://arxiv.org/abs/1808.07036) By Choi E, He H, Iyyer M, et al. 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/FlowQA_Attention/code/QA_model/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/FlowQA_Attention/code/QA_model/model_QuAC.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.optim as optim
  4 | import torch.nn.functional as F
  5 | import numpy as np
  6 | import logging
  7 | 
  8 | from torch.nn import Parameter
  9 | from torch.autograd import Variable
 10 | from .utils import AverageMeter
 11 | from .detail_model import FlowQA
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class QAModel(object):
 17 |     """
 18 |     High level model that handles intializing the underlying network
 19 |     architecture, saving, updating examples, and predicting examples.
 20 |     """
 21 | 
 22 |     def __init__(self, opt, embedding=None, state_dict=None):
 23 |         # Book-keeping.
 24 |         self.opt = opt
 25 |         self.updates = state_dict['updates'] if state_dict else 0
 26 |         self.eval_embed_transfer = True
 27 |         self.train_loss = AverageMeter()
 28 | 
 29 |         # Building network.
 30 |         self.network = FlowQA(opt, embedding)
 31 |         if state_dict:
 32 |             new_state = set(self.network.state_dict().keys())
 33 |             for k in list(state_dict['network'].keys()):
 34 |                 if k not in new_state:
 35 |                     del state_dict['network'][k]
 36 |             self.network.load_state_dict(state_dict['network'])
 37 | 
 38 |         # Building optimizer.
 39 |         parameters = [p for p in self.network.parameters() if p.requires_grad]
 40 |         if opt['optimizer'] == 'sgd':
 41 |             self.optimizer = optim.SGD(parameters, opt['learning_rate'],
 42 |                                        momentum=opt['momentum'],
 43 |                                        weight_decay=opt['weight_decay'])
 44 |         elif opt['optimizer'] == 'adamax':
 45 |             self.optimizer = optim.Adamax(parameters,
 46 |                                           weight_decay=opt['weight_decay'])
 47 |         elif opt['optimizer'] == 'adadelta':
 48 |             self.optimizer = optim.Adadelta(parameters, rho=0.95, weight_decay=opt['weight_decay'])
 49 |         else:
 50 |             raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer'])
 51 |         if state_dict:
 52 |             self.optimizer.load_state_dict(state_dict['optimizer'])
 53 | 
 54 |         if opt['fix_embeddings']:
 55 |             wvec_size = 0
 56 |         else:
 57 |             wvec_size = (opt['vocab_size'] - opt['tune_partial']) * opt['embedding_dim']
 58 |         self.total_param = sum([p.nelement() for p in parameters]) - wvec_size
 59 | 
 60 |     def update(self, batch):
 61 |         # Train mode
 62 |         self.network.train()
 63 |         torch.set_grad_enabled(True)
 64 | 
 65 |         # Transfer to GPU
 66 |         if self.opt['cuda']:
 67 |             inputs = [e.cuda(non_blocking=True) for e in batch[:9] + batch[17:]]
 68 |             overall_mask = batch[9].cuda(non_blocking=True)
 69 | 
 70 |             answer_s = batch[10].cuda(non_blocking=True)
 71 |             answer_e = batch[11].cuda(non_blocking=True)
 72 |             answer_c = batch[12].cuda(non_blocking=True)
 73 |         else:
 74 |             inputs = [e for e in batch[:9] + batch[17:]]
 75 |             overall_mask = batch[9]
 76 | 
 77 |             answer_s = batch[10]
 78 |             answer_e = batch[11]
 79 |             answer_c = batch[12]
 80 | 
 81 |         # Run forward
 82 |         # output: [batch_size, question_num, context_len], [batch_size, question_num]
 83 |         score_s, score_e, score_no_answ = self.network(*inputs)
 84 | 
 85 |         # Compute loss and accuracies
 86 |         loss = self.opt['elmo_lambda'] * (self.network.elmo.scalar_mix_0.scalar_parameters[0] ** 2
 87 |                                         + self.network.elmo.scalar_mix_0.scalar_parameters[1] ** 2
 88 |                                         + self.network.elmo.scalar_mix_0.scalar_parameters[2] ** 2) # ELMo L2 regularization
 89 |         all_no_answ = (answer_c == 0)
 90 |         answer_s.masked_fill_(all_no_answ, -100) # ignore_index is -100 in F.cross_entropy
 91 |         answer_e.masked_fill_(all_no_answ, -100)
 92 | 
 93 |         for i in range(overall_mask.size(0)):
 94 |             q_num = sum(overall_mask[i]) # the true question number for this sampled context
 95 | 
 96 |             target_s = answer_s[i, :q_num] # Size: q_num
 97 |             target_e = answer_e[i, :q_num]
 98 |             target_c = answer_c[i, :q_num]
 99 |             target_no_answ = all_no_answ[i, :q_num]
100 | 
101 |             # single_loss is averaged across q_num
102 |             if self.opt['question_normalize']:
103 |                 single_loss = F.binary_cross_entropy_with_logits(score_no_answ[i, :q_num], target_no_answ.float()) * q_num.item() / 8.0
104 |                 single_loss = single_loss + F.cross_entropy(score_s[i, :q_num], target_s) * (q_num - sum(target_no_answ)).item() / 7.0
105 |                 single_loss = single_loss + F.cross_entropy(score_e[i, :q_num], target_e) * (q_num - sum(target_no_answ)).item() / 7.0
106 |             else:
107 |                 single_loss = F.binary_cross_entropy_with_logits(score_no_answ[i, :q_num], target_no_answ.float()) \
108 |                             + F.cross_entropy(score_s[i, :q_num], target_s) + F.cross_entropy(score_e[i, :q_num], target_e)
109 | 
110 |             loss = loss + (single_loss / overall_mask.size(0))
111 |         self.train_loss.update(loss.item(), overall_mask.size(0))
112 | 
113 |         # Clear gradients and run backward
114 |         self.optimizer.zero_grad()
115 |         loss.backward()
116 | 
117 |         # Clip gradients
118 |         torch.nn.utils.clip_grad_norm_(self.network.parameters(),
119 |                                        self.opt['grad_clipping'])
120 | 
121 |         # Update parameters
122 |         self.optimizer.step()
123 |         self.updates += 1
124 | 
125 |         # Reset any partially fixed parameters (e.g. rare words)
126 |         self.reset_embeddings()
127 |         self.eval_embed_transfer = True
128 | 
129 |     def predict(self, batch, No_Ans_Threshold=None):
130 |         # Eval mode
131 |         self.network.eval()
132 |         torch.set_grad_enabled(False)
133 | 
134 |         # Transfer trained embedding to evaluation embedding
135 |         if self.eval_embed_transfer:
136 |             self.update_eval_embed()
137 |             self.eval_embed_transfer = False
138 | 
139 |         # Transfer to GPU
140 |         if self.opt['cuda']:
141 |             inputs = [e.cuda(non_blocking=True) for e in batch[:9] + batch[17:]]
142 |         else:
143 |             inputs = [e for e in batch[:9] + batch[17:]]
144 | 
145 |         # Run forward
146 |         # output: [batch_size, question_num, context_len], [batch_size, question_num]
147 |         score_s, score_e, score_no_answ = self.network(*inputs)
148 |         score_s = F.softmax(score_s, dim=2)
149 |         score_e = F.softmax(score_e, dim=2)
150 | 
151 |         # Transfer to CPU/normal tensors for numpy ops
152 |         score_s = score_s.data.cpu()
153 |         score_e = score_e.data.cpu()
154 |         score_no_answ = score_no_answ.data.cpu()
155 | 
156 |         # Get argmax text spans
157 |         text = batch[13]
158 |         spans = batch[14]
159 |         overall_mask = batch[9]
160 | 
161 |         predictions, no_ans_scores = [], []
162 |         max_len = self.opt['max_len'] or score_s.size(2)
163 | 
164 |         for i in range(overall_mask.size(0)):
165 |             dialog_pred, dialog_noans = [], []
166 | 
167 |             for j in range(overall_mask.size(1)):
168 |                 if overall_mask[i, j] == 0: # this dialog has ended
169 |                     break
170 | 
171 |                 dialog_noans.append(score_no_answ[i, j].item())
172 |                 if No_Ans_Threshold is not None and score_no_answ[i, j] > No_Ans_Threshold:
173 |                     dialog_pred.append("CANNOTANSWER")
174 |                 else:
175 |                     scores = torch.ger(score_s[i, j], score_e[i, j])
176 |                     scores.triu_().tril_(max_len - 1)
177 |                     scores = scores.numpy()
178 |                     s_idx, e_idx = np.unravel_index(np.argmax(scores), scores.shape)
179 | 
180 |                     s_offset, e_offset = spans[i][s_idx][0], spans[i][e_idx][1]
181 |                     dialog_pred.append(text[i][s_offset:e_offset])
182 | 
183 |             predictions.append(dialog_pred)
184 |             no_ans_scores.append(dialog_noans)
185 | 
186 |         return predictions, no_ans_scores # list of (list of strings), list of (list of floats)
187 | 
188 |     # allow the evaluation embedding be larger than training embedding
189 |     # this is helpful if we have pretrained word embeddings
190 |     def setup_eval_embed(self, eval_embed, padding_idx = 0):
191 |         # eval_embed should be a supermatrix of training embedding
192 |         self.network.eval_embed = nn.Embedding(eval_embed.size(0),
193 |                                                eval_embed.size(1),
194 |                                                padding_idx = padding_idx)
195 |         self.network.eval_embed.weight.data = eval_embed
196 |         for p in self.network.eval_embed.parameters():
197 |             p.requires_grad = False
198 |         self.eval_embed_transfer = True
199 | 
200 |         if hasattr(self.network, 'CoVe'):
201 |             self.network.CoVe.setup_eval_embed(eval_embed)
202 | 
203 |     def update_eval_embed(self):
204 |         # update evaluation embedding to trained embedding
205 |         if self.opt['tune_partial'] > 0:
206 |             offset = self.opt['tune_partial']
207 |             self.network.eval_embed.weight.data[0:offset] \
208 |                 = self.network.embedding.weight.data[0:offset]
209 |         else:
210 |             offset = 10
211 |             self.network.eval_embed.weight.data[0:offset] \
212 |                 = self.network.embedding.weight.data[0:offset]
213 | 
214 |     def reset_embeddings(self):
215 |         # Reset fixed embeddings to original value
216 |         if self.opt['tune_partial'] > 0:
217 |             offset = self.opt['tune_partial']
218 |             if offset < self.network.embedding.weight.data.size(0):
219 |                 self.network.embedding.weight.data[offset:] \
220 |                     = self.network.fixed_embedding
221 | 
222 |     def get_pretrain(self, state_dict):
223 |         own_state = self.network.state_dict()
224 |         for name, param in state_dict.items():
225 |             if name not in own_state:
226 |                 continue
227 |             if isinstance(param, Parameter):
228 |                 param = param.data
229 |             try:
230 |                 own_state[name].copy_(param)
231 |             except:
232 |                 print("Skip", name)
233 |                 continue
234 | 
235 |     def save(self, filename, epoch):
236 |         params = {
237 |             'state_dict': {
238 |                 'network': self.network.state_dict(),
239 |                 'optimizer': self.optimizer.state_dict(),
240 |                 'updates': self.updates # how many updates
241 |             },
242 |             'config': self.opt,
243 |             'epoch': epoch
244 |         }
245 |         try:
246 |             torch.save(params, filename)
247 |             logger.info('model saved to {}'.format(filename))
248 |         except BaseException:
249 |             logger.warn('[ WARN: Saving failed... continuing anyway. ]')
250 | 
251 |     def save_for_predict(self, filename, epoch):
252 |         network_state = dict([(k, v) for k, v in self.network.state_dict().items() if k[0:4] != 'CoVe'])
253 |         if 'eval_embed.weight' in network_state:
254 |             del network_state['eval_embed.weight']
255 |         if 'fixed_embedding' in network_state:
256 |             del network_state['fixed_embedding']
257 |         params = {
258 |             'state_dict': {'network': network_state},
259 |             'config': self.opt,
260 |         }
261 |         try:
262 |             torch.save(params, filename)
263 |             logger.info('model saved to {}'.format(filename))
264 |         except BaseException:
265 |             logger.warn('[ WARN: Saving failed... continuing anyway. ]')
266 | 
267 |     def cuda(self):
268 |         self.network.cuda()
269 | 


--------------------------------------------------------------------------------
/FlowQA_Attention/code/QA_model/utils.py:
--------------------------------------------------------------------------------
 1 | class AverageMeter(object):
 2 |     """Computes and stores the average and current value."""
 3 |     def __init__(self):
 4 |         self.reset()
 5 | 
 6 |     def reset(self):
 7 |         self.val = 0
 8 |         self.avg = 0
 9 |         self.sum = 0
10 |         self.count = 0
11 | 
12 |     def update(self, val, n=1):
13 |         self.val = val
14 |         self.sum += val * n
15 |         self.count += n
16 |         self.avg = self.sum / self.count
17 | 


--------------------------------------------------------------------------------
/FlowQA_Attention/code/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/FlowQA_Attention/code/predict_QuAC.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import sys
  4 | import random
  5 | import string
  6 | import logging
  7 | import argparse
  8 | from os.path import basename
  9 | from shutil import copyfile
 10 | from datetime import datetime
 11 | from collections import Counter
 12 | import torch
 13 | import msgpack
 14 | import pickle
 15 | import pandas as pd
 16 | import numpy as np
 17 | from QA_model.model_QuAC import QAModel
 18 | from general_utils import score, BatchGen_QuAC, find_best_score_and_thresh
 19 | 
 20 | parser = argparse.ArgumentParser(
 21 |     description='Predict using a Dialog QA model.'
 22 | )
 23 | parser.add_argument('--dev_dir', default='QuAC_data/')
 24 | 
 25 | parser.add_argument('-o', '--output_dir', default='pred_out/')
 26 | parser.add_argument('--number', type=int, default=-1, help='id of the current prediction')
 27 | parser.add_argument('-m', '--model', default='',
 28 |                     help='testing model pathname, e.g. "models/checkpoint_epoch_11.pt"')
 29 | 
 30 | parser.add_argument('-bs', '--batch_size', type=int, default=4)
 31 | parser.add_argument('--no_ans', type=float, default=0)
 32 | parser.add_argument('--min_f1', type=float, default=0.4)
 33 | 
 34 | parser.add_argument('--show', type=int, default=3)
 35 | parser.add_argument('--seed', type=int, default=1023,
 36 |                     help='random seed for data shuffling, dropout, etc.')
 37 | parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available(),
 38 |                     help='whether to use GPU acceleration.')
 39 | 
 40 | args = parser.parse_args()
 41 | if args.model == '':
 42 |     print("model file is not provided")
 43 |     sys.exit(-1)
 44 | if args.model[-3:] != '.pt':
 45 |     print("does not recognize the model file")
 46 |     sys.exit(-1)
 47 | 
 48 | # create prediction output dir
 49 | os.makedirs(args.output_dir, exist_ok=True)
 50 | # count the number of prediction files
 51 | if args.number == -1:
 52 |     args.number = len(os.listdir(args.output_dir))+1
 53 | args.output = args.output_dir + 'pred' + str(args.number) + '.pckl'
 54 | 
 55 | random.seed(args.seed)
 56 | np.random.seed(args.seed)
 57 | torch.manual_seed(args.seed)
 58 | if args.cuda:
 59 |     torch.cuda.manual_seed_all(args.seed)
 60 | 
 61 | log = logging.getLogger(__name__)
 62 | log.setLevel(logging.DEBUG)
 63 | ch = logging.StreamHandler(sys.stdout)
 64 | ch.setLevel(logging.INFO)
 65 | formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S')
 66 | ch.setFormatter(formatter)
 67 | log.addHandler(ch)
 68 | 
 69 | def main():
 70 |     log.info('[program starts.]')
 71 |     checkpoint = torch.load(args.model)
 72 |     opt = checkpoint['config']
 73 |     opt['task_name'] = 'QuAC'
 74 |     opt['cuda'] = args.cuda
 75 |     opt['seed'] = args.seed
 76 |     if opt.get('disperse_flow') is None:
 77 |         opt['disperse_flow'] = False
 78 |     if opt.get('rationale_lambda') is None:
 79 |         opt['rationale_lambda'] = 0.0
 80 |     if opt.get('no_dialog_flow') is None:
 81 |         opt['no_dialog_flow'] = False
 82 |     if opt.get('do_hierarchical_query') is None:
 83 |         opt['do_hierarchical_query'] = False
 84 |     state_dict = checkpoint['state_dict']
 85 |     log.info('[model loaded.]')
 86 | 
 87 |     test, test_embedding, test_answer = load_dev_data(opt)
 88 |     model = QAModel(opt, state_dict = state_dict)
 89 |     log.info('[Data loaded.]')
 90 | 
 91 |     model.setup_eval_embed(test_embedding)
 92 | 
 93 |     if args.cuda:
 94 |         model.cuda()
 95 | 
 96 |     batches = BatchGen_QuAC(test, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, dialog_ctx=opt['explicit_dialog_ctx'], use_dialog_act=opt['use_dialog_act'], precompute_elmo=opt['elmo_batch_size'] // args.batch_size)
 97 |     sample_idx = random.sample(range(len(batches)), args.show)
 98 | 
 99 |     predictions = []
100 |     no_ans_scores = []
101 |     for i, batch in enumerate(batches):
102 |         prediction, noans = model.predict(batch, No_Ans_Threshold=args.no_ans)
103 |         predictions.extend(prediction)
104 |         no_ans_scores.extend(noans)
105 | 
106 |         if not (i in sample_idx):
107 |             continue
108 |         
109 |         print("Context: ", batch[-4][0])
110 |         for j in range(len(batch[-2][0])):
111 |             print("Q: ", batch[-2][0][j])
112 |             print("A: ", prediction[0][j])
113 |             print("     True A: ", batch[-1][0][j], "| Follow up" if batch[-6][0][j].item() // 10 else "| Don't follow up")
114 |             print("     Val. A: ", test_answer[args.batch_size * i][j])
115 |         print("")
116 | 
117 | 
118 |     pred_out = {'predictions': predictions, 'no_ans_scores': no_ans_scores}
119 |     with open(args.output, 'wb') as f:
120 |         pickle.dump(pred_out, f)
121 | 
122 |     f1, h_f1, HEQ_Q, HEQ_D = score(predictions, test_answer, min_F1=args.min_f1)
123 |     log.warning("Test F1: {:.2f}, HEQ_Q: {:.2f}, HEQ_D: {:.2f}".format(f1, HEQ_Q, HEQ_D))
124 | 
125 | def load_dev_data(opt): # can be extended to true test set
126 |     with open(os.path.join(args.dev_dir, 'dev_meta.msgpack'), 'rb') as f:
127 |         meta = msgpack.load(f, encoding='utf8')
128 |     embedding = torch.Tensor(meta['embedding'])
129 |     assert opt['embedding_dim'] == embedding.size(1)
130 | 
131 |     with open(os.path.join(args.dev_dir, 'dev_data.msgpack'), 'rb') as f:
132 |         data = msgpack.load(f, encoding='utf8')
133 | 
134 |     assert opt['num_features'] == len(data['context_features'][0][0]) + opt['explicit_dialog_ctx'] * (opt['use_dialog_act']*3 + 2)
135 |     
136 |     dev = {'context': list(zip(
137 |                         data['context_ids'],
138 |                         data['context_tags'],
139 |                         data['context_ents'],
140 |                         data['context'],
141 |                         data['context_span'],
142 |                         data['1st_question'],
143 |                         data['context_tokenized'])),
144 |            'qa': list(zip(
145 |                         data['question_CID'],
146 |                         data['question_ids'],
147 |                         data['context_features'],
148 |                         data['answer_start'],
149 |                         data['answer_end'],
150 |                         data['answer_choice'],
151 |                         data['question'],
152 |                         data['answer'],
153 |                         data['question_tokenized']))
154 |           }
155 |     
156 |     dev_answer = []
157 |     for i, CID in enumerate(data['question_CID']):
158 |         if len(dev_answer) <= CID:
159 |             dev_answer.append([])
160 |         dev_answer[CID].append(data['all_answer'][i])
161 |     
162 |     return dev, embedding, dev_answer
163 | 
164 | if __name__ == '__main__':
165 |     main()
166 | 


--------------------------------------------------------------------------------
/FlowQA_Attention/code/preprocess_QuAC.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | import spacy
  4 | import msgpack
  5 | import unicodedata
  6 | import numpy as np
  7 | import pandas as pd
  8 | import argparse
  9 | import collections
 10 | import multiprocessing
 11 | import logging
 12 | import random
 13 | from allennlp.modules.elmo import batch_to_ids
 14 | from general_utils import flatten_json, normalize_text, build_embedding, load_glove_vocab, pre_proc, get_context_span, find_answer_span, feature_gen, token2id
 15 | 
 16 | parser = argparse.ArgumentParser(
 17 |     description='Preprocessing train + dev files, about 20 minutes to run on Servers.'
 18 | )
 19 | parser.add_argument('--wv_file', default='glove/glove.840B.300d.txt',
 20 |                     help='path to word vector file.')
 21 | parser.add_argument('--wv_dim', type=int, default=300,
 22 |                     help='word vector dimension.')
 23 | parser.add_argument('--sort_all', action='store_true',
 24 |                     help='sort the vocabulary by frequencies of all words.'
 25 |                          'Otherwise consider question words first.')
 26 | parser.add_argument('--threads', type=int, default=multiprocessing.cpu_count(),
 27 |                     help='number of threads for preprocessing.')
 28 | parser.add_argument('--no_match', action='store_true',
 29 |                     help='do not extract the three exact matching features.')
 30 | parser.add_argument('--seed', type=int, default=1023,
 31 |                     help='random seed for data shuffling, embedding init, etc.')
 32 | 
 33 | 
 34 | args = parser.parse_args()
 35 | trn_file = 'QuAC_data/train.json'
 36 | dev_file = 'QuAC_data/dev.json'
 37 | wv_file = args.wv_file
 38 | wv_dim = args.wv_dim
 39 | nlp = spacy.load('en', disable=['parser'])
 40 | 
 41 | random.seed(args.seed)
 42 | np.random.seed(args.seed)
 43 | 
 44 | logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG,
 45 |                     datefmt='%m/%d/%Y %I:%M:%S')
 46 | log = logging.getLogger(__name__)
 47 | 
 48 | log.info('start data preparing... (using {} threads)'.format(args.threads))
 49 | 
 50 | glove_vocab = load_glove_vocab(wv_file, wv_dim) # return a "set" of vocabulary
 51 | log.info('glove loaded.')
 52 | 
 53 | #===============================================================
 54 | #=================== Work on training data =====================
 55 | #===============================================================
 56 | 
 57 | def proc_train(ith, article):
 58 |     rows = []
 59 |     
 60 |     for paragraph in article['paragraphs']:
 61 |         context = paragraph['context']
 62 |         for qa in paragraph['qas']:
 63 |             question = qa['question']
 64 |             answers = qa['orig_answer']
 65 |             
 66 |             answer = answers['text']
 67 |             answer_start = answers['answer_start']
 68 |             answer_end = answers['answer_start'] + len(answers['text'])
 69 |             answer_choice = 0 if answer == 'CANNOTANSWER' else\
 70 |                             1 if qa['yesno'] == 'y' else\
 71 |                             2 if qa['yesno'] == 'n' else\
 72 |                             3 # Not a yes/no question
 73 |             if answer_choice != 0:
 74 |                 """
 75 |                 0: Do not ask a follow up question!
 76 |                 1: Definitely ask a follow up question!
 77 |                 2: Not too important, but you can ask a follow up.
 78 |                 """
 79 |                 answer_choice += 10 * (0 if qa['followup'] == "n" else\
 80 |                                        1 if qa['followup'] == "y" else\
 81 |                                        2)
 82 |             else:
 83 |                 answer_start, answer_end = -1, -1
 84 |             rows.append((ith, question, answer, answer_start, answer_end, answer_choice))
 85 |     return rows, context
 86 | 
 87 | train, train_context = flatten_json(trn_file, proc_train)
 88 | train = pd.DataFrame(train, columns=['context_idx', 'question', 'answer',
 89 |                                     'answer_start', 'answer_end', 'answer_choice'])
 90 | log.info('train json data flattened.')
 91 | 
 92 | print(train)
 93 | 
 94 | trC_iter = (pre_proc(c) for c in train_context)
 95 | trQ_iter = (pre_proc(q) for q in train.question)
 96 | trC_docs = [doc for doc in nlp.pipe(trC_iter, batch_size=64, n_threads=args.threads)]
 97 | trQ_docs = [doc for doc in nlp.pipe(trQ_iter, batch_size=64, n_threads=args.threads)]
 98 | 
 99 | # tokens
100 | trC_tokens = [[normalize_text(w.text) for w in doc] for doc in trC_docs]
101 | trQ_tokens = [[normalize_text(w.text) for w in doc] for doc in trQ_docs]
102 | trC_unnorm_tokens = [[w.text for w in doc] for doc in trC_docs]
103 | log.info('All tokens for training are obtained.')
104 | 
105 | train_context_span = [get_context_span(a, b) for a, b in zip(train_context, trC_unnorm_tokens)]
106 | 
107 | ans_st_token_ls, ans_end_token_ls = [], []
108 | for ans_st, ans_end, idx in zip(train.answer_start, train.answer_end, train.context_idx):
109 |     ans_st_token, ans_end_token = find_answer_span(train_context_span[idx], ans_st, ans_end)
110 |     ans_st_token_ls.append(ans_st_token)
111 |     ans_end_token_ls.append(ans_end_token)
112 | 
113 | train['answer_start_token'], train['answer_end_token'] = ans_st_token_ls, ans_end_token_ls
114 | initial_len = len(train)
115 | train.dropna(inplace=True) # modify self DataFrame
116 | log.info('drop {0}/{1} inconsistent samples.'.format(initial_len - len(train), initial_len))
117 | log.info('answer span for training is generated.')
118 | 
119 | # features
120 | trC_tags, trC_ents, trC_features = feature_gen(trC_docs, train.context_idx, trQ_docs, args.no_match)
121 | log.info('features for training is generated: {}, {}, {}'.format(len(trC_tags), len(trC_ents), len(trC_features)))
122 | 
123 | def build_train_vocab(questions, contexts): # vocabulary will also be sorted accordingly
124 |     if args.sort_all:
125 |         counter = collections.Counter(w for doc in questions + contexts for w in doc)
126 |         vocab = sorted([t for t in counter if t in glove_vocab], key=counter.get, reverse=True)
127 |     else:
128 |         counter_c = collections.Counter(w for doc in contexts for w in doc)
129 |         counter_q = collections.Counter(w for doc in questions for w in doc)
130 |         counter = counter_c + counter_q
131 |         vocab = sorted([t for t in counter_q if t in glove_vocab], key=counter_q.get, reverse=True)
132 |         vocab += sorted([t for t in counter_c.keys() - counter_q.keys() if t in glove_vocab],
133 |                         key=counter.get, reverse=True)
134 |     total = sum(counter.values())
135 |     matched = sum(counter[t] for t in vocab)
136 |     log.info('vocab {1}/{0} OOV {2}/{3} ({4:.4f}%)'.format(
137 |         len(counter), len(vocab), (total - matched), total, (total - matched) / total * 100))
138 |     vocab.insert(0, "<PAD>")
139 |     vocab.insert(1, "<UNK>")
140 |     vocab.insert(2, "<S>")
141 |     vocab.insert(3, "</S>")
142 |     return vocab
143 | 
144 | # vocab
145 | tr_vocab = build_train_vocab(trQ_tokens, trC_tokens)
146 | trC_ids = token2id(trC_tokens, tr_vocab, unk_id=1)
147 | trQ_ids = token2id(trQ_tokens, tr_vocab, unk_id=1)
148 | trQ_tokens = [["<S>"] + doc + ["</S>"] for doc in trQ_tokens]
149 | trQ_ids = [[2] + qsent + [3] for qsent in trQ_ids]
150 | print(trQ_ids[:10])
151 | # tags
152 | vocab_tag = [''] + list(nlp.tagger.labels)
153 | trC_tag_ids = token2id(trC_tags, vocab_tag)
154 | # entities
155 | vocab_ent = list(set([ent for sent in trC_ents for ent in sent]))
156 | trC_ent_ids = token2id(trC_ents, vocab_ent, unk_id=0)
157 | 
158 | log.info('Found {} POS tags.'.format(len(vocab_tag)))
159 | log.info('Found {} entity tags: {}'.format(len(vocab_ent), vocab_ent))
160 | log.info('vocabulary for training is built.')
161 | 
162 | tr_embedding = build_embedding(wv_file, tr_vocab, wv_dim)
163 | log.info('got embedding matrix for training.')
164 | 
165 | # don't store row name in csv
166 | #train.to_csv('QuAC_data/train.csv', index=False, encoding='utf8')
167 | 
168 | meta = {
169 |     'vocab': tr_vocab,
170 |     'embedding': tr_embedding.tolist()
171 | }
172 | with open('QuAC_data/train_meta.msgpack', 'wb') as f:
173 |     msgpack.dump(meta, f)
174 | 
175 | prev_CID, first_question = -1, []
176 | for i, CID in enumerate(train.context_idx):
177 |     if not (CID == prev_CID):
178 |         first_question.append(i)
179 |     prev_CID = CID
180 | 
181 | result = {
182 |     'question_ids': trQ_ids,
183 |     'context_ids': trC_ids,
184 |     'context_features': trC_features, # exact match, tf
185 |     'context_tags': trC_tag_ids, # POS tagging
186 |     'context_ents': trC_ent_ids, # Entity recognition
187 |     'context': train_context,
188 |     'context_span': train_context_span,
189 |     '1st_question': first_question,
190 |     'question_CID': train.context_idx.tolist(),
191 |     'question': train.question.tolist(),
192 |     'answer': train.answer.tolist(),
193 |     'answer_start': train.answer_start_token.tolist(),
194 |     'answer_end': train.answer_end_token.tolist(),
195 |     'answer_choice': train.answer_choice.tolist(),
196 |     'context_tokenized': trC_tokens,
197 |     'question_tokenized': trQ_tokens
198 | }
199 | with open('QuAC_data/train_data.msgpack', 'wb') as f:
200 |     msgpack.dump(result, f)
201 | 
202 | log.info('saved training to disk.')
203 | 
204 | #==========================================================
205 | #=================== Work on dev data =====================
206 | #==========================================================
207 | 
208 | def proc_dev(ith, article):
209 |     rows = []
210 |     
211 |     for paragraph in article['paragraphs']:
212 |         context = paragraph['context']
213 |         for qa in paragraph['qas']:
214 |             question = qa['question']
215 |             answers = qa['orig_answer']
216 |             
217 |             answer = answers['text']
218 |             answer_start = answers['answer_start']
219 |             answer_end = answers['answer_start'] + len(answers['text'])
220 |             answer_choice = 0 if answer == 'CANNOTANSWER' else\
221 |                             1 if qa['yesno'] == 'y' else\
222 |                             2 if qa['yesno'] == 'n' else\
223 |                             3 # Not a yes/no question
224 |             if answer_choice != 0:
225 |                 """
226 |                 0: Do not ask a follow up question!
227 |                 1: Definitely ask a follow up question!
228 |                 2: Not too important, but you can ask a follow up.
229 |                 """
230 |                 answer_choice += 10 * (0 if qa['followup'] == "n" else\
231 |                                        1 if qa['followup'] == "y" else\
232 |                                        2)
233 |             else:
234 |                 answer_start, answer_end = -1, -1
235 |             
236 |             ans_ls = []
237 |             for ans in qa['answers']:
238 |                 ans_ls.append(ans['text'])
239 |             
240 |             rows.append((ith, question, answer, answer_start, answer_end, answer_choice, ans_ls))
241 |     return rows, context
242 | 
243 | dev, dev_context = flatten_json(dev_file, proc_dev)
244 | dev = pd.DataFrame(dev, columns=['context_idx', 'question', 'answer',
245 |                                  'answer_start', 'answer_end', 'answer_choice', 'all_answer'])
246 | log.info('dev json data flattened.')
247 | 
248 | print(dev)
249 | 
250 | devC_iter = (pre_proc(c) for c in dev_context)
251 | devQ_iter = (pre_proc(q) for q in dev.question)
252 | devC_docs = [doc for doc in nlp.pipe(
253 |     devC_iter, batch_size=64, n_threads=args.threads)]
254 | devQ_docs = [doc for doc in nlp.pipe(
255 |     devQ_iter, batch_size=64, n_threads=args.threads)]
256 | 
257 | # tokens
258 | devC_tokens = [[normalize_text(w.text) for w in doc] for doc in devC_docs]
259 | devQ_tokens = [[normalize_text(w.text) for w in doc] for doc in devQ_docs]
260 | devC_unnorm_tokens = [[w.text for w in doc] for doc in devC_docs]
261 | log.info('All tokens for dev are obtained.')
262 | 
263 | dev_context_span = [get_context_span(a, b) for a, b in zip(dev_context, devC_unnorm_tokens)]
264 | log.info('context span for dev is generated.')
265 | 
266 | ans_st_token_ls, ans_end_token_ls = [], []
267 | for ans_st, ans_end, idx in zip(dev.answer_start, dev.answer_end, dev.context_idx):
268 |     ans_st_token, ans_end_token = find_answer_span(dev_context_span[idx], ans_st, ans_end)
269 |     ans_st_token_ls.append(ans_st_token)
270 |     ans_end_token_ls.append(ans_end_token)
271 | 
272 | dev['answer_start_token'], dev['answer_end_token'] = ans_st_token_ls, ans_end_token_ls
273 | initial_len = len(dev)
274 | dev.dropna(inplace=True) # modify self DataFrame
275 | log.info('drop {0}/{1} inconsistent samples.'.format(initial_len - len(dev), initial_len))
276 | log.info('answer span for dev is generated.')
277 | 
278 | # features
279 | devC_tags, devC_ents, devC_features = feature_gen(devC_docs, dev.context_idx, devQ_docs, args.no_match)
280 | log.info('features for dev is generated: {}, {}, {}'.format(len(devC_tags), len(devC_ents), len(devC_features)))
281 | 
282 | def build_dev_vocab(questions, contexts): # most vocabulary comes from tr_vocab
283 |     existing_vocab = set(tr_vocab)
284 |     new_vocab = list(set([w for doc in questions + contexts for w in doc if w not in existing_vocab and w in glove_vocab]))
285 |     vocab = tr_vocab + new_vocab
286 |     log.info('train vocab {0}, total vocab {1}'.format(len(tr_vocab), len(vocab)))
287 |     return vocab
288 | 
289 | # vocab
290 | dev_vocab = build_dev_vocab(devQ_tokens, devC_tokens) # tr_vocab is a subset of dev_vocab
291 | devC_ids = token2id(devC_tokens, dev_vocab, unk_id=1)
292 | devQ_ids = token2id(devQ_tokens, dev_vocab, unk_id=1)
293 | devQ_tokens = [["<S>"] + doc + ["</S>"] for doc in devQ_tokens]
294 | devQ_ids = [[2] + qsent + [3] for qsent in devQ_ids]
295 | print(devQ_ids[:10])
296 | # tags
297 | devC_tag_ids = token2id(devC_tags, vocab_tag) # vocab_tag same as training
298 | # entities
299 | devC_ent_ids = token2id(devC_ents, vocab_ent, unk_id=0) # vocab_ent same as training
300 | log.info('vocabulary for dev is built.')
301 | 
302 | dev_embedding = build_embedding(wv_file, dev_vocab, wv_dim)
303 | # tr_embedding is a submatrix of dev_embedding
304 | log.info('got embedding matrix for dev.')
305 | 
306 | # don't store row name in csv
307 | #dev.to_csv('QuAC_data/dev.csv', index=False, encoding='utf8')
308 | 
309 | meta = {
310 |     'vocab': dev_vocab,
311 |     'embedding': dev_embedding.tolist()
312 | }
313 | with open('QuAC_data/dev_meta.msgpack', 'wb') as f:
314 |     msgpack.dump(meta, f)
315 | 
316 | prev_CID, first_question = -1, []
317 | for i, CID in enumerate(dev.context_idx):
318 |     if not (CID == prev_CID):
319 |         first_question.append(i)
320 |     prev_CID = CID
321 | 
322 | result = {
323 |     'question_ids': devQ_ids,
324 |     'context_ids': devC_ids,
325 |     'context_features': devC_features, # exact match, tf
326 |     'context_tags': devC_tag_ids, # POS tagging
327 |     'context_ents': devC_ent_ids, # Entity recognition
328 |     'context': dev_context,
329 |     'context_span': dev_context_span,
330 |     '1st_question': first_question,
331 |     'question_CID': dev.context_idx.tolist(),
332 |     'question': dev.question.tolist(),
333 |     'answer': dev.answer.tolist(),
334 |     'answer_start': dev.answer_start_token.tolist(),
335 |     'answer_end': dev.answer_end_token.tolist(),
336 |     'answer_choice': dev.answer_choice.tolist(),
337 |     'all_answer': dev.all_answer.tolist(),
338 |     'context_tokenized': devC_tokens,
339 |     'question_tokenized': devQ_tokens
340 | }
341 | with open('QuAC_data/dev_data.msgpack', 'wb') as f:
342 |     msgpack.dump(result, f)
343 | 
344 | log.info('saved dev to disk.')
345 | 


--------------------------------------------------------------------------------
/FlowQA_Attention/code/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | msgpack-python
4 | spacy
5 | allennlp
6 | torch
7 | 


--------------------------------------------------------------------------------
/FlowQA_Attention/code/train_QuAC.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import sys
  4 | import random
  5 | import string
  6 | import logging
  7 | import argparse
  8 | from shutil import copyfile
  9 | from datetime import datetime
 10 | from collections import Counter
 11 | import torch
 12 | import msgpack
 13 | import pandas as pd
 14 | import numpy as np
 15 | from QA_model.model_QuAC import QAModel
 16 | from general_utils import find_best_score_and_thresh, BatchGen_QuAC
 17 | 
 18 | parser = argparse.ArgumentParser(
 19 |     description='Train a Dialog QA model.'
 20 | )
 21 | 
 22 | # system
 23 | parser.add_argument('--task_name', default='QuAC')
 24 | parser.add_argument('--name', default='', help='additional name of the current run')
 25 | parser.add_argument('--log_file', default='output.log',
 26 |                     help='path for log file.')
 27 | parser.add_argument('--log_per_updates', type=int, default=20,
 28 |                     help='log model loss per x updates (mini-batches).')
 29 | 
 30 | parser.add_argument('--train_dir', default='QuAC_data/')
 31 | parser.add_argument('--dev_dir', default='QuAC_data/')
 32 | parser.add_argument('--answer_type_num', type=int, default=1)
 33 | 
 34 | parser.add_argument('--model_dir', default='models',
 35 |                     help='path to store saved models.')
 36 | parser.add_argument('--eval_per_epoch', type=int, default=1,
 37 |                     help='perform evaluation per x epoches.')
 38 | parser.add_argument('--MTLSTM_path', default='glove/MT-LSTM.pth')
 39 | parser.add_argument('--save_all', dest='save_best_only', action='store_false', help='save all models.')
 40 | parser.add_argument('--do_not_save', action='store_true', help='don\'t save any model')
 41 | parser.add_argument('--save_for_predict', action='store_true')
 42 | parser.add_argument('--seed', type=int, default=1023,
 43 |                     help='random seed for data shuffling, dropout, etc.')
 44 | parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available(),
 45 |                     help='whether to use GPU acceleration.')
 46 | # training
 47 | parser.add_argument('-e', '--epoches', type=int, default=30)
 48 | parser.add_argument('-bs', '--batch_size', type=int, default=3)
 49 | parser.add_argument('-ebs', '--elmo_batch_size', type=int, default=12)
 50 | parser.add_argument('-rs', '--resume', default='',
 51 |                     help='previous model pathname. '
 52 |                          'e.g. "models/checkpoint_epoch_11.pt"')
 53 | parser.add_argument('-ro', '--resume_options', action='store_true',
 54 |                     help='use previous model options, ignore the cli and defaults.')
 55 | parser.add_argument('-rlr', '--reduce_lr', type=float, default=0.,
 56 |                     help='reduce initial (resumed) learning rate by this factor.')
 57 | parser.add_argument('-op', '--optimizer', default='adamax',
 58 |                     help='supported optimizer: adamax, sgd, adadelta, adam')
 59 | parser.add_argument('-gc', '--grad_clipping', type=float, default=10)
 60 | parser.add_argument('-wd', '--weight_decay', type=float, default=0)
 61 | parser.add_argument('-lr', '--learning_rate', type=float, default=0.1,
 62 |                     help='only applied to SGD.')
 63 | parser.add_argument('-mm', '--momentum', type=float, default=0,
 64 |                     help='only applied to SGD.')
 65 | parser.add_argument('-tp', '--tune_partial', type=int, default=1000,
 66 |                     help='finetune top-x embeddings (including <PAD>, <UNK>).')
 67 | parser.add_argument('--fix_embeddings', action='store_true',
 68 |                     help='if true, `tune_partial` will be ignored.')
 69 | parser.add_argument('--elmo_lambda', type=float, default=0.0)
 70 | parser.add_argument('--no_question_normalize', dest='question_normalize', action='store_false') # when set, do dialog normalize
 71 | parser.add_argument('--pretrain', default='')
 72 | 
 73 | # model
 74 | parser.add_argument('--explicit_dialog_ctx', type=int, default=2)
 75 | parser.add_argument('--use_dialog_act', action='store_true')
 76 | parser.add_argument('--no_dialog_flow', action='store_true')
 77 | parser.add_argument('--no_hierarchical_query', dest='do_hierarchical_query', action='store_false')
 78 | parser.add_argument('--no_prealign', dest='do_prealign', action='store_false')
 79 | 
 80 | parser.add_argument('--final_output_att_hidden', type=int, default=250)
 81 | parser.add_argument('--question_merge', default='linear_self_attn')
 82 | parser.add_argument('--no_ptr_update', dest='do_ptr_update', action='store_false')
 83 | parser.add_argument('--no_ptr_net_indep_attn', dest='ptr_net_indep_attn', action='store_false')
 84 | parser.add_argument('--ptr_net_attn_type', default='Bilinear', help="Attention for answer span output: Bilinear, MLP or Default")
 85 | 
 86 | parser.add_argument('--do_residual_rnn', dest='do_residual_rnn', action='store_true')
 87 | parser.add_argument('--do_residual_everything', dest='do_residual_everything', action='store_true')
 88 | parser.add_argument('--do_residual', dest='do_residual', action='store_true')
 89 | parser.add_argument('--rnn_layers', type=int, default=1, help="Default number of RNN layers")
 90 | parser.add_argument('--rnn_type', default='lstm',
 91 |                     help='supported types: rnn, gru, lstm')
 92 | parser.add_argument('--concat_rnn', dest='concat_rnn', action='store_true')
 93 | 
 94 | parser.add_argument('--hidden_size', type=int, default=125)
 95 | parser.add_argument('--self_attention_opt', type=int, default=1) # 0: no self attention
 96 | 
 97 | parser.add_argument('--deep_inter_att_do_similar', type=int, default=0)
 98 | parser.add_argument('--deep_att_hidden_size_per_abstr', type=int, default=250)
 99 | 
100 | parser.add_argument('--no_elmo', dest='use_elmo', action='store_false')
101 | parser.add_argument('--no_em', action='store_true')
102 | 
103 | parser.add_argument('--no_wemb', dest='use_wemb', action='store_false') # word embedding
104 | parser.add_argument('--CoVe_opt', type=int, default=1) # contexualized embedding option
105 | parser.add_argument('--no_pos', dest='use_pos', action='store_false') # pos tagging
106 | parser.add_argument('--pos_size', type=int, default=51, help='how many kinds of POS tags.')
107 | parser.add_argument('--pos_dim', type=int, default=12, help='the embedding dimension for POS tags.')
108 | parser.add_argument('--no_ner', dest='use_ner', action='store_false') # named entity
109 | parser.add_argument('--ner_size', type=int, default=19, help='how many kinds of named entity tags.')
110 | parser.add_argument('--ner_dim', type=int, default=8, help='the embedding dimension for named entity tags.')
111 | 
112 | parser.add_argument('--prealign_hidden', type=int, default=300)
113 | parser.add_argument('--prealign_option', type=int, default=2, help='0: No prealign, 1, 2, ...: Different options')
114 | 
115 | parser.add_argument('--no_seq_dropout', dest='do_seq_dropout', action='store_false')
116 | parser.add_argument('--my_dropout_p', type=float, default=0.4)
117 | parser.add_argument('--dropout_emb', type=float, default=0.4)
118 | 
119 | parser.add_argument('--max_len', type=int, default=35)
120 | 
121 | parser.add_argument('--flow_attention', type=int, default = 1)
122 | parser.add_argument('--use_bert', help="Whether to use BERT or not, and which version to use")
123 | parser.add_argument('--bert_path', default='bert_vocab_files/')
124 | args = parser.parse_args()
125 | 
126 | if args.name != '':
127 |     args.model_dir = args.model_dir + '_' + args.name
128 |     args.log_file = os.path.dirname(args.log_file) + 'output_' + args.name + '.log'
129 | 
130 | # set model dir
131 | model_dir = args.model_dir
132 | os.makedirs(model_dir, exist_ok=True)
133 | model_dir = os.path.abspath(model_dir)
134 | 
135 | # set random seed
136 | random.seed(args.seed)
137 | np.random.seed(args.seed)
138 | torch.manual_seed(args.seed)
139 | if args.cuda:
140 |     torch.cuda.manual_seed_all(args.seed)
141 | 
142 | # setup logger
143 | log = logging.getLogger(__name__)
144 | log.setLevel(logging.DEBUG)
145 | fh = logging.FileHandler(args.log_file)
146 | fh.setLevel(logging.DEBUG)
147 | ch = logging.StreamHandler(sys.stdout)
148 | ch.setLevel(logging.INFO)
149 | formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S')
150 | fh.setFormatter(formatter)
151 | ch.setFormatter(formatter)
152 | log.addHandler(fh)
153 | log.addHandler(ch)
154 | 
155 | def main():
156 |     log.info('[program starts.]')
157 |     opt = vars(args) # changing opt will change args
158 |     train, train_embedding, opt = load_train_data(opt)
159 |     dev, dev_embedding, dev_answer = load_dev_data(opt)
160 |     opt['num_features'] += args.explicit_dialog_ctx * (args.use_dialog_act*3 + 2) # dialog_act + previous answer
161 |     if opt['use_elmo'] == False:
162 |         opt['elmo_batch_size'] = 0
163 |     log.info('[Data loaded.]')
164 | 
165 |     if args.resume:
166 |         log.info('[loading previous model...]')
167 |         checkpoint = torch.load(args.resume)
168 |         if args.resume_options:
169 |             opt = checkpoint['config']
170 |         state_dict = checkpoint['state_dict']
171 |         model = QAModel(opt, train_embedding, state_dict)
172 |         epoch_0 = checkpoint['epoch'] + 1
173 |         for i in range(checkpoint['epoch']):
174 |             random.shuffle(list(range(len(train))))  # synchronize random seed
175 |         if args.reduce_lr:
176 |             lr_decay(model.optimizer, lr_decay=args.reduce_lr)
177 |     else:
178 |         model = QAModel(opt, train_embedding)
179 |         epoch_0 = 1
180 | 
181 |     if args.pretrain:
182 |         pretrain_model = torch.load(args.pretrain)
183 |         state_dict = pretrain_model['state_dict']['network']
184 | 
185 |         model.get_pretrain(state_dict)
186 | 
187 |     model.setup_eval_embed(dev_embedding)
188 |     log.info("[dev] Total number of params: {}".format(model.total_param))
189 | 
190 |     if args.cuda:
191 |         model.cuda()
192 | 
193 |     if args.resume:
194 |         batches = BatchGen_QuAC(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, dialog_ctx=args.explicit_dialog_ctx, use_dialog_act=args.use_dialog_act, use_bert=args.use_bert, bert_path=args.bert_path)
195 |         predictions, no_ans_scores = [], []
196 |         for batch in batches:
197 |             phrases, noans = model.predict(batch)
198 |             predictions.extend(phrases)
199 |             no_ans_scores.extend(noans)
200 |         f1, na, thresh = find_best_score_and_thresh(predictions, dev_answer, no_ans_scores)
201 |         log.info("[dev F1: {} NA: {} TH: {}]".format(f1, na, thresh))
202 |         best_val_score, best_na, best_thresh = f1, na, thresh
203 |     else:
204 |         best_val_score, best_na, best_thresh = 0.0, 0.0, 0.0
205 | 
206 |     for epoch in range(epoch_0, epoch_0 + args.epoches):
207 |         log.warning('Epoch {}'.format(epoch))
208 |         # train
209 |         batches = BatchGen_QuAC(train, batch_size=args.batch_size, gpu=args.cuda, dialog_ctx=args.explicit_dialog_ctx, use_dialog_act=args.use_dialog_act, precompute_elmo=args.elmo_batch_size // args.batch_size, use_bert = args.use_bert, bert_path=args.bert_path)
210 |         start = datetime.now()
211 |         for i, batch in enumerate(batches):
212 |             model.update(batch)
213 |             if i % args.log_per_updates == 0:
214 |                 log.info('updates[{0:6}] train loss[{1:.5f}] remaining[{2}]'.format(
215 |                     model.updates, model.train_loss.avg,
216 |                     str((datetime.now() - start) / (i + 1) * (len(batches) - i - 1)).split('.')[0]))
217 |         
218 |         # eval
219 |         if epoch % args.eval_per_epoch == 0:
220 |             batches = BatchGen_QuAC(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, dialog_ctx=args.explicit_dialog_ctx, use_dialog_act=args.use_dialog_act, precompute_elmo=args.elmo_batch_size // args.batch_size, use_bert = args.use_bert, bert_path=args.bert_path)
221 |             predictions, no_ans_scores = [], []
222 |             for batch in batches:
223 |                 phrases, noans = model.predict(batch)
224 |                 predictions.extend(phrases)
225 |                 no_ans_scores.extend(noans)
226 |             f1, na, thresh = find_best_score_and_thresh(predictions, dev_answer, no_ans_scores)
227 | 
228 |         # save
229 |         if args.save_best_only:
230 |             if f1 > best_val_score:
231 |                 best_val_score, best_na, best_thresh = f1, na, thresh
232 |                 model_file = os.path.join(model_dir, 'best_model.pt')
233 |                 model.save(model_file, epoch)
234 |                 log.info('[new best model saved.]')
235 |         else:
236 |             model_file = os.path.join(model_dir, 'checkpoint_epoch_{}.pt'.format(epoch))
237 |             model.save(model_file, epoch)
238 |             if f1 > best_val_score:
239 |                 best_val_score, best_na, best_thresh = f1, na, thresh
240 |                 copyfile(os.path.join(model_dir, model_file),
241 |                          os.path.join(model_dir, 'best_model.pt'))
242 |                 log.info('[new best model saved.]')
243 | 
244 |         log.warning("Epoch {} - dev F1: {:.3f} NA: {:.3f} TH: {:.3f} (best F1: {:.3f} NA: {:.3f} TH: {:.3f})".format(epoch, f1, na, thresh, best_val_score, best_na, best_thresh))
245 | 
246 | def lr_decay(optimizer, lr_decay):
247 |     for param_group in optimizer.param_groups:
248 |         param_group['lr'] *= lr_decay
249 |     log.info('[learning rate reduced by {}]'.format(lr_decay))
250 |     return optimizer
251 | 
252 | def load_train_data(opt):
253 |     with open(os.path.join(args.train_dir, 'train_meta.msgpack'), 'rb') as f:
254 |         meta = msgpack.load(f, encoding='utf8')
255 |     embedding = torch.Tensor(meta['embedding'])
256 |     opt['vocab_size'] = embedding.size(0)
257 |     opt['embedding_dim'] = embedding.size(1)
258 | 
259 |     with open(os.path.join(args.train_dir, 'train_data.msgpack'), 'rb') as f:
260 |         data = msgpack.load(f, encoding='utf8')
261 |     #data_orig = pd.read_csv(os.path.join(args.train_dir, 'train.csv'))
262 | 
263 |     opt['num_features'] = len(data['context_features'][0][0])
264 | 
265 |     train = {'context': list(zip(
266 |                         data['context_ids'],
267 |                         data['context_tags'],
268 |                         data['context_ents'],
269 |                         data['context'],
270 |                         data['context_span'],
271 |                         data['1st_question'],
272 |                         data['context_tokenized'])),
273 |              'qa': list(zip(
274 |                         data['question_CID'],
275 |                         data['question_ids'],
276 |                         data['context_features'],
277 |                         data['answer_start'],
278 |                         data['answer_end'],
279 |                         data['answer_choice'],
280 |                         data['question'],
281 |                         data['answer'],
282 |                         data['question_tokenized']))
283 |             }
284 |     return train, embedding, opt
285 | 
286 | def load_dev_data(opt): # can be extended to true test set
287 |     with open(os.path.join(args.dev_dir, 'dev_meta.msgpack'), 'rb') as f:
288 |         meta = msgpack.load(f, encoding='utf8')
289 |     embedding = torch.Tensor(meta['embedding'])
290 |     assert opt['embedding_dim'] == embedding.size(1)
291 | 
292 |     with open(os.path.join(args.dev_dir, 'dev_data.msgpack'), 'rb') as f:
293 |         data = msgpack.load(f, encoding='utf8')
294 |     #data_orig = pd.read_csv(os.path.join(args.dev_dir, 'dev.csv'))
295 | 
296 |     assert opt['num_features'] == len(data['context_features'][0][0])
297 | 
298 |     dev = {'context': list(zip(
299 |                         data['context_ids'],
300 |                         data['context_tags'],
301 |                         data['context_ents'],
302 |                         data['context'],
303 |                         data['context_span'],
304 |                         data['1st_question'],
305 |                         data['context_tokenized'])),
306 |            'qa': list(zip(
307 |                         data['question_CID'],
308 |                         data['question_ids'],
309 |                         data['context_features'],
310 |                         data['answer_start'],
311 |                         data['answer_end'],
312 |                         data['answer_choice'],
313 |                         data['question'],
314 |                         data['answer'],
315 |                         data['question_tokenized']))
316 |           }
317 | 
318 |     dev_answer = []
319 |     for i, CID in enumerate(data['question_CID']):
320 |         if len(dev_answer) <= CID:
321 |             dev_answer.append([])
322 |         dev_answer[CID].append(data['all_answer'][i])
323 | 
324 |     return dev, embedding, dev_answer
325 | 
326 | if __name__ == '__main__':
327 |     main()
328 | 


--------------------------------------------------------------------------------
/FlowQA_Attention/figure/Attention Over Flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/FlowQA_Attention/figure/Attention Over Flow.png


--------------------------------------------------------------------------------
/FlowQA_Attention/figure/Readme.me:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/FlowQA_Attention/figure/Vanilla Flow Operation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/FlowQA_Attention/figure/Vanilla Flow Operation.png


--------------------------------------------------------------------------------
/FlowQA_Attention/figure/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/FlowQA_Attention/figure/result.png


--------------------------------------------------------------------------------
/FlowQA_Coreference/README.md:
--------------------------------------------------------------------------------
 1 | # FlowQA + Coreference
 2 | 
 3 | We used the [coreference model](https://github.com/kentonl/e2e-coref) proposed by Lee et all to improve the [FlowQA](https://github.com/momohuang/FlowQA) model. 
 4 | In short, this model clusters spans in text that coreference among other.
 5 | For example, let's consider below conversation;
 6 | 
 7 | A: What is the story about?
 8 | 
 9 | B: young girl and her dog 
10 | 
11 | A: What were they doing?
12 | 
13 | B: set out a trip
14 | 
15 | In this conversation, the phrase "young girl and her dog" said by A refers to "they" said by B.
16 | Also, the pronoun "her" said by A refers to "young girl" which appears earlier.
17 | Therefore, the model will cluster the following spans;
18 | 
19 | [["young girl", "her"], ["young girl and her dog", "they"]]
20 | 
21 | 
22 | In order to feed this coreference information to the FlowQA model, we can
23 | 
24 | * Feed the result from the coreference model to the FlowQA model
25 | * Feed the internal representation of each tokens in coreference model to the FlowQA model.
26 | 
27 | We concatenate these two new representation with the existing input of FlowQA, which consists of word embeddings, part of speech and named entity.
28 | 
29 | ## Feed the result from the coreference model 
30 | 
31 | We use one hot encoding to represent the results from coreference model. 
32 | Each token will be coresponding to a vector of which the length is the number of clusters.
33 | If a token belong to an ith cluster, the element ith in one hot vector will be 1.
34 | Here is the one hot encoding for above example.
35 | 
36 | "young" [1, 1]
37 | "girl" [1, 1]
38 | "and" [0 ,1]
39 | "her" [1, 1]
40 | "dog" [1, 1]
41 | "What" [0, 0]
42 | "were" [0, 0]
43 | "they" [0, 1]
44 | "doing" [0, 0]
45 | 
46 | ## Feed the internal representation 
47 | 
48 | We use the latent representation from the coreference model as an input of FlowQA model. To be precise, we use X* from the figure 3 of [this paper](https://arxiv.org/abs/1707.07045). Since the model predicts the coreference, we hypothesize that the latent representation of the model will encode the coreference as well.
49 | 
50 | # Experiment
51 | 
52 | First, we set up the Coreference Model according to [instruction](https://github.com/kentonl/e2e-coref). 
53 | Then we use their pre-trained coreference model to predict the coreference of QuAC data set.
54 | We extract the latent representation of each token and save them. 
55 | After that, we convert the result of the coreference model to one hot encoding.
56 | Finally, we concatenate them with one hot encoding to obtain the augmented input vector that will be used to input FlowQA.
57 | The augmented vector' size is 450 which consists of 400 for X* latent representation and 50 for one hot encoding.
58 | 
59 | Second, we set up the FlowQA model according to the [instruction](https://github.com/momohuang/FlowQA). 
60 | We have to modify ```train_QuAC.py```, ```general_utils.py``` and ```detail_model.py``` to support the augmented representation. 
61 | Then we train the model with this new input representation.
62 | 
63 | 
64 | We use [Google Cloud Platform](https://cloud.google.com/) with 4 vCPUs, 26 GB memory and 1 x NVIDIA Tesla K80 to run both FlowQA and Coreference models. 
65 | It takes around 50 hour to predict the coreference for entire QuAC dataset and takes around 4 hour per epoch to train FlowQA model
66 | 
67 | 
68 | # Result
69 | 
70 | Below is the comparison of F1 between the original FlowQA model and our modification. We can see that the improved model perform slightly better.
71 | 
72 | ![ ](/figure/coref-F1.png)
73 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Question Answering In Context
  2 | Question Answering (QA) has long been a promising yet challenging task, and a large number of work has been done in this area. However, simple QA tasks such as extracting answer spans from a given text have been criticized as "shallow pattern recognition", which does not equip machines with the capability of reading. Recently, Conversational Question Answering has been proposed to address this issue. In this task, a program needs to not only answer questions, but also answer them in a conversational style.
  3 | 
  4 | In this project, we are going to use deep neural networks to address [QuAC](https://quac.ai/), one of these Conversational QA datasets. We are going to train existing models on QuAC and compare their pros and cons. We will also use models effective on other related datasets and invent new models involving different network architectures or mechanisms. We hope that we could obtain a better, and even state-of-the-art performance.
  5 | 
  6 | This project is a part of CS 599 Deep Learning and Its Applications in Spring 2019 at Department of Computer Science, University of Southern California.
  7 | 
  8 | ## QuAC
  9 | <p align="center">
 10 |     <img src="figure/task.png" width="400"/>
 11 |     <br></br>
 12 |     Figure 1: Task setup of QuAC. (Pics from [QuAC](https://quac.ai))
 13 | </p>
 14 | 
 15 | [Question Answering in Context(QuAC)](https://quac.ai/) is a newly proposed dataset with regards to conversational QA. The task is to answer a question in a conversation given a context and historical questions and answers in the conversation. Compared with the former ones, its questions are often highly context-dependent, elliptical, and even unanswerable. The architecture o typical QA models for QuAC:
 16 | 
 17 | <p align="center">
 18 |     <img src="figure/quac.png" width="400"/>
 19 |     <br></br>
 20 |     Figure 2: Baseline QA Model for QuAC.
 21 | </p>
 22 | 
 23 | ## Problem Formulation
 24 | The input of the problem is a context as background and a sequence of text-free questions, each question has two binary properties “yes/no” and “followup”. The first property indicates whether the question can be answered by affirmation “yes” or “no”. The second property indicates whether the question is a follow-up of previous questions. 
 25 | 
 26 | The output is the answer of each question. The answer must be a span of context unless the question is a “yes/no“ question, or the answer should be “cannot answer“ if the answer cannot be found in given context.
 27 | 
 28 | <p align="center">
 29 |     <img src="https://github.com/deepnlp-cs599-usc/quac/blob/master/figure/QA.png" width="400"/>
 30 |     
 31 | </p>
 32 | 
 33 | ## Approaches
 34 | 
 35 | We try to enhance two baseline models: FlowQA and BiDAF++. We also try to apply an existing model, SDNet, which works on another QA dataset.
 36 | 
 37 | ### FlowQA
 38 | 
 39 | [FlowQA](https://github.com/momohuang/FlowQA) has been shown to have a considerable good performance on conversational question-answering tasks such as CoQA and QuAC. Firstly, we re-run the FlowQA model and investigate why it has great performance and why it is special. 
 40 | 
 41 | 
 42 | Why does FlowQA work? We believe that it is because of its "flow" operation, which uses a LSTM (or GRU) to represent context words in terms of question turns. By using "flow" operation, FlowQA could capture and retain conversational information. Since
 43 | conversations are in fact sequences, it is a natural, yet smart, idea, to encode conversational information in this way.
 44 | 
 45 | 
 46 | <p align="center">
 47 |     <img src="https://github.com/deepnlp-cs599-usc/quac/blob/master/FlowQA_Attention/figure/Vanilla%20Flow%20Operation.png" width="400"/>
 48 |     
 49 | </p>
 50 | 
 51 | #### [FlowQA + Attention Over Flow](FlowQA_Attention)
 52 | 
 53 | We believe that applying attention here should be considered, because not every historical conversation is important. 
 54 | When new questions are posted, it is highly likely that only several previous questions are involved, instead of all. By adding attention in the "flow" operation, we could align new representations with previous useful ones. We would like to call it "attention-over-flow".
 55 | 
 56 | 
 57 | <p align="center">
 58 |     <img src="https://github.com/deepnlp-cs599-usc/quac/blob/master/FlowQA_Attention/figure/Attention%20Over%20Flow.png" width="400"/>
 59 |     
 60 | </p>
 61 | 
 62 | #### [FlowQA + Coreference](FlowQA_Coreference)
 63 | 
 64 | Since the QuAC data set may contain many coreference in context and also in conversations, our intuition is that we could exploit the coreference resolution model to improve FlowQA model. 
 65 | Below figure shows the improved model where the gray blocks are the existing FlowQA model and the blue blocks are coreference model we added. 
 66 | Our experiment shows that, with coreference model, the F1 is slightly better.
 67 | <p align="center">
 68 |     <img src="/figure/flow-coref.png" width="400"/>
 69 | </p>
 70 | 
 71 | ### [SD-Net](SDNet)
 72 | A contextualized attention-based deep neural network developed by Microsoft. It is originally evaluated on another question answering dataset CoQA, and it is the first model that reaches in-domain F1 score higher than 80% (80.7%) on CoQA. Here we are applying this model on QuAC dataset. Since the format of two datasets are different, we need to preprocess data carefully.
 73 | 
 74 | ### [BiDAF++](BiDAF)
 75 | An original [BiDAF++](https://arxiv.org/abs/1710.10723) model uses Char-CNN for character embedding and GLoVe for word embedding. It is also equipped with contextualized embeddings and self attention. In this model, marker embeddings corresponding to previous answer words are used, while question turn numbers are encoded into question embeddings. We intend to append ELMo or BERT embedding to word embeddings and contextualized embeddings to get better performance.
 76 | <p align="center">
 77 |     <img src="BiDAF/Figures/Arch.png" width="600"/>
 78 | </p>
 79 | 
 80 | 
 81 | ## Results
 82 | | Model | F1 | Remark |
 83 | | ------------- | ------------- | ------------- |
 84 | | **FlowQA Baseline** | 64.24| Baseline in one experiment |
 85 | | FlowQA + Coreference | 62.50 | Baseline=62.26 in another experiment|
 86 | | FlowQA + Attention on Flows | 64.35 |  |
 87 | | **BiDAF++ Baseline** | 55.59 | |
 88 | | ELMO + BiDAF++ | 58.40 |  |
 89 | | BERT + BIDAF++ | 60.04 |  |
 90 | | **SDNet** | 33.13 |  |
 91 | 
 92 | ## Conclusion
 93 | * **FlowQA+Attention Over Flow**: adding attention layers over ﬂow operation layer slightly improves the FlowQA model. We believe that it is because the representations generated in this way focus more on recent dialogs and help resolve coreferences.
 94 | 
 95 | * **FlowQA+Coreferece**: Adding vector representation that encodes the coreference in long context can increase the performance of the model for task involving complex dependencies (context and dialogue)
 96 | 
 97 | * **Enhancing BiDAF++**: Appending ELMo or BERT embedding to word embeddings and contextualized embeddings. Because ELMo extracts context features from language model and BERT uses pre-trained token embedding model which could perform better than GLoVe on QuAC dataset, these two embeddings could enhance the model performance.
 98 | 
 99 | * **SDNet**: SDnet is not originally applied on QuAC dataset. So we adjusted the format of dataset so it can be ﬁtted on SDNet. However the result is not as good as we expected. We can try to ﬁgure out issues and improve this model in future.
100 | 
101 | ## References
102 | 
103 | * [FlowQA: Grasping Flow in History for Conversational Machine Comprehension](https://arxiv.org/abs/1810.06683) by Huang et. al.
104 | * [Bidirectional attention ﬂow for machine comprehension](https://arxiv.org/abs/1611.01603) by Minjoon Seo et. al.
105 | * [Simple and effective multi-paragraph reading comprehension](https://arxiv.org/abs/1710.10723) by Christopher Clark et. al.
106 | * [Deep contextualized word representations](https://arxiv.org/abs/1802.05365) by Matthew E. Peters et. al.
107 | * [Bert: Pre-training of deep bidirectional transformers for language understanding](https://arxiv.org/abs/1810.04805) by  Jacob Devlin et. al.
108 | * [SDNet: Contextualized Attention-based Deep Network for Conversational Question Answering](https://arxiv.org/abs/1812.03593) by Chenguang Zhu et. al.
109 | * [End-to-end Neural Coreference Resolution](https://arxiv.org/abs/1707.07045) by Lee et. al.
110 | 


--------------------------------------------------------------------------------
/SDNet/Figures/f1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/Figures/f1.jpg


--------------------------------------------------------------------------------
/SDNet/Figures/loss.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/Figures/loss.jpg


--------------------------------------------------------------------------------
/SDNet/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # SDNet
 3 | [SDNet](https://arxiv.org/abs/1812.03593) is a contextualized attention-based deep neural network developed by Microsoft. It is originally evaluated on another question answering dataset [CoQA](https://stanfordnlp.github.io/coqa/), it is the first model that reaches in-domain F1 score higher than 80% (80.7%) on CoQA.
 4 | 
 5 | ## Difference between CoQA and QuAC
 6 | Since the format of dataset is different between CoQA and QuAC, we need to convert the format first to let SDNet work on QuAC. Unlike QuAC, the answers in CoQA are context-free. However, SDNet still need to generate an answer span from text, so we can take this as output on QuAC. Moreover, questions in CoQA don’t have properties such as “follow up” and “yes/no”, but all questions are possible to be related to previous questions and answers. Original SDNet model prepends 2 rounds of previous questions and answers to obtain the best F1 score, so we will keep this behavior and ignore “follow up” property. Also, SDNet essentially needs to compute the probability of answer being affirmation “yes” or “no”, and “No answer” (“unknown” in CoQA), which we can use directly on QuAC.
 7 | 
 8 | ## Usage
 9 | In preprocess folder there is a script to convert QuAC data to CoQA format.
10 | 
11 | For detailed usage please refer to [SDNet GitHub page](https://github.com/Microsoft/SDNet)
12 | 
13 | ## Result and conclusion
14 | Training loss and development set F1 score are shown below: 
15 | 
16 | <p align="center">
17 |     <img src="Figures/loss.jpg" height="50%" width="50%"/>
18 | </p>
19 | 
20 | <p align="center">
21 |     <img src="Figures/f1.jpg" height="50%" width="50%"/>
22 | </p>
23 | 
24 | Unfortunately, the reult of SDNet is not as good as we expected. We only get 33.13 F1 score on development set. We tried to feagure out the reason why this happends (such as format and preprocess issue) but couldn't make any significant improvement on that. We can set this as a future work and continue working on this.
25 | 
26 | ## References
27 | 
28 | * [SDNet: Contextualized Attention-based Deep Network for Conversational Question Answering](https://arxiv.org/abs/1812.03593) by Chenguang Zhu et. al.
29 | 


--------------------------------------------------------------------------------
/SDNet/code/SDNet/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | This project welcomes contributions and suggestions. Most contributions require you to
 4 | agree to a Contributor License Agreement (CLA) declaring that you have the right to,
 5 | and actually do, grant us the rights to use your contribution. For details, visit
 6 | https://cla.microsoft.com.
 7 | 
 8 | When you submit a pull request, a CLA-bot will automatically determine whether you need
 9 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
10 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
11 | 
12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
14 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.


--------------------------------------------------------------------------------
/SDNet/code/SDNet/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/BaseTrainer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | import os
 5 | 
 6 | class BaseTrainer():
 7 |     def __init__(self, opt):
 8 |         self.opt = opt
 9 |         self.isTrain = False;
10 |         if self.opt['cuda'] == True:
11 |             self.use_cuda = True
12 |             print('Using Cuda\n') 
13 |         else:
14 |             self.use_cuda = False
15 |             print('Using CPU\n')
16 | 
17 |         self.is_official = 'OFFICIAL' in self.opt
18 |         # Make sure raw text feature files are ready
19 |         self.use_spacy = 'SPACY_FEATURE' in self.opt
20 |         self.opt['logFile'] = 'log.txt'
21 | 
22 |         opt['FEATURE_FOLDER'] = 'conf~/' + ('spacy_intermediate_feature~/' if self.use_spacy else 'intermediate_feature~/')
23 |         opt['FEATURE_FOLDER'] = os.path.join(opt['datadir'], opt['FEATURE_FOLDER'])
24 | 
25 |     def log(self, s):
26 |         # In official case, the program does not output logs
27 |         if self.is_official:
28 |             return
29 | 
30 |         with open(os.path.join(self.saveFolder, self.opt['logFile']), 'a') as f:
31 |             f.write(s + '\n')
32 |         print(s)
33 | 
34 |     def getSaveFolder(self):
35 |         runid = 1
36 |         while True:
37 |             saveFolder = os.path.join(self.opt['datadir'], 'conf~', 'run_' + str(runid))
38 |             if not os.path.exists(saveFolder):
39 |                 self.saveFolder = saveFolder
40 |                 os.makedirs(self.saveFolder)
41 |                 print('Saving logs, model and evaluation in ' + self.saveFolder)
42 |                 return
43 |             runid = runid + 1    
44 |   
45 |     # save copy of conf file 
46 |     def saveConf(self):
47 |         with open(self.opt['confFile'], encoding='utf-8') as f:
48 |             with open(os.path.join(self.saveFolder, 'conf_copy'), 'w', encoding='utf-8') as fw:
49 |                 for line in f:
50 |                     fw.write(line + '\n')
51 | 
52 |     def train(self): 
53 |         pass
54 |  
55 |     def load(self):
56 |         pass
57 | 


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/Bert/Bert.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT license.
  3 | 
  4 | import os
  5 | import torch
  6 | import torch.nn as nn
  7 | from torch.autograd import Variable
  8 | from Models.Bert.modeling import BertModel
  9 | 
 10 | '''
 11 |     BERT    
 12 | '''
 13 | class Bert(nn.Module):
 14 |     def __init__(self, opt):
 15 |         super(Bert, self).__init__()
 16 |         print('Loading BERT model...')
 17 |         self.BERT_MAX_LEN = 512
 18 |         self.linear_combine = 'BERT_LINEAR_COMBINE' in opt
 19 |         
 20 |         if 'BERT_LARGE' in opt:
 21 |             print('Using BERT Large model')
 22 |             model_file = os.path.join(opt['datadir'], opt['BERT_large_model_file'])
 23 |             print('Loading BERT model from', model_file)
 24 |             self.bert_model = BertModel.from_pretrained(model_file)
 25 |             #self.bert_model = BertModel.from_pretrained('bert-large-uncased')        
 26 |             self.bert_dim = 1024
 27 |             self.bert_layer = 24
 28 |         else:
 29 |             print('Using BERT base model')
 30 |             model_file = os.path.join(opt['datadir'], opt['BERT_model_file'])
 31 |             print('Loading BERT model from', model_file)
 32 |             self.bert_model = BertModel.from_pretrained(model_file)
 33 |             #self.bert_model = BertModel.from_pretrained('bert-base-cased')        
 34 |             self.bert_dim = 768
 35 |             self.bert_layer = 12
 36 |         self.bert_model.cuda()
 37 |         self.bert_model.eval()
 38 | 
 39 |         print('Finished loading')
 40 | 
 41 |     '''
 42 |         Input:
 43 |               x_bert: batch * max_bert_sent_len (ids)
 44 |               x_bert_mask: batch * max_bert_sent_len (0/1)
 45 |               x_bert_offset: batch * max_real_word_num * 2
 46 |               x_mask: batch * max_real_word_num
 47 |             Output:
 48 |               embedding: batch * max_real_word_num * bert_dim
 49 |     '''
 50 |     def forward(self, x_bert, x_bert_mask, x_bert_offset, x_mask):
 51 |         if self.linear_combine:
 52 |             return self.combine_forward(x_bert, x_bert_mask, x_bert_offset, x_mask)
 53 | 
 54 |         last_layers = []
 55 |         bert_sent_len = x_bert.shape[1]
 56 |         p = 0
 57 |         while p < bert_sent_len:
 58 |             all_encoder_layers, _ = self.bert_model(x_bert[:, p:(p + self.BERT_MAX_LEN)], token_type_ids=None, attention_mask=x_bert_mask[:, p:(p + self.BERT_MAX_LEN)]) # bert_layer * batch * max_bert_sent_len * bert_dim
 59 |             last_layers.append(all_encoder_layers[-1]) # batch * up_to_512 * bert_dim
 60 |             p += self.BERT_MAX_LEN
 61 | 
 62 |         bert_embedding = torch.cat(last_layers, 1)
 63 |         
 64 |         batch_size = x_mask.shape[0]
 65 |         max_word_num = x_mask.shape[1]
 66 |         output = Variable(torch.zeros(batch_size, max_word_num, self.bert_dim))
 67 |         for i in range(batch_size):
 68 |             for j in range(max_word_num):
 69 |                 if x_mask[i, j] == 0:
 70 |                     continue
 71 |                 st = x_bert_offset[i, j, 0]    
 72 |                 ed = x_bert_offset[i, j, 1]
 73 |                 # we can also try using st only, ed only
 74 |                 if st + 1 == ed: # including st==ed
 75 |                     output[i, j, :] = bert_embedding[i, st, :]
 76 |                 else:    
 77 |                     subword_ebd_sum = torch.sum(bert_embedding[i, st:ed, :], dim = 0)
 78 |                     if st < ed:
 79 |                         output[i, j, :] = subword_ebd_sum / float(ed - st) # dim 0 is st:ed
 80 | 
 81 |         output = output.cuda()        
 82 |         return output
 83 | 
 84 |     def combine_forward(self, x_bert, x_bert_mask, x_bert_offset, x_mask):
 85 |         all_layers = []
 86 | 
 87 |         bert_sent_len = x_bert.shape[1]
 88 |         p = 0
 89 |         while p < bert_sent_len:
 90 |             all_encoder_layers, _ = self.bert_model(x_bert[:, p:(p + self.BERT_MAX_LEN)], token_type_ids=None, attention_mask=x_bert_mask[:, p:(p + self.BERT_MAX_LEN)]) # bert_layer * batch * max_bert_sent_len * bert_dim
 91 |             all_layers.append(torch.cat(all_encoder_layers, dim = 2))  # batch * up_to_512 * (bert_dim * layer)
 92 |             p += self.BERT_MAX_LEN
 93 | 
 94 |         bert_embedding = torch.cat(all_layers, dim = 1) # batch * up_to_512 * (bert_dim * layer)
 95 |         batch_size = x_mask.shape[0]
 96 |         max_word_num = x_mask.shape[1]
 97 |         tot_dim = bert_embedding.shape[2]
 98 |         output = Variable(torch.zeros(batch_size, max_word_num, tot_dim))
 99 |         for i in range(batch_size):
100 |             for j in range(max_word_num):
101 |                 if x_mask[i, j] == 0:
102 |                     continue
103 |                 st = x_bert_offset[i, j, 0]    
104 |                 ed = x_bert_offset[i, j, 1]
105 |                 # we can also try using st only, ed only
106 |                 if st + 1 == ed: # including st==ed
107 |                     output[i, j, :] = bert_embedding[i, st, :]
108 |                 else:    
109 |                     subword_ebd_sum = torch.sum(bert_embedding[i, st:ed, :], dim = 0)
110 |                     if st < ed:
111 |                         output[i, j, :] = subword_ebd_sum / float(ed - st) # dim 0 is st:ed
112 | 
113 |         outputs = []
114 |         for i in range(self.bert_layer):
115 |             now = output[:, :, (i * self.bert_dim) : ((i + 1) * self.bert_dim)]
116 |             now = now.cuda()
117 |             outputs.append(now)
118 | 
119 |         return outputs
120 | 


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/Bert/__pycache__/Bert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/Bert/__pycache__/Bert.cpython-37.pyc


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/Bert/__pycache__/modeling.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/Bert/__pycache__/modeling.cpython-37.pyc


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/Bert/__pycache__/tokenization.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/Bert/__pycache__/tokenization.cpython-37.pyc


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/Bert/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """PyTorch optimization for BERT model."""
 16 | 
 17 | import math
 18 | import torch
 19 | from torch.optim import Optimizer
 20 | from torch.nn.utils import clip_grad_norm_
 21 | 
 22 | def warmup_cosine(x, warmup=0.002):
 23 |     if x < warmup:
 24 |         return x/warmup
 25 |     return 0.5 * (1.0 + torch.cos(math.pi * x))
 26 | 
 27 | def warmup_constant(x, warmup=0.002):
 28 |     if x < warmup:
 29 |         return x/warmup
 30 |     return 1.0
 31 | 
 32 | def warmup_linear(x, warmup=0.002):
 33 |     if x < warmup:
 34 |         return x/warmup
 35 |     return 1.0 - x
 36 | 
 37 | SCHEDULES = {
 38 |     'warmup_cosine':warmup_cosine,
 39 |     'warmup_constant':warmup_constant,
 40 |     'warmup_linear':warmup_linear,
 41 | }
 42 | 
 43 | 
 44 | class BertAdam(Optimizer):
 45 |     """Implements BERT version of Adam algorithm with weight decay fix.
 46 |     Params:
 47 |         lr: learning rate
 48 |         warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
 49 |         t_total: total number of training steps for the learning
 50 |             rate schedule, -1  means constant learning rate. Default: -1
 51 |         schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
 52 |         b1: Adams b1. Default: 0.9
 53 |         b2: Adams b2. Default: 0.999
 54 |         e: Adams epsilon. Default: 1e-6
 55 |         weight_decay_rate: Weight decay. Default: 0.01
 56 |         max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
 57 |     """
 58 |     def __init__(self, params, lr, warmup=-1, t_total=-1, schedule='warmup_linear',
 59 |                  b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01,
 60 |                  max_grad_norm=1.0):
 61 |         if not lr >= 0.0:
 62 |             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
 63 |         if schedule not in SCHEDULES:
 64 |             raise ValueError("Invalid schedule parameter: {}".format(schedule))
 65 |         if not 0.0 <= warmup < 1.0 and not warmup == -1:
 66 |             raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
 67 |         if not 0.0 <= b1 < 1.0:
 68 |             raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
 69 |         if not 0.0 <= b2 < 1.0:
 70 |             raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
 71 |         if not e >= 0.0:
 72 |             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
 73 |         defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
 74 |                         b1=b1, b2=b2, e=e, weight_decay_rate=weight_decay_rate,
 75 |                         max_grad_norm=max_grad_norm)
 76 |         super(BertAdam, self).__init__(params, defaults)
 77 | 
 78 |     def get_lr(self):
 79 |         lr = []
 80 |         for group in self.param_groups:
 81 |             for p in group['params']:
 82 |                 state = self.state[p]
 83 |                 if len(state) == 0:
 84 |                     return [0]
 85 |                 if group['t_total'] != -1:
 86 |                     schedule_fct = SCHEDULES[group['schedule']]
 87 |                     lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
 88 |                 else:
 89 |                     lr_scheduled = group['lr']
 90 |                 lr.append(lr_scheduled)
 91 |         return lr
 92 | 
 93 |     def step(self, closure=None):
 94 |         """Performs a single optimization step.
 95 | 
 96 |         Arguments:
 97 |             closure (callable, optional): A closure that reevaluates the model
 98 |                 and returns the loss.
 99 |         """
100 |         loss = None
101 |         if closure is not None:
102 |             loss = closure()
103 | 
104 |         for group in self.param_groups:
105 |             for p in group['params']:
106 |                 if p.grad is None:
107 |                     continue
108 |                 grad = p.grad.data
109 |                 if grad.is_sparse:
110 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
111 | 
112 |                 state = self.state[p]
113 | 
114 |                 # State initialization
115 |                 if len(state) == 0:
116 |                     state['step'] = 0
117 |                     # Exponential moving average of gradient values
118 |                     state['next_m'] = torch.zeros_like(p.data)
119 |                     # Exponential moving average of squared gradient values
120 |                     state['next_v'] = torch.zeros_like(p.data)
121 | 
122 |                 next_m, next_v = state['next_m'], state['next_v']
123 |                 beta1, beta2 = group['b1'], group['b2']
124 | 
125 |                 # Add grad clipping
126 |                 if group['max_grad_norm'] > 0:
127 |                     clip_grad_norm_(p, group['max_grad_norm'])
128 | 
129 |                 # Decay the first and second moment running average coefficient
130 |                 # In-place operations to update the averages at the same time
131 |                 next_m.mul_(beta1).add_(1 - beta1, grad)
132 |                 next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
133 |                 update = next_m / (next_v.sqrt() + group['e'])
134 | 
135 |                 # Just adding the square of the weights to the loss function is *not*
136 |                 # the correct way of using L2 regularization/weight decay with Adam,
137 |                 # since that will interact with the m and v parameters in strange ways.
138 |                 #
139 |                 # Instead we want to decay the weights in a manner that doesn't interact
140 |                 # with the m/v parameters. This is equivalent to adding the square
141 |                 # of the weights to the loss with plain (non-momentum) SGD.
142 |                 if group['weight_decay_rate'] > 0.0:
143 |                     update += group['weight_decay_rate'] * p.data
144 | 
145 |                 if group['t_total'] != -1:
146 |                     schedule_fct = SCHEDULES[group['schedule']]
147 |                     lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
148 |                 else:
149 |                     lr_scheduled = group['lr']
150 | 
151 |                 update_with_lr = lr_scheduled * update
152 |                 p.data.add_(-update_with_lr)
153 | 
154 |                 state['step'] += 1
155 | 
156 |                 # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
157 |                 # No bias correction
158 |                 # bias_correction1 = 1 - beta1 ** state['step']
159 |                 # bias_correction2 = 1 - beta2 ** state['step']
160 | 
161 |         return loss
162 | 


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/Bert/tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import unicodedata
 23 | import os
 24 | import logging
 25 | 
 26 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
 27 |                     datefmt = '%m/%d/%Y %H:%M:%S',
 28 |                     level = logging.INFO)
 29 | logger = logging.getLogger(__name__)
 30 | 
 31 | PRETRAINED_VOCAB_ARCHIVE_MAP = {
 32 |     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
 33 |     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
 34 |     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
 35 |     'bert-base-multilingual': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-vocab.txt",
 36 |     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
 37 | }
 38 | 
 39 | def convert_to_unicode(text):
 40 |     """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
 41 |     if isinstance(text, str):
 42 |         return text
 43 |     elif isinstance(text, bytes):
 44 |         return text.decode("utf-8", "ignore")
 45 |     else:
 46 |         raise ValueError("Unsupported string type: %s" % (type(text)))
 47 | 
 48 | 
 49 | def printable_text(text):
 50 |     """Returns text encoded in a way suitable for print or `tf.logging`."""
 51 | 
 52 |     # These functions want `str` for both Python2 and Python3, but in one case
 53 |     # it's a Unicode string and in the other it's a byte string.
 54 |     if isinstance(text, str):
 55 |         return text
 56 |     elif isinstance(text, bytes):
 57 |         return text.decode("utf-8", "ignore")
 58 |     else:
 59 |         raise ValueError("Unsupported string type: %s" % (type(text)))
 60 | 
 61 | 
 62 | def load_vocab(vocab_file):
 63 |     """Loads a vocabulary file into a dictionary."""
 64 |     vocab = collections.OrderedDict()
 65 |     index = 0
 66 |     with open(vocab_file, "r", encoding="utf8") as reader:
 67 |         while True:
 68 |             token = convert_to_unicode(reader.readline())
 69 |             if not token:
 70 |                 break
 71 |             token = token.strip()
 72 |             vocab[token] = index
 73 |             index += 1
 74 |     return vocab
 75 | 
 76 | 
 77 | def whitespace_tokenize(text):
 78 |     """Runs basic whitespace cleaning and splitting on a peice of text."""
 79 |     text = text.strip()
 80 |     if not text:
 81 |         return []
 82 |     tokens = text.split()
 83 |     return tokens
 84 | 
 85 | 
 86 | class BertTokenizer(object):
 87 |     """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
 88 |     def __init__(self, vocab_file, do_lower_case=True):
 89 |         if not os.path.isfile(vocab_file):
 90 |             raise ValueError(
 91 |                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
 92 |                 "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
 93 |         self.vocab = load_vocab(vocab_file)
 94 |         self.ids_to_tokens = collections.OrderedDict(
 95 |             [(ids, tok) for tok, ids in self.vocab.items()])
 96 |         self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
 97 |         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
 98 | 
 99 |     def tokenize(self, text):
100 |         split_tokens = []
101 |         for token in self.basic_tokenizer.tokenize(text):
102 |             for sub_token in self.wordpiece_tokenizer.tokenize(token):
103 |                 split_tokens.append(sub_token)
104 |         return split_tokens
105 | 
106 |     def convert_tokens_to_ids(self, tokens):
107 |         """Converts a sequence of tokens into ids using the vocab."""
108 |         ids = []
109 |         for token in tokens:
110 |             ids.append(self.vocab[token])
111 |         return ids
112 | 
113 |     def convert_ids_to_tokens(self, ids):
114 |         """Converts a sequence of ids in wordpiece tokens using the vocab."""
115 |         tokens = []
116 |         for i in ids:
117 |             tokens.append(self.ids_to_tokens[i])
118 |         return tokens
119 | 
120 |     @classmethod
121 |     def from_pretrained(cls, pretrained_model_name, do_lower_case=True):
122 |         """
123 |         Instantiate a PreTrainedBertModel from a pre-trained model file.
124 |         Download and cache the pre-trained model file if needed.
125 |         """
126 |         if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
127 |             vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
128 |         else:
129 |             vocab_file = pretrained_model_name
130 |         # redirect to the cache, if necessary
131 |         try:
132 |             resolved_vocab_file = vocab_file
133 |             if resolved_vocab_file == vocab_file:
134 |                 logger.info("loading vocabulary file {}".format(vocab_file))
135 |             else:
136 |                 logger.info("loading vocabulary file {} from cache at {}".format(
137 |                     vocab_file, resolved_vocab_file))
138 |             # Instantiate tokenizer.
139 |             tokenizer = cls(resolved_vocab_file, do_lower_case)
140 |         except FileNotFoundError:
141 |             logger.error(
142 |                 "Model name '{}' was not found in model name list ({}). "
143 |                 "We assumed '{}' was a path or url but couldn't find any file "
144 |                 "associated to this path or url.".format(
145 |                     pretrained_model_name,
146 |                     ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
147 |                     pretrained_model_name))
148 |             tokenizer = None
149 |         return tokenizer
150 | 
151 | 
152 | class BasicTokenizer(object):
153 |     """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
154 | 
155 |     def __init__(self, do_lower_case=True):
156 |         """Constructs a BasicTokenizer.
157 | 
158 |         Args:
159 |           do_lower_case: Whether to lower case the input.
160 |         """
161 |         self.do_lower_case = do_lower_case
162 | 
163 |     def tokenize(self, text):
164 |         """Tokenizes a piece of text."""
165 |         text = convert_to_unicode(text)
166 |         text = self._clean_text(text)
167 |         # This was added on November 1st, 2018 for the multilingual and Chinese
168 |         # models. This is also applied to the English models now, but it doesn't
169 |         # matter since the English models were not trained on any Chinese data
170 |         # and generally don't have any Chinese data in them (there are Chinese
171 |         # characters in the vocabulary because Wikipedia does have some Chinese
172 |         # words in the English Wikipedia.).
173 |         text = self._tokenize_chinese_chars(text)
174 |         orig_tokens = whitespace_tokenize(text)
175 |         split_tokens = []
176 |         for token in orig_tokens:
177 |             if self.do_lower_case:
178 |                 token = token.lower()
179 |                 token = self._run_strip_accents(token)
180 |             split_tokens.extend(self._run_split_on_punc(token))
181 | 
182 |         output_tokens = whitespace_tokenize(" ".join(split_tokens))
183 |         return output_tokens
184 | 
185 |     def _run_strip_accents(self, text):
186 |         """Strips accents from a piece of text."""
187 |         text = unicodedata.normalize("NFD", text)
188 |         output = []
189 |         for char in text:
190 |             cat = unicodedata.category(char)
191 |             if cat == "Mn":
192 |                 continue
193 |             output.append(char)
194 |         return "".join(output)
195 | 
196 |     def _run_split_on_punc(self, text):
197 |         """Splits punctuation on a piece of text."""
198 |         chars = list(text)
199 |         i = 0
200 |         start_new_word = True
201 |         output = []
202 |         while i < len(chars):
203 |             char = chars[i]
204 |             if _is_punctuation(char):
205 |                 output.append([char])
206 |                 start_new_word = True
207 |             else:
208 |                 if start_new_word:
209 |                     output.append([])
210 |                 start_new_word = False
211 |                 output[-1].append(char)
212 |             i += 1
213 | 
214 |         return ["".join(x) for x in output]
215 |     
216 |     def _tokenize_chinese_chars(self, text):
217 |         """Adds whitespace around any CJK character."""
218 |         output = []
219 |         for char in text:
220 |             cp = ord(char)
221 |             if self._is_chinese_char(cp):
222 |                 output.append(" ")
223 |                 output.append(char)
224 |                 output.append(" ")
225 |             else:
226 |                 output.append(char)
227 |         return "".join(output)
228 | 
229 |     def _is_chinese_char(self, cp):
230 |         """Checks whether CP is the codepoint of a CJK character."""
231 |         # This defines a "chinese character" as anything in the CJK Unicode block:
232 |         #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
233 |         #
234 |         # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
235 |         # despite its name. The modern Korean Hangul alphabet is a different block,
236 |         # as is Japanese Hiragana and Katakana. Those alphabets are used to write
237 |         # space-separated words, so they are not treated specially and handled
238 |         # like the all of the other languages.
239 |         if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
240 |             (cp >= 0x3400 and cp <= 0x4DBF) or  #
241 |             (cp >= 0x20000 and cp <= 0x2A6DF) or  #
242 |             (cp >= 0x2A700 and cp <= 0x2B73F) or  #
243 |             (cp >= 0x2B740 and cp <= 0x2B81F) or  #
244 |             (cp >= 0x2B820 and cp <= 0x2CEAF) or
245 |             (cp >= 0xF900 and cp <= 0xFAFF) or  #
246 |             (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
247 |             return True
248 |     
249 |         return False
250 |     
251 |     def _clean_text(self, text):
252 |         """Performs invalid character removal and whitespace cleanup on text."""
253 |         output = []
254 |         for char in text:
255 |             cp = ord(char)
256 |             if cp == 0 or cp == 0xfffd or _is_control(char):
257 |                 continue
258 |             if _is_whitespace(char):
259 |                 output.append(" ")
260 |             else:
261 |                 output.append(char)
262 |         return "".join(output)
263 | 
264 | 
265 | class WordpieceTokenizer(object):
266 |     """Runs WordPiece tokenization."""
267 | 
268 |     def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
269 |         self.vocab = vocab
270 |         self.unk_token = unk_token
271 |         self.max_input_chars_per_word = max_input_chars_per_word
272 | 
273 |     def tokenize(self, text):
274 |         """Tokenizes a piece of text into its word pieces.
275 | 
276 |         This uses a greedy longest-match-first algorithm to perform tokenization
277 |         using the given vocabulary.
278 | 
279 |         For example:
280 |           input = "unaffable"
281 |           output = ["un", "##aff", "##able"]
282 | 
283 |         Args:
284 |           text: A single token or whitespace separated tokens. This should have
285 |             already been passed through `BasicTokenizer.
286 | 
287 |         Returns:
288 |           A list of wordpiece tokens.
289 |         """
290 | 
291 |         text = convert_to_unicode(text)
292 | 
293 |         output_tokens = []
294 |         for token in whitespace_tokenize(text):
295 |             chars = list(token)
296 |             if len(chars) > self.max_input_chars_per_word:
297 |                 output_tokens.append(self.unk_token)
298 |                 continue
299 | 
300 |             is_bad = False
301 |             start = 0
302 |             sub_tokens = []
303 |             while start < len(chars):
304 |                 end = len(chars)
305 |                 cur_substr = None
306 |                 while start < end:
307 |                     substr = "".join(chars[start:end])
308 |                     if start > 0:
309 |                         substr = "##" + substr
310 |                     if substr in self.vocab:
311 |                         cur_substr = substr
312 |                         break
313 |                     end -= 1
314 |                 if cur_substr is None:
315 |                     is_bad = True
316 |                     break
317 |                 sub_tokens.append(cur_substr)
318 |                 start = end
319 | 
320 |             if is_bad:
321 |                 output_tokens.append(self.unk_token)
322 |             else:
323 |                 output_tokens.extend(sub_tokens)
324 |         return output_tokens
325 | 
326 | 
327 | def _is_whitespace(char):
328 |     """Checks whether `chars` is a whitespace character."""
329 |     # \t, \n, and \r are technically contorl characters but we treat them
330 |     # as whitespace since they are generally considered as such.
331 |     if char == " " or char == "\t" or char == "\n" or char == "\r":
332 |         return True
333 |     cat = unicodedata.category(char)
334 |     if cat == "Zs":
335 |         return True
336 |     return False
337 | 
338 | 
339 | def _is_control(char):
340 |     """Checks whether `chars` is a control character."""
341 |     # These are technically control characters but we count them as whitespace
342 |     # characters.
343 |     if char == "\t" or char == "\n" or char == "\r":
344 |         return False
345 |     cat = unicodedata.category(char)
346 |     if cat.startswith("C"):
347 |         return True
348 |     return False
349 | 
350 | 
351 | def _is_punctuation(char):
352 |     """Checks whether `chars` is a punctuation character."""
353 |     cp = ord(char)
354 |     # We treat all non-letter/number ASCII as punctuation.
355 |     # Characters such as "^", "$", and "`" are not in the Unicode
356 |     # Punctuation class but we treat them as punctuation anyways, for
357 |     # consistency.
358 |     if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
359 |             (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
360 |         return True
361 |     cat = unicodedata.category(char)
362 |     if cat.startswith("P"):
363 |         return True
364 |     return False
365 | 


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/SDNet.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT license.
  3 | 
  4 | import math
  5 | import random
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | from torch.autograd import Variable
 11 | import torch.nn.init as init
 12 | from torch.nn.parameter import Parameter
 13 | from Models.Bert.Bert import Bert
 14 | from Models.Layers import MaxPooling, CNN, dropout, RNN_from_opt, set_dropout_prob, weighted_avg, set_seq_dropout, Attention, DeepAttention, LinearSelfAttn, GetFinalScores
 15 | from Utils.CoQAUtils import POS, ENT
 16 | 
 17 | '''
 18 |  SDNet
 19 | '''
 20 | class SDNet(nn.Module):
 21 |     def __init__(self, opt, word_embedding):
 22 |         super(SDNet, self).__init__()
 23 |         print('SDNet model\n')
 24 | 
 25 |         self.opt = opt
 26 |         self.use_cuda = (self.opt['cuda'] == True)
 27 |         set_dropout_prob(0.0 if not 'DROPOUT' in opt else float(opt['DROPOUT']))
 28 |         set_seq_dropout('VARIATIONAL_DROPOUT' in self.opt)
 29 | 
 30 |         x_input_size = 0
 31 |         ques_input_size = 0
 32 | 
 33 |         self.vocab_size = int(opt['vocab_size'])
 34 |         vocab_dim = int(opt['vocab_dim'])
 35 |         self.vocab_embed = nn.Embedding(self.vocab_size, vocab_dim, padding_idx = 1)
 36 |         self.vocab_embed.weight.data = word_embedding
 37 | 
 38 |         x_input_size += vocab_dim
 39 |         ques_input_size += vocab_dim
 40 | 
 41 |         if 'CHAR_CNN' in self.opt:
 42 |             print('CHAR_CNN')
 43 |             char_vocab_size = int(opt['char_vocab_size'])
 44 |             char_dim = int(opt['char_emb_size'])
 45 |             char_hidden_size = int(opt['char_hidden_size'])
 46 |             self.char_embed = nn.Embedding(char_vocab_size, char_dim, padding_idx = 1)
 47 |             self.char_cnn = CNN(char_dim, 3, char_hidden_size)
 48 |             self.maxpooling = MaxPooling()
 49 |             x_input_size += char_hidden_size
 50 |             ques_input_size += char_hidden_size
 51 | 
 52 |         if 'TUNE_PARTIAL' in self.opt:
 53 |             print('TUNE_PARTIAL')
 54 |             self.fixed_embedding = word_embedding[opt['tune_partial']:]
 55 |         else:    
 56 |             self.vocab_embed.weight.requires_grad = False
 57 | 
 58 |         cdim = 0
 59 |         self.use_contextual = False
 60 | 
 61 |         if 'BERT' in self.opt:
 62 |             print('Using BERT')
 63 |             self.Bert = Bert(self.opt)
 64 |             if 'LOCK_BERT' in self.opt:
 65 |                 print('Lock BERT\'s weights')
 66 |                 for p in self.Bert.parameters():
 67 |                     p.requires_grad = False
 68 |             if 'BERT_LARGE' in self.opt:
 69 |                 print('BERT_LARGE')
 70 |                 bert_dim = 1024
 71 |                 bert_layers = 24
 72 |             else:
 73 |                 bert_dim = 768
 74 |                 bert_layers = 12
 75 | 
 76 |             print('BERT dim:', bert_dim, 'BERT_LAYERS:', bert_layers)    
 77 | 
 78 |             if 'BERT_LINEAR_COMBINE' in self.opt:
 79 |                 print('BERT_LINEAR_COMBINE')
 80 |                 self.alphaBERT = nn.Parameter(torch.Tensor(bert_layers), requires_grad=True)
 81 |                 self.gammaBERT = nn.Parameter(torch.Tensor(1, 1), requires_grad=True)
 82 |                 torch.nn.init.constant(self.alphaBERT, 1.0)
 83 |                 torch.nn.init.constant(self.gammaBERT, 1.0)
 84 |                 
 85 |             cdim = bert_dim
 86 |             x_input_size += bert_dim
 87 |             ques_input_size += bert_dim
 88 | 
 89 |         self.pre_align = Attention(vocab_dim, opt['prealign_hidden'], correlation_func = 3, do_similarity = True)
 90 |         x_input_size += vocab_dim
 91 | 
 92 |         pos_dim = opt['pos_dim']
 93 |         ent_dim = opt['ent_dim']
 94 |         self.pos_embedding = nn.Embedding(len(POS), pos_dim)
 95 |         self.ent_embedding = nn.Embedding(len(ENT), ent_dim)
 96 | 
 97 |         x_feat_len = 4
 98 |         if 'ANSWER_SPAN_IN_CONTEXT_FEATURE' in self.opt:
 99 |             print('ANSWER_SPAN_IN_CONTEXT_FEATURE')
100 |             x_feat_len += 1
101 | 
102 |         x_input_size += pos_dim + ent_dim + x_feat_len
103 | 
104 |         print('Initially, the vector_sizes [doc, query] are', x_input_size, ques_input_size)
105 | 
106 |         addtional_feat = cdim if self.use_contextual else 0
107 | 
108 |         # RNN context encoder
109 |         self.context_rnn, context_rnn_output_size = RNN_from_opt(x_input_size, opt['hidden_size'],
110 |             num_layers=opt['in_rnn_layers'], concat_rnn=opt['concat_rnn'], add_feat=addtional_feat)
111 |         # RNN question encoder
112 |         self.ques_rnn, ques_rnn_output_size = RNN_from_opt(ques_input_size, opt['hidden_size'],
113 |             num_layers=opt['in_rnn_layers'], concat_rnn=opt['concat_rnn'], add_feat=addtional_feat)
114 | 
115 |         # Output sizes of rnn encoders
116 |         print('After Input LSTM, the vector_sizes [doc, query] are [', context_rnn_output_size, ques_rnn_output_size, '] *', opt['in_rnn_layers'])
117 | 
118 |         # Deep inter-attention
119 |         self.deep_attn = DeepAttention(opt, abstr_list_cnt=opt['in_rnn_layers'], 
120 |             deep_att_hidden_size_per_abstr=opt['deep_att_hidden_size_per_abstr'], correlation_func=3, word_hidden_size=vocab_dim + addtional_feat)
121 |         self.deep_attn_input_size = self.deep_attn.rnn_input_size
122 |         self.deep_attn_output_size = self.deep_attn.output_size
123 | 
124 |         # Question understanding and compression
125 |         self.high_lvl_ques_rnn , high_lvl_ques_rnn_output_size = RNN_from_opt(ques_rnn_output_size * opt['in_rnn_layers'], 
126 |             opt['highlvl_hidden_size'], num_layers = opt['question_high_lvl_rnn_layers'], concat_rnn = True)
127 | 
128 |         self.after_deep_attn_size = self.deep_attn_output_size + self.deep_attn_input_size + addtional_feat + vocab_dim
129 |         self.self_attn_input_size = self.after_deep_attn_size
130 |         self_attn_output_size = self.deep_attn_output_size        
131 | 
132 |         # Self attention on context
133 |         self.highlvl_self_att = Attention(self.self_attn_input_size, opt['deep_att_hidden_size_per_abstr'], correlation_func=3)
134 |         print('Self deep-attention input is {}-dim'.format(self.self_attn_input_size))
135 | 
136 |         self.high_lvl_context_rnn, high_lvl_context_rnn_output_size = RNN_from_opt(self.deep_attn_output_size + self_attn_output_size, 
137 |             opt['highlvl_hidden_size'], num_layers = 1, concat_rnn = False)
138 |         context_final_size = high_lvl_context_rnn_output_size
139 | 
140 |         print('Do Question self attention')
141 |         self.ques_self_attn = Attention(high_lvl_ques_rnn_output_size, opt['query_self_attn_hidden_size'], correlation_func=3)
142 |         
143 |         ques_final_size = high_lvl_ques_rnn_output_size
144 |         print('Before answer span finding, hidden size are', context_final_size, ques_final_size)
145 | 
146 |         # Question merging
147 |         self.ques_merger = LinearSelfAttn(ques_final_size)
148 |         self.get_answer = GetFinalScores(context_final_size, ques_final_size)
149 | 
150 |     '''
151 |     x: 1 x x_len (word_ids)
152 |     x_single_mask: 1 x x_len
153 |     x_char: 1 x x_len x char_len (char_ids)
154 |     x_char_mask: 1 x x_len x char_len
155 |     x_features: batch_size x x_len x feature_len (5, if answer_span_in_context_feature; 4 otherwise)
156 |     x_pos: 1 x x_len (POS id)
157 |     x_ent: 1 x x_len (entity id)
158 |     x_bert: 1 x x_bert_token_len
159 |     x_bert_mask: 1 x x_bert_token_len
160 |     x_bert_offsets: 1 x x_len x 2
161 |     q: batch x q_len  (word_ids)
162 |     q_mask: batch x q_len
163 |     q_char: batch x q_len x char_len (char ids)
164 |     q_char_mask: batch x q_len x char_len
165 |     q_bert: 1 x q_bert_token_len
166 |     q_bert_mask: 1 x q_bert_token_len
167 |     q_bert_offsets: 1 x q_len x 2
168 |     context_len: number of words in context (only one per batch)
169 |     return: 
170 |       score_s: batch x context_len
171 |       score_e: batch x context_len
172 |       score_no: batch x 1
173 |       score_yes: batch x 1
174 |       score_noanswer: batch x 1
175 |     '''
176 |     def forward(self, x, x_single_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, q, q_mask, q_char, q_char_mask, q_bert, q_bert_mask, q_bert_offsets, context_len):
177 |         batch_size = q.shape[0]
178 |         x_mask = x_single_mask.expand(batch_size, -1)
179 |         x_word_embed = self.vocab_embed(x).expand(batch_size, -1, -1) # batch x x_len x vocab_dim
180 |         ques_word_embed = self.vocab_embed(q) # batch x q_len x vocab_dim
181 | 
182 |         x_input_list = [dropout(x_word_embed, p=self.opt['dropout_emb'], training=self.drop_emb)] # batch x x_len x vocab_dim
183 |         ques_input_list = [dropout(ques_word_embed, p=self.opt['dropout_emb'], training=self.drop_emb)] # batch x q_len x vocab_dim
184 | 
185 |         # contextualized embedding
186 |         x_cemb = ques_cemb = None        
187 |         if 'BERT' in self.opt:
188 |             x_cemb = ques_cemb = None
189 |             
190 |             if 'BERT_LINEAR_COMBINE' in self.opt:
191 |                 x_bert_output = self.Bert(x_bert, x_bert_mask, x_bert_offsets, x_single_mask)
192 |                 x_cemb_mid = self.linear_sum(x_bert_output, self.alphaBERT, self.gammaBERT)
193 |                 ques_bert_output = self.Bert(q_bert, q_bert_mask, q_bert_offsets, q_mask)
194 |                 ques_cemb_mid = self.linear_sum(ques_bert_output, self.alphaBERT, self.gammaBERT)
195 |                 x_cemb_mid = x_cemb_mid.expand(batch_size, -1, -1)
196 |             else:    
197 |                 x_cemb_mid = self.Bert(x_bert, x_bert_mask, x_bert_offsets, x_single_mask)
198 |                 x_cemb_mid = x_cemb_mid.expand(batch_size, -1, -1)
199 |                 ques_cemb_mid = self.Bert(q_bert, q_bert_mask, q_bert_offsets, q_mask)
200 | 
201 |             x_input_list.append(x_cemb_mid)
202 |             ques_input_list.append(ques_cemb_mid)
203 | 
204 |         if 'CHAR_CNN' in self.opt:
205 |             x_char_final = self.character_cnn(x_char, x_char_mask)
206 |             x_char_final = x_char_final.expand(batch_size, -1, -1)
207 |             ques_char_final = self.character_cnn(q_char, q_char_mask)
208 |             x_input_list.append(x_char_final)
209 |             ques_input_list.append(ques_char_final)
210 |         
211 |         x_prealign = self.pre_align(x_word_embed, ques_word_embed, q_mask)
212 |         x_input_list.append(x_prealign) # batch x x_len x (vocab_dim + cdim + vocab_dim)
213 | 
214 |         x_pos_emb = self.pos_embedding(x_pos).expand(batch_size, -1, -1) # batch x x_len x pos_dim
215 |         x_ent_emb = self.ent_embedding(x_ent).expand(batch_size, -1, -1) # batch x x_len x ent_dim
216 |         x_input_list.append(x_pos_emb)
217 |         x_input_list.append(x_ent_emb)
218 |         x_input_list.append(x_features)  # batch x x_len x (vocab_dim + cdim + vocab_dim + pos_dim + ent_dim + feature_dim)
219 | 
220 |         x_input = torch.cat(x_input_list, 2) # batch x x_len x (vocab_dim + cdim + vocab_dim + pos_dim + ent_dim + feature_dim)
221 |         ques_input = torch.cat(ques_input_list, 2) # batch x q_len x (vocab_dim + cdim)
222 | 
223 |         # Multi-layer RNN
224 |         _, x_rnn_layers = self.context_rnn(x_input, x_mask, return_list=True, x_additional=x_cemb) # layer x batch x x_len x context_rnn_output_size
225 |         _, ques_rnn_layers = self.ques_rnn(ques_input, q_mask, return_list=True, x_additional=ques_cemb) # layer x batch x q_len x ques_rnn_output_size
226 | 
227 |         # rnn with question only 
228 |         ques_highlvl = self.high_lvl_ques_rnn(torch.cat(ques_rnn_layers, 2), q_mask) # batch x q_len x high_lvl_ques_rnn_output_size
229 |         ques_rnn_layers.append(ques_highlvl) # (layer + 1) layers
230 | 
231 |         # deep multilevel inter-attention
232 |         if x_cemb is None:
233 |             x_long = x_word_embed
234 |             ques_long = ques_word_embed
235 |         else:
236 |             x_long = torch.cat([x_word_embed, x_cemb], 2)          # batch x x_len x (vocab_dim + cdim)
237 |             ques_long = torch.cat([ques_word_embed, ques_cemb], 2) # batch x q_len x (vocab_dim + cdim)
238 | 
239 |         x_rnn_after_inter_attn, x_inter_attn = self.deep_attn([x_long], x_rnn_layers, [ques_long], ques_rnn_layers, x_mask, q_mask, return_bef_rnn=True)
240 |         # x_rnn_after_inter_attn: batch x x_len x deep_attn_output_size
241 |         # x_inter_attn: batch x x_len x deep_attn_input_size
242 | 
243 |         # deep self attention
244 |         if x_cemb is None:
245 |             x_self_attn_input = torch.cat([x_rnn_after_inter_attn, x_inter_attn, x_word_embed], 2)
246 |         else:
247 |             x_self_attn_input = torch.cat([x_rnn_after_inter_attn, x_inter_attn, x_cemb, x_word_embed], 2)
248 |             # batch x x_len x (deep_attn_output_size + deep_attn_input_size + cdim + vocab_dim)
249 |         
250 |         x_self_attn_output = self.highlvl_self_att(x_self_attn_input, x_self_attn_input, x_mask, x3=x_rnn_after_inter_attn, drop_diagonal=True)
251 |         # batch x x_len x deep_attn_output_size
252 | 
253 |         x_highlvl_output = self.high_lvl_context_rnn(torch.cat([x_rnn_after_inter_attn, x_self_attn_output], 2), x_mask)
254 |         # bach x x_len x high_lvl_context_rnn.output_size
255 |         x_final = x_highlvl_output
256 | 
257 |         # question self attention  
258 |         ques_final = self.ques_self_attn(ques_highlvl, ques_highlvl, q_mask, x3=None, drop_diagonal=True) # batch x q_len x high_lvl_ques_rnn_output_size
259 | 
260 |         # merge questions  
261 |         q_merge_weights = self.ques_merger(ques_final, q_mask) 
262 |         ques_merged = weighted_avg(ques_final, q_merge_weights) # batch x ques_final_size
263 | 
264 |         # predict scores
265 |         score_s, score_e, score_no, score_yes, score_noanswer = self.get_answer(x_final, ques_merged, x_mask)
266 |         return score_s, score_e, score_no, score_yes, score_noanswer
267 |     
268 |     '''
269 |      input: 
270 |       x_char: batch x word_num x char_num
271 |       x_char_mask: batch x word_num x char_num
272 |      output: 
273 |        x_char_cnn_final:  batch x word_num x char_cnn_hidden_size
274 |     '''
275 |     def character_cnn(self, x_char, x_char_mask):
276 |         x_char_embed = self.char_embed(x_char) # batch x word_num x char_num x char_dim
277 |         batch_size = x_char_embed.shape[0]
278 |         word_num = x_char_embed.shape[1]
279 |         char_num = x_char_embed.shape[2]
280 |         char_dim = x_char_embed.shape[3]
281 |         x_char_cnn = self.char_cnn(x_char_embed.contiguous().view(-1, char_num, char_dim), x_char_mask) # (batch x word_num) x char_num x char_cnn_hidden_size
282 |         x_char_cnn_final = self.maxpooling(x_char_cnn, x_char_mask.contiguous().view(-1, char_num)).contiguous().view(batch_size, word_num, -1) # batch x word_num x char_cnn_hidden_size
283 |         return x_char_cnn_final
284 | 
285 |     def linear_sum(self, output, alpha, gamma):
286 |         alpha_softmax = F.softmax(alpha)
287 |         for i in range(len(output)):
288 |             t = output[i] * alpha_softmax[i] * gamma
289 |             if i == 0:
290 |                 res = t
291 |             else:
292 |                 res += t
293 | 
294 |         res = dropout(res, p=self.opt['dropout_emb'], training=self.drop_emb)
295 |         return res
296 | 


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/SDNetTrainer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT license.
  3 | 
  4 | from datetime import datetime
  5 | import json
  6 | import numpy as np
  7 | import os
  8 | import random
  9 | import sys
 10 | import time
 11 | import torch
 12 | from torch.autograd import Variable
 13 | import torch.nn as nn
 14 | import torch.nn.functional as F
 15 | import torch.optim as optim
 16 | from Utils.CoQAPreprocess import CoQAPreprocess
 17 | from Models.Layers import MaxPooling, set_dropout_prob
 18 | from Models.SDNet import SDNet
 19 | from Models.BaseTrainer import BaseTrainer
 20 | from Utils.CoQAUtils import BatchGen, AverageMeter, gen_upper_triangle, score
 21 |  
 22 | class SDNetTrainer(BaseTrainer):
 23 |     def __init__(self, opt):
 24 |         super(SDNetTrainer, self).__init__(opt)
 25 |         print('SDNet Model Trainer')
 26 |         set_dropout_prob(0.0 if not 'DROPOUT' in opt else float(opt['DROPOUT']))
 27 |         self.seed = int(opt['SEED'])
 28 |         self.data_prefix = 'coqa-'
 29 |         random.seed(self.seed)
 30 |         np.random.seed(self.seed)
 31 |         torch.manual_seed(self.seed)
 32 |         self.preproc = CoQAPreprocess(self.opt)
 33 |         if self.use_cuda:
 34 |             torch.cuda.manual_seed_all(self.seed)
 35 | 
 36 |     def official(self, model_path, test_data):
 37 |         print('-----------------------------------------------')
 38 |         print("Initializing model...")
 39 |         self.setup_model(self.preproc.train_embedding)
 40 |         self.load_model(model_path)
 41 | 
 42 |         print("Predicting in batches...")
 43 |         test_batches = BatchGen(self.opt, test_data['data'], self.use_cuda, self.preproc.train_vocab, self.preproc.train_char_vocab, evaluation=True)
 44 |         predictions = []
 45 |         confidence = []
 46 |         final_json = []
 47 |         cnt = 0
 48 |         for j, test_batch in enumerate(test_batches):
 49 |             cnt += 1
 50 |             if cnt % 50 == 0:
 51 |                 print(cnt, '/', len(test_batches))  
 52 |             phrase, phrase_score, pred_json = self.predict(test_batch)
 53 |             predictions.extend(phrase)
 54 |             confidence.extend(phrase_score)
 55 |             final_json.extend(pred_json)
 56 | 
 57 |         return predictions, confidence, final_json
 58 | 
 59 |     def train(self): 
 60 |         self.isTrain = True
 61 |         self.getSaveFolder()
 62 |         self.saveConf()
 63 |         self.vocab, self.char_vocab, vocab_embedding = self.preproc.load_data()
 64 |         self.log('-----------------------------------------------')
 65 |         self.log("Initializing model...")
 66 |         self.setup_model(vocab_embedding)
 67 |         
 68 |         if 'RESUME' in self.opt:
 69 |             model_path = os.path.join(self.opt['datadir'], self.opt['MODEL_PATH'])
 70 |             self.load_model(model_path)            
 71 | 
 72 |         print('Loading train json...')
 73 |         with open(os.path.join(self.opt['FEATURE_FOLDER'], self.data_prefix + 'train-preprocessed.json'), 'r') as f:
 74 |             train_data = json.load(f)
 75 | 
 76 |         print('Loading dev json...')
 77 |         with open(os.path.join(self.opt['FEATURE_FOLDER'], self.data_prefix + 'dev-preprocessed.json'), 'r') as f:
 78 |             dev_data = json.load(f)
 79 | 
 80 |         best_f1_score = 0.0
 81 |         numEpochs = self.opt['EPOCH']
 82 |         for epoch in range(self.epoch_start, numEpochs):
 83 |             self.log('Epoch {}'.format(epoch))
 84 |             self.network.train()
 85 |             startTime = datetime.now()
 86 |             train_batches = BatchGen(self.opt, train_data['data'], self.use_cuda, self.vocab, self.char_vocab)
 87 |             dev_batches = BatchGen(self.opt, dev_data['data'], self.use_cuda, self.vocab, self.char_vocab, evaluation=True)
 88 |             for i, batch in enumerate(train_batches):
 89 |                 if i == len(train_batches) - 1 or (epoch == 0 and i == 0 and ('RESUME' in self.opt)) or (i > 0 and i % 1500 == 0):
 90 |                     print('Saving folder is', self.saveFolder)
 91 |                     print('Evaluating on dev set...')
 92 |                     predictions = []
 93 |                     confidence = []
 94 |                     dev_answer = []
 95 |                     final_json = []
 96 |                     for j, dev_batch in enumerate(dev_batches):
 97 |                         phrase, phrase_score, pred_json = self.predict(dev_batch)
 98 |                         final_json.extend(pred_json)
 99 |                         predictions.extend(phrase)
100 |                         confidence.extend(phrase_score)
101 |                         dev_answer.extend(dev_batch[-3]) # answer_str
102 |                     result, all_f1s = score(predictions, dev_answer, final_json)
103 |                     f1 = result['f1']
104 | 
105 |                     if f1 > best_f1_score:
106 |                         model_file = os.path.join(self.saveFolder, 'best_model.pt')
107 |                         self.save_for_predict(model_file, epoch)
108 |                         best_f1_score = f1
109 |                         pred_json_file = os.path.join(self.saveFolder, 'prediction.json')
110 |                         with open(pred_json_file, 'w') as output_file:
111 |                             json.dump(final_json, output_file)
112 |                         score_per_instance = []    
113 |                         for instance, s in zip(final_json, all_f1s):
114 |                             score_per_instance.append({
115 |                                 'id': instance['id'],
116 |                                 'turn_id': instance['turn_id'],
117 |                                 'f1': s
118 |                             })
119 |                         score_per_instance_json_file = os.path.join(self.saveFolder, 'score_per_instance.json')
120 |                         with open(score_per_instance_json_file, 'w') as output_file:
121 |                             json.dump(score_per_instance, output_file)    
122 | 
123 |                     self.log("Epoch {0} - dev: F1: {1:.3f} (best F1: {2:.3f})".format(epoch, f1, best_f1_score))
124 |                     self.log("Results breakdown\n{0}".format(result))
125 |                 
126 |                 self.update(batch)
127 |                 if i % 100 == 0:
128 |                     self.log('updates[{0:6}] train loss[{1:.5f}] remaining[{2}]'.format(
129 |                         self.updates, self.train_loss.avg,
130 |                         str((datetime.now() - startTime) / (i + 1) * (len(train_batches) - i - 1)).split('.')[0]))
131 | 
132 |             print("PROGRESS: {0:.2f}%".format(100.0 * (epoch + 1) / numEpochs))
133 |             print('Config file is at ' + self.opt['confFile'])
134 | 
135 |     def setup_model(self, vocab_embedding):
136 |         self.train_loss = AverageMeter()
137 |         self.network = SDNet(self.opt, vocab_embedding)
138 |         if self.use_cuda:
139 |             self.log('Putting model into GPU')
140 |             self.network.cuda()
141 | 
142 |         parameters = [p for p in self.network.parameters() if p.requires_grad]
143 |         self.optimizer = optim.Adamax(parameters)
144 |         if 'ADAM2' in self.opt:
145 |             print('ADAM2')
146 |             self.optimizer = optim.Adam(parameters, lr = 0.0001)
147 | 
148 |         self.updates = 0
149 |         self.epoch_start = 0
150 |         self.loss_func = F.cross_entropy 
151 | 
152 |     def update(self, batch):
153 |         # Train mode
154 |         self.network.train()
155 |         self.network.drop_emb = True
156 | 
157 |         x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, query, query_mask, \
158 |         query_char, query_char_mask, query_bert, query_bert_mask, query_bert_offsets, ground_truth, context_str, context_words, _, _, _, _ = batch
159 | 
160 |         # Run forward
161 |         # score_s, score_e: batch x context_word_num
162 |         # score_yes, score_no, score_no_answer: batch x 1
163 |         score_s, score_e, score_yes, score_no, score_no_answer = self.network(x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, 
164 |             query, query_mask, query_char, query_char_mask, query_bert, query_bert_mask, query_bert_offsets, len(context_words))
165 |         max_len = self.opt['max_len'] or score_s.size(1)
166 |         batch_size = score_s.shape[0]
167 |         context_len = score_s.size(1)
168 |         expand_score = gen_upper_triangle(score_s, score_e, max_len, self.use_cuda)
169 |         scores = torch.cat((expand_score, score_no, score_yes, score_no_answer), dim=1) # batch x (context_len * context_len + 3)
170 |         targets = []
171 |         span_idx = int(context_len * context_len)
172 |         for i in range(ground_truth.shape[0]):
173 |             if ground_truth[i][0] == -1 and ground_truth[i][1] == -1: # no answer
174 |                 targets.append(span_idx + 2)
175 |             if ground_truth[i][0] == 0 and ground_truth[i][1] == -1: # no
176 |                 targets.append(span_idx)
177 |             if ground_truth[i][0] == -1 and ground_truth[i][1] == 0: # yes
178 |                 targets.append(span_idx + 1)
179 |             if ground_truth[i][0] != -1 and ground_truth[i][1] != -1: # normal span
180 |                 targets.append(ground_truth[i][0] * context_len + ground_truth[i][1])
181 | 
182 |         targets = torch.LongTensor(np.array(targets))
183 |         if self.use_cuda:
184 |             targets = targets.cuda()
185 |         loss = self.loss_func(scores, targets)
186 |         self.train_loss.update(loss.data[0], 1)
187 |         self.optimizer.zero_grad()
188 |         loss.backward()
189 |         torch.nn.utils.clip_grad_norm(self.network.parameters(), self.opt['grad_clipping'])
190 |         self.optimizer.step()
191 |         self.updates += 1
192 |         if 'TUNE_PARTIAL' in self.opt:
193 |             self.network.vocab_embed.weight.data[self.opt['tune_partial']:] = self.network.fixed_embedding
194 | 
195 |     def predict(self, batch):
196 |         self.network.eval()
197 |         self.network.drop_emb = False
198 | 
199 |         # Run forward
200 |         x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, query, query_mask, \
201 |         query_char, query_char_mask, query_bert, query_bert_mask, query_bert_offsets, ground_truth, context_str, context_words, \
202 |         context_word_offsets, answers, context_id, turn_ids = batch
203 |         
204 |         context_len = len(context_words)
205 |         score_s, score_e, score_yes, score_no, score_no_answer = self.network(x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, 
206 |             query, query_mask, query_char, query_char_mask, query_bert, query_bert_mask, query_bert_offsets, len(context_words))
207 |         batch_size = score_s.shape[0]
208 |         max_len = self.opt['max_len'] or score_s.size(1)
209 | 
210 |         expand_score = gen_upper_triangle(score_s, score_e, max_len, self.use_cuda)
211 |         scores = torch.cat((expand_score, score_no, score_yes, score_no_answer), dim=1) # batch x (context_len * context_len + 3)
212 |         prob = F.softmax(scores, dim = 1).data.cpu() # Transfer to CPU/normal tensors for numpy ops
213 | 
214 |         # Get argmax text spans
215 |         predictions = []
216 |         confidence = []
217 |         
218 |         pred_json = []
219 |         for i in range(batch_size):
220 |             _, ids = torch.sort(prob[i, :], descending=True)
221 |             idx = 0
222 |             best_id = ids[idx]
223 | 
224 |             confidence.append(float(prob[i, best_id]))
225 |             if best_id < context_len * context_len:
226 |                 st = best_id / context_len
227 |                 ed = best_id % context_len
228 |                 st = context_word_offsets[st][0]
229 |                 ed = context_word_offsets[ed][1]
230 |                 predictions.append(context_str[st:ed])
231 |             
232 |             if best_id == context_len * context_len:
233 |                 predictions.append('no')
234 | 
235 |             if best_id == context_len * context_len + 1:
236 |                 predictions.append('yes')
237 | 
238 |             if best_id == context_len * context_len + 2:
239 |                 predictions.append('unknown')
240 | 
241 |             pred_json.append({
242 |                 'id': context_id,
243 |                 'turn_id': turn_ids[i],
244 |                 'answer': predictions[-1]
245 |             })
246 | 
247 |         return (predictions, confidence, pred_json) # list of strings, list of floats, list of jsons
248 | 
249 |     def load_model(self, model_path):
250 |         print('Loading model from', model_path)
251 |         checkpoint = torch.load(model_path)
252 |         state_dict = checkpoint['state_dict']
253 |         new_state = set(self.network.state_dict().keys())
254 |         for k in list(state_dict['network'].keys()):
255 |             if k not in new_state:
256 |                 del state_dict['network'][k]
257 |         for k, v in list(self.network.state_dict().items()):
258 |             if k not in state_dict['network']:
259 |                 state_dict['network'][k] = v
260 |         self.network.load_state_dict(state_dict['network'])
261 | 
262 |         print('Loading finished', model_path)        
263 | 
264 |     def save(self, filename, epoch, prev_filename):
265 |         params = {
266 |             'state_dict': {
267 |                 'network': self.network.state_dict(),
268 |                 'optimizer': self.optimizer.state_dict(),
269 |                 'updates': self.updates # how many updates
270 |             },
271 |             'train_loss': {
272 |                 'val': self.train_loss.val,
273 |                 'avg': self.train_loss.avg,
274 |                 'sum': self.train_loss.sum,
275 |                 'count': self.train_loss.count
276 |             },
277 |             'config': self.opt,
278 |             'epoch': epoch
279 |         }
280 |         try:
281 |             torch.save(params, filename)
282 |             self.log('model saved to {}'.format(filename))
283 |             if os.path.exists(prev_filename):
284 |                 os.remove(prev_filename)
285 |         except BaseException:
286 |             self.log('[ WARN: Saving failed... continuing anyway. ]')
287 | 
288 |     def save_for_predict(self, filename, epoch):
289 |         network_state = dict([(k, v) for k, v in self.network.state_dict().items() if k[0:4] != 'CoVe' and k[0:4] != 'ELMo' and k[0:9] != 'AllenELMo' and k[0:4] != 'Bert'])
290 | 
291 |         if 'eval_embed.weight' in network_state:
292 |             del network_state['eval_embed.weight']
293 |         if 'fixed_embedding' in network_state:
294 |             del network_state['fixed_embedding']
295 |         params = {
296 |             'state_dict': {'network': network_state},
297 |             'config': self.opt,
298 |         }
299 |         try:
300 |             torch.save(params, filename)
301 |             self.log('model saved to {}'.format(filename))
302 |         except BaseException:
303 |             self.log('[ WARN: Saving failed... continuing anyway. ]')
304 | 


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/__pycache__/BaseTrainer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/__pycache__/BaseTrainer.cpython-37.pyc


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/__pycache__/Layers.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/__pycache__/Layers.cpython-37.pyc


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/__pycache__/SDNet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/__pycache__/SDNet.cpython-37.pyc


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/__pycache__/SDNetTrainer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/__pycache__/SDNetTrainer.cpython-37.pyc


--------------------------------------------------------------------------------
/SDNet/code/SDNet/NOTICE.txt:
--------------------------------------------------------------------------------
  1 | NOTICES AND INFORMATION
  2 | Do Not Translate or Localize
  3 | 
  4 | This software incorporates material from third parties. Microsoft makes certain
  5 | open source code available at http://3rdpartysource.microsoft.com, or you may
  6 | send a check or money order for US $5.00, including the product name, the open
  7 | source component name, and version number, to:
  8 | 
  9 | Source Code Compliance Team
 10 | Microsoft Corporation
 11 | One Microsoft Way
 12 | Redmond, WA 98052
 13 | USA
 14 | 
 15 | Notwithstanding any other terms, you may reverse engineer this software to the
 16 | extent required to debug changes to any libraries licensed under the GNU Lesser
 17 | General Public License.
 18 | 
 19 | ________________________
 20 | 
 21 | huggingface/pytorch-pretrained-BERT v0.6.1 (Source)
 22 | 
 23 | Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 24 | Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 25 | 
 26 | 
 27 |    		Apache License
 28 |                            Version 2.0, January 2004
 29 |                         http://www.apache.org/licenses/
 30 | 
 31 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 32 | 
 33 |    1. Definitions.
 34 | 
 35 |       "License" shall mean the terms and conditions for use, reproduction,
 36 |       and distribution as defined by Sections 1 through 9 of this document.
 37 | 
 38 |       "Licensor" shall mean the copyright owner or entity authorized by
 39 |       the copyright owner that is granting the License.
 40 | 
 41 |       "Legal Entity" shall mean the union of the acting entity and all
 42 |       other entities that control, are controlled by, or are under common
 43 |       control with that entity. For the purposes of this definition,
 44 |       "control" means (i) the power, direct or indirect, to cause the
 45 |       direction or management of such entity, whether by contract or
 46 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 47 |       outstanding shares, or (iii) beneficial ownership of such entity.
 48 | 
 49 |       "You" (or "Your") shall mean an individual or Legal Entity
 50 |       exercising permissions granted by this License.
 51 | 
 52 |       "Source" form shall mean the preferred form for making modifications,
 53 |       including but not limited to software source code, documentation
 54 |       source, and configuration files.
 55 | 
 56 |       "Object" form shall mean any form resulting from mechanical
 57 |       transformation or translation of a Source form, including but
 58 |       not limited to compiled object code, generated documentation,
 59 |       and conversions to other media types.
 60 | 
 61 |       "Work" shall mean the work of authorship, whether in Source or
 62 |       Object form, made available under the License, as indicated by a
 63 |       copyright notice that is included in or attached to the work
 64 |       (an example is provided in the Appendix below).
 65 | 
 66 |       "Derivative Works" shall mean any work, whether in Source or Object
 67 |       form, that is based on (or derived from) the Work and for which the
 68 |       editorial revisions, annotations, elaborations, or other modifications
 69 |       represent, as a whole, an original work of authorship. For the purposes
 70 |       of this License, Derivative Works shall not include works that remain
 71 |       separable from, or merely link (or bind by name) to the interfaces of,
 72 |       the Work and Derivative Works thereof.
 73 | 
 74 |       "Contribution" shall mean any work of authorship, including
 75 |       the original version of the Work and any modifications or additions
 76 |       to that Work or Derivative Works thereof, that is intentionally
 77 |       submitted to Licensor for inclusion in the Work by the copyright owner
 78 |       or by an individual or Legal Entity authorized to submit on behalf of
 79 |       the copyright owner. For the purposes of this definition, "submitted"
 80 |       means any form of electronic, verbal, or written communication sent
 81 |       to the Licensor or its representatives, including but not limited to
 82 |       communication on electronic mailing lists, source code control systems,
 83 |       and issue tracking systems that are managed by, or on behalf of, the
 84 |       Licensor for the purpose of discussing and improving the Work, but
 85 |       excluding communication that is conspicuously marked or otherwise
 86 |       designated in writing by the copyright owner as "Not a Contribution."
 87 | 
 88 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 89 |       on behalf of whom a Contribution has been received by Licensor and
 90 |       subsequently incorporated within the Work.
 91 | 
 92 |    2. Grant of Copyright License. Subject to the terms and conditions of
 93 |       this License, each Contributor hereby grants to You a perpetual,
 94 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 95 |       copyright license to reproduce, prepare Derivative Works of,
 96 |       publicly display, publicly perform, sublicense, and distribute the
 97 |       Work and such Derivative Works in Source or Object form.
 98 | 
 99 |    3. Grant of Patent License. Subject to the terms and conditions of
100 |       this License, each Contributor hereby grants to You a perpetual,
101 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
102 |       (except as stated in this section) patent license to make, have made,
103 |       use, offer to sell, sell, import, and otherwise transfer the Work,
104 |       where such license applies only to those patent claims licensable
105 |       by such Contributor that are necessarily infringed by their
106 |       Contribution(s) alone or by combination of their Contribution(s)
107 |       with the Work to which such Contribution(s) was submitted. If You
108 |       institute patent litigation against any entity (including a
109 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
110 |       or a Contribution incorporated within the Work constitutes direct
111 |       or contributory patent infringement, then any patent licenses
112 |       granted to You under this License for that Work shall terminate
113 |       as of the date such litigation is filed.
114 | 
115 |    4. Redistribution. You may reproduce and distribute copies of the
116 |       Work or Derivative Works thereof in any medium, with or without
117 |       modifications, and in Source or Object form, provided that You
118 |       meet the following conditions:
119 | 
120 |       (a) You must give any other recipients of the Work or
121 |           Derivative Works a copy of this License; and
122 | 
123 |       (b) You must cause any modified files to carry prominent notices
124 |           stating that You changed the files; and
125 | 
126 |       (c) You must retain, in the Source form of any Derivative Works
127 |           that You distribute, all copyright, patent, trademark, and
128 |           attribution notices from the Source form of the Work,
129 |           excluding those notices that do not pertain to any part of
130 |           the Derivative Works; and
131 | 
132 |       (d) If the Work includes a "NOTICE" text file as part of its
133 |           distribution, then any Derivative Works that You distribute must
134 |           include a readable copy of the attribution notices contained
135 |           within such NOTICE file, excluding those notices that do not
136 |           pertain to any part of the Derivative Works, in at least one
137 |           of the following places: within a NOTICE text file distributed
138 |           as part of the Derivative Works; within the Source form or
139 |           documentation, if provided along with the Derivative Works; or,
140 |           within a display generated by the Derivative Works, if and
141 |           wherever such third-party notices normally appear. The contents
142 |           of the NOTICE file are for informational purposes only and
143 |           do not modify the License. You may add Your own attribution
144 |           notices within Derivative Works that You distribute, alongside
145 |           or as an addendum to the NOTICE text from the Work, provided
146 |           that such additional attribution notices cannot be construed
147 |           as modifying the License.
148 | 
149 |       You may add Your own copyright statement to Your modifications and
150 |       may provide additional or different license terms and conditions
151 |       for use, reproduction, or distribution of Your modifications, or
152 |       for any such Derivative Works as a whole, provided Your use,
153 |       reproduction, and distribution of the Work otherwise complies with
154 |       the conditions stated in this License.
155 | 
156 |    5. Submission of Contributions. Unless You explicitly state otherwise,
157 |       any Contribution intentionally submitted for inclusion in the Work
158 |       by You to the Licensor shall be under the terms and conditions of
159 |       this License, without any additional terms or conditions.
160 |       Notwithstanding the above, nothing herein shall supersede or modify
161 |       the terms of any separate license agreement you may have executed
162 |       with Licensor regarding such Contributions.
163 | 
164 |    6. Trademarks. This License does not grant permission to use the trade
165 |       names, trademarks, service marks, or product names of the Licensor,
166 |       except as required for reasonable and customary use in describing the
167 |       origin of the Work and reproducing the content of the NOTICE file.
168 | 
169 |    7. Disclaimer of Warranty. Unless required by applicable law or
170 |       agreed to in writing, Licensor provides the Work (and each
171 |       Contributor provides its Contributions) on an "AS IS" BASIS,
172 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
173 |       implied, including, without limitation, any warranties or conditions
174 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
175 |       PARTICULAR PURPOSE. You are solely responsible for determining the
176 |       appropriateness of using or redistributing the Work and assume any
177 |       risks associated with Your exercise of permissions under this License.
178 | 
179 |    8. Limitation of Liability. In no event and under no legal theory,
180 |       whether in tort (including negligence), contract, or otherwise,
181 |       unless required by applicable law (such as deliberate and grossly
182 |       negligent acts) or agreed to in writing, shall any Contributor be
183 |       liable to You for damages, including any direct, indirect, special,
184 |       incidental, or consequential damages of any character arising as a
185 |       result of this License or out of the use or inability to use the
186 |       Work (including but not limited to damages for loss of goodwill,
187 |       work stoppage, computer failure or malfunction, or any and all
188 |       other commercial damages or losses), even if such Contributor
189 |       has been advised of the possibility of such damages.
190 | 
191 |    9. Accepting Warranty or Additional Liability. While redistributing
192 |       the Work or Derivative Works thereof, You may choose to offer,
193 |       and charge a fee for, acceptance of support, warranty, indemnity,
194 |       or other liability obligations and/or rights consistent with this
195 |       License. However, in accepting such obligations, You may act only
196 |       on Your own behalf and on Your sole responsibility, not on behalf
197 |       of any other Contributor, and only if You agree to indemnify,
198 |       defend, and hold each Contributor harmless for any liability
199 |       incurred by, or claims asserted against, such Contributor by reason
200 |       of your accepting any such warranty or additional liability.
201 | 
202 |    END OF TERMS AND CONDITIONS
203 | 
204 |    APPENDIX: How to apply the Apache License to your work.
205 | 
206 |       To apply the Apache License to your work, attach the following
207 |       boilerplate notice, with the fields enclosed by brackets "[]"
208 |       replaced with your own identifying information. (Don't include
209 |       the brackets!)  The text should be enclosed in the appropriate
210 |       comment syntax for the file format. We also recommend that a
211 |       file or class name and description of purpose be included on the
212 |       same "printed page" as the copyright notice for easier
213 |       identification within third-party archives.
214 | 
215 |    Copyright [yyyy] [name of copyright owner]
216 | 
217 |    Licensed under the Apache License, Version 2.0 (the "License");
218 |    you may not use this file except in compliance with the License.
219 |    You may obtain a copy of the License at
220 | 
221 |        http://www.apache.org/licenses/LICENSE-2.0
222 | 
223 |    Unless required by applicable law or agreed to in writing, software
224 |    distributed under the License is distributed on an "AS IS" BASIS,
225 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
226 |    See the License for the specific language governing permissions and
227 |    limitations under the License.
228 | 


--------------------------------------------------------------------------------
/SDNet/code/SDNet/README.md:
--------------------------------------------------------------------------------
 1 | # SDNet
 2 | 
 3 | This is the official code for the Microsoft's submission of SDNet model to [CoQA](https://stanfordnlp.github.io/coqa/) leaderboard. It is implemented under PyTorch framework. The related paper to cite is: 
 4 | 
 5 | **SDNet: Contextualized Attention-based Deep Network for Conversational Question Answering**, by Chenguang Zhu, Michael Zeng and Xuedong Huang, at https://arxiv.org/abs/1812.03593.
 6 | 
 7 | For usage of this code, please follow [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct).
 8 | 
 9 | # Directory structure:
10 | * main.py: the starter code
11 | 
12 | * Models/
13 |   * BaseTrainer.py: Base class for trainer
14 |   * SDNetTrainer.py: Trainer for SDNet, including training and predicting procedures
15 |   * SDNet.py: The SDNet network structure
16 |   * Layers.py: Related network layer functions
17 |   * Bert/
18 |     * Bert.py: Customized class to compute BERT contextualized embedding     
19 | 	   * modeling.py, optimization.py, tokenization.py: From Huggingface's PyTorch implementation of BERT
20 | * Utils/
21 |   * Arguments.py: Process argument configuration file
22 |   * Constants.py: Define constants used
23 |   * CoQAPreprocess.py: preprocess CoQA raw data into intermediate binary/json file, including tokenzation, history preprending
24 |   * CoQAUtils.py, General Utils.py: utility functions used in SDNet
25 |   * Timing.py: Logging time
26 | 
27 | # How to run
28 | Requirement: PyTorch 0.4.0, spaCy 2.0.
29 | The docker we used is available at dockerhub: https://hub.docker.com/r/zcgzcgzcg/squadv2/tags. Please use v3.0 or v4.0.
30 | 1. Create a folder (e.g. **coqa**) to contain data and running logs;
31 | 2. Create folder **coqa/data** to store CoQA raw data: **coqa-train-v1.0.json** and **coqa-dev-v1.0.json**;
32 | 3. Copy the file **conf** from the repo into folder **coqa**;
33 | 4. If you want to use BERT-Large, download their model into **coqa/bert-large-uncased**; if you want to use BERT-base, download their model into **coqa/bert-base-cased**;
34 |     * The models can be downloaded from Huggingface: 
35 |       * 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
36 |       * 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz"
37 |     * bert-large-uncased-vocab.txt can be downloaded from Google's BERT repository
38 | 5. Create a folder **glove** in the same directory of **coqa** and download GloVe embedding **glove.840B.300d.txt** into the folder.
39 | 
40 | Your directory should look like this:
41 | * coqa/
42 |   * data/
43 |     * coqa-train-v1.0.json
44 |     * coqa-dev-v1.0.json
45 |   * bert-large-uncased/
46 |     * bert-large-uncased-vocab.txt
47 |     * bert_config.json
48 |     * pytorch_model.bin
49 |   * conf  
50 | * glove/
51 |   * glove.840B.300d.txt
52 | 
53 | Then, execute `python main.py train path_to_coqa/conf`.
54 | 
55 | If you run for the first time, CoQAPreprocess.py will automatically create folders **conf~/spacy_intermediate_features~** inside **coqa** to store intermediate tokenization results, which will take a few hours.
56 | 
57 | Every time you run the code, a new running folder **run_idx** will be created inside **coqa/conf~**, which contains running logs, prediction result on dev set, and best model.
58 | 
59 | # Contact
60 | If you have any questions, please contact Chenguang Zhu, chezhu@microsoft.com
61 | 


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Scratch/SQuAD_to_CoQA.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | # Scratch code to convert SQuAD data into CoQA format
 5 | 
 6 | import json
 7 | 
 8 | data = {'version': 0.2, 'data': []}
 9 | with open('train-v2.0.json', 'r') as f:
10 |     squad = json.load(f)
11 | 
12 | cnt = 0
13 | for d in squad['data']:
14 |     for p in d['paragraphs']:
15 |         cnt += 1
16 |         a = {'source': 'source', 'filename': 'filename', 'id': str(cnt), 'story': p['context']}
17 |         ques = []
18 |         ans = []
19 | 
20 |         additional = []
21 |         turn_id = 1
22 |         for qa in p['qas']:
23 |             ques.append({
24 |                     'input_text': qa['question'],
25 |                     'turn_id': turn_id
26 |                 })
27 | 
28 |             if qa['is_impossible']:
29 |                 if len(ans) == 0:
30 |                     ans.append([])
31 |                 ans[0].append({
32 |                     'input_text': 'unknown',
33 |                     'span_text': 'unknown',
34 |                     'span_start': -1,
35 |                     'span_end': -1,
36 |                     'turn_id': turn_id
37 |                 })    
38 | 
39 |             for j in range(len(qa['answers'])):
40 |                 if j >= len(ans):
41 |                     ans.append([])
42 |                 ans[j].append({
43 |                     'input_text': qa['answers'][j]['text'],
44 |                     'span_text': qa['answers'][j]['text'],
45 |                     'span_start': qa['answers'][j]['answer_start'],
46 |                     'span_end': qa['answers'][j]['answer_start'] + len(qa['answers'][j]['text']),
47 |                     'turn_id': turn_id
48 |                 })    
49 | 
50 |             turn_id += 1
51 | 
52 |         a['questions'] = ques
53 |         a['answers'] = ans[0]
54 |         a['additional_answers'] = {}
55 |         for j in range(1, len(ans)):
56 |             a['additional_answers'][str(j-1)] = ans[j]
57 | 
58 |         data['data'].append(a)
59 | 
60 | 
61 | with open('squad_in_coqa_format.json', 'w') as output_file:
62 |     json.dump(data, output_file, indent=4)
63 | 


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/Arguments.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | import os
 5 | 
 6 | class Arguments:
 7 |     def __init__(self, confFile):
 8 |         if not os.path.exists(confFile):
 9 |             raise Exception("The argument file does not exist: " + confFile)
10 |         self.confFile = confFile
11 | 
12 |     def is_int(self, s):
13 |         try:
14 |             int(s)
15 |             return True
16 |         except ValueError:
17 |             return False
18 | 
19 |     def is_float(self, s):
20 |         try:
21 |             float(s)
22 |             return True
23 |         except ValueError:
24 |             return False
25 | 
26 |     def is_bool(self, s):
27 |         return s.lower() == 'true' or s.lower() == 'false'
28 | 
29 |     def readHyperDriveArguments(self, arguments):
30 |         hyperdrive_opts = {}
31 |         for i in range(0, len(arguments), 2):
32 |             hp_name, hp_value = arguments[i:i+2]
33 |             hp_name = hp_name.replace("--", "")
34 |             if self.is_int(hp_value):
35 |                 hp_value = int(hp_value)
36 |             elif self.is_float(hp_value):
37 |                 hp_value = float(hp_value)
38 |             hyperdrive_opts[hp_name] = hp_value
39 |         return hyperdrive_opts
40 | 
41 |     def readArguments(self):
42 |         opt = {}
43 |         with open(self.confFile, encoding='utf-8') as f:
44 |             for line in f:
45 |                 l = line.replace('\t', ' ').strip()
46 |                 if l.startswith("#"):
47 |                     continue
48 |                 parts = l.split()
49 |                 if len(parts) == 1:
50 |                     key = parts[0]
51 |                     if not key in opt:
52 |                         opt[key] = True
53 |                 if len(parts) == 2:
54 |                     key = parts[0]
55 |                     value = parts[1]
56 |                     if not key in opt:
57 |                         opt[key] = value
58 |                         if self.is_int(value):
59 |                             opt[key] = int(value)
60 |                         elif self.is_float(value):
61 |                             opt[key] = float(value)
62 |                         elif self.is_bool(value):
63 |                             opt[key] = value.lower() == 'true'
64 |                     else:
65 |                         print('Warning: key %s already exists' % key)
66 |         return opt
67 | 


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/CoQAPreprocess.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT license.
  3 | 
  4 | """
  5 |     This file takes a CoQA data file as input and generates the input files for training the QA model.
  6 | """
  7 | import json
  8 | import msgpack
  9 | import multiprocessing
 10 | import re
 11 | import string
 12 | import torch
 13 | from tqdm import tqdm
 14 | from collections import Counter
 15 | from Utils.GeneralUtils import nlp, load_glove_vocab, pre_proc
 16 | from Utils.CoQAUtils import token2id, token2id_sent, char2id_sent, build_embedding, feature_gen, POS, ENT
 17 | import os
 18 | 
 19 | class CoQAPreprocess():
 20 |     def __init__(self, opt):
 21 |         print('CoQA Preprocessing')
 22 |         self.opt = opt
 23 |         self.spacyDir = opt['FEATURE_FOLDER']
 24 |         self.train_file = os.path.join(opt['datadir'], opt['CoQA_TRAIN_FILE'])
 25 |         self.dev_file = os.path.join(opt['datadir'], opt['CoQA_DEV_FILE'])
 26 |         self.glove_file = os.path.join(opt['datadir'], opt['INIT_WORD_EMBEDDING_FILE'])
 27 |         self.glove_dim = 300
 28 |         self.official = 'OFFICIAL' in opt
 29 |         self.data_prefix = 'coqa-'
 30 | 
 31 |         if self.official:
 32 |             self.glove_vocab = load_glove_vocab(self.glove_file, self.glove_dim, to_lower = False)
 33 |             print('Official prediction initializes...')
 34 |             print('Loading training vocab and vocab char...')
 35 |             self.train_vocab, self.train_char_vocab, self.train_embedding = self.load_data()
 36 |             self.test_file = self.opt['OFFICIAL_TEST_FILE']
 37 |             return
 38 | 
 39 |         dataset_labels = ['train', 'dev']
 40 |         allExist = True
 41 |         for dataset_label in dataset_labels:
 42 |             if not os.path.exists(os.path.join(self.spacyDir, self.data_prefix + dataset_label + '-preprocessed.json')):
 43 |                 allExist = False
 44 | 
 45 |         if allExist:
 46 |             return
 47 | 
 48 |         print('Previously result not found, creating preprocessed files now...')
 49 |         self.glove_vocab = load_glove_vocab(self.glove_file, self.glove_dim, to_lower = False)
 50 |         if not os.path.isdir(self.spacyDir):
 51 |             os.makedirs(self.spacyDir)
 52 |             print('Directory created: ' + self.spacyDir)
 53 |        
 54 |         for dataset_label in dataset_labels:
 55 |             self.preprocess(dataset_label)
 56 | 
 57 |     # dataset_label can be 'train' or 'dev' or 'test'
 58 |     def preprocess(self, dataset_label):
 59 |         file_name = self.train_file if dataset_label == 'train' else (self.dev_file if dataset_label == 'dev' else self.test_file)
 60 |         output_file_name = os.path.join(self.spacyDir, self.data_prefix + dataset_label + '-preprocessed.json')
 61 | 
 62 |         print('Preprocessing', dataset_label, 'file:', file_name)
 63 |         print('Loading json...')
 64 |         with open(file_name, 'r') as f:
 65 |             dataset = json.load(f)
 66 | 
 67 |         print('Processing json...')
 68 | 
 69 |         data = []
 70 |         tot = len(dataset['data'])
 71 |         for data_idx in tqdm(range(tot)):
 72 |             datum = dataset['data'][data_idx]
 73 |             context_str = datum['story']
 74 |             _datum = {'context': context_str,
 75 |                       'source': datum['source'],
 76 |                       'id': datum['id'],
 77 |                       'filename': datum['filename']}
 78 | 
 79 |             nlp_context = nlp(pre_proc(context_str))
 80 |             _datum['annotated_context'] = self.process(nlp_context)
 81 |             _datum['raw_context_offsets'] = self.get_raw_context_offsets(_datum['annotated_context']['word'], context_str)
 82 |             _datum['qas'] = []
 83 |             assert len(datum['questions']) == len(datum['answers'])
 84 | 
 85 |             additional_answers = {}
 86 |             if 'additional_answers' in datum:
 87 |                 for k, answer in datum['additional_answers'].items():
 88 |                     if len(answer) == len(datum['answers']):
 89 |                         for ex in answer:
 90 |                             idx = ex['turn_id']
 91 |                             if idx not in additional_answers:
 92 |                                 additional_answers[idx] = []
 93 |                             additional_answers[idx].append(ex['input_text']) # additional_answer is only used to eval, so raw_text is fine
 94 | 
 95 |             for i in range(len(datum['questions'])):
 96 |                 question, answer = datum['questions'][i], datum['answers'][i]
 97 |                 assert question['turn_id'] == answer['turn_id']
 98 | 
 99 |                 idx = question['turn_id']
100 |                 _qas = {'turn_id': idx,
101 |                         'question': question['input_text'],
102 |                         'answer': answer['input_text']}
103 |                 if idx in additional_answers:
104 |                     _qas['additional_answers'] = additional_answers[idx]
105 | 
106 |                 _qas['annotated_question'] = self.process(nlp(pre_proc(question['input_text'])))
107 |                 _qas['annotated_answer'] = self.process(nlp(pre_proc(answer['input_text'])))
108 |                 _qas['raw_answer'] = answer['input_text']
109 |                 _qas['answer_span_start'] = answer['span_start']
110 |                 _qas['answer_span_end'] = answer['span_end']
111 | 
112 |                 start = answer['span_start']
113 |                 end = answer['span_end']
114 |                 chosen_text = _datum['context'][start: end].lower()
115 |                 while len(chosen_text) > 0 and chosen_text[0] in string.whitespace:
116 |                     chosen_text = chosen_text[1:]
117 |                     start += 1
118 |                 while len(chosen_text) > 0 and chosen_text[-1] in string.whitespace:
119 |                     chosen_text = chosen_text[:-1]
120 |                     end -= 1
121 |                 input_text = _qas['answer'].strip().lower()
122 |                 if input_text in chosen_text:
123 |                     p = chosen_text.find(input_text)
124 |                     _qas['answer_span'] = self.find_span(_datum['raw_context_offsets'],
125 |                                                     start + p, start + p + len(input_text))
126 |                 else:
127 |                     _qas['answer_span'] = self.find_span_with_gt(_datum['context'],
128 |                                                             _datum['raw_context_offsets'], input_text)
129 |                 long_question = ''
130 |                 for j in range(i - 2, i + 1):
131 |                     if j < 0:
132 |                         continue
133 |                     long_question += ' ' + datum['questions'][j]['input_text']
134 |                     if j < i:
135 |                         long_question += ' ' + datum['answers'][j]['input_text']
136 | 
137 |                 long_question = long_question.strip()       
138 |                 nlp_long_question = nlp(long_question)
139 |                 _qas['context_features'] = feature_gen(nlp_context, nlp_long_question)
140 |                     
141 |                 _datum['qas'].append(_qas)
142 |             data.append(_datum)
143 | 
144 |         # build vocabulary
145 |         if dataset_label == 'train':
146 |             print('Build vocabulary from training data...')
147 |             contexts = [_datum['annotated_context']['word'] for _datum in data]
148 |             qas = [qa['annotated_question']['word'] + qa['annotated_answer']['word'] for qa in _datum['qas'] for _datum in data]
149 |             self.train_vocab = self.build_vocab(contexts, qas)
150 |             self.train_char_vocab = self.build_char_vocab(self.train_vocab)
151 | 
152 |         print('Getting word ids...')
153 |         w2id = {w: i for i, w in enumerate(self.train_vocab)}
154 |         c2id = {c: i for i, c in enumerate(self.train_char_vocab)}
155 |         for _datum in data:
156 |             _datum['annotated_context']['wordid'] = token2id_sent(_datum['annotated_context']['word'], w2id, unk_id = 1, to_lower = False)
157 |             _datum['annotated_context']['charid'] = char2id_sent(_datum['annotated_context']['word'], c2id, unk_id = 1, to_lower = False)
158 |             for qa in _datum['qas']:
159 |                 qa['annotated_question']['wordid'] = token2id_sent(qa['annotated_question']['word'], w2id, unk_id = 1, to_lower = False)
160 |                 qa['annotated_question']['charid'] = char2id_sent(qa['annotated_question']['word'], c2id, unk_id = 1, to_lower = False)
161 |                 qa['annotated_answer']['wordid'] = token2id_sent(qa['annotated_answer']['word'], w2id, unk_id = 1, to_lower = False)
162 |                 qa['annotated_answer']['charid'] = char2id_sent(qa['annotated_answer']['word'], c2id, unk_id = 1, to_lower = False)
163 | 
164 |         if dataset_label == 'train':
165 |             # get the condensed dictionary embedding
166 |             print('Getting embedding matrix for ' + dataset_label)
167 |             embedding = build_embedding(self.glove_file, self.train_vocab, self.glove_dim)
168 |             meta = {'vocab': self.train_vocab, 'char_vocab': self.train_char_vocab, 'embedding': embedding.tolist()}
169 |             meta_file_name = os.path.join(self.spacyDir, dataset_label + '_meta.msgpack')
170 |             print('Saving meta information to', meta_file_name)
171 |             with open(meta_file_name, 'wb') as f:
172 |                 msgpack.dump(meta, f, encoding='utf8')
173 | 
174 |         dataset['data'] = data
175 | 
176 |         if dataset_label == 'test':
177 |             return dataset
178 | 
179 |         with open(output_file_name, 'w') as output_file:
180 |             json.dump(dataset, output_file, sort_keys=True, indent=4)
181 |         
182 |     '''
183 |      Return train_vocab embedding
184 |     '''
185 |     def load_data(self):
186 |         print('Load train_meta.msgpack...')
187 |         meta_file_name = os.path.join(self.spacyDir, 'train_meta.msgpack')
188 |         with open(meta_file_name, 'rb') as f:
189 |             meta = msgpack.load(f, encoding='utf8')
190 |         embedding = torch.Tensor(meta['embedding'])
191 |         self.opt['vocab_size'] = embedding.size(0)
192 |         self.opt['vocab_dim'] = embedding.size(1)
193 |         self.opt['char_vocab_size'] = len(meta['char_vocab'])
194 |         return meta['vocab'], meta['char_vocab'], embedding
195 | 
196 |     def build_vocab(self, contexts, qas): # vocabulary will also be sorted accordingly
197 |         counter_c = Counter(w for doc in contexts for w in doc)
198 |         counter_qa = Counter(w for doc in qas for w in doc)
199 |         counter = counter_c + counter_qa
200 |         vocab = sorted([t for t in counter_qa if t in self.glove_vocab], key=counter_qa.get, reverse=True)
201 |         vocab += sorted([t for t in counter_c.keys() - counter_qa.keys() if t in self.glove_vocab],
202 |                         key=counter.get, reverse=True)
203 |         total = sum(counter.values())
204 |         matched = sum(counter[t] for t in vocab)
205 |         print('vocab {1}/{0} OOV {2}/{3} ({4:.4f}%)'.format(
206 |             len(counter), len(vocab), (total - matched), total, (total - matched) / total * 100))
207 |         vocab.insert(0, "<PAD>")
208 |         vocab.insert(1, "<UNK>")
209 |         vocab.insert(2, "<Q>")
210 |         vocab.insert(3, "<A>")
211 |         return vocab
212 | 
213 |     def build_char_vocab(self, words):
214 |         counter = Counter(c for w in words for c in w)
215 |         print('All characters: {0}'.format(len(counter)))
216 |         char_vocab = [c for c, cnt in counter.items() if cnt > 3]
217 |         print('Occurrence > 3 characters: {0}'.format(len(char_vocab)))
218 | 
219 |         char_vocab.insert(0, "<PAD>")
220 |         char_vocab.insert(1, "<UNK>")
221 |         char_vocab.insert(2, "<STA>")
222 |         char_vocab.insert(3, "<END>")
223 |         return char_vocab    
224 | 
225 |     def _str(self, s):
226 |         """ Convert PTB tokens to normal tokens """
227 |         if (s.lower() == '-lrb-'):
228 |             s = '('
229 |         elif (s.lower() == '-rrb-'):
230 |             s = ')'
231 |         elif (s.lower() == '-lsb-'):
232 |             s = '['
233 |         elif (s.lower() == '-rsb-'):
234 |             s = ']'
235 |         elif (s.lower() == '-lcb-'):
236 |             s = '{'
237 |         elif (s.lower() == '-rcb-'):
238 |             s = '}'
239 |         return s
240 | 
241 |     def process(self, parsed_text):
242 |         output = {'word': [],
243 |                   'lemma': [],
244 |                   'pos': [],
245 |                   'pos_id': [],
246 |                   'ent': [],
247 |                   'ent_id': [],
248 |                   'offsets': [],
249 |                   'sentences': []}
250 | 
251 |         for token in parsed_text:
252 |             #[(token.text,token.idx) for token in parsed_sentence]
253 |             output['word'].append(self._str(token.text))
254 |             pos = token.tag_
255 |             output['pos'].append(pos)
256 |             output['pos_id'].append(token2id(pos, POS, 0))
257 | 
258 |             ent = 'O' if token.ent_iob_ == 'O' else (token.ent_iob_ + '-' + token.ent_type_)
259 |             output['ent'].append(ent)
260 |             output['ent_id'].append(token2id(ent, ENT, 0))
261 | 
262 |             output['lemma'].append(token.lemma_ if token.lemma_ != '-PRON-' else token.text.lower())
263 |             output['offsets'].append((token.idx, token.idx + len(token.text)))
264 | 
265 |         word_idx = 0
266 |         for sent in parsed_text.sents:
267 |             output['sentences'].append((word_idx, word_idx + len(sent)))
268 |             word_idx += len(sent)
269 | 
270 |         assert word_idx == len(output['word'])
271 |         return output
272 | 
273 |     '''
274 |      offsets based on raw_text
275 |      this will solve the problem that, in raw_text, it's "a-b", in parsed test, it's "a - b"
276 |     '''
277 |     def get_raw_context_offsets(self, words, raw_text):
278 |         raw_context_offsets = []
279 |         p = 0
280 |         for token in words:            
281 |             while p < len(raw_text) and re.match('\s', raw_text[p]):
282 |                 p += 1
283 |             if raw_text[p:p + len(token)] != token:
284 |                 print('something is wrong! token', token, 'raw_text:', raw_text)
285 | 
286 |             raw_context_offsets.append((p, p + len(token)))
287 |             p += len(token)
288 | 
289 |         return raw_context_offsets
290 | 
291 |     def normalize_answer(self, s):
292 |         """Lower text and remove punctuation, storys and extra whitespace."""
293 | 
294 |         def remove_articles(text):
295 |             regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
296 |             return re.sub(regex, ' ', text)
297 | 
298 |         def white_space_fix(text):
299 |             return ' '.join(text.split())
300 | 
301 |         def remove_punc(text):
302 |             exclude = set(string.punctuation)
303 |             return ''.join(ch for ch in text if ch not in exclude)
304 | 
305 |         def lower(text):
306 |             return text.lower()
307 | 
308 |         return white_space_fix(remove_articles(remove_punc(lower(s))))
309 | 
310 | 
311 |     # find the word id start and stop
312 |     def find_span_with_gt(self, context, offsets, ground_truth):
313 |         best_f1 = 0.0
314 |         best_span = (len(offsets) - 1, len(offsets) - 1)
315 |         gt = self.normalize_answer(pre_proc(ground_truth)).split()
316 | 
317 |         ls = [i for i in range(len(offsets)) if context[offsets[i][0]:offsets[i][1]].lower() in gt]
318 | 
319 |         for i in range(len(ls)):
320 |             for j in range(i, len(ls)):
321 |                 pred = self.normalize_answer(pre_proc(context[offsets[ls[i]][0]: offsets[ls[j]][1]])).split()
322 |                 common = Counter(pred) & Counter(gt)
323 |                 num_same = sum(common.values())
324 |                 if num_same > 0:
325 |                     precision = 1.0 * num_same / len(pred)
326 |                     recall = 1.0 * num_same / len(gt)
327 |                     f1 = (2 * precision * recall) / (precision + recall)
328 |                     if f1 > best_f1:
329 |                         best_f1 = f1
330 |                         best_span = (ls[i], ls[j])
331 |         return best_span
332 | 
333 | 
334 |     # find the word id start and stop
335 |     def find_span(self, offsets, start, end):
336 |         start_index = -1
337 |         end_index = -1
338 |         for i, offset in enumerate(offsets):
339 |             if (start_index < 0) or (start >= offset[0]):
340 |                 start_index = i
341 |             if (end_index < 0) and (end <= offset[1]):
342 |                 end_index = i
343 |         return (start_index, end_index)
344 | 


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/Constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | PAD_WORD_ID = 0
 5 | UNK_WORD_ID = 1
 6 | END_WORD_ID = 2
 7 | 
 8 | PAD_CHAR = 261
 9 | BOW_CHAR = 259
10 | EOW_CHAR = 260
11 | 
12 | 


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/GeneralUtils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | import math
 5 | import re
 6 | from Utils.Constants import *
 7 | import spacy
 8 | import torch
 9 | import torch.nn.functional as F
10 | import unicodedata
11 | import sys
12 | from torch.autograd import Variable
13 | nlp = spacy.load('en', parser = False)
14 | 
15 | # normalize sentence
16 | def normalize_text(text):
17 |     return unicodedata.normalize('NFD', text)
18 |  
19 | def space_extend( matchobj):
20 |     return ' ' + matchobj.group(0) + ' '
21 | 
22 | # get rid of punctuation stuff and stripping
23 | def pre_proc(text):
24 |     text = re.sub(u'-|\u2010|\u2011|\u2012|\u2013|\u2014|\u2015|%|\[|\]|:|\(|\)|/|\t', space_extend, text)
25 |     text = text.strip(' \n')
26 |     text = re.sub('\s+', ' ', text)
27 |     return text 
28 | 
29 | # get a set of vocabulary
30 | def load_glove_vocab(file, wv_dim, to_lower = True):
31 |     glove_vocab = set()
32 |     print('Loading glove vocabulary from ' + file)
33 |     lineCnt = 0
34 |     with open(file, encoding = 'utf-8') as f:
35 |         for line in f:
36 |             # delete!!!
37 |             #if lineCnt == 20000:
38 |             #    print('delete!')
39 |             #    break
40 | 
41 |             lineCnt = lineCnt + 1
42 |             if lineCnt % 100000 == 0:
43 |                 print('.', end = '',flush=True)
44 |             elems = line.split()
45 |             token = normalize_text(''.join(elems[0:-wv_dim]))
46 |             if to_lower:
47 |                 token = token.lower()
48 |             glove_vocab.add(token) 
49 | 
50 |     print('\n')
51 |     print('%d words loaded from Glove\n' % len(glove_vocab))
52 |     return glove_vocab
53 | 
54 | def token2id(docs, vocab, unk_id=None):
55 |     w2id = {w: i for i, w in enumerate(vocab)}
56 |     ids = [[w2id[w] if w in w2id else unk_id for w in doc] for doc in docs]
57 |     return ids
58 | 
59 | def char2id(docs, char_vocab, unk_id=None):
60 |     c2id = {c: i for i, c in enumerate(char_vocab)}
61 |     ids = [[[c2id["<STA>"]] + [c2id[c] if c in c2id else unk_id for c in w] + [c2id["<END>"]] for w in doc] for doc in docs]
62 |     return ids
63 | 
64 | def removeInvalidChar(sentence):
65 |     ordId = list(sentence.encode('utf-8', errors='ignore'))
66 |     ordId = [x for x in ordId if x >= 0 and x < 256]
67 |     return ''.join([chr(x) for x in ordId])
68 | 
69 | def makeVariable(x, use_cuda):
70 |     if use_cuda:
71 |         x = x.pin_memory()
72 |         #return Variable(x.cuda(async = True), requires_grad = False)
73 |         return Variable(x.cuda(non_blocking = True), requires_grad = False)
74 |     else:
75 |         return Variable(x, requires_grad = False)
76 | 
77 | '''
78 | Input:
79 |  nlp is an instance of spacy
80 |  sentence is a string
81 | 
82 | Output:
83 |  A list of tokens, entity and POS tags
84 | '''
85 | def spacyTokenize(sentence, vocab_ent=None, vocab_tag=None):
86 |     sentence = sentence.lower()
87 |     sentence = pre_proc(sentence)
88 |     raw_tokens = nlp(sentence)
89 |     tokens = [normalize_text(token.text) for token in raw_tokens if not token.is_punct | token.is_space]
90 |     ent = None
91 |     if vocab_ent is not None:
92 |         ent = [token2id(token.ent_type_, vocab_ent) + 1 for token in raw_tokens if not token.is_punct | token.is_space]
93 | 
94 |     tag = None
95 |     if vocab_tag is not None:
96 |         tag = [token2id(token.tag_, vocab_tag) + 1 for token in raw_tokens if not token.is_punct | token.is_space]    
97 | 
98 |     return tokens, ent, tag
99 | 


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/Timing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | from datetime import datetime
 5 | 
 6 | timeelapsed = {}
 7 | startTime = {}
 8 | endTime = {}
 9 | 
10 | def timerstart(name):
11 |     startTime[name] = datetime.now()
12 | 
13 | def timerstop(name):
14 |     endTime[name] = datetime.now()
15 |     if not name in timeelapsed:
16 |         timeelapsed[name] = endTime[name] - startTime[name]
17 |     else:
18 |         timeelapsed[name] += endTime[name] - startTime[name]
19 | 
20 | def timerreport():
21 |     total = 0
22 |     for name in timeelapsed:
23 |         total += timeelapsed[name].total_seconds()
24 |     print('')    
25 |     print('----------------Timer Report----------------------')    
26 |     for name, value in sorted(timeelapsed.items(), key = lambda item: -item[1].total_seconds()):
27 |         print('%s: used time %s, %f%% ' % ('{:20}'.format(name), str(value).split('.')[0], value.total_seconds() / total * 100.0))
28 |     print('--------------------------------------------------')    
29 |     print('')
30 | 


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/__pycache__/Arguments.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Utils/__pycache__/Arguments.cpython-37.pyc


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/__pycache__/CoQAPreprocess.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Utils/__pycache__/CoQAPreprocess.cpython-37.pyc


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/__pycache__/CoQAUtils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Utils/__pycache__/CoQAUtils.cpython-37.pyc


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/__pycache__/Constants.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Utils/__pycache__/Constants.cpython-37.pyc


--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/__pycache__/GeneralUtils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Utils/__pycache__/GeneralUtils.cpython-37.pyc


--------------------------------------------------------------------------------
/SDNet/code/SDNet/conf:
--------------------------------------------------------------------------------
 1 | CoQA_TRAIN_FILE	data/coqa-train-v1.0.json
 2 | CoQA_DEV_FILE	data/coqa-dev-v1.0.json
 3 | 
 4 | PREV_ANS	2
 5 | PREV_QUES	2
 6 | 
 7 | DROPOUT	0.3
 8 | my_dropout_p	0.3
 9 | VARIATIONAL_DROPOUT
10 | 
11 | BERT
12 | BERT_LARGE
13 | dropout_emb	0.4
14 | 
15 | LOCK_BERT
16 | BERT_LINEAR_COMBINE
17 | BERT_tokenizer_file     bert-base-cased/bert-base-cased-vocab.txt
18 | BERT_model_file bert-base-cased/
19 | BERT_large_tokenizer_file       bert-large-uncased/bert-large-uncased-vocab.txt
20 | BERT_large_model_file   bert-large-uncased/
21 | 
22 | SEED	1033
23 | SPACY_FEATURE
24 | CONTEXT_RNN_HIDDEN_DIM	300
25 | 
26 | MAX_WORD_PER_SENTENCE	30
27 | INIT_WORD_EMBEDDING_FILE	../glove/glove.840B.300d.txt
28 | MINI_BATCH	32
29 | EPOCH	30
30 | 
31 | QUES_SELF_ATTN
32 | max_len	15
33 | concat_rnn	False
34 | grad_clipping	 10
35 | do_seq_dropout
36 | tune_partial	1000
37 | embedding_dim	300
38 | prealign_hidden	300
39 | flow_hidden_size 300
40 | query_self_attn_hidden_size	300
41 | pos_dim	12
42 | ent_dim	8
43 | hidden_size	125
44 | deep_att_hidden_size_per_abstr	250
45 | deep_inter_att_use_CoVe	1
46 | in_rnn_layers	2
47 | highlvl_hidden_size	125
48 | question_high_lvl_rnn_layers	1
49 | char_emb_size	8
50 | char_hidden_size	50


--------------------------------------------------------------------------------
/SDNet/code/SDNet/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | import argparse
 5 | import os
 6 | import sys
 7 | import torch
 8 | from Models.SDNetTrainer import SDNetTrainer
 9 | from Utils.Arguments import Arguments
10 | 
11 | opt = None
12 | 
13 | parser = argparse.ArgumentParser(description='SDNet')
14 | parser.add_argument('command', help='Command: train')
15 | parser.add_argument('conf_file', help='Path to conf file.')
16 | 
17 | cmdline_args = parser.parse_args()
18 | command = cmdline_args.command
19 | conf_file = cmdline_args.conf_file
20 | conf_args = Arguments(conf_file)
21 | opt = conf_args.readArguments()
22 | opt['cuda'] = torch.cuda.is_available()
23 | opt['confFile'] = conf_file
24 | opt['datadir'] = os.path.dirname(conf_file)  # conf_file specifies where the data folder is
25 | 
26 | for key,val in cmdline_args.__dict__.items():
27 |     if val is not None and key not in ['command', 'conf_file']:
28 |         opt[key] = val
29 | 
30 | model = SDNetTrainer(opt)    
31 |     
32 | print('Select command: ' + command)
33 | model.train()
34 | 


--------------------------------------------------------------------------------
/SDNet/code/preprocess/prepro.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | f=open('train_v0.2.json','r')
 3 | quacdata=json.load(f)
 4 | data=[]
 5 | for i in range(len(quacdata['data'])):
 6 |   temp={}
 7 |   temp["source"]="wikipedia"
 8 |   temp["filename"]=quacdata['data'][i]["title"]+".txt"
 9 |   temp["name"]=temp["filename"]
10 |   temp["id"]=quacdata['data'][i]["paragraphs"][0]["id"]
11 |   temp["story"]=quacdata['data'][i]["paragraphs"][0]["context"][:-13]
12 |   temp["questions"]=[]
13 |   temp["answers"]=[]
14 |   turn=0
15 |   for qa in quacdata['data'][i]["paragraphs"][0]["qas"]:
16 |     turn+=1
17 |     tempq={}
18 |     tempa={}
19 |     tempq["turn_id"]=turn
20 |     tempq["input_text"]=qa["question"]
21 |     temp["questions"].append(tempq)
22 |     tempa["span_start"]=qa["orig_answer"]["answer_start"]
23 |     tempa["span_end"]=qa["orig_answer"]["answer_start"]+len(qa["orig_answer"]["text"])
24 |     answertext=qa["orig_answer"]["text"]
25 |     if answertext=="CANNOTANSWER":
26 |       answertext="unknown"
27 |       tempa["span_start"]=-1
28 |       tempa["span_end"]=-1
29 |     tempa["span_text"]=answertext
30 |     if qa["yesno"]=="y":
31 |       tempa["input_text"]="Yes"
32 |     elif qa["yesno"]=="n":
33 |       tempa["input_text"]="No"
34 |     else:
35 |       tempa["input_text"]=answertext
36 |     tempa["turn_id"]=turn
37 |     temp["answers"].append(tempa)
38 |   data.append(temp)
39 | output={}
40 | output["data"]=data
41 | output["version"]="1.0"
42 | outf=open('quac_train_processed.json','w')
43 | json.dump(output, outf ,indent=2 )
44 | outf.close()
45 | f.close()
46 | 
47 | f=open('val_v0.2.json','r')
48 | quacdata=json.load(f)
49 | data=[]
50 | for i in range(len(quacdata['data'])):
51 |   temp={}
52 |   temp["source"]="wikipedia"
53 |   temp["filename"]=quacdata['data'][i]["title"]+".txt"
54 |   temp["name"]=temp["filename"]
55 |   temp["id"]=quacdata['data'][i]["paragraphs"][0]["id"]
56 |   temp["story"]=quacdata['data'][i]["paragraphs"][0]["context"][:-13]
57 |   temp["questions"]=[]
58 |   temp["answers"]=[]
59 |   turn=0
60 |   for qa in quacdata['data'][i]["paragraphs"][0]["qas"]:
61 |     turn+=1
62 |     tempq={}
63 |     tempa={}
64 |     tempq["turn_id"]=turn
65 |     tempq["input_text"]=qa["question"]
66 |     temp["questions"].append(tempq)
67 |     tempa["span_start"]=qa["orig_answer"]["answer_start"]
68 |     tempa["span_end"]=qa["orig_answer"]["answer_start"]+len(qa["orig_answer"]["text"])
69 |     answertext=qa["orig_answer"]["text"]
70 |     if answertext=="CANNOTANSWER":
71 |       answertext="unknown"
72 |       tempa["span_start"]=-1
73 |       tempa["span_end"]=-1
74 |     tempa["span_text"]=answertext
75 |     if qa["yesno"]=="y":
76 |       tempa["input_text"]="Yes"
77 |     elif qa["yesno"]=="n":
78 |       tempa["input_text"]="No"
79 |     else:
80 |       tempa["input_text"]=answertext
81 |     tempa["turn_id"]=turn
82 |     temp["answers"].append(tempa)
83 |   data.append(temp)
84 | output={}
85 | output["data"]=data
86 | output["version"]="1.0"
87 | outf=open('quac_val_processed.json','w')
88 | json.dump(output, outf ,indent=2 )
89 | outf.close()
90 | f.close()
91 | 


--------------------------------------------------------------------------------
/figure/1:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/figure/QA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/figure/QA.png


--------------------------------------------------------------------------------
/figure/coref-F1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/figure/coref-F1.png


--------------------------------------------------------------------------------
/figure/flow-coref.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/figure/flow-coref.png


--------------------------------------------------------------------------------
/figure/quac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/figure/quac.png


--------------------------------------------------------------------------------
/figure/task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/figure/task.png


--------------------------------------------------------------------------------