├── BiDAF
├── BiDAFF++_with_glove+bert.jsonnet
├── BiDAFF++_with_glove+elmo.jsonnet
├── Figures
│ ├── 1.png
│ ├── 2.png
│ ├── 3.png
│ ├── Arch.png
│ ├── baseline.png
│ ├── enhenced.png
│ └── enhenced_elmo.png
├── README.md
├── bert_token_embedder.py
└── wordpiece_indexer.py
├── FlowQA_Attention
├── README.md
├── code
│ ├── QA_model
│ │ ├── README.md
│ │ ├── detail_model.py
│ │ ├── layers.py
│ │ ├── model_QuAC.py
│ │ └── utils.py
│ ├── README.md
│ ├── general_utils.py
│ ├── predict_QuAC.py
│ ├── preprocess_QuAC.py
│ ├── requirements.txt
│ └── train_QuAC.py
└── figure
│ ├── Attention Over Flow.png
│ ├── Readme.me
│ ├── Vanilla Flow Operation.png
│ └── result.png
├── FlowQA_Coreference
└── README.md
├── README.md
├── SDNet
├── Figures
│ ├── f1.jpg
│ └── loss.jpg
├── README.md
└── code
│ ├── SDNet
│ ├── CONTRIBUTING.md
│ ├── LICENSE
│ ├── Models
│ │ ├── BaseTrainer.py
│ │ ├── Bert
│ │ │ ├── Bert.py
│ │ │ ├── __pycache__
│ │ │ │ ├── Bert.cpython-37.pyc
│ │ │ │ ├── modeling.cpython-37.pyc
│ │ │ │ └── tokenization.cpython-37.pyc
│ │ │ ├── modeling.py
│ │ │ ├── optimization.py
│ │ │ └── tokenization.py
│ │ ├── Layers.py
│ │ ├── SDNet.py
│ │ ├── SDNetTrainer.py
│ │ └── __pycache__
│ │ │ ├── BaseTrainer.cpython-37.pyc
│ │ │ ├── Layers.cpython-37.pyc
│ │ │ ├── SDNet.cpython-37.pyc
│ │ │ └── SDNetTrainer.cpython-37.pyc
│ ├── NOTICE.txt
│ ├── README.md
│ ├── Scratch
│ │ └── SQuAD_to_CoQA.py
│ ├── Utils
│ │ ├── Arguments.py
│ │ ├── CoQAPreprocess.py
│ │ ├── CoQAUtils.py
│ │ ├── Constants.py
│ │ ├── GeneralUtils.py
│ │ ├── Timing.py
│ │ └── __pycache__
│ │ │ ├── Arguments.cpython-37.pyc
│ │ │ ├── CoQAPreprocess.cpython-37.pyc
│ │ │ ├── CoQAUtils.cpython-37.pyc
│ │ │ ├── Constants.cpython-37.pyc
│ │ │ └── GeneralUtils.cpython-37.pyc
│ ├── bert_vocab_files
│ │ ├── bert-base-uncased-vocab.txt
│ │ └── bert-large-uncased-vocab.txt
│ ├── conf
│ └── main.py
│ └── preprocess
│ └── prepro.py
└── figure
├── 1
├── QA.png
├── coref-F1.png
├── flow-coref.png
├── quac.png
└── task.png
/BiDAF/BiDAFF++_with_glove+bert.jsonnet:
--------------------------------------------------------------------------------
1 | {
2 | "dataset_reader": {
3 | "type": "quac",
4 | "lazy": true,
5 | "num_context_answers": 2,
6 |
7 | "token_indexers": {
8 | "glove": {
9 | "type": "single_id",
10 | "lowercase_tokens": false
11 | },
12 | "bert": {
13 | "type": "bert-pretrained",
14 | "pretrained_model": "bert-large-cased",
15 | "do_lowercase": false,
16 | "use_starting_offsets": true,
17 | "truncate_long_sequences": false
18 | },
19 | "token_characters": {
20 | "type": "characters",
21 | "min_padding_length": 3,
22 | "character_tokenizer": {
23 | "byte_encoding": "utf-8",
24 | "end_tokens": [
25 | 260
26 | ],
27 | "start_tokens": [
28 | 259
29 | ]
30 | }
31 | }
32 | }
33 | },
34 | "iterator": {
35 | "type": "bucket",
36 | "batch_size": 10,
37 | "max_instances_in_memory": 1000,
38 | "sorting_keys": [
39 | [
40 | "question",
41 | "num_fields"
42 | ],
43 | [
44 | "passage",
45 | "num_tokens"
46 | ]
47 | ]
48 | },
49 | "model": {
50 | "type": "dialog_qa",
51 | "dropout": 0.2,
52 | "initializer": [],
53 | "marker_embedding_dim": 10,
54 | "num_context_answers": 2,
55 | "phrase_layer": {
56 | "type": "gru",
57 | "bidirectional": true,
58 | "hidden_size": 100,
59 | "input_size": 1472, // elmo (1024) + cnn (100) + num_context_answers (2) * marker_embedding_dim (10)
60 | "num_layers": 1
61 | },
62 | "residual_encoder": {
63 | "type": "gru",
64 | "bidirectional": true,
65 | "hidden_size": 100,
66 | "input_size": 200,
67 | "num_layers": 1
68 | },
69 | "span_end_encoder": {
70 | "type": "gru",
71 | "bidirectional": true,
72 | "hidden_size": 100,
73 | "input_size": 400,
74 | "num_layers": 1
75 | },
76 | "span_start_encoder": {
77 | "type": "gru",
78 | "bidirectional": true,
79 | "hidden_size": 100,
80 | "input_size": 200,
81 | "num_layers": 1
82 | },
83 |
84 | "text_field_embedder": {
85 | "allow_unmatched_keys": true,
86 | "embedder_to_indexer_map": {
87 | "glove": ["glove"],
88 | "bert": ["bert", "bert-offsets"],
89 | "token_characters": ["token_characters"]
90 | },
91 | "token_embedders": {
92 | "glove": {
93 | "type": "embedding",
94 | "embedding_dim": 300,
95 | "pretrained_file": "http://nlp.stanford.edu/data/glove.840B.300d.zip",
96 | "trainable": true
97 | },
98 | "bert": {
99 | "type": "bert-pretrained",
100 | "pretrained_model": "bert-large-cased"
101 | },
102 | "token_characters": {
103 | "type": "character_encoding",
104 | "embedding": {
105 | "embedding_dim": 16,
106 | "num_embeddings": 262
107 | },
108 | "encoder": {
109 | "type": "cnn",
110 | "embedding_dim": 16,
111 | "num_filters": 128,
112 | "ngram_filter_sizes": [3],
113 | "conv_layer_activation": "relu"
114 | }
115 | }
116 | }
117 | }
118 | },
119 | "train_data_path": "https://s3.amazonaws.com/my89public/quac/train_5000.json",
120 | "validation_data_path": "https://s3.amazonaws.com/my89public/quac/val.json",
121 | "trainer": {
122 | "cuda_device": 0,
123 | "learning_rate_scheduler": {
124 | "type": "reduce_on_plateau",
125 | "factor": 0.5,
126 | "mode": "max",
127 | "patience": 3
128 | },
129 | "num_epochs": 30,
130 | "optimizer": {
131 | "type": "sgd",
132 | "lr": 0.01,
133 | "momentum": 0.9
134 | },
135 | "patience": 10,
136 | "validation_metric": "+f1"
137 | },
138 | "validation_iterator": {
139 | "type": "bucket",
140 | "batch_size": 3,
141 | "max_instances_in_memory": 1000,
142 | "sorting_keys": [
143 | [
144 | "question",
145 | "num_fields"
146 | ],
147 | [
148 | "passage",
149 | "num_tokens"
150 | ]
151 | ]
152 | }
153 | }
154 |
--------------------------------------------------------------------------------
/BiDAF/BiDAFF++_with_glove+elmo.jsonnet:
--------------------------------------------------------------------------------
1 | {
2 | "dataset_reader": {
3 | "type": "quac",
4 | "lazy": true,
5 | "num_context_answers": 2,
6 | "token_indexers": {
7 | "glove": {
8 | "type": "single_id",
9 | "lowercase_tokens": false
10 | },
11 | "elmo": {
12 | "type": "elmo_characters"
13 | },
14 | "token_characters": {
15 | "type": "characters",
16 | "character_tokenizer": {
17 | "byte_encoding": "utf-8",
18 | "end_tokens": [
19 | 260
20 | ],
21 | "start_tokens": [
22 | 259
23 | ]
24 | },
25 | "min_padding_length": 5
26 | }
27 | }
28 | },
29 | "iterator": {
30 | "type": "bucket",
31 | "batch_size": 10,
32 | "max_instances_in_memory": 1000,
33 | "sorting_keys": [
34 | [
35 | "question",
36 | "num_fields"
37 | ],
38 | [
39 | "passage",
40 | "num_tokens"
41 | ]
42 | ]
43 | },
44 | "model": {
45 | "type": "dialog_qa",
46 | "dropout": 0.2,
47 | "initializer": [],
48 | "marker_embedding_dim": 10,
49 | "num_context_answers": 2,
50 | "phrase_layer": {
51 | "type": "gru",
52 | "bidirectional": true,
53 | "hidden_size": 100,
54 | "input_size": 1444, // elmo (1024) + cnn (100) + num_context_answers (2) * marker_embedding_dim (10)
55 | "num_layers": 1
56 | },
57 | "residual_encoder": {
58 | "type": "gru",
59 | "bidirectional": true,
60 | "hidden_size": 100,
61 | "input_size": 200,
62 | "num_layers": 1
63 | },
64 | "span_end_encoder": {
65 | "type": "gru",
66 | "bidirectional": true,
67 | "hidden_size": 100,
68 | "input_size": 400,
69 | "num_layers": 1
70 | },
71 | "span_start_encoder": {
72 | "type": "gru",
73 | "bidirectional": true,
74 | "hidden_size": 100,
75 | "input_size": 200,
76 | "num_layers": 1
77 | },
78 | "text_field_embedder": {
79 | "glove": {
80 | "type": "embedding",
81 | "embedding_dim": 300,
82 | "pretrained_file": "http://nlp.stanford.edu/data/glove.840B.300d.zip",
83 | "trainable": true
84 | },
85 | "elmo": {
86 | "type": "elmo_token_embedder",
87 | "do_layer_norm": false,
88 | "dropout": 0.2,
89 | "options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json",
90 | "weight_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
91 | },
92 | "token_characters": {
93 | "type": "character_encoding",
94 | "dropout": 0.2,
95 | "embedding": {
96 | "embedding_dim": 20,
97 | "num_embeddings": 262
98 | },
99 | "encoder": {
100 | "type": "cnn",
101 | "embedding_dim": 20,
102 | "ngram_filter_sizes": [
103 | 5
104 | ],
105 | "num_filters": 100
106 | }
107 | }
108 | }
109 | },
110 | "train_data_path": "https://s3.amazonaws.com/my89public/quac/train_5000.json",
111 | "validation_data_path": "https://s3.amazonaws.com/my89public/quac/val.json",
112 | "trainer": {
113 | "cuda_device": 0,
114 | "learning_rate_scheduler": {
115 | "type": "reduce_on_plateau",
116 | "factor": 0.5,
117 | "mode": "max",
118 | "patience": 3
119 | },
120 | "num_epochs": 30,
121 | "optimizer": {
122 | "type": "sgd",
123 | "lr": 0.01,
124 | "momentum": 0.9
125 | },
126 | "patience": 10,
127 | "validation_metric": "+f1"
128 | },
129 | "validation_iterator": {
130 | "type": "bucket",
131 | "batch_size": 3,
132 | "max_instances_in_memory": 1000,
133 | "sorting_keys": [
134 | [
135 | "question",
136 | "num_fields"
137 | ],
138 | [
139 | "passage",
140 | "num_tokens"
141 | ]
142 | ]
143 | }
144 | }
145 |
--------------------------------------------------------------------------------
/BiDAF/Figures/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/1.png
--------------------------------------------------------------------------------
/BiDAF/Figures/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/2.png
--------------------------------------------------------------------------------
/BiDAF/Figures/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/3.png
--------------------------------------------------------------------------------
/BiDAF/Figures/Arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/Arch.png
--------------------------------------------------------------------------------
/BiDAF/Figures/baseline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/baseline.png
--------------------------------------------------------------------------------
/BiDAF/Figures/enhenced.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/enhenced.png
--------------------------------------------------------------------------------
/BiDAF/Figures/enhenced_elmo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/enhenced_elmo.png
--------------------------------------------------------------------------------
/BiDAF/README.md:
--------------------------------------------------------------------------------
1 | # Using enhanced BiDAF++ on QuAC
2 |
3 | ## Descriptions
4 | An original BiDAF++ model uses Char-CNN for character embedding and GLoVe for word embedding. It is also equipped with contextualized embeddings and self attention. In this model, marker embeddings corresponding to previous answer words are used, while question turn numbers are encoded into question embeddings.
5 |
6 | We've used [AllenNLP](https://github.com/allenai/allennlp) library to modify the BiDAF++ and used enhenced models to train on [QuAC](https://quac.ai/) dataset.
7 |
8 | ## Architecture of enhenced BiDAF++
9 |
10 |
11 |
12 |
13 | ### Embedding layers from BiDAF++
14 |
15 | We aimed to apply ELMo or BERT embedding to the original embeddings.
16 |
17 |
18 | Let ELMok be a ELMo vector, xk denote a original embedding vector and hk present a contextual vector generated by bi-LSTM layer. Then we can use ELMo enhanced representation [xk; ELMok] instead of xk. Also, replace hk with [hk; ELMok].
19 | We do the same thing when using BERT embedding.
20 |
21 | ELMo ver:[options_file](https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json)
22 | and [weight_file](https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5)
23 | BERT ver: [bert-large-cased](https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt)
24 | ### Self-Attention layers from BiDAF++
25 |
26 | First, the input is passed through a linear layer with ReLU activations. Then, it passes through a bi-directional GRU or LSTM. After that, we apply attention mechanism between the context and itself as following:
27 |
28 | Let hi, hj be the vectors for context word i and word j and nc be the lengths of the context. Then compute attention between the two words as:
29 |
30 |
31 |
32 |
33 | where w1,w2, and w3 are learned vectors and ⊙ is element-wise multiplica- tion. In this case, we set aij = −inf if i = j. Then, compute an attended vector ci for each context token as:
34 |
35 |
36 |
37 |
38 | Compute a context-to-context vector tc:
39 |
40 |
41 |
42 |
43 | The output of this layer is built concatenating hi, ci, hi ⊙ ci and tc ⊙ ci,
44 | additionally summed with the input before ReLU.
45 | ### Modification
46 |
47 | We modified the wordpiece indexer and bert token embedder to train the model with BERT embedding, cause origianl two files cannot handle the situation with 500 more tokens in the sentence. The modified files would not influence training process of model with other embeddings.
48 |
49 | ## Platform and tools
50 | ### Launching a Deep Learning VM on GCP
51 | - 2 vCPUs with 13GB memory
52 | - 1 NVIDIA Tesla V100 GPU
53 | - PyTorch 1.0 + fast.ai 1.0(CUDA 10.0)
54 | - Install NVIDIA GPU driver automatically on first startup
55 |
56 | ### Setting up an AllenNLP virtual environment
57 |
58 | 1. Create a Conda environment with Python 3.6
59 |
60 | ```bash
61 | conda create -n allennlp python=3.6
62 | ```
63 |
64 | 2. Activate the Conda environment. You will need to activate the Conda environment in each terminal in which you want to use AllenNLP.
65 |
66 | ```bash
67 | source activate allennlp
68 | ```
69 |
70 |
71 | ## Usage
72 |
73 | ### Dataset
74 | [Train data](https://s3.amazonaws.com/my89public/quac/train_5000.json)
75 | [Validation data](https://s3.amazonaws.com/my89public/quac/val.json)
76 |
77 | ### Training
78 | Train a enhanced model with QuAC dataset which includes training and validation dataset
79 | ```
80 | nohup allennlp train --serialization-dir > output.log &
81 | ```
82 | As [QuAC](https://arxiv.org/pdf/1808.07036.pdf) mentioned,Questions in the training set have one reference answer, while validation and test questions have five references each, which makes the F1 score on validation dataset has a higher score then that on training set.
83 |
84 | Best model's configuration is [here](https://github.com/deepnlp-cs599-usc/quac/blob/master/BiDAF/BiDAFF%2B%2B_with_glove%2Bbert.jsonnet).
85 | ## Results
86 | ### F1 score
87 | Enhanced model by BERT
88 |
89 |
90 |
91 | Enhanced model by ELMo
92 |
93 |
94 |
95 | Baseline model
96 |
97 |
98 |
99 |
100 | ### Performance on baseline and enhanced models
101 |
102 | | | F1 score on training set | F1 score on validation set|
103 | | --- | --- | --- |
104 | | Baseline model | 49.40 | 55.59 |
105 | | Enhanced by ELMo | 49.40 | **58.40** |
106 | | Enhanced by BERT | **53.05** | **60.04**|
107 |
108 | ## Related works
109 |
110 | * [Bidirectional attention flow for machine comprehension](https://arxiv.org/abs/1611.01603) by Minjoon Seo et. al.
111 | * [Simple and effective multi-paragraph reading comprehension](https://arxiv.org/abs/1710.10723) by Christopher Clark et. al.
112 | * [Deep contextualized word representations](https://arxiv.org/abs/1802.05365) by Matthew E. Peters et. al.
113 | * [Bert: Pre-training of deep bidirectional transformers for language understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin et. al.
114 |
--------------------------------------------------------------------------------
/BiDAF/bert_token_embedder.py:
--------------------------------------------------------------------------------
1 | """
2 | A ``TokenEmbedder`` which uses one of the BERT models
3 | (https://github.com/google-research/bert)
4 | to produce embeddings.
5 | At its core it uses Hugging Face's PyTorch implementation
6 | (https://github.com/huggingface/pytorch-pretrained-BERT),
7 | so thanks to them!
8 | """
9 | import logging
10 |
11 | import torch
12 | import torch.nn.functional as F
13 |
14 | from pytorch_pretrained_bert.modeling import BertModel
15 |
16 | from allennlp.modules.scalar_mix import ScalarMix
17 | from allennlp.modules.token_embedders.token_embedder import TokenEmbedder
18 | from allennlp.nn import util
19 |
20 | logger = logging.getLogger(__name__)
21 |
22 |
23 | class BertEmbedder(TokenEmbedder):
24 | """
25 | A ``TokenEmbedder`` that produces BERT embeddings for your tokens.
26 | Should be paired with a ``BertIndexer``, which produces wordpiece ids.
27 | Most likely you probably want to use ``PretrainedBertEmbedder``
28 | for one of the named pretrained models, not this base class.
29 | Parameters
30 | ----------
31 | bert_model: ``BertModel``
32 | The BERT model being wrapped.
33 | top_layer_only: ``bool``, optional (default = ``False``)
34 | If ``True``, then only return the top layer instead of apply the scalar mix.
35 | max_pieces : int, optional (default: 512)
36 | The BERT embedder uses positional embeddings and so has a corresponding
37 | maximum length for its input ids. Assuming the inputs are windowed
38 | and padded appropriately by this length, the embedder will split them into a
39 | large batch, feed them into BERT, and recombine the output as if it was a
40 | longer sequence.
41 | start_tokens : int, optional (default: 1)
42 | The number of starting special tokens input to BERT (usually 1, i.e., [CLS])
43 | end_tokens : int, optional (default: 1)
44 | The number of ending tokens input to BERT (usually 1, i.e., [SEP])
45 | """
46 | def __init__(self,
47 | bert_model: BertModel,
48 | top_layer_only: bool = False,
49 | max_pieces: int = 512,
50 | start_tokens: int = 1,
51 | end_tokens: int = 1) -> None:
52 | super().__init__()
53 | self.bert_model = bert_model
54 | self.output_dim = bert_model.config.hidden_size
55 | self.max_pieces = max_pieces
56 | self.start_tokens = start_tokens
57 | self.end_tokens = end_tokens
58 |
59 | if not top_layer_only:
60 | self._scalar_mix = ScalarMix(bert_model.config.num_hidden_layers,
61 | do_layer_norm=False)
62 | else:
63 | self._scalar_mix = None
64 |
65 | def get_output_dim(self) -> int:
66 | return self.output_dim
67 |
68 | def forward(self,
69 | input_ids: torch.LongTensor,
70 | offsets: torch.LongTensor = None,
71 | token_type_ids: torch.LongTensor = None) -> torch.Tensor:
72 | """
73 | Parameters
74 | ----------
75 | input_ids : ``torch.LongTensor``
76 | The (batch_size, ..., max_sequence_length) tensor of wordpiece ids.
77 | offsets : ``torch.LongTensor``, optional
78 | The BERT embeddings are one per wordpiece. However it's possible/likely
79 | you might want one per original token. In that case, ``offsets``
80 | represents the indices of the desired wordpiece for each original token.
81 | Depending on how your token indexer is configured, this could be the
82 | position of the last wordpiece for each token, or it could be the position
83 | of the first wordpiece for each token.
84 | For example, if you had the sentence "Definitely not", and if the corresponding
85 | wordpieces were ["Def", "##in", "##ite", "##ly", "not"], then the input_ids
86 | would be 5 wordpiece ids, and the "last wordpiece" offsets would be [3, 4].
87 | If offsets are provided, the returned tensor will contain only the wordpiece
88 | embeddings at those positions, and (in particular) will contain one embedding
89 | per token. If offsets are not provided, the entire tensor of wordpiece embeddings
90 | will be returned.
91 | token_type_ids : ``torch.LongTensor``, optional
92 | If an input consists of two sentences (as in the BERT paper),
93 | tokens from the first sentence should have type 0 and tokens from
94 | the second sentence should have type 1. If you don't provide this
95 | (the default BertIndexer doesn't) then it's assumed to be all 0s.
96 | """
97 | # pylint: disable=arguments-differ
98 | batch_size, full_seq_len = input_ids.size(0), input_ids.size(-1)
99 | initial_dims = list(input_ids.shape[:-1])
100 |
101 | # The embedder may receive an input tensor that has a sequence length longer than can
102 | # be fit. In that case, we should expect the wordpiece indexer to create padded windows
103 | # of length `self.max_pieces` for us, and have them concatenated into one long sequence.
104 | # E.g., "[CLS] I went to the [SEP] [CLS] to the store to [SEP] ..."
105 | # We can then split the sequence into sub-sequences of that length, and concatenate them
106 | # along the batch dimension so we effectively have one huge batch of partial sentences.
107 | # This can then be fed into BERT without any sentence length issues. Keep in mind
108 | # that the memory consumption can dramatically increase for large batches with extremely
109 | # long sentences.
110 | needs_split = full_seq_len > self.max_pieces
111 | last_window_size = 0
112 | if needs_split:
113 | # Split the flattened list by the window size, `max_pieces`
114 | split_input_ids = list(input_ids.split(self.max_pieces, dim=-1))
115 |
116 | # We want all sequences to be the same length, so pad the last sequence
117 | last_window_size = split_input_ids[-1].size(-1)
118 | padding_amount = self.max_pieces - last_window_size
119 | split_input_ids[-1] = F.pad(split_input_ids[-1], pad=[0, padding_amount], value=0)
120 |
121 | # Now combine the sequences along the batch dimension
122 | input_ids = torch.cat(split_input_ids, dim=0)
123 |
124 | if token_type_ids is None:
125 | token_type_ids = torch.zeros_like(input_ids)
126 |
127 | input_mask = (input_ids != 0).long()
128 |
129 | # input_ids may have extra dimensions, so we reshape down to 2-d
130 | # before calling the BERT model and then reshape back at the end.
131 | all_encoder_layers, _ = self.bert_model(input_ids=util.combine_initial_dims(input_ids),
132 | token_type_ids=util.combine_initial_dims(token_type_ids),
133 | attention_mask=util.combine_initial_dims(input_mask))
134 | all_encoder_layers = torch.stack(all_encoder_layers)
135 |
136 | if needs_split:
137 | # First, unpack the output embeddings into one long sequence again
138 | unpacked_embeddings = torch.split(all_encoder_layers, batch_size, dim=1)
139 | unpacked_embeddings = torch.cat(unpacked_embeddings, dim=2)
140 |
141 | # Next, select indices of the sequence such that it will result in embeddings representing the original
142 | # sentence. To do this, the indices will be the left half of each embedded window sub-sequence. E.g.,
143 | # 0 1 2 3 4 5 6 7 8 9 10 11
144 | # "[CLS] I went to the [SEP] [CLS] to the store to [SEP]"
145 | # should now have indices [0, 1, 2, 7, 8]. The remaining code adds the final window indices (so the
146 | # final indices should be [0, 1, 2, 7, 8, 9, 10, 11])
147 |
148 | full_seq_len = unpacked_embeddings.size(-2) - self.max_pieces
149 | stride = self.max_pieces // 2
150 | select_indices = [0]
151 | select_indices.extend(i for i in range(full_seq_len)
152 | if self.start_tokens - 1 < i % self.max_pieces < stride)
153 |
154 | # Add the final window indices
155 | final_window_size = last_window_size - self.start_tokens + self.end_tokens
156 | select_indices.extend(full_seq_len + i for i in range(self.start_tokens, final_window_size))
157 |
158 | initial_dims.append(len(select_indices))
159 |
160 | recombined_embeddings = unpacked_embeddings[:, :, select_indices]
161 | else:
162 | recombined_embeddings = all_encoder_layers
163 |
164 | # Recombine the outputs of all layers
165 | # (layers, batch_size * d1 * ... * dn, sequence_length, embedding_dim)
166 | # recombined = torch.cat(combined, dim=2)
167 | input_mask = (recombined_embeddings != 0).long()
168 |
169 | if self._scalar_mix is not None:
170 | mix = self._scalar_mix(recombined_embeddings, input_mask)
171 | else:
172 | mix = recombined_embeddings[-1]
173 |
174 | # At this point, mix is (batch_size * d1 * ... * dn, sequence_length, embedding_dim)
175 |
176 | if offsets is None:
177 | # Resize to (batch_size, d1, ..., dn, sequence_length, embedding_dim)
178 | dims = initial_dims if needs_split else input_ids.size()
179 | return util.uncombine_initial_dims(mix, dims)
180 | else:
181 | # offsets is (batch_size, d1, ..., dn, orig_sequence_length)
182 | offsets2d = util.combine_initial_dims(offsets)
183 | # now offsets is (batch_size * d1 * ... * dn, orig_sequence_length)
184 | range_vector = util.get_range_vector(offsets2d.size(0),
185 | device=util.get_device_of(mix)).unsqueeze(1)
186 | # selected embeddings is also (batch_size * d1 * ... * dn, orig_sequence_length)
187 | selected_embeddings = mix[range_vector, offsets2d]
188 |
189 | return util.uncombine_initial_dims(selected_embeddings, offsets.size())
190 |
191 |
192 | @TokenEmbedder.register("bert-pretrained")
193 | class PretrainedBertEmbedder(BertEmbedder):
194 | # pylint: disable=line-too-long
195 | """
196 | Parameters
197 | ----------
198 | pretrained_model: ``str``
199 | Either the name of the pretrained model to use (e.g. 'bert-base-uncased'),
200 | or the path to the .tar.gz file with the model weights.
201 | If the name is a key in the list of pretrained models at
202 | https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L41
203 | the corresponding path will be used; otherwise it will be interpreted as a path or URL.
204 | requires_grad : ``bool``, optional (default = False)
205 | If True, compute gradient of BERT parameters for fine tuning.
206 | top_layer_only: ``bool``, optional (default = ``False``)
207 | If ``True``, then only return the top layer instead of apply the scalar mix.
208 | """
209 | def __init__(self, pretrained_model: str, requires_grad: bool = False, top_layer_only: bool = False) -> None:
210 | model = BertModel.from_pretrained(pretrained_model)
211 |
212 | for param in model.parameters():
213 | param.requires_grad = requires_grad
214 |
215 | super().__init__(bert_model=model, top_layer_only=top_layer_only)
--------------------------------------------------------------------------------
/FlowQA_Attention/README.md:
--------------------------------------------------------------------------------
1 | # FlowQA + Attention Over Flow
2 |
3 |
4 |
5 | ## How we improve FlowQA -- Adding Self-attention
6 |
7 | We have noticed that in the "flow" operation, no attention mechanism is used. We believe that applying attention here should be considered, because not every historical conversation is important. When new questions are posted, it is highly likely that only several previous questions are involved, instead of all. By adding attention in the "flow" operation, we could align new representations with previous useful ones. We would like to call it "attention-over-flow".
8 |
9 | 
10 |
11 |
12 | ## Experiments
13 |
14 | We conducted experiments on Google Cloud, using a 2-CPU, 1 Tesla K80 GPU with 11 GB GPU memory and 60 GB disk memory. We use python 3.6.8 and pytorch 1.0.1. The batch size is set to be 2. The code is based on [momohuang's code on github](https://github.com/momohuang/FlowQA).
15 |
16 | How to run the code is the same as FlowQA. However, using "attention over flow" is set to be default.
17 |
18 | From https://github.com/momohuang/FlowQA we borrow their instructions as follows
19 |
20 | Step 1:
21 | perform the following:
22 |
23 | > pip install -r requirements.txt
24 |
25 | to install all dependent python packages.
26 |
27 | Step 2:
28 | download necessary files using:
29 |
30 | > ./download.sh
31 |
32 | Step 3:
33 | preprocess the data files using:
34 |
35 | > python preprocess_QuAC.py
36 |
37 | Step 4:
38 | run the training code using:
39 |
40 | > python train_QuAC.py
41 |
42 | To specify not using "attention over flow", run:
43 |
44 | > python train_QuAC.py --flow_attention=0
45 |
46 |
47 | ## Results
48 |
49 | The result shows that our attempt slightly improves FlowQA by 0.1 of F-1 value on dev set. We would like to mention that the model converges much faster, as only 10 epochs is used instead of 20.
50 |
51 | 
52 |
53 |
54 | ## References
55 |
56 | [Flowqa: Grasping flow in history for conversational machine comprehensionn.](https://arxiv.org/abs/1810.06683) By Huang H Y, Choi E, Yih W.
57 |
58 | [Quac: Question answering in context.](https://arxiv.org/abs/1808.07036) By Choi E, He H, Iyyer M, et al.
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/FlowQA_Attention/code/QA_model/README.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/FlowQA_Attention/code/QA_model/model_QuAC.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.optim as optim
4 | import torch.nn.functional as F
5 | import numpy as np
6 | import logging
7 |
8 | from torch.nn import Parameter
9 | from torch.autograd import Variable
10 | from .utils import AverageMeter
11 | from .detail_model import FlowQA
12 |
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | class QAModel(object):
17 | """
18 | High level model that handles intializing the underlying network
19 | architecture, saving, updating examples, and predicting examples.
20 | """
21 |
22 | def __init__(self, opt, embedding=None, state_dict=None):
23 | # Book-keeping.
24 | self.opt = opt
25 | self.updates = state_dict['updates'] if state_dict else 0
26 | self.eval_embed_transfer = True
27 | self.train_loss = AverageMeter()
28 |
29 | # Building network.
30 | self.network = FlowQA(opt, embedding)
31 | if state_dict:
32 | new_state = set(self.network.state_dict().keys())
33 | for k in list(state_dict['network'].keys()):
34 | if k not in new_state:
35 | del state_dict['network'][k]
36 | self.network.load_state_dict(state_dict['network'])
37 |
38 | # Building optimizer.
39 | parameters = [p for p in self.network.parameters() if p.requires_grad]
40 | if opt['optimizer'] == 'sgd':
41 | self.optimizer = optim.SGD(parameters, opt['learning_rate'],
42 | momentum=opt['momentum'],
43 | weight_decay=opt['weight_decay'])
44 | elif opt['optimizer'] == 'adamax':
45 | self.optimizer = optim.Adamax(parameters,
46 | weight_decay=opt['weight_decay'])
47 | elif opt['optimizer'] == 'adadelta':
48 | self.optimizer = optim.Adadelta(parameters, rho=0.95, weight_decay=opt['weight_decay'])
49 | else:
50 | raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer'])
51 | if state_dict:
52 | self.optimizer.load_state_dict(state_dict['optimizer'])
53 |
54 | if opt['fix_embeddings']:
55 | wvec_size = 0
56 | else:
57 | wvec_size = (opt['vocab_size'] - opt['tune_partial']) * opt['embedding_dim']
58 | self.total_param = sum([p.nelement() for p in parameters]) - wvec_size
59 |
60 | def update(self, batch):
61 | # Train mode
62 | self.network.train()
63 | torch.set_grad_enabled(True)
64 |
65 | # Transfer to GPU
66 | if self.opt['cuda']:
67 | inputs = [e.cuda(non_blocking=True) for e in batch[:9] + batch[17:]]
68 | overall_mask = batch[9].cuda(non_blocking=True)
69 |
70 | answer_s = batch[10].cuda(non_blocking=True)
71 | answer_e = batch[11].cuda(non_blocking=True)
72 | answer_c = batch[12].cuda(non_blocking=True)
73 | else:
74 | inputs = [e for e in batch[:9] + batch[17:]]
75 | overall_mask = batch[9]
76 |
77 | answer_s = batch[10]
78 | answer_e = batch[11]
79 | answer_c = batch[12]
80 |
81 | # Run forward
82 | # output: [batch_size, question_num, context_len], [batch_size, question_num]
83 | score_s, score_e, score_no_answ = self.network(*inputs)
84 |
85 | # Compute loss and accuracies
86 | loss = self.opt['elmo_lambda'] * (self.network.elmo.scalar_mix_0.scalar_parameters[0] ** 2
87 | + self.network.elmo.scalar_mix_0.scalar_parameters[1] ** 2
88 | + self.network.elmo.scalar_mix_0.scalar_parameters[2] ** 2) # ELMo L2 regularization
89 | all_no_answ = (answer_c == 0)
90 | answer_s.masked_fill_(all_no_answ, -100) # ignore_index is -100 in F.cross_entropy
91 | answer_e.masked_fill_(all_no_answ, -100)
92 |
93 | for i in range(overall_mask.size(0)):
94 | q_num = sum(overall_mask[i]) # the true question number for this sampled context
95 |
96 | target_s = answer_s[i, :q_num] # Size: q_num
97 | target_e = answer_e[i, :q_num]
98 | target_c = answer_c[i, :q_num]
99 | target_no_answ = all_no_answ[i, :q_num]
100 |
101 | # single_loss is averaged across q_num
102 | if self.opt['question_normalize']:
103 | single_loss = F.binary_cross_entropy_with_logits(score_no_answ[i, :q_num], target_no_answ.float()) * q_num.item() / 8.0
104 | single_loss = single_loss + F.cross_entropy(score_s[i, :q_num], target_s) * (q_num - sum(target_no_answ)).item() / 7.0
105 | single_loss = single_loss + F.cross_entropy(score_e[i, :q_num], target_e) * (q_num - sum(target_no_answ)).item() / 7.0
106 | else:
107 | single_loss = F.binary_cross_entropy_with_logits(score_no_answ[i, :q_num], target_no_answ.float()) \
108 | + F.cross_entropy(score_s[i, :q_num], target_s) + F.cross_entropy(score_e[i, :q_num], target_e)
109 |
110 | loss = loss + (single_loss / overall_mask.size(0))
111 | self.train_loss.update(loss.item(), overall_mask.size(0))
112 |
113 | # Clear gradients and run backward
114 | self.optimizer.zero_grad()
115 | loss.backward()
116 |
117 | # Clip gradients
118 | torch.nn.utils.clip_grad_norm_(self.network.parameters(),
119 | self.opt['grad_clipping'])
120 |
121 | # Update parameters
122 | self.optimizer.step()
123 | self.updates += 1
124 |
125 | # Reset any partially fixed parameters (e.g. rare words)
126 | self.reset_embeddings()
127 | self.eval_embed_transfer = True
128 |
129 | def predict(self, batch, No_Ans_Threshold=None):
130 | # Eval mode
131 | self.network.eval()
132 | torch.set_grad_enabled(False)
133 |
134 | # Transfer trained embedding to evaluation embedding
135 | if self.eval_embed_transfer:
136 | self.update_eval_embed()
137 | self.eval_embed_transfer = False
138 |
139 | # Transfer to GPU
140 | if self.opt['cuda']:
141 | inputs = [e.cuda(non_blocking=True) for e in batch[:9] + batch[17:]]
142 | else:
143 | inputs = [e for e in batch[:9] + batch[17:]]
144 |
145 | # Run forward
146 | # output: [batch_size, question_num, context_len], [batch_size, question_num]
147 | score_s, score_e, score_no_answ = self.network(*inputs)
148 | score_s = F.softmax(score_s, dim=2)
149 | score_e = F.softmax(score_e, dim=2)
150 |
151 | # Transfer to CPU/normal tensors for numpy ops
152 | score_s = score_s.data.cpu()
153 | score_e = score_e.data.cpu()
154 | score_no_answ = score_no_answ.data.cpu()
155 |
156 | # Get argmax text spans
157 | text = batch[13]
158 | spans = batch[14]
159 | overall_mask = batch[9]
160 |
161 | predictions, no_ans_scores = [], []
162 | max_len = self.opt['max_len'] or score_s.size(2)
163 |
164 | for i in range(overall_mask.size(0)):
165 | dialog_pred, dialog_noans = [], []
166 |
167 | for j in range(overall_mask.size(1)):
168 | if overall_mask[i, j] == 0: # this dialog has ended
169 | break
170 |
171 | dialog_noans.append(score_no_answ[i, j].item())
172 | if No_Ans_Threshold is not None and score_no_answ[i, j] > No_Ans_Threshold:
173 | dialog_pred.append("CANNOTANSWER")
174 | else:
175 | scores = torch.ger(score_s[i, j], score_e[i, j])
176 | scores.triu_().tril_(max_len - 1)
177 | scores = scores.numpy()
178 | s_idx, e_idx = np.unravel_index(np.argmax(scores), scores.shape)
179 |
180 | s_offset, e_offset = spans[i][s_idx][0], spans[i][e_idx][1]
181 | dialog_pred.append(text[i][s_offset:e_offset])
182 |
183 | predictions.append(dialog_pred)
184 | no_ans_scores.append(dialog_noans)
185 |
186 | return predictions, no_ans_scores # list of (list of strings), list of (list of floats)
187 |
188 | # allow the evaluation embedding be larger than training embedding
189 | # this is helpful if we have pretrained word embeddings
190 | def setup_eval_embed(self, eval_embed, padding_idx = 0):
191 | # eval_embed should be a supermatrix of training embedding
192 | self.network.eval_embed = nn.Embedding(eval_embed.size(0),
193 | eval_embed.size(1),
194 | padding_idx = padding_idx)
195 | self.network.eval_embed.weight.data = eval_embed
196 | for p in self.network.eval_embed.parameters():
197 | p.requires_grad = False
198 | self.eval_embed_transfer = True
199 |
200 | if hasattr(self.network, 'CoVe'):
201 | self.network.CoVe.setup_eval_embed(eval_embed)
202 |
203 | def update_eval_embed(self):
204 | # update evaluation embedding to trained embedding
205 | if self.opt['tune_partial'] > 0:
206 | offset = self.opt['tune_partial']
207 | self.network.eval_embed.weight.data[0:offset] \
208 | = self.network.embedding.weight.data[0:offset]
209 | else:
210 | offset = 10
211 | self.network.eval_embed.weight.data[0:offset] \
212 | = self.network.embedding.weight.data[0:offset]
213 |
214 | def reset_embeddings(self):
215 | # Reset fixed embeddings to original value
216 | if self.opt['tune_partial'] > 0:
217 | offset = self.opt['tune_partial']
218 | if offset < self.network.embedding.weight.data.size(0):
219 | self.network.embedding.weight.data[offset:] \
220 | = self.network.fixed_embedding
221 |
222 | def get_pretrain(self, state_dict):
223 | own_state = self.network.state_dict()
224 | for name, param in state_dict.items():
225 | if name not in own_state:
226 | continue
227 | if isinstance(param, Parameter):
228 | param = param.data
229 | try:
230 | own_state[name].copy_(param)
231 | except:
232 | print("Skip", name)
233 | continue
234 |
235 | def save(self, filename, epoch):
236 | params = {
237 | 'state_dict': {
238 | 'network': self.network.state_dict(),
239 | 'optimizer': self.optimizer.state_dict(),
240 | 'updates': self.updates # how many updates
241 | },
242 | 'config': self.opt,
243 | 'epoch': epoch
244 | }
245 | try:
246 | torch.save(params, filename)
247 | logger.info('model saved to {}'.format(filename))
248 | except BaseException:
249 | logger.warn('[ WARN: Saving failed... continuing anyway. ]')
250 |
251 | def save_for_predict(self, filename, epoch):
252 | network_state = dict([(k, v) for k, v in self.network.state_dict().items() if k[0:4] != 'CoVe'])
253 | if 'eval_embed.weight' in network_state:
254 | del network_state['eval_embed.weight']
255 | if 'fixed_embedding' in network_state:
256 | del network_state['fixed_embedding']
257 | params = {
258 | 'state_dict': {'network': network_state},
259 | 'config': self.opt,
260 | }
261 | try:
262 | torch.save(params, filename)
263 | logger.info('model saved to {}'.format(filename))
264 | except BaseException:
265 | logger.warn('[ WARN: Saving failed... continuing anyway. ]')
266 |
267 | def cuda(self):
268 | self.network.cuda()
269 |
--------------------------------------------------------------------------------
/FlowQA_Attention/code/QA_model/utils.py:
--------------------------------------------------------------------------------
1 | class AverageMeter(object):
2 | """Computes and stores the average and current value."""
3 | def __init__(self):
4 | self.reset()
5 |
6 | def reset(self):
7 | self.val = 0
8 | self.avg = 0
9 | self.sum = 0
10 | self.count = 0
11 |
12 | def update(self, val, n=1):
13 | self.val = val
14 | self.sum += val * n
15 | self.count += n
16 | self.avg = self.sum / self.count
17 |
--------------------------------------------------------------------------------
/FlowQA_Attention/code/README.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/FlowQA_Attention/code/predict_QuAC.py:
--------------------------------------------------------------------------------
1 | import re
2 | import os
3 | import sys
4 | import random
5 | import string
6 | import logging
7 | import argparse
8 | from os.path import basename
9 | from shutil import copyfile
10 | from datetime import datetime
11 | from collections import Counter
12 | import torch
13 | import msgpack
14 | import pickle
15 | import pandas as pd
16 | import numpy as np
17 | from QA_model.model_QuAC import QAModel
18 | from general_utils import score, BatchGen_QuAC, find_best_score_and_thresh
19 |
20 | parser = argparse.ArgumentParser(
21 | description='Predict using a Dialog QA model.'
22 | )
23 | parser.add_argument('--dev_dir', default='QuAC_data/')
24 |
25 | parser.add_argument('-o', '--output_dir', default='pred_out/')
26 | parser.add_argument('--number', type=int, default=-1, help='id of the current prediction')
27 | parser.add_argument('-m', '--model', default='',
28 | help='testing model pathname, e.g. "models/checkpoint_epoch_11.pt"')
29 |
30 | parser.add_argument('-bs', '--batch_size', type=int, default=4)
31 | parser.add_argument('--no_ans', type=float, default=0)
32 | parser.add_argument('--min_f1', type=float, default=0.4)
33 |
34 | parser.add_argument('--show', type=int, default=3)
35 | parser.add_argument('--seed', type=int, default=1023,
36 | help='random seed for data shuffling, dropout, etc.')
37 | parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available(),
38 | help='whether to use GPU acceleration.')
39 |
40 | args = parser.parse_args()
41 | if args.model == '':
42 | print("model file is not provided")
43 | sys.exit(-1)
44 | if args.model[-3:] != '.pt':
45 | print("does not recognize the model file")
46 | sys.exit(-1)
47 |
48 | # create prediction output dir
49 | os.makedirs(args.output_dir, exist_ok=True)
50 | # count the number of prediction files
51 | if args.number == -1:
52 | args.number = len(os.listdir(args.output_dir))+1
53 | args.output = args.output_dir + 'pred' + str(args.number) + '.pckl'
54 |
55 | random.seed(args.seed)
56 | np.random.seed(args.seed)
57 | torch.manual_seed(args.seed)
58 | if args.cuda:
59 | torch.cuda.manual_seed_all(args.seed)
60 |
61 | log = logging.getLogger(__name__)
62 | log.setLevel(logging.DEBUG)
63 | ch = logging.StreamHandler(sys.stdout)
64 | ch.setLevel(logging.INFO)
65 | formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S')
66 | ch.setFormatter(formatter)
67 | log.addHandler(ch)
68 |
69 | def main():
70 | log.info('[program starts.]')
71 | checkpoint = torch.load(args.model)
72 | opt = checkpoint['config']
73 | opt['task_name'] = 'QuAC'
74 | opt['cuda'] = args.cuda
75 | opt['seed'] = args.seed
76 | if opt.get('disperse_flow') is None:
77 | opt['disperse_flow'] = False
78 | if opt.get('rationale_lambda') is None:
79 | opt['rationale_lambda'] = 0.0
80 | if opt.get('no_dialog_flow') is None:
81 | opt['no_dialog_flow'] = False
82 | if opt.get('do_hierarchical_query') is None:
83 | opt['do_hierarchical_query'] = False
84 | state_dict = checkpoint['state_dict']
85 | log.info('[model loaded.]')
86 |
87 | test, test_embedding, test_answer = load_dev_data(opt)
88 | model = QAModel(opt, state_dict = state_dict)
89 | log.info('[Data loaded.]')
90 |
91 | model.setup_eval_embed(test_embedding)
92 |
93 | if args.cuda:
94 | model.cuda()
95 |
96 | batches = BatchGen_QuAC(test, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, dialog_ctx=opt['explicit_dialog_ctx'], use_dialog_act=opt['use_dialog_act'], precompute_elmo=opt['elmo_batch_size'] // args.batch_size)
97 | sample_idx = random.sample(range(len(batches)), args.show)
98 |
99 | predictions = []
100 | no_ans_scores = []
101 | for i, batch in enumerate(batches):
102 | prediction, noans = model.predict(batch, No_Ans_Threshold=args.no_ans)
103 | predictions.extend(prediction)
104 | no_ans_scores.extend(noans)
105 |
106 | if not (i in sample_idx):
107 | continue
108 |
109 | print("Context: ", batch[-4][0])
110 | for j in range(len(batch[-2][0])):
111 | print("Q: ", batch[-2][0][j])
112 | print("A: ", prediction[0][j])
113 | print(" True A: ", batch[-1][0][j], "| Follow up" if batch[-6][0][j].item() // 10 else "| Don't follow up")
114 | print(" Val. A: ", test_answer[args.batch_size * i][j])
115 | print("")
116 |
117 |
118 | pred_out = {'predictions': predictions, 'no_ans_scores': no_ans_scores}
119 | with open(args.output, 'wb') as f:
120 | pickle.dump(pred_out, f)
121 |
122 | f1, h_f1, HEQ_Q, HEQ_D = score(predictions, test_answer, min_F1=args.min_f1)
123 | log.warning("Test F1: {:.2f}, HEQ_Q: {:.2f}, HEQ_D: {:.2f}".format(f1, HEQ_Q, HEQ_D))
124 |
125 | def load_dev_data(opt): # can be extended to true test set
126 | with open(os.path.join(args.dev_dir, 'dev_meta.msgpack'), 'rb') as f:
127 | meta = msgpack.load(f, encoding='utf8')
128 | embedding = torch.Tensor(meta['embedding'])
129 | assert opt['embedding_dim'] == embedding.size(1)
130 |
131 | with open(os.path.join(args.dev_dir, 'dev_data.msgpack'), 'rb') as f:
132 | data = msgpack.load(f, encoding='utf8')
133 |
134 | assert opt['num_features'] == len(data['context_features'][0][0]) + opt['explicit_dialog_ctx'] * (opt['use_dialog_act']*3 + 2)
135 |
136 | dev = {'context': list(zip(
137 | data['context_ids'],
138 | data['context_tags'],
139 | data['context_ents'],
140 | data['context'],
141 | data['context_span'],
142 | data['1st_question'],
143 | data['context_tokenized'])),
144 | 'qa': list(zip(
145 | data['question_CID'],
146 | data['question_ids'],
147 | data['context_features'],
148 | data['answer_start'],
149 | data['answer_end'],
150 | data['answer_choice'],
151 | data['question'],
152 | data['answer'],
153 | data['question_tokenized']))
154 | }
155 |
156 | dev_answer = []
157 | for i, CID in enumerate(data['question_CID']):
158 | if len(dev_answer) <= CID:
159 | dev_answer.append([])
160 | dev_answer[CID].append(data['all_answer'][i])
161 |
162 | return dev, embedding, dev_answer
163 |
164 | if __name__ == '__main__':
165 | main()
166 |
--------------------------------------------------------------------------------
/FlowQA_Attention/code/preprocess_QuAC.py:
--------------------------------------------------------------------------------
1 | import re
2 | import json
3 | import spacy
4 | import msgpack
5 | import unicodedata
6 | import numpy as np
7 | import pandas as pd
8 | import argparse
9 | import collections
10 | import multiprocessing
11 | import logging
12 | import random
13 | from allennlp.modules.elmo import batch_to_ids
14 | from general_utils import flatten_json, normalize_text, build_embedding, load_glove_vocab, pre_proc, get_context_span, find_answer_span, feature_gen, token2id
15 |
16 | parser = argparse.ArgumentParser(
17 | description='Preprocessing train + dev files, about 20 minutes to run on Servers.'
18 | )
19 | parser.add_argument('--wv_file', default='glove/glove.840B.300d.txt',
20 | help='path to word vector file.')
21 | parser.add_argument('--wv_dim', type=int, default=300,
22 | help='word vector dimension.')
23 | parser.add_argument('--sort_all', action='store_true',
24 | help='sort the vocabulary by frequencies of all words.'
25 | 'Otherwise consider question words first.')
26 | parser.add_argument('--threads', type=int, default=multiprocessing.cpu_count(),
27 | help='number of threads for preprocessing.')
28 | parser.add_argument('--no_match', action='store_true',
29 | help='do not extract the three exact matching features.')
30 | parser.add_argument('--seed', type=int, default=1023,
31 | help='random seed for data shuffling, embedding init, etc.')
32 |
33 |
34 | args = parser.parse_args()
35 | trn_file = 'QuAC_data/train.json'
36 | dev_file = 'QuAC_data/dev.json'
37 | wv_file = args.wv_file
38 | wv_dim = args.wv_dim
39 | nlp = spacy.load('en', disable=['parser'])
40 |
41 | random.seed(args.seed)
42 | np.random.seed(args.seed)
43 |
44 | logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG,
45 | datefmt='%m/%d/%Y %I:%M:%S')
46 | log = logging.getLogger(__name__)
47 |
48 | log.info('start data preparing... (using {} threads)'.format(args.threads))
49 |
50 | glove_vocab = load_glove_vocab(wv_file, wv_dim) # return a "set" of vocabulary
51 | log.info('glove loaded.')
52 |
53 | #===============================================================
54 | #=================== Work on training data =====================
55 | #===============================================================
56 |
57 | def proc_train(ith, article):
58 | rows = []
59 |
60 | for paragraph in article['paragraphs']:
61 | context = paragraph['context']
62 | for qa in paragraph['qas']:
63 | question = qa['question']
64 | answers = qa['orig_answer']
65 |
66 | answer = answers['text']
67 | answer_start = answers['answer_start']
68 | answer_end = answers['answer_start'] + len(answers['text'])
69 | answer_choice = 0 if answer == 'CANNOTANSWER' else\
70 | 1 if qa['yesno'] == 'y' else\
71 | 2 if qa['yesno'] == 'n' else\
72 | 3 # Not a yes/no question
73 | if answer_choice != 0:
74 | """
75 | 0: Do not ask a follow up question!
76 | 1: Definitely ask a follow up question!
77 | 2: Not too important, but you can ask a follow up.
78 | """
79 | answer_choice += 10 * (0 if qa['followup'] == "n" else\
80 | 1 if qa['followup'] == "y" else\
81 | 2)
82 | else:
83 | answer_start, answer_end = -1, -1
84 | rows.append((ith, question, answer, answer_start, answer_end, answer_choice))
85 | return rows, context
86 |
87 | train, train_context = flatten_json(trn_file, proc_train)
88 | train = pd.DataFrame(train, columns=['context_idx', 'question', 'answer',
89 | 'answer_start', 'answer_end', 'answer_choice'])
90 | log.info('train json data flattened.')
91 |
92 | print(train)
93 |
94 | trC_iter = (pre_proc(c) for c in train_context)
95 | trQ_iter = (pre_proc(q) for q in train.question)
96 | trC_docs = [doc for doc in nlp.pipe(trC_iter, batch_size=64, n_threads=args.threads)]
97 | trQ_docs = [doc for doc in nlp.pipe(trQ_iter, batch_size=64, n_threads=args.threads)]
98 |
99 | # tokens
100 | trC_tokens = [[normalize_text(w.text) for w in doc] for doc in trC_docs]
101 | trQ_tokens = [[normalize_text(w.text) for w in doc] for doc in trQ_docs]
102 | trC_unnorm_tokens = [[w.text for w in doc] for doc in trC_docs]
103 | log.info('All tokens for training are obtained.')
104 |
105 | train_context_span = [get_context_span(a, b) for a, b in zip(train_context, trC_unnorm_tokens)]
106 |
107 | ans_st_token_ls, ans_end_token_ls = [], []
108 | for ans_st, ans_end, idx in zip(train.answer_start, train.answer_end, train.context_idx):
109 | ans_st_token, ans_end_token = find_answer_span(train_context_span[idx], ans_st, ans_end)
110 | ans_st_token_ls.append(ans_st_token)
111 | ans_end_token_ls.append(ans_end_token)
112 |
113 | train['answer_start_token'], train['answer_end_token'] = ans_st_token_ls, ans_end_token_ls
114 | initial_len = len(train)
115 | train.dropna(inplace=True) # modify self DataFrame
116 | log.info('drop {0}/{1} inconsistent samples.'.format(initial_len - len(train), initial_len))
117 | log.info('answer span for training is generated.')
118 |
119 | # features
120 | trC_tags, trC_ents, trC_features = feature_gen(trC_docs, train.context_idx, trQ_docs, args.no_match)
121 | log.info('features for training is generated: {}, {}, {}'.format(len(trC_tags), len(trC_ents), len(trC_features)))
122 |
123 | def build_train_vocab(questions, contexts): # vocabulary will also be sorted accordingly
124 | if args.sort_all:
125 | counter = collections.Counter(w for doc in questions + contexts for w in doc)
126 | vocab = sorted([t for t in counter if t in glove_vocab], key=counter.get, reverse=True)
127 | else:
128 | counter_c = collections.Counter(w for doc in contexts for w in doc)
129 | counter_q = collections.Counter(w for doc in questions for w in doc)
130 | counter = counter_c + counter_q
131 | vocab = sorted([t for t in counter_q if t in glove_vocab], key=counter_q.get, reverse=True)
132 | vocab += sorted([t for t in counter_c.keys() - counter_q.keys() if t in glove_vocab],
133 | key=counter.get, reverse=True)
134 | total = sum(counter.values())
135 | matched = sum(counter[t] for t in vocab)
136 | log.info('vocab {1}/{0} OOV {2}/{3} ({4:.4f}%)'.format(
137 | len(counter), len(vocab), (total - matched), total, (total - matched) / total * 100))
138 | vocab.insert(0, "")
139 | vocab.insert(1, "")
140 | vocab.insert(2, "")
141 | vocab.insert(3, "")
142 | return vocab
143 |
144 | # vocab
145 | tr_vocab = build_train_vocab(trQ_tokens, trC_tokens)
146 | trC_ids = token2id(trC_tokens, tr_vocab, unk_id=1)
147 | trQ_ids = token2id(trQ_tokens, tr_vocab, unk_id=1)
148 | trQ_tokens = [[""] + doc + [""] for doc in trQ_tokens]
149 | trQ_ids = [[2] + qsent + [3] for qsent in trQ_ids]
150 | print(trQ_ids[:10])
151 | # tags
152 | vocab_tag = [''] + list(nlp.tagger.labels)
153 | trC_tag_ids = token2id(trC_tags, vocab_tag)
154 | # entities
155 | vocab_ent = list(set([ent for sent in trC_ents for ent in sent]))
156 | trC_ent_ids = token2id(trC_ents, vocab_ent, unk_id=0)
157 |
158 | log.info('Found {} POS tags.'.format(len(vocab_tag)))
159 | log.info('Found {} entity tags: {}'.format(len(vocab_ent), vocab_ent))
160 | log.info('vocabulary for training is built.')
161 |
162 | tr_embedding = build_embedding(wv_file, tr_vocab, wv_dim)
163 | log.info('got embedding matrix for training.')
164 |
165 | # don't store row name in csv
166 | #train.to_csv('QuAC_data/train.csv', index=False, encoding='utf8')
167 |
168 | meta = {
169 | 'vocab': tr_vocab,
170 | 'embedding': tr_embedding.tolist()
171 | }
172 | with open('QuAC_data/train_meta.msgpack', 'wb') as f:
173 | msgpack.dump(meta, f)
174 |
175 | prev_CID, first_question = -1, []
176 | for i, CID in enumerate(train.context_idx):
177 | if not (CID == prev_CID):
178 | first_question.append(i)
179 | prev_CID = CID
180 |
181 | result = {
182 | 'question_ids': trQ_ids,
183 | 'context_ids': trC_ids,
184 | 'context_features': trC_features, # exact match, tf
185 | 'context_tags': trC_tag_ids, # POS tagging
186 | 'context_ents': trC_ent_ids, # Entity recognition
187 | 'context': train_context,
188 | 'context_span': train_context_span,
189 | '1st_question': first_question,
190 | 'question_CID': train.context_idx.tolist(),
191 | 'question': train.question.tolist(),
192 | 'answer': train.answer.tolist(),
193 | 'answer_start': train.answer_start_token.tolist(),
194 | 'answer_end': train.answer_end_token.tolist(),
195 | 'answer_choice': train.answer_choice.tolist(),
196 | 'context_tokenized': trC_tokens,
197 | 'question_tokenized': trQ_tokens
198 | }
199 | with open('QuAC_data/train_data.msgpack', 'wb') as f:
200 | msgpack.dump(result, f)
201 |
202 | log.info('saved training to disk.')
203 |
204 | #==========================================================
205 | #=================== Work on dev data =====================
206 | #==========================================================
207 |
208 | def proc_dev(ith, article):
209 | rows = []
210 |
211 | for paragraph in article['paragraphs']:
212 | context = paragraph['context']
213 | for qa in paragraph['qas']:
214 | question = qa['question']
215 | answers = qa['orig_answer']
216 |
217 | answer = answers['text']
218 | answer_start = answers['answer_start']
219 | answer_end = answers['answer_start'] + len(answers['text'])
220 | answer_choice = 0 if answer == 'CANNOTANSWER' else\
221 | 1 if qa['yesno'] == 'y' else\
222 | 2 if qa['yesno'] == 'n' else\
223 | 3 # Not a yes/no question
224 | if answer_choice != 0:
225 | """
226 | 0: Do not ask a follow up question!
227 | 1: Definitely ask a follow up question!
228 | 2: Not too important, but you can ask a follow up.
229 | """
230 | answer_choice += 10 * (0 if qa['followup'] == "n" else\
231 | 1 if qa['followup'] == "y" else\
232 | 2)
233 | else:
234 | answer_start, answer_end = -1, -1
235 |
236 | ans_ls = []
237 | for ans in qa['answers']:
238 | ans_ls.append(ans['text'])
239 |
240 | rows.append((ith, question, answer, answer_start, answer_end, answer_choice, ans_ls))
241 | return rows, context
242 |
243 | dev, dev_context = flatten_json(dev_file, proc_dev)
244 | dev = pd.DataFrame(dev, columns=['context_idx', 'question', 'answer',
245 | 'answer_start', 'answer_end', 'answer_choice', 'all_answer'])
246 | log.info('dev json data flattened.')
247 |
248 | print(dev)
249 |
250 | devC_iter = (pre_proc(c) for c in dev_context)
251 | devQ_iter = (pre_proc(q) for q in dev.question)
252 | devC_docs = [doc for doc in nlp.pipe(
253 | devC_iter, batch_size=64, n_threads=args.threads)]
254 | devQ_docs = [doc for doc in nlp.pipe(
255 | devQ_iter, batch_size=64, n_threads=args.threads)]
256 |
257 | # tokens
258 | devC_tokens = [[normalize_text(w.text) for w in doc] for doc in devC_docs]
259 | devQ_tokens = [[normalize_text(w.text) for w in doc] for doc in devQ_docs]
260 | devC_unnorm_tokens = [[w.text for w in doc] for doc in devC_docs]
261 | log.info('All tokens for dev are obtained.')
262 |
263 | dev_context_span = [get_context_span(a, b) for a, b in zip(dev_context, devC_unnorm_tokens)]
264 | log.info('context span for dev is generated.')
265 |
266 | ans_st_token_ls, ans_end_token_ls = [], []
267 | for ans_st, ans_end, idx in zip(dev.answer_start, dev.answer_end, dev.context_idx):
268 | ans_st_token, ans_end_token = find_answer_span(dev_context_span[idx], ans_st, ans_end)
269 | ans_st_token_ls.append(ans_st_token)
270 | ans_end_token_ls.append(ans_end_token)
271 |
272 | dev['answer_start_token'], dev['answer_end_token'] = ans_st_token_ls, ans_end_token_ls
273 | initial_len = len(dev)
274 | dev.dropna(inplace=True) # modify self DataFrame
275 | log.info('drop {0}/{1} inconsistent samples.'.format(initial_len - len(dev), initial_len))
276 | log.info('answer span for dev is generated.')
277 |
278 | # features
279 | devC_tags, devC_ents, devC_features = feature_gen(devC_docs, dev.context_idx, devQ_docs, args.no_match)
280 | log.info('features for dev is generated: {}, {}, {}'.format(len(devC_tags), len(devC_ents), len(devC_features)))
281 |
282 | def build_dev_vocab(questions, contexts): # most vocabulary comes from tr_vocab
283 | existing_vocab = set(tr_vocab)
284 | new_vocab = list(set([w for doc in questions + contexts for w in doc if w not in existing_vocab and w in glove_vocab]))
285 | vocab = tr_vocab + new_vocab
286 | log.info('train vocab {0}, total vocab {1}'.format(len(tr_vocab), len(vocab)))
287 | return vocab
288 |
289 | # vocab
290 | dev_vocab = build_dev_vocab(devQ_tokens, devC_tokens) # tr_vocab is a subset of dev_vocab
291 | devC_ids = token2id(devC_tokens, dev_vocab, unk_id=1)
292 | devQ_ids = token2id(devQ_tokens, dev_vocab, unk_id=1)
293 | devQ_tokens = [[""] + doc + [""] for doc in devQ_tokens]
294 | devQ_ids = [[2] + qsent + [3] for qsent in devQ_ids]
295 | print(devQ_ids[:10])
296 | # tags
297 | devC_tag_ids = token2id(devC_tags, vocab_tag) # vocab_tag same as training
298 | # entities
299 | devC_ent_ids = token2id(devC_ents, vocab_ent, unk_id=0) # vocab_ent same as training
300 | log.info('vocabulary for dev is built.')
301 |
302 | dev_embedding = build_embedding(wv_file, dev_vocab, wv_dim)
303 | # tr_embedding is a submatrix of dev_embedding
304 | log.info('got embedding matrix for dev.')
305 |
306 | # don't store row name in csv
307 | #dev.to_csv('QuAC_data/dev.csv', index=False, encoding='utf8')
308 |
309 | meta = {
310 | 'vocab': dev_vocab,
311 | 'embedding': dev_embedding.tolist()
312 | }
313 | with open('QuAC_data/dev_meta.msgpack', 'wb') as f:
314 | msgpack.dump(meta, f)
315 |
316 | prev_CID, first_question = -1, []
317 | for i, CID in enumerate(dev.context_idx):
318 | if not (CID == prev_CID):
319 | first_question.append(i)
320 | prev_CID = CID
321 |
322 | result = {
323 | 'question_ids': devQ_ids,
324 | 'context_ids': devC_ids,
325 | 'context_features': devC_features, # exact match, tf
326 | 'context_tags': devC_tag_ids, # POS tagging
327 | 'context_ents': devC_ent_ids, # Entity recognition
328 | 'context': dev_context,
329 | 'context_span': dev_context_span,
330 | '1st_question': first_question,
331 | 'question_CID': dev.context_idx.tolist(),
332 | 'question': dev.question.tolist(),
333 | 'answer': dev.answer.tolist(),
334 | 'answer_start': dev.answer_start_token.tolist(),
335 | 'answer_end': dev.answer_end_token.tolist(),
336 | 'answer_choice': dev.answer_choice.tolist(),
337 | 'all_answer': dev.all_answer.tolist(),
338 | 'context_tokenized': devC_tokens,
339 | 'question_tokenized': devQ_tokens
340 | }
341 | with open('QuAC_data/dev_data.msgpack', 'wb') as f:
342 | msgpack.dump(result, f)
343 |
344 | log.info('saved dev to disk.')
345 |
--------------------------------------------------------------------------------
/FlowQA_Attention/code/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | msgpack-python
4 | spacy
5 | allennlp
6 | torch
7 |
--------------------------------------------------------------------------------
/FlowQA_Attention/code/train_QuAC.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import sys
4 | import random
5 | import string
6 | import logging
7 | import argparse
8 | from shutil import copyfile
9 | from datetime import datetime
10 | from collections import Counter
11 | import torch
12 | import msgpack
13 | import pandas as pd
14 | import numpy as np
15 | from QA_model.model_QuAC import QAModel
16 | from general_utils import find_best_score_and_thresh, BatchGen_QuAC
17 |
18 | parser = argparse.ArgumentParser(
19 | description='Train a Dialog QA model.'
20 | )
21 |
22 | # system
23 | parser.add_argument('--task_name', default='QuAC')
24 | parser.add_argument('--name', default='', help='additional name of the current run')
25 | parser.add_argument('--log_file', default='output.log',
26 | help='path for log file.')
27 | parser.add_argument('--log_per_updates', type=int, default=20,
28 | help='log model loss per x updates (mini-batches).')
29 |
30 | parser.add_argument('--train_dir', default='QuAC_data/')
31 | parser.add_argument('--dev_dir', default='QuAC_data/')
32 | parser.add_argument('--answer_type_num', type=int, default=1)
33 |
34 | parser.add_argument('--model_dir', default='models',
35 | help='path to store saved models.')
36 | parser.add_argument('--eval_per_epoch', type=int, default=1,
37 | help='perform evaluation per x epoches.')
38 | parser.add_argument('--MTLSTM_path', default='glove/MT-LSTM.pth')
39 | parser.add_argument('--save_all', dest='save_best_only', action='store_false', help='save all models.')
40 | parser.add_argument('--do_not_save', action='store_true', help='don\'t save any model')
41 | parser.add_argument('--save_for_predict', action='store_true')
42 | parser.add_argument('--seed', type=int, default=1023,
43 | help='random seed for data shuffling, dropout, etc.')
44 | parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available(),
45 | help='whether to use GPU acceleration.')
46 | # training
47 | parser.add_argument('-e', '--epoches', type=int, default=30)
48 | parser.add_argument('-bs', '--batch_size', type=int, default=3)
49 | parser.add_argument('-ebs', '--elmo_batch_size', type=int, default=12)
50 | parser.add_argument('-rs', '--resume', default='',
51 | help='previous model pathname. '
52 | 'e.g. "models/checkpoint_epoch_11.pt"')
53 | parser.add_argument('-ro', '--resume_options', action='store_true',
54 | help='use previous model options, ignore the cli and defaults.')
55 | parser.add_argument('-rlr', '--reduce_lr', type=float, default=0.,
56 | help='reduce initial (resumed) learning rate by this factor.')
57 | parser.add_argument('-op', '--optimizer', default='adamax',
58 | help='supported optimizer: adamax, sgd, adadelta, adam')
59 | parser.add_argument('-gc', '--grad_clipping', type=float, default=10)
60 | parser.add_argument('-wd', '--weight_decay', type=float, default=0)
61 | parser.add_argument('-lr', '--learning_rate', type=float, default=0.1,
62 | help='only applied to SGD.')
63 | parser.add_argument('-mm', '--momentum', type=float, default=0,
64 | help='only applied to SGD.')
65 | parser.add_argument('-tp', '--tune_partial', type=int, default=1000,
66 | help='finetune top-x embeddings (including , ).')
67 | parser.add_argument('--fix_embeddings', action='store_true',
68 | help='if true, `tune_partial` will be ignored.')
69 | parser.add_argument('--elmo_lambda', type=float, default=0.0)
70 | parser.add_argument('--no_question_normalize', dest='question_normalize', action='store_false') # when set, do dialog normalize
71 | parser.add_argument('--pretrain', default='')
72 |
73 | # model
74 | parser.add_argument('--explicit_dialog_ctx', type=int, default=2)
75 | parser.add_argument('--use_dialog_act', action='store_true')
76 | parser.add_argument('--no_dialog_flow', action='store_true')
77 | parser.add_argument('--no_hierarchical_query', dest='do_hierarchical_query', action='store_false')
78 | parser.add_argument('--no_prealign', dest='do_prealign', action='store_false')
79 |
80 | parser.add_argument('--final_output_att_hidden', type=int, default=250)
81 | parser.add_argument('--question_merge', default='linear_self_attn')
82 | parser.add_argument('--no_ptr_update', dest='do_ptr_update', action='store_false')
83 | parser.add_argument('--no_ptr_net_indep_attn', dest='ptr_net_indep_attn', action='store_false')
84 | parser.add_argument('--ptr_net_attn_type', default='Bilinear', help="Attention for answer span output: Bilinear, MLP or Default")
85 |
86 | parser.add_argument('--do_residual_rnn', dest='do_residual_rnn', action='store_true')
87 | parser.add_argument('--do_residual_everything', dest='do_residual_everything', action='store_true')
88 | parser.add_argument('--do_residual', dest='do_residual', action='store_true')
89 | parser.add_argument('--rnn_layers', type=int, default=1, help="Default number of RNN layers")
90 | parser.add_argument('--rnn_type', default='lstm',
91 | help='supported types: rnn, gru, lstm')
92 | parser.add_argument('--concat_rnn', dest='concat_rnn', action='store_true')
93 |
94 | parser.add_argument('--hidden_size', type=int, default=125)
95 | parser.add_argument('--self_attention_opt', type=int, default=1) # 0: no self attention
96 |
97 | parser.add_argument('--deep_inter_att_do_similar', type=int, default=0)
98 | parser.add_argument('--deep_att_hidden_size_per_abstr', type=int, default=250)
99 |
100 | parser.add_argument('--no_elmo', dest='use_elmo', action='store_false')
101 | parser.add_argument('--no_em', action='store_true')
102 |
103 | parser.add_argument('--no_wemb', dest='use_wemb', action='store_false') # word embedding
104 | parser.add_argument('--CoVe_opt', type=int, default=1) # contexualized embedding option
105 | parser.add_argument('--no_pos', dest='use_pos', action='store_false') # pos tagging
106 | parser.add_argument('--pos_size', type=int, default=51, help='how many kinds of POS tags.')
107 | parser.add_argument('--pos_dim', type=int, default=12, help='the embedding dimension for POS tags.')
108 | parser.add_argument('--no_ner', dest='use_ner', action='store_false') # named entity
109 | parser.add_argument('--ner_size', type=int, default=19, help='how many kinds of named entity tags.')
110 | parser.add_argument('--ner_dim', type=int, default=8, help='the embedding dimension for named entity tags.')
111 |
112 | parser.add_argument('--prealign_hidden', type=int, default=300)
113 | parser.add_argument('--prealign_option', type=int, default=2, help='0: No prealign, 1, 2, ...: Different options')
114 |
115 | parser.add_argument('--no_seq_dropout', dest='do_seq_dropout', action='store_false')
116 | parser.add_argument('--my_dropout_p', type=float, default=0.4)
117 | parser.add_argument('--dropout_emb', type=float, default=0.4)
118 |
119 | parser.add_argument('--max_len', type=int, default=35)
120 |
121 | parser.add_argument('--flow_attention', type=int, default = 1)
122 | parser.add_argument('--use_bert', help="Whether to use BERT or not, and which version to use")
123 | parser.add_argument('--bert_path', default='bert_vocab_files/')
124 | args = parser.parse_args()
125 |
126 | if args.name != '':
127 | args.model_dir = args.model_dir + '_' + args.name
128 | args.log_file = os.path.dirname(args.log_file) + 'output_' + args.name + '.log'
129 |
130 | # set model dir
131 | model_dir = args.model_dir
132 | os.makedirs(model_dir, exist_ok=True)
133 | model_dir = os.path.abspath(model_dir)
134 |
135 | # set random seed
136 | random.seed(args.seed)
137 | np.random.seed(args.seed)
138 | torch.manual_seed(args.seed)
139 | if args.cuda:
140 | torch.cuda.manual_seed_all(args.seed)
141 |
142 | # setup logger
143 | log = logging.getLogger(__name__)
144 | log.setLevel(logging.DEBUG)
145 | fh = logging.FileHandler(args.log_file)
146 | fh.setLevel(logging.DEBUG)
147 | ch = logging.StreamHandler(sys.stdout)
148 | ch.setLevel(logging.INFO)
149 | formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S')
150 | fh.setFormatter(formatter)
151 | ch.setFormatter(formatter)
152 | log.addHandler(fh)
153 | log.addHandler(ch)
154 |
155 | def main():
156 | log.info('[program starts.]')
157 | opt = vars(args) # changing opt will change args
158 | train, train_embedding, opt = load_train_data(opt)
159 | dev, dev_embedding, dev_answer = load_dev_data(opt)
160 | opt['num_features'] += args.explicit_dialog_ctx * (args.use_dialog_act*3 + 2) # dialog_act + previous answer
161 | if opt['use_elmo'] == False:
162 | opt['elmo_batch_size'] = 0
163 | log.info('[Data loaded.]')
164 |
165 | if args.resume:
166 | log.info('[loading previous model...]')
167 | checkpoint = torch.load(args.resume)
168 | if args.resume_options:
169 | opt = checkpoint['config']
170 | state_dict = checkpoint['state_dict']
171 | model = QAModel(opt, train_embedding, state_dict)
172 | epoch_0 = checkpoint['epoch'] + 1
173 | for i in range(checkpoint['epoch']):
174 | random.shuffle(list(range(len(train)))) # synchronize random seed
175 | if args.reduce_lr:
176 | lr_decay(model.optimizer, lr_decay=args.reduce_lr)
177 | else:
178 | model = QAModel(opt, train_embedding)
179 | epoch_0 = 1
180 |
181 | if args.pretrain:
182 | pretrain_model = torch.load(args.pretrain)
183 | state_dict = pretrain_model['state_dict']['network']
184 |
185 | model.get_pretrain(state_dict)
186 |
187 | model.setup_eval_embed(dev_embedding)
188 | log.info("[dev] Total number of params: {}".format(model.total_param))
189 |
190 | if args.cuda:
191 | model.cuda()
192 |
193 | if args.resume:
194 | batches = BatchGen_QuAC(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, dialog_ctx=args.explicit_dialog_ctx, use_dialog_act=args.use_dialog_act, use_bert=args.use_bert, bert_path=args.bert_path)
195 | predictions, no_ans_scores = [], []
196 | for batch in batches:
197 | phrases, noans = model.predict(batch)
198 | predictions.extend(phrases)
199 | no_ans_scores.extend(noans)
200 | f1, na, thresh = find_best_score_and_thresh(predictions, dev_answer, no_ans_scores)
201 | log.info("[dev F1: {} NA: {} TH: {}]".format(f1, na, thresh))
202 | best_val_score, best_na, best_thresh = f1, na, thresh
203 | else:
204 | best_val_score, best_na, best_thresh = 0.0, 0.0, 0.0
205 |
206 | for epoch in range(epoch_0, epoch_0 + args.epoches):
207 | log.warning('Epoch {}'.format(epoch))
208 | # train
209 | batches = BatchGen_QuAC(train, batch_size=args.batch_size, gpu=args.cuda, dialog_ctx=args.explicit_dialog_ctx, use_dialog_act=args.use_dialog_act, precompute_elmo=args.elmo_batch_size // args.batch_size, use_bert = args.use_bert, bert_path=args.bert_path)
210 | start = datetime.now()
211 | for i, batch in enumerate(batches):
212 | model.update(batch)
213 | if i % args.log_per_updates == 0:
214 | log.info('updates[{0:6}] train loss[{1:.5f}] remaining[{2}]'.format(
215 | model.updates, model.train_loss.avg,
216 | str((datetime.now() - start) / (i + 1) * (len(batches) - i - 1)).split('.')[0]))
217 |
218 | # eval
219 | if epoch % args.eval_per_epoch == 0:
220 | batches = BatchGen_QuAC(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, dialog_ctx=args.explicit_dialog_ctx, use_dialog_act=args.use_dialog_act, precompute_elmo=args.elmo_batch_size // args.batch_size, use_bert = args.use_bert, bert_path=args.bert_path)
221 | predictions, no_ans_scores = [], []
222 | for batch in batches:
223 | phrases, noans = model.predict(batch)
224 | predictions.extend(phrases)
225 | no_ans_scores.extend(noans)
226 | f1, na, thresh = find_best_score_and_thresh(predictions, dev_answer, no_ans_scores)
227 |
228 | # save
229 | if args.save_best_only:
230 | if f1 > best_val_score:
231 | best_val_score, best_na, best_thresh = f1, na, thresh
232 | model_file = os.path.join(model_dir, 'best_model.pt')
233 | model.save(model_file, epoch)
234 | log.info('[new best model saved.]')
235 | else:
236 | model_file = os.path.join(model_dir, 'checkpoint_epoch_{}.pt'.format(epoch))
237 | model.save(model_file, epoch)
238 | if f1 > best_val_score:
239 | best_val_score, best_na, best_thresh = f1, na, thresh
240 | copyfile(os.path.join(model_dir, model_file),
241 | os.path.join(model_dir, 'best_model.pt'))
242 | log.info('[new best model saved.]')
243 |
244 | log.warning("Epoch {} - dev F1: {:.3f} NA: {:.3f} TH: {:.3f} (best F1: {:.3f} NA: {:.3f} TH: {:.3f})".format(epoch, f1, na, thresh, best_val_score, best_na, best_thresh))
245 |
246 | def lr_decay(optimizer, lr_decay):
247 | for param_group in optimizer.param_groups:
248 | param_group['lr'] *= lr_decay
249 | log.info('[learning rate reduced by {}]'.format(lr_decay))
250 | return optimizer
251 |
252 | def load_train_data(opt):
253 | with open(os.path.join(args.train_dir, 'train_meta.msgpack'), 'rb') as f:
254 | meta = msgpack.load(f, encoding='utf8')
255 | embedding = torch.Tensor(meta['embedding'])
256 | opt['vocab_size'] = embedding.size(0)
257 | opt['embedding_dim'] = embedding.size(1)
258 |
259 | with open(os.path.join(args.train_dir, 'train_data.msgpack'), 'rb') as f:
260 | data = msgpack.load(f, encoding='utf8')
261 | #data_orig = pd.read_csv(os.path.join(args.train_dir, 'train.csv'))
262 |
263 | opt['num_features'] = len(data['context_features'][0][0])
264 |
265 | train = {'context': list(zip(
266 | data['context_ids'],
267 | data['context_tags'],
268 | data['context_ents'],
269 | data['context'],
270 | data['context_span'],
271 | data['1st_question'],
272 | data['context_tokenized'])),
273 | 'qa': list(zip(
274 | data['question_CID'],
275 | data['question_ids'],
276 | data['context_features'],
277 | data['answer_start'],
278 | data['answer_end'],
279 | data['answer_choice'],
280 | data['question'],
281 | data['answer'],
282 | data['question_tokenized']))
283 | }
284 | return train, embedding, opt
285 |
286 | def load_dev_data(opt): # can be extended to true test set
287 | with open(os.path.join(args.dev_dir, 'dev_meta.msgpack'), 'rb') as f:
288 | meta = msgpack.load(f, encoding='utf8')
289 | embedding = torch.Tensor(meta['embedding'])
290 | assert opt['embedding_dim'] == embedding.size(1)
291 |
292 | with open(os.path.join(args.dev_dir, 'dev_data.msgpack'), 'rb') as f:
293 | data = msgpack.load(f, encoding='utf8')
294 | #data_orig = pd.read_csv(os.path.join(args.dev_dir, 'dev.csv'))
295 |
296 | assert opt['num_features'] == len(data['context_features'][0][0])
297 |
298 | dev = {'context': list(zip(
299 | data['context_ids'],
300 | data['context_tags'],
301 | data['context_ents'],
302 | data['context'],
303 | data['context_span'],
304 | data['1st_question'],
305 | data['context_tokenized'])),
306 | 'qa': list(zip(
307 | data['question_CID'],
308 | data['question_ids'],
309 | data['context_features'],
310 | data['answer_start'],
311 | data['answer_end'],
312 | data['answer_choice'],
313 | data['question'],
314 | data['answer'],
315 | data['question_tokenized']))
316 | }
317 |
318 | dev_answer = []
319 | for i, CID in enumerate(data['question_CID']):
320 | if len(dev_answer) <= CID:
321 | dev_answer.append([])
322 | dev_answer[CID].append(data['all_answer'][i])
323 |
324 | return dev, embedding, dev_answer
325 |
326 | if __name__ == '__main__':
327 | main()
328 |
--------------------------------------------------------------------------------
/FlowQA_Attention/figure/Attention Over Flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/FlowQA_Attention/figure/Attention Over Flow.png
--------------------------------------------------------------------------------
/FlowQA_Attention/figure/Readme.me:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/FlowQA_Attention/figure/Vanilla Flow Operation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/FlowQA_Attention/figure/Vanilla Flow Operation.png
--------------------------------------------------------------------------------
/FlowQA_Attention/figure/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/FlowQA_Attention/figure/result.png
--------------------------------------------------------------------------------
/FlowQA_Coreference/README.md:
--------------------------------------------------------------------------------
1 | # FlowQA + Coreference
2 |
3 | We used the [coreference model](https://github.com/kentonl/e2e-coref) proposed by Lee et all to improve the [FlowQA](https://github.com/momohuang/FlowQA) model.
4 | In short, this model clusters spans in text that coreference among other.
5 | For example, let's consider below conversation;
6 |
7 | A: What is the story about?
8 |
9 | B: young girl and her dog
10 |
11 | A: What were they doing?
12 |
13 | B: set out a trip
14 |
15 | In this conversation, the phrase "young girl and her dog" said by A refers to "they" said by B.
16 | Also, the pronoun "her" said by A refers to "young girl" which appears earlier.
17 | Therefore, the model will cluster the following spans;
18 |
19 | [["young girl", "her"], ["young girl and her dog", "they"]]
20 |
21 |
22 | In order to feed this coreference information to the FlowQA model, we can
23 |
24 | * Feed the result from the coreference model to the FlowQA model
25 | * Feed the internal representation of each tokens in coreference model to the FlowQA model.
26 |
27 | We concatenate these two new representation with the existing input of FlowQA, which consists of word embeddings, part of speech and named entity.
28 |
29 | ## Feed the result from the coreference model
30 |
31 | We use one hot encoding to represent the results from coreference model.
32 | Each token will be coresponding to a vector of which the length is the number of clusters.
33 | If a token belong to an ith cluster, the element ith in one hot vector will be 1.
34 | Here is the one hot encoding for above example.
35 |
36 | "young" [1, 1]
37 | "girl" [1, 1]
38 | "and" [0 ,1]
39 | "her" [1, 1]
40 | "dog" [1, 1]
41 | "What" [0, 0]
42 | "were" [0, 0]
43 | "they" [0, 1]
44 | "doing" [0, 0]
45 |
46 | ## Feed the internal representation
47 |
48 | We use the latent representation from the coreference model as an input of FlowQA model. To be precise, we use X* from the figure 3 of [this paper](https://arxiv.org/abs/1707.07045). Since the model predicts the coreference, we hypothesize that the latent representation of the model will encode the coreference as well.
49 |
50 | # Experiment
51 |
52 | First, we set up the Coreference Model according to [instruction](https://github.com/kentonl/e2e-coref).
53 | Then we use their pre-trained coreference model to predict the coreference of QuAC data set.
54 | We extract the latent representation of each token and save them.
55 | After that, we convert the result of the coreference model to one hot encoding.
56 | Finally, we concatenate them with one hot encoding to obtain the augmented input vector that will be used to input FlowQA.
57 | The augmented vector' size is 450 which consists of 400 for X* latent representation and 50 for one hot encoding.
58 |
59 | Second, we set up the FlowQA model according to the [instruction](https://github.com/momohuang/FlowQA).
60 | We have to modify ```train_QuAC.py```, ```general_utils.py``` and ```detail_model.py``` to support the augmented representation.
61 | Then we train the model with this new input representation.
62 |
63 |
64 | We use [Google Cloud Platform](https://cloud.google.com/) with 4 vCPUs, 26 GB memory and 1 x NVIDIA Tesla K80 to run both FlowQA and Coreference models.
65 | It takes around 50 hour to predict the coreference for entire QuAC dataset and takes around 4 hour per epoch to train FlowQA model
66 |
67 |
68 | # Result
69 |
70 | Below is the comparison of F1 between the original FlowQA model and our modification. We can see that the improved model perform slightly better.
71 |
72 | 
73 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Question Answering In Context
2 | Question Answering (QA) has long been a promising yet challenging task, and a large number of work has been done in this area. However, simple QA tasks such as extracting answer spans from a given text have been criticized as "shallow pattern recognition", which does not equip machines with the capability of reading. Recently, Conversational Question Answering has been proposed to address this issue. In this task, a program needs to not only answer questions, but also answer them in a conversational style.
3 |
4 | In this project, we are going to use deep neural networks to address [QuAC](https://quac.ai/), one of these Conversational QA datasets. We are going to train existing models on QuAC and compare their pros and cons. We will also use models effective on other related datasets and invent new models involving different network architectures or mechanisms. We hope that we could obtain a better, and even state-of-the-art performance.
5 |
6 | This project is a part of CS 599 Deep Learning and Its Applications in Spring 2019 at Department of Computer Science, University of Southern California.
7 |
8 | ## QuAC
9 |
10 |
11 |
12 | Figure 1: Task setup of QuAC. (Pics from [QuAC](https://quac.ai))
13 |
14 |
15 | [Question Answering in Context(QuAC)](https://quac.ai/) is a newly proposed dataset with regards to conversational QA. The task is to answer a question in a conversation given a context and historical questions and answers in the conversation. Compared with the former ones, its questions are often highly context-dependent, elliptical, and even unanswerable. The architecture o typical QA models for QuAC:
16 |
17 |
18 |
19 |
20 | Figure 2: Baseline QA Model for QuAC.
21 |
22 |
23 | ## Problem Formulation
24 | The input of the problem is a context as background and a sequence of text-free questions, each question has two binary properties “yes/no” and “followup”. The first property indicates whether the question can be answered by affirmation “yes” or “no”. The second property indicates whether the question is a follow-up of previous questions.
25 |
26 | The output is the answer of each question. The answer must be a span of context unless the question is a “yes/no“ question, or the answer should be “cannot answer“ if the answer cannot be found in given context.
27 |
28 |
29 |
30 |
31 |
32 |
33 | ## Approaches
34 |
35 | We try to enhance two baseline models: FlowQA and BiDAF++. We also try to apply an existing model, SDNet, which works on another QA dataset.
36 |
37 | ### FlowQA
38 |
39 | [FlowQA](https://github.com/momohuang/FlowQA) has been shown to have a considerable good performance on conversational question-answering tasks such as CoQA and QuAC. Firstly, we re-run the FlowQA model and investigate why it has great performance and why it is special.
40 |
41 |
42 | Why does FlowQA work? We believe that it is because of its "flow" operation, which uses a LSTM (or GRU) to represent context words in terms of question turns. By using "flow" operation, FlowQA could capture and retain conversational information. Since
43 | conversations are in fact sequences, it is a natural, yet smart, idea, to encode conversational information in this way.
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 | #### [FlowQA + Attention Over Flow](FlowQA_Attention)
52 |
53 | We believe that applying attention here should be considered, because not every historical conversation is important.
54 | When new questions are posted, it is highly likely that only several previous questions are involved, instead of all. By adding attention in the "flow" operation, we could align new representations with previous useful ones. We would like to call it "attention-over-flow".
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 | #### [FlowQA + Coreference](FlowQA_Coreference)
63 |
64 | Since the QuAC data set may contain many coreference in context and also in conversations, our intuition is that we could exploit the coreference resolution model to improve FlowQA model.
65 | Below figure shows the improved model where the gray blocks are the existing FlowQA model and the blue blocks are coreference model we added.
66 | Our experiment shows that, with coreference model, the F1 is slightly better.
67 |
68 |
69 |
70 |
71 | ### [SD-Net](SDNet)
72 | A contextualized attention-based deep neural network developed by Microsoft. It is originally evaluated on another question answering dataset CoQA, and it is the first model that reaches in-domain F1 score higher than 80% (80.7%) on CoQA. Here we are applying this model on QuAC dataset. Since the format of two datasets are different, we need to preprocess data carefully.
73 |
74 | ### [BiDAF++](BiDAF)
75 | An original [BiDAF++](https://arxiv.org/abs/1710.10723) model uses Char-CNN for character embedding and GLoVe for word embedding. It is also equipped with contextualized embeddings and self attention. In this model, marker embeddings corresponding to previous answer words are used, while question turn numbers are encoded into question embeddings. We intend to append ELMo or BERT embedding to word embeddings and contextualized embeddings to get better performance.
76 |
77 |
78 |
79 |
80 |
81 | ## Results
82 | | Model | F1 | Remark |
83 | | ------------- | ------------- | ------------- |
84 | | **FlowQA Baseline** | 64.24| Baseline in one experiment |
85 | | FlowQA + Coreference | 62.50 | Baseline=62.26 in another experiment|
86 | | FlowQA + Attention on Flows | 64.35 | |
87 | | **BiDAF++ Baseline** | 55.59 | |
88 | | ELMO + BiDAF++ | 58.40 | |
89 | | BERT + BIDAF++ | 60.04 | |
90 | | **SDNet** | 33.13 | |
91 |
92 | ## Conclusion
93 | * **FlowQA+Attention Over Flow**: adding attention layers over flow operation layer slightly improves the FlowQA model. We believe that it is because the representations generated in this way focus more on recent dialogs and help resolve coreferences.
94 |
95 | * **FlowQA+Coreferece**: Adding vector representation that encodes the coreference in long context can increase the performance of the model for task involving complex dependencies (context and dialogue)
96 |
97 | * **Enhancing BiDAF++**: Appending ELMo or BERT embedding to word embeddings and contextualized embeddings. Because ELMo extracts context features from language model and BERT uses pre-trained token embedding model which could perform better than GLoVe on QuAC dataset, these two embeddings could enhance the model performance.
98 |
99 | * **SDNet**: SDnet is not originally applied on QuAC dataset. So we adjusted the format of dataset so it can be fitted on SDNet. However the result is not as good as we expected. We can try to figure out issues and improve this model in future.
100 |
101 | ## References
102 |
103 | * [FlowQA: Grasping Flow in History for Conversational Machine Comprehension](https://arxiv.org/abs/1810.06683) by Huang et. al.
104 | * [Bidirectional attention flow for machine comprehension](https://arxiv.org/abs/1611.01603) by Minjoon Seo et. al.
105 | * [Simple and effective multi-paragraph reading comprehension](https://arxiv.org/abs/1710.10723) by Christopher Clark et. al.
106 | * [Deep contextualized word representations](https://arxiv.org/abs/1802.05365) by Matthew E. Peters et. al.
107 | * [Bert: Pre-training of deep bidirectional transformers for language understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin et. al.
108 | * [SDNet: Contextualized Attention-based Deep Network for Conversational Question Answering](https://arxiv.org/abs/1812.03593) by Chenguang Zhu et. al.
109 | * [End-to-end Neural Coreference Resolution](https://arxiv.org/abs/1707.07045) by Lee et. al.
110 |
--------------------------------------------------------------------------------
/SDNet/Figures/f1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/Figures/f1.jpg
--------------------------------------------------------------------------------
/SDNet/Figures/loss.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/Figures/loss.jpg
--------------------------------------------------------------------------------
/SDNet/README.md:
--------------------------------------------------------------------------------
1 |
2 | # SDNet
3 | [SDNet](https://arxiv.org/abs/1812.03593) is a contextualized attention-based deep neural network developed by Microsoft. It is originally evaluated on another question answering dataset [CoQA](https://stanfordnlp.github.io/coqa/), it is the first model that reaches in-domain F1 score higher than 80% (80.7%) on CoQA.
4 |
5 | ## Difference between CoQA and QuAC
6 | Since the format of dataset is different between CoQA and QuAC, we need to convert the format first to let SDNet work on QuAC. Unlike QuAC, the answers in CoQA are context-free. However, SDNet still need to generate an answer span from text, so we can take this as output on QuAC. Moreover, questions in CoQA don’t have properties such as “follow up” and “yes/no”, but all questions are possible to be related to previous questions and answers. Original SDNet model prepends 2 rounds of previous questions and answers to obtain the best F1 score, so we will keep this behavior and ignore “follow up” property. Also, SDNet essentially needs to compute the probability of answer being affirmation “yes” or “no”, and “No answer” (“unknown” in CoQA), which we can use directly on QuAC.
7 |
8 | ## Usage
9 | In preprocess folder there is a script to convert QuAC data to CoQA format.
10 |
11 | For detailed usage please refer to [SDNet GitHub page](https://github.com/Microsoft/SDNet)
12 |
13 | ## Result and conclusion
14 | Training loss and development set F1 score are shown below:
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 | Unfortunately, the reult of SDNet is not as good as we expected. We only get 33.13 F1 score on development set. We tried to feagure out the reason why this happends (such as format and preprocess issue) but couldn't make any significant improvement on that. We can set this as a future work and continue working on this.
25 |
26 | ## References
27 |
28 | * [SDNet: Contextualized Attention-based Deep Network for Conversational Question Answering](https://arxiv.org/abs/1812.03593) by Chenguang Zhu et. al.
29 |
--------------------------------------------------------------------------------
/SDNet/code/SDNet/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | This project welcomes contributions and suggestions. Most contributions require you to
4 | agree to a Contributor License Agreement (CLA) declaring that you have the right to,
5 | and actually do, grant us the rights to use your contribution. For details, visit
6 | https://cla.microsoft.com.
7 |
8 | When you submit a pull request, a CLA-bot will automatically determine whether you need
9 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
10 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
11 |
12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
14 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
--------------------------------------------------------------------------------
/SDNet/code/SDNet/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Microsoft Corporation. All rights reserved.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 |
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/BaseTrainer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | import os
5 |
6 | class BaseTrainer():
7 | def __init__(self, opt):
8 | self.opt = opt
9 | self.isTrain = False;
10 | if self.opt['cuda'] == True:
11 | self.use_cuda = True
12 | print('Using Cuda\n')
13 | else:
14 | self.use_cuda = False
15 | print('Using CPU\n')
16 |
17 | self.is_official = 'OFFICIAL' in self.opt
18 | # Make sure raw text feature files are ready
19 | self.use_spacy = 'SPACY_FEATURE' in self.opt
20 | self.opt['logFile'] = 'log.txt'
21 |
22 | opt['FEATURE_FOLDER'] = 'conf~/' + ('spacy_intermediate_feature~/' if self.use_spacy else 'intermediate_feature~/')
23 | opt['FEATURE_FOLDER'] = os.path.join(opt['datadir'], opt['FEATURE_FOLDER'])
24 |
25 | def log(self, s):
26 | # In official case, the program does not output logs
27 | if self.is_official:
28 | return
29 |
30 | with open(os.path.join(self.saveFolder, self.opt['logFile']), 'a') as f:
31 | f.write(s + '\n')
32 | print(s)
33 |
34 | def getSaveFolder(self):
35 | runid = 1
36 | while True:
37 | saveFolder = os.path.join(self.opt['datadir'], 'conf~', 'run_' + str(runid))
38 | if not os.path.exists(saveFolder):
39 | self.saveFolder = saveFolder
40 | os.makedirs(self.saveFolder)
41 | print('Saving logs, model and evaluation in ' + self.saveFolder)
42 | return
43 | runid = runid + 1
44 |
45 | # save copy of conf file
46 | def saveConf(self):
47 | with open(self.opt['confFile'], encoding='utf-8') as f:
48 | with open(os.path.join(self.saveFolder, 'conf_copy'), 'w', encoding='utf-8') as fw:
49 | for line in f:
50 | fw.write(line + '\n')
51 |
52 | def train(self):
53 | pass
54 |
55 | def load(self):
56 | pass
57 |
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/Bert/Bert.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | import os
5 | import torch
6 | import torch.nn as nn
7 | from torch.autograd import Variable
8 | from Models.Bert.modeling import BertModel
9 |
10 | '''
11 | BERT
12 | '''
13 | class Bert(nn.Module):
14 | def __init__(self, opt):
15 | super(Bert, self).__init__()
16 | print('Loading BERT model...')
17 | self.BERT_MAX_LEN = 512
18 | self.linear_combine = 'BERT_LINEAR_COMBINE' in opt
19 |
20 | if 'BERT_LARGE' in opt:
21 | print('Using BERT Large model')
22 | model_file = os.path.join(opt['datadir'], opt['BERT_large_model_file'])
23 | print('Loading BERT model from', model_file)
24 | self.bert_model = BertModel.from_pretrained(model_file)
25 | #self.bert_model = BertModel.from_pretrained('bert-large-uncased')
26 | self.bert_dim = 1024
27 | self.bert_layer = 24
28 | else:
29 | print('Using BERT base model')
30 | model_file = os.path.join(opt['datadir'], opt['BERT_model_file'])
31 | print('Loading BERT model from', model_file)
32 | self.bert_model = BertModel.from_pretrained(model_file)
33 | #self.bert_model = BertModel.from_pretrained('bert-base-cased')
34 | self.bert_dim = 768
35 | self.bert_layer = 12
36 | self.bert_model.cuda()
37 | self.bert_model.eval()
38 |
39 | print('Finished loading')
40 |
41 | '''
42 | Input:
43 | x_bert: batch * max_bert_sent_len (ids)
44 | x_bert_mask: batch * max_bert_sent_len (0/1)
45 | x_bert_offset: batch * max_real_word_num * 2
46 | x_mask: batch * max_real_word_num
47 | Output:
48 | embedding: batch * max_real_word_num * bert_dim
49 | '''
50 | def forward(self, x_bert, x_bert_mask, x_bert_offset, x_mask):
51 | if self.linear_combine:
52 | return self.combine_forward(x_bert, x_bert_mask, x_bert_offset, x_mask)
53 |
54 | last_layers = []
55 | bert_sent_len = x_bert.shape[1]
56 | p = 0
57 | while p < bert_sent_len:
58 | all_encoder_layers, _ = self.bert_model(x_bert[:, p:(p + self.BERT_MAX_LEN)], token_type_ids=None, attention_mask=x_bert_mask[:, p:(p + self.BERT_MAX_LEN)]) # bert_layer * batch * max_bert_sent_len * bert_dim
59 | last_layers.append(all_encoder_layers[-1]) # batch * up_to_512 * bert_dim
60 | p += self.BERT_MAX_LEN
61 |
62 | bert_embedding = torch.cat(last_layers, 1)
63 |
64 | batch_size = x_mask.shape[0]
65 | max_word_num = x_mask.shape[1]
66 | output = Variable(torch.zeros(batch_size, max_word_num, self.bert_dim))
67 | for i in range(batch_size):
68 | for j in range(max_word_num):
69 | if x_mask[i, j] == 0:
70 | continue
71 | st = x_bert_offset[i, j, 0]
72 | ed = x_bert_offset[i, j, 1]
73 | # we can also try using st only, ed only
74 | if st + 1 == ed: # including st==ed
75 | output[i, j, :] = bert_embedding[i, st, :]
76 | else:
77 | subword_ebd_sum = torch.sum(bert_embedding[i, st:ed, :], dim = 0)
78 | if st < ed:
79 | output[i, j, :] = subword_ebd_sum / float(ed - st) # dim 0 is st:ed
80 |
81 | output = output.cuda()
82 | return output
83 |
84 | def combine_forward(self, x_bert, x_bert_mask, x_bert_offset, x_mask):
85 | all_layers = []
86 |
87 | bert_sent_len = x_bert.shape[1]
88 | p = 0
89 | while p < bert_sent_len:
90 | all_encoder_layers, _ = self.bert_model(x_bert[:, p:(p + self.BERT_MAX_LEN)], token_type_ids=None, attention_mask=x_bert_mask[:, p:(p + self.BERT_MAX_LEN)]) # bert_layer * batch * max_bert_sent_len * bert_dim
91 | all_layers.append(torch.cat(all_encoder_layers, dim = 2)) # batch * up_to_512 * (bert_dim * layer)
92 | p += self.BERT_MAX_LEN
93 |
94 | bert_embedding = torch.cat(all_layers, dim = 1) # batch * up_to_512 * (bert_dim * layer)
95 | batch_size = x_mask.shape[0]
96 | max_word_num = x_mask.shape[1]
97 | tot_dim = bert_embedding.shape[2]
98 | output = Variable(torch.zeros(batch_size, max_word_num, tot_dim))
99 | for i in range(batch_size):
100 | for j in range(max_word_num):
101 | if x_mask[i, j] == 0:
102 | continue
103 | st = x_bert_offset[i, j, 0]
104 | ed = x_bert_offset[i, j, 1]
105 | # we can also try using st only, ed only
106 | if st + 1 == ed: # including st==ed
107 | output[i, j, :] = bert_embedding[i, st, :]
108 | else:
109 | subword_ebd_sum = torch.sum(bert_embedding[i, st:ed, :], dim = 0)
110 | if st < ed:
111 | output[i, j, :] = subword_ebd_sum / float(ed - st) # dim 0 is st:ed
112 |
113 | outputs = []
114 | for i in range(self.bert_layer):
115 | now = output[:, :, (i * self.bert_dim) : ((i + 1) * self.bert_dim)]
116 | now = now.cuda()
117 | outputs.append(now)
118 |
119 | return outputs
120 |
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/Bert/__pycache__/Bert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/Bert/__pycache__/Bert.cpython-37.pyc
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/Bert/__pycache__/modeling.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/Bert/__pycache__/modeling.cpython-37.pyc
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/Bert/__pycache__/tokenization.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/Bert/__pycache__/tokenization.cpython-37.pyc
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/Bert/optimization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """PyTorch optimization for BERT model."""
16 |
17 | import math
18 | import torch
19 | from torch.optim import Optimizer
20 | from torch.nn.utils import clip_grad_norm_
21 |
22 | def warmup_cosine(x, warmup=0.002):
23 | if x < warmup:
24 | return x/warmup
25 | return 0.5 * (1.0 + torch.cos(math.pi * x))
26 |
27 | def warmup_constant(x, warmup=0.002):
28 | if x < warmup:
29 | return x/warmup
30 | return 1.0
31 |
32 | def warmup_linear(x, warmup=0.002):
33 | if x < warmup:
34 | return x/warmup
35 | return 1.0 - x
36 |
37 | SCHEDULES = {
38 | 'warmup_cosine':warmup_cosine,
39 | 'warmup_constant':warmup_constant,
40 | 'warmup_linear':warmup_linear,
41 | }
42 |
43 |
44 | class BertAdam(Optimizer):
45 | """Implements BERT version of Adam algorithm with weight decay fix.
46 | Params:
47 | lr: learning rate
48 | warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
49 | t_total: total number of training steps for the learning
50 | rate schedule, -1 means constant learning rate. Default: -1
51 | schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
52 | b1: Adams b1. Default: 0.9
53 | b2: Adams b2. Default: 0.999
54 | e: Adams epsilon. Default: 1e-6
55 | weight_decay_rate: Weight decay. Default: 0.01
56 | max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
57 | """
58 | def __init__(self, params, lr, warmup=-1, t_total=-1, schedule='warmup_linear',
59 | b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01,
60 | max_grad_norm=1.0):
61 | if not lr >= 0.0:
62 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
63 | if schedule not in SCHEDULES:
64 | raise ValueError("Invalid schedule parameter: {}".format(schedule))
65 | if not 0.0 <= warmup < 1.0 and not warmup == -1:
66 | raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
67 | if not 0.0 <= b1 < 1.0:
68 | raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
69 | if not 0.0 <= b2 < 1.0:
70 | raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
71 | if not e >= 0.0:
72 | raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
73 | defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
74 | b1=b1, b2=b2, e=e, weight_decay_rate=weight_decay_rate,
75 | max_grad_norm=max_grad_norm)
76 | super(BertAdam, self).__init__(params, defaults)
77 |
78 | def get_lr(self):
79 | lr = []
80 | for group in self.param_groups:
81 | for p in group['params']:
82 | state = self.state[p]
83 | if len(state) == 0:
84 | return [0]
85 | if group['t_total'] != -1:
86 | schedule_fct = SCHEDULES[group['schedule']]
87 | lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
88 | else:
89 | lr_scheduled = group['lr']
90 | lr.append(lr_scheduled)
91 | return lr
92 |
93 | def step(self, closure=None):
94 | """Performs a single optimization step.
95 |
96 | Arguments:
97 | closure (callable, optional): A closure that reevaluates the model
98 | and returns the loss.
99 | """
100 | loss = None
101 | if closure is not None:
102 | loss = closure()
103 |
104 | for group in self.param_groups:
105 | for p in group['params']:
106 | if p.grad is None:
107 | continue
108 | grad = p.grad.data
109 | if grad.is_sparse:
110 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
111 |
112 | state = self.state[p]
113 |
114 | # State initialization
115 | if len(state) == 0:
116 | state['step'] = 0
117 | # Exponential moving average of gradient values
118 | state['next_m'] = torch.zeros_like(p.data)
119 | # Exponential moving average of squared gradient values
120 | state['next_v'] = torch.zeros_like(p.data)
121 |
122 | next_m, next_v = state['next_m'], state['next_v']
123 | beta1, beta2 = group['b1'], group['b2']
124 |
125 | # Add grad clipping
126 | if group['max_grad_norm'] > 0:
127 | clip_grad_norm_(p, group['max_grad_norm'])
128 |
129 | # Decay the first and second moment running average coefficient
130 | # In-place operations to update the averages at the same time
131 | next_m.mul_(beta1).add_(1 - beta1, grad)
132 | next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
133 | update = next_m / (next_v.sqrt() + group['e'])
134 |
135 | # Just adding the square of the weights to the loss function is *not*
136 | # the correct way of using L2 regularization/weight decay with Adam,
137 | # since that will interact with the m and v parameters in strange ways.
138 | #
139 | # Instead we want to decay the weights in a manner that doesn't interact
140 | # with the m/v parameters. This is equivalent to adding the square
141 | # of the weights to the loss with plain (non-momentum) SGD.
142 | if group['weight_decay_rate'] > 0.0:
143 | update += group['weight_decay_rate'] * p.data
144 |
145 | if group['t_total'] != -1:
146 | schedule_fct = SCHEDULES[group['schedule']]
147 | lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
148 | else:
149 | lr_scheduled = group['lr']
150 |
151 | update_with_lr = lr_scheduled * update
152 | p.data.add_(-update_with_lr)
153 |
154 | state['step'] += 1
155 |
156 | # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
157 | # No bias correction
158 | # bias_correction1 = 1 - beta1 ** state['step']
159 | # bias_correction2 = 1 - beta2 ** state['step']
160 |
161 | return loss
162 |
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/Bert/tokenization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import collections
22 | import unicodedata
23 | import os
24 | import logging
25 |
26 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
27 | datefmt = '%m/%d/%Y %H:%M:%S',
28 | level = logging.INFO)
29 | logger = logging.getLogger(__name__)
30 |
31 | PRETRAINED_VOCAB_ARCHIVE_MAP = {
32 | 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
33 | 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
34 | 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
35 | 'bert-base-multilingual': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-vocab.txt",
36 | 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
37 | }
38 |
39 | def convert_to_unicode(text):
40 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
41 | if isinstance(text, str):
42 | return text
43 | elif isinstance(text, bytes):
44 | return text.decode("utf-8", "ignore")
45 | else:
46 | raise ValueError("Unsupported string type: %s" % (type(text)))
47 |
48 |
49 | def printable_text(text):
50 | """Returns text encoded in a way suitable for print or `tf.logging`."""
51 |
52 | # These functions want `str` for both Python2 and Python3, but in one case
53 | # it's a Unicode string and in the other it's a byte string.
54 | if isinstance(text, str):
55 | return text
56 | elif isinstance(text, bytes):
57 | return text.decode("utf-8", "ignore")
58 | else:
59 | raise ValueError("Unsupported string type: %s" % (type(text)))
60 |
61 |
62 | def load_vocab(vocab_file):
63 | """Loads a vocabulary file into a dictionary."""
64 | vocab = collections.OrderedDict()
65 | index = 0
66 | with open(vocab_file, "r", encoding="utf8") as reader:
67 | while True:
68 | token = convert_to_unicode(reader.readline())
69 | if not token:
70 | break
71 | token = token.strip()
72 | vocab[token] = index
73 | index += 1
74 | return vocab
75 |
76 |
77 | def whitespace_tokenize(text):
78 | """Runs basic whitespace cleaning and splitting on a peice of text."""
79 | text = text.strip()
80 | if not text:
81 | return []
82 | tokens = text.split()
83 | return tokens
84 |
85 |
86 | class BertTokenizer(object):
87 | """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
88 | def __init__(self, vocab_file, do_lower_case=True):
89 | if not os.path.isfile(vocab_file):
90 | raise ValueError(
91 | "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
92 | "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
93 | self.vocab = load_vocab(vocab_file)
94 | self.ids_to_tokens = collections.OrderedDict(
95 | [(ids, tok) for tok, ids in self.vocab.items()])
96 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
97 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
98 |
99 | def tokenize(self, text):
100 | split_tokens = []
101 | for token in self.basic_tokenizer.tokenize(text):
102 | for sub_token in self.wordpiece_tokenizer.tokenize(token):
103 | split_tokens.append(sub_token)
104 | return split_tokens
105 |
106 | def convert_tokens_to_ids(self, tokens):
107 | """Converts a sequence of tokens into ids using the vocab."""
108 | ids = []
109 | for token in tokens:
110 | ids.append(self.vocab[token])
111 | return ids
112 |
113 | def convert_ids_to_tokens(self, ids):
114 | """Converts a sequence of ids in wordpiece tokens using the vocab."""
115 | tokens = []
116 | for i in ids:
117 | tokens.append(self.ids_to_tokens[i])
118 | return tokens
119 |
120 | @classmethod
121 | def from_pretrained(cls, pretrained_model_name, do_lower_case=True):
122 | """
123 | Instantiate a PreTrainedBertModel from a pre-trained model file.
124 | Download and cache the pre-trained model file if needed.
125 | """
126 | if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
127 | vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
128 | else:
129 | vocab_file = pretrained_model_name
130 | # redirect to the cache, if necessary
131 | try:
132 | resolved_vocab_file = vocab_file
133 | if resolved_vocab_file == vocab_file:
134 | logger.info("loading vocabulary file {}".format(vocab_file))
135 | else:
136 | logger.info("loading vocabulary file {} from cache at {}".format(
137 | vocab_file, resolved_vocab_file))
138 | # Instantiate tokenizer.
139 | tokenizer = cls(resolved_vocab_file, do_lower_case)
140 | except FileNotFoundError:
141 | logger.error(
142 | "Model name '{}' was not found in model name list ({}). "
143 | "We assumed '{}' was a path or url but couldn't find any file "
144 | "associated to this path or url.".format(
145 | pretrained_model_name,
146 | ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
147 | pretrained_model_name))
148 | tokenizer = None
149 | return tokenizer
150 |
151 |
152 | class BasicTokenizer(object):
153 | """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
154 |
155 | def __init__(self, do_lower_case=True):
156 | """Constructs a BasicTokenizer.
157 |
158 | Args:
159 | do_lower_case: Whether to lower case the input.
160 | """
161 | self.do_lower_case = do_lower_case
162 |
163 | def tokenize(self, text):
164 | """Tokenizes a piece of text."""
165 | text = convert_to_unicode(text)
166 | text = self._clean_text(text)
167 | # This was added on November 1st, 2018 for the multilingual and Chinese
168 | # models. This is also applied to the English models now, but it doesn't
169 | # matter since the English models were not trained on any Chinese data
170 | # and generally don't have any Chinese data in them (there are Chinese
171 | # characters in the vocabulary because Wikipedia does have some Chinese
172 | # words in the English Wikipedia.).
173 | text = self._tokenize_chinese_chars(text)
174 | orig_tokens = whitespace_tokenize(text)
175 | split_tokens = []
176 | for token in orig_tokens:
177 | if self.do_lower_case:
178 | token = token.lower()
179 | token = self._run_strip_accents(token)
180 | split_tokens.extend(self._run_split_on_punc(token))
181 |
182 | output_tokens = whitespace_tokenize(" ".join(split_tokens))
183 | return output_tokens
184 |
185 | def _run_strip_accents(self, text):
186 | """Strips accents from a piece of text."""
187 | text = unicodedata.normalize("NFD", text)
188 | output = []
189 | for char in text:
190 | cat = unicodedata.category(char)
191 | if cat == "Mn":
192 | continue
193 | output.append(char)
194 | return "".join(output)
195 |
196 | def _run_split_on_punc(self, text):
197 | """Splits punctuation on a piece of text."""
198 | chars = list(text)
199 | i = 0
200 | start_new_word = True
201 | output = []
202 | while i < len(chars):
203 | char = chars[i]
204 | if _is_punctuation(char):
205 | output.append([char])
206 | start_new_word = True
207 | else:
208 | if start_new_word:
209 | output.append([])
210 | start_new_word = False
211 | output[-1].append(char)
212 | i += 1
213 |
214 | return ["".join(x) for x in output]
215 |
216 | def _tokenize_chinese_chars(self, text):
217 | """Adds whitespace around any CJK character."""
218 | output = []
219 | for char in text:
220 | cp = ord(char)
221 | if self._is_chinese_char(cp):
222 | output.append(" ")
223 | output.append(char)
224 | output.append(" ")
225 | else:
226 | output.append(char)
227 | return "".join(output)
228 |
229 | def _is_chinese_char(self, cp):
230 | """Checks whether CP is the codepoint of a CJK character."""
231 | # This defines a "chinese character" as anything in the CJK Unicode block:
232 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
233 | #
234 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
235 | # despite its name. The modern Korean Hangul alphabet is a different block,
236 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write
237 | # space-separated words, so they are not treated specially and handled
238 | # like the all of the other languages.
239 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
240 | (cp >= 0x3400 and cp <= 0x4DBF) or #
241 | (cp >= 0x20000 and cp <= 0x2A6DF) or #
242 | (cp >= 0x2A700 and cp <= 0x2B73F) or #
243 | (cp >= 0x2B740 and cp <= 0x2B81F) or #
244 | (cp >= 0x2B820 and cp <= 0x2CEAF) or
245 | (cp >= 0xF900 and cp <= 0xFAFF) or #
246 | (cp >= 0x2F800 and cp <= 0x2FA1F)): #
247 | return True
248 |
249 | return False
250 |
251 | def _clean_text(self, text):
252 | """Performs invalid character removal and whitespace cleanup on text."""
253 | output = []
254 | for char in text:
255 | cp = ord(char)
256 | if cp == 0 or cp == 0xfffd or _is_control(char):
257 | continue
258 | if _is_whitespace(char):
259 | output.append(" ")
260 | else:
261 | output.append(char)
262 | return "".join(output)
263 |
264 |
265 | class WordpieceTokenizer(object):
266 | """Runs WordPiece tokenization."""
267 |
268 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
269 | self.vocab = vocab
270 | self.unk_token = unk_token
271 | self.max_input_chars_per_word = max_input_chars_per_word
272 |
273 | def tokenize(self, text):
274 | """Tokenizes a piece of text into its word pieces.
275 |
276 | This uses a greedy longest-match-first algorithm to perform tokenization
277 | using the given vocabulary.
278 |
279 | For example:
280 | input = "unaffable"
281 | output = ["un", "##aff", "##able"]
282 |
283 | Args:
284 | text: A single token or whitespace separated tokens. This should have
285 | already been passed through `BasicTokenizer.
286 |
287 | Returns:
288 | A list of wordpiece tokens.
289 | """
290 |
291 | text = convert_to_unicode(text)
292 |
293 | output_tokens = []
294 | for token in whitespace_tokenize(text):
295 | chars = list(token)
296 | if len(chars) > self.max_input_chars_per_word:
297 | output_tokens.append(self.unk_token)
298 | continue
299 |
300 | is_bad = False
301 | start = 0
302 | sub_tokens = []
303 | while start < len(chars):
304 | end = len(chars)
305 | cur_substr = None
306 | while start < end:
307 | substr = "".join(chars[start:end])
308 | if start > 0:
309 | substr = "##" + substr
310 | if substr in self.vocab:
311 | cur_substr = substr
312 | break
313 | end -= 1
314 | if cur_substr is None:
315 | is_bad = True
316 | break
317 | sub_tokens.append(cur_substr)
318 | start = end
319 |
320 | if is_bad:
321 | output_tokens.append(self.unk_token)
322 | else:
323 | output_tokens.extend(sub_tokens)
324 | return output_tokens
325 |
326 |
327 | def _is_whitespace(char):
328 | """Checks whether `chars` is a whitespace character."""
329 | # \t, \n, and \r are technically contorl characters but we treat them
330 | # as whitespace since they are generally considered as such.
331 | if char == " " or char == "\t" or char == "\n" or char == "\r":
332 | return True
333 | cat = unicodedata.category(char)
334 | if cat == "Zs":
335 | return True
336 | return False
337 |
338 |
339 | def _is_control(char):
340 | """Checks whether `chars` is a control character."""
341 | # These are technically control characters but we count them as whitespace
342 | # characters.
343 | if char == "\t" or char == "\n" or char == "\r":
344 | return False
345 | cat = unicodedata.category(char)
346 | if cat.startswith("C"):
347 | return True
348 | return False
349 |
350 |
351 | def _is_punctuation(char):
352 | """Checks whether `chars` is a punctuation character."""
353 | cp = ord(char)
354 | # We treat all non-letter/number ASCII as punctuation.
355 | # Characters such as "^", "$", and "`" are not in the Unicode
356 | # Punctuation class but we treat them as punctuation anyways, for
357 | # consistency.
358 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
359 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
360 | return True
361 | cat = unicodedata.category(char)
362 | if cat.startswith("P"):
363 | return True
364 | return False
365 |
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/SDNet.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | import math
5 | import random
6 | import numpy as np
7 | import torch
8 | import torch.nn as nn
9 | import torch.nn.functional as F
10 | from torch.autograd import Variable
11 | import torch.nn.init as init
12 | from torch.nn.parameter import Parameter
13 | from Models.Bert.Bert import Bert
14 | from Models.Layers import MaxPooling, CNN, dropout, RNN_from_opt, set_dropout_prob, weighted_avg, set_seq_dropout, Attention, DeepAttention, LinearSelfAttn, GetFinalScores
15 | from Utils.CoQAUtils import POS, ENT
16 |
17 | '''
18 | SDNet
19 | '''
20 | class SDNet(nn.Module):
21 | def __init__(self, opt, word_embedding):
22 | super(SDNet, self).__init__()
23 | print('SDNet model\n')
24 |
25 | self.opt = opt
26 | self.use_cuda = (self.opt['cuda'] == True)
27 | set_dropout_prob(0.0 if not 'DROPOUT' in opt else float(opt['DROPOUT']))
28 | set_seq_dropout('VARIATIONAL_DROPOUT' in self.opt)
29 |
30 | x_input_size = 0
31 | ques_input_size = 0
32 |
33 | self.vocab_size = int(opt['vocab_size'])
34 | vocab_dim = int(opt['vocab_dim'])
35 | self.vocab_embed = nn.Embedding(self.vocab_size, vocab_dim, padding_idx = 1)
36 | self.vocab_embed.weight.data = word_embedding
37 |
38 | x_input_size += vocab_dim
39 | ques_input_size += vocab_dim
40 |
41 | if 'CHAR_CNN' in self.opt:
42 | print('CHAR_CNN')
43 | char_vocab_size = int(opt['char_vocab_size'])
44 | char_dim = int(opt['char_emb_size'])
45 | char_hidden_size = int(opt['char_hidden_size'])
46 | self.char_embed = nn.Embedding(char_vocab_size, char_dim, padding_idx = 1)
47 | self.char_cnn = CNN(char_dim, 3, char_hidden_size)
48 | self.maxpooling = MaxPooling()
49 | x_input_size += char_hidden_size
50 | ques_input_size += char_hidden_size
51 |
52 | if 'TUNE_PARTIAL' in self.opt:
53 | print('TUNE_PARTIAL')
54 | self.fixed_embedding = word_embedding[opt['tune_partial']:]
55 | else:
56 | self.vocab_embed.weight.requires_grad = False
57 |
58 | cdim = 0
59 | self.use_contextual = False
60 |
61 | if 'BERT' in self.opt:
62 | print('Using BERT')
63 | self.Bert = Bert(self.opt)
64 | if 'LOCK_BERT' in self.opt:
65 | print('Lock BERT\'s weights')
66 | for p in self.Bert.parameters():
67 | p.requires_grad = False
68 | if 'BERT_LARGE' in self.opt:
69 | print('BERT_LARGE')
70 | bert_dim = 1024
71 | bert_layers = 24
72 | else:
73 | bert_dim = 768
74 | bert_layers = 12
75 |
76 | print('BERT dim:', bert_dim, 'BERT_LAYERS:', bert_layers)
77 |
78 | if 'BERT_LINEAR_COMBINE' in self.opt:
79 | print('BERT_LINEAR_COMBINE')
80 | self.alphaBERT = nn.Parameter(torch.Tensor(bert_layers), requires_grad=True)
81 | self.gammaBERT = nn.Parameter(torch.Tensor(1, 1), requires_grad=True)
82 | torch.nn.init.constant(self.alphaBERT, 1.0)
83 | torch.nn.init.constant(self.gammaBERT, 1.0)
84 |
85 | cdim = bert_dim
86 | x_input_size += bert_dim
87 | ques_input_size += bert_dim
88 |
89 | self.pre_align = Attention(vocab_dim, opt['prealign_hidden'], correlation_func = 3, do_similarity = True)
90 | x_input_size += vocab_dim
91 |
92 | pos_dim = opt['pos_dim']
93 | ent_dim = opt['ent_dim']
94 | self.pos_embedding = nn.Embedding(len(POS), pos_dim)
95 | self.ent_embedding = nn.Embedding(len(ENT), ent_dim)
96 |
97 | x_feat_len = 4
98 | if 'ANSWER_SPAN_IN_CONTEXT_FEATURE' in self.opt:
99 | print('ANSWER_SPAN_IN_CONTEXT_FEATURE')
100 | x_feat_len += 1
101 |
102 | x_input_size += pos_dim + ent_dim + x_feat_len
103 |
104 | print('Initially, the vector_sizes [doc, query] are', x_input_size, ques_input_size)
105 |
106 | addtional_feat = cdim if self.use_contextual else 0
107 |
108 | # RNN context encoder
109 | self.context_rnn, context_rnn_output_size = RNN_from_opt(x_input_size, opt['hidden_size'],
110 | num_layers=opt['in_rnn_layers'], concat_rnn=opt['concat_rnn'], add_feat=addtional_feat)
111 | # RNN question encoder
112 | self.ques_rnn, ques_rnn_output_size = RNN_from_opt(ques_input_size, opt['hidden_size'],
113 | num_layers=opt['in_rnn_layers'], concat_rnn=opt['concat_rnn'], add_feat=addtional_feat)
114 |
115 | # Output sizes of rnn encoders
116 | print('After Input LSTM, the vector_sizes [doc, query] are [', context_rnn_output_size, ques_rnn_output_size, '] *', opt['in_rnn_layers'])
117 |
118 | # Deep inter-attention
119 | self.deep_attn = DeepAttention(opt, abstr_list_cnt=opt['in_rnn_layers'],
120 | deep_att_hidden_size_per_abstr=opt['deep_att_hidden_size_per_abstr'], correlation_func=3, word_hidden_size=vocab_dim + addtional_feat)
121 | self.deep_attn_input_size = self.deep_attn.rnn_input_size
122 | self.deep_attn_output_size = self.deep_attn.output_size
123 |
124 | # Question understanding and compression
125 | self.high_lvl_ques_rnn , high_lvl_ques_rnn_output_size = RNN_from_opt(ques_rnn_output_size * opt['in_rnn_layers'],
126 | opt['highlvl_hidden_size'], num_layers = opt['question_high_lvl_rnn_layers'], concat_rnn = True)
127 |
128 | self.after_deep_attn_size = self.deep_attn_output_size + self.deep_attn_input_size + addtional_feat + vocab_dim
129 | self.self_attn_input_size = self.after_deep_attn_size
130 | self_attn_output_size = self.deep_attn_output_size
131 |
132 | # Self attention on context
133 | self.highlvl_self_att = Attention(self.self_attn_input_size, opt['deep_att_hidden_size_per_abstr'], correlation_func=3)
134 | print('Self deep-attention input is {}-dim'.format(self.self_attn_input_size))
135 |
136 | self.high_lvl_context_rnn, high_lvl_context_rnn_output_size = RNN_from_opt(self.deep_attn_output_size + self_attn_output_size,
137 | opt['highlvl_hidden_size'], num_layers = 1, concat_rnn = False)
138 | context_final_size = high_lvl_context_rnn_output_size
139 |
140 | print('Do Question self attention')
141 | self.ques_self_attn = Attention(high_lvl_ques_rnn_output_size, opt['query_self_attn_hidden_size'], correlation_func=3)
142 |
143 | ques_final_size = high_lvl_ques_rnn_output_size
144 | print('Before answer span finding, hidden size are', context_final_size, ques_final_size)
145 |
146 | # Question merging
147 | self.ques_merger = LinearSelfAttn(ques_final_size)
148 | self.get_answer = GetFinalScores(context_final_size, ques_final_size)
149 |
150 | '''
151 | x: 1 x x_len (word_ids)
152 | x_single_mask: 1 x x_len
153 | x_char: 1 x x_len x char_len (char_ids)
154 | x_char_mask: 1 x x_len x char_len
155 | x_features: batch_size x x_len x feature_len (5, if answer_span_in_context_feature; 4 otherwise)
156 | x_pos: 1 x x_len (POS id)
157 | x_ent: 1 x x_len (entity id)
158 | x_bert: 1 x x_bert_token_len
159 | x_bert_mask: 1 x x_bert_token_len
160 | x_bert_offsets: 1 x x_len x 2
161 | q: batch x q_len (word_ids)
162 | q_mask: batch x q_len
163 | q_char: batch x q_len x char_len (char ids)
164 | q_char_mask: batch x q_len x char_len
165 | q_bert: 1 x q_bert_token_len
166 | q_bert_mask: 1 x q_bert_token_len
167 | q_bert_offsets: 1 x q_len x 2
168 | context_len: number of words in context (only one per batch)
169 | return:
170 | score_s: batch x context_len
171 | score_e: batch x context_len
172 | score_no: batch x 1
173 | score_yes: batch x 1
174 | score_noanswer: batch x 1
175 | '''
176 | def forward(self, x, x_single_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, q, q_mask, q_char, q_char_mask, q_bert, q_bert_mask, q_bert_offsets, context_len):
177 | batch_size = q.shape[0]
178 | x_mask = x_single_mask.expand(batch_size, -1)
179 | x_word_embed = self.vocab_embed(x).expand(batch_size, -1, -1) # batch x x_len x vocab_dim
180 | ques_word_embed = self.vocab_embed(q) # batch x q_len x vocab_dim
181 |
182 | x_input_list = [dropout(x_word_embed, p=self.opt['dropout_emb'], training=self.drop_emb)] # batch x x_len x vocab_dim
183 | ques_input_list = [dropout(ques_word_embed, p=self.opt['dropout_emb'], training=self.drop_emb)] # batch x q_len x vocab_dim
184 |
185 | # contextualized embedding
186 | x_cemb = ques_cemb = None
187 | if 'BERT' in self.opt:
188 | x_cemb = ques_cemb = None
189 |
190 | if 'BERT_LINEAR_COMBINE' in self.opt:
191 | x_bert_output = self.Bert(x_bert, x_bert_mask, x_bert_offsets, x_single_mask)
192 | x_cemb_mid = self.linear_sum(x_bert_output, self.alphaBERT, self.gammaBERT)
193 | ques_bert_output = self.Bert(q_bert, q_bert_mask, q_bert_offsets, q_mask)
194 | ques_cemb_mid = self.linear_sum(ques_bert_output, self.alphaBERT, self.gammaBERT)
195 | x_cemb_mid = x_cemb_mid.expand(batch_size, -1, -1)
196 | else:
197 | x_cemb_mid = self.Bert(x_bert, x_bert_mask, x_bert_offsets, x_single_mask)
198 | x_cemb_mid = x_cemb_mid.expand(batch_size, -1, -1)
199 | ques_cemb_mid = self.Bert(q_bert, q_bert_mask, q_bert_offsets, q_mask)
200 |
201 | x_input_list.append(x_cemb_mid)
202 | ques_input_list.append(ques_cemb_mid)
203 |
204 | if 'CHAR_CNN' in self.opt:
205 | x_char_final = self.character_cnn(x_char, x_char_mask)
206 | x_char_final = x_char_final.expand(batch_size, -1, -1)
207 | ques_char_final = self.character_cnn(q_char, q_char_mask)
208 | x_input_list.append(x_char_final)
209 | ques_input_list.append(ques_char_final)
210 |
211 | x_prealign = self.pre_align(x_word_embed, ques_word_embed, q_mask)
212 | x_input_list.append(x_prealign) # batch x x_len x (vocab_dim + cdim + vocab_dim)
213 |
214 | x_pos_emb = self.pos_embedding(x_pos).expand(batch_size, -1, -1) # batch x x_len x pos_dim
215 | x_ent_emb = self.ent_embedding(x_ent).expand(batch_size, -1, -1) # batch x x_len x ent_dim
216 | x_input_list.append(x_pos_emb)
217 | x_input_list.append(x_ent_emb)
218 | x_input_list.append(x_features) # batch x x_len x (vocab_dim + cdim + vocab_dim + pos_dim + ent_dim + feature_dim)
219 |
220 | x_input = torch.cat(x_input_list, 2) # batch x x_len x (vocab_dim + cdim + vocab_dim + pos_dim + ent_dim + feature_dim)
221 | ques_input = torch.cat(ques_input_list, 2) # batch x q_len x (vocab_dim + cdim)
222 |
223 | # Multi-layer RNN
224 | _, x_rnn_layers = self.context_rnn(x_input, x_mask, return_list=True, x_additional=x_cemb) # layer x batch x x_len x context_rnn_output_size
225 | _, ques_rnn_layers = self.ques_rnn(ques_input, q_mask, return_list=True, x_additional=ques_cemb) # layer x batch x q_len x ques_rnn_output_size
226 |
227 | # rnn with question only
228 | ques_highlvl = self.high_lvl_ques_rnn(torch.cat(ques_rnn_layers, 2), q_mask) # batch x q_len x high_lvl_ques_rnn_output_size
229 | ques_rnn_layers.append(ques_highlvl) # (layer + 1) layers
230 |
231 | # deep multilevel inter-attention
232 | if x_cemb is None:
233 | x_long = x_word_embed
234 | ques_long = ques_word_embed
235 | else:
236 | x_long = torch.cat([x_word_embed, x_cemb], 2) # batch x x_len x (vocab_dim + cdim)
237 | ques_long = torch.cat([ques_word_embed, ques_cemb], 2) # batch x q_len x (vocab_dim + cdim)
238 |
239 | x_rnn_after_inter_attn, x_inter_attn = self.deep_attn([x_long], x_rnn_layers, [ques_long], ques_rnn_layers, x_mask, q_mask, return_bef_rnn=True)
240 | # x_rnn_after_inter_attn: batch x x_len x deep_attn_output_size
241 | # x_inter_attn: batch x x_len x deep_attn_input_size
242 |
243 | # deep self attention
244 | if x_cemb is None:
245 | x_self_attn_input = torch.cat([x_rnn_after_inter_attn, x_inter_attn, x_word_embed], 2)
246 | else:
247 | x_self_attn_input = torch.cat([x_rnn_after_inter_attn, x_inter_attn, x_cemb, x_word_embed], 2)
248 | # batch x x_len x (deep_attn_output_size + deep_attn_input_size + cdim + vocab_dim)
249 |
250 | x_self_attn_output = self.highlvl_self_att(x_self_attn_input, x_self_attn_input, x_mask, x3=x_rnn_after_inter_attn, drop_diagonal=True)
251 | # batch x x_len x deep_attn_output_size
252 |
253 | x_highlvl_output = self.high_lvl_context_rnn(torch.cat([x_rnn_after_inter_attn, x_self_attn_output], 2), x_mask)
254 | # bach x x_len x high_lvl_context_rnn.output_size
255 | x_final = x_highlvl_output
256 |
257 | # question self attention
258 | ques_final = self.ques_self_attn(ques_highlvl, ques_highlvl, q_mask, x3=None, drop_diagonal=True) # batch x q_len x high_lvl_ques_rnn_output_size
259 |
260 | # merge questions
261 | q_merge_weights = self.ques_merger(ques_final, q_mask)
262 | ques_merged = weighted_avg(ques_final, q_merge_weights) # batch x ques_final_size
263 |
264 | # predict scores
265 | score_s, score_e, score_no, score_yes, score_noanswer = self.get_answer(x_final, ques_merged, x_mask)
266 | return score_s, score_e, score_no, score_yes, score_noanswer
267 |
268 | '''
269 | input:
270 | x_char: batch x word_num x char_num
271 | x_char_mask: batch x word_num x char_num
272 | output:
273 | x_char_cnn_final: batch x word_num x char_cnn_hidden_size
274 | '''
275 | def character_cnn(self, x_char, x_char_mask):
276 | x_char_embed = self.char_embed(x_char) # batch x word_num x char_num x char_dim
277 | batch_size = x_char_embed.shape[0]
278 | word_num = x_char_embed.shape[1]
279 | char_num = x_char_embed.shape[2]
280 | char_dim = x_char_embed.shape[3]
281 | x_char_cnn = self.char_cnn(x_char_embed.contiguous().view(-1, char_num, char_dim), x_char_mask) # (batch x word_num) x char_num x char_cnn_hidden_size
282 | x_char_cnn_final = self.maxpooling(x_char_cnn, x_char_mask.contiguous().view(-1, char_num)).contiguous().view(batch_size, word_num, -1) # batch x word_num x char_cnn_hidden_size
283 | return x_char_cnn_final
284 |
285 | def linear_sum(self, output, alpha, gamma):
286 | alpha_softmax = F.softmax(alpha)
287 | for i in range(len(output)):
288 | t = output[i] * alpha_softmax[i] * gamma
289 | if i == 0:
290 | res = t
291 | else:
292 | res += t
293 |
294 | res = dropout(res, p=self.opt['dropout_emb'], training=self.drop_emb)
295 | return res
296 |
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/SDNetTrainer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | from datetime import datetime
5 | import json
6 | import numpy as np
7 | import os
8 | import random
9 | import sys
10 | import time
11 | import torch
12 | from torch.autograd import Variable
13 | import torch.nn as nn
14 | import torch.nn.functional as F
15 | import torch.optim as optim
16 | from Utils.CoQAPreprocess import CoQAPreprocess
17 | from Models.Layers import MaxPooling, set_dropout_prob
18 | from Models.SDNet import SDNet
19 | from Models.BaseTrainer import BaseTrainer
20 | from Utils.CoQAUtils import BatchGen, AverageMeter, gen_upper_triangle, score
21 |
22 | class SDNetTrainer(BaseTrainer):
23 | def __init__(self, opt):
24 | super(SDNetTrainer, self).__init__(opt)
25 | print('SDNet Model Trainer')
26 | set_dropout_prob(0.0 if not 'DROPOUT' in opt else float(opt['DROPOUT']))
27 | self.seed = int(opt['SEED'])
28 | self.data_prefix = 'coqa-'
29 | random.seed(self.seed)
30 | np.random.seed(self.seed)
31 | torch.manual_seed(self.seed)
32 | self.preproc = CoQAPreprocess(self.opt)
33 | if self.use_cuda:
34 | torch.cuda.manual_seed_all(self.seed)
35 |
36 | def official(self, model_path, test_data):
37 | print('-----------------------------------------------')
38 | print("Initializing model...")
39 | self.setup_model(self.preproc.train_embedding)
40 | self.load_model(model_path)
41 |
42 | print("Predicting in batches...")
43 | test_batches = BatchGen(self.opt, test_data['data'], self.use_cuda, self.preproc.train_vocab, self.preproc.train_char_vocab, evaluation=True)
44 | predictions = []
45 | confidence = []
46 | final_json = []
47 | cnt = 0
48 | for j, test_batch in enumerate(test_batches):
49 | cnt += 1
50 | if cnt % 50 == 0:
51 | print(cnt, '/', len(test_batches))
52 | phrase, phrase_score, pred_json = self.predict(test_batch)
53 | predictions.extend(phrase)
54 | confidence.extend(phrase_score)
55 | final_json.extend(pred_json)
56 |
57 | return predictions, confidence, final_json
58 |
59 | def train(self):
60 | self.isTrain = True
61 | self.getSaveFolder()
62 | self.saveConf()
63 | self.vocab, self.char_vocab, vocab_embedding = self.preproc.load_data()
64 | self.log('-----------------------------------------------')
65 | self.log("Initializing model...")
66 | self.setup_model(vocab_embedding)
67 |
68 | if 'RESUME' in self.opt:
69 | model_path = os.path.join(self.opt['datadir'], self.opt['MODEL_PATH'])
70 | self.load_model(model_path)
71 |
72 | print('Loading train json...')
73 | with open(os.path.join(self.opt['FEATURE_FOLDER'], self.data_prefix + 'train-preprocessed.json'), 'r') as f:
74 | train_data = json.load(f)
75 |
76 | print('Loading dev json...')
77 | with open(os.path.join(self.opt['FEATURE_FOLDER'], self.data_prefix + 'dev-preprocessed.json'), 'r') as f:
78 | dev_data = json.load(f)
79 |
80 | best_f1_score = 0.0
81 | numEpochs = self.opt['EPOCH']
82 | for epoch in range(self.epoch_start, numEpochs):
83 | self.log('Epoch {}'.format(epoch))
84 | self.network.train()
85 | startTime = datetime.now()
86 | train_batches = BatchGen(self.opt, train_data['data'], self.use_cuda, self.vocab, self.char_vocab)
87 | dev_batches = BatchGen(self.opt, dev_data['data'], self.use_cuda, self.vocab, self.char_vocab, evaluation=True)
88 | for i, batch in enumerate(train_batches):
89 | if i == len(train_batches) - 1 or (epoch == 0 and i == 0 and ('RESUME' in self.opt)) or (i > 0 and i % 1500 == 0):
90 | print('Saving folder is', self.saveFolder)
91 | print('Evaluating on dev set...')
92 | predictions = []
93 | confidence = []
94 | dev_answer = []
95 | final_json = []
96 | for j, dev_batch in enumerate(dev_batches):
97 | phrase, phrase_score, pred_json = self.predict(dev_batch)
98 | final_json.extend(pred_json)
99 | predictions.extend(phrase)
100 | confidence.extend(phrase_score)
101 | dev_answer.extend(dev_batch[-3]) # answer_str
102 | result, all_f1s = score(predictions, dev_answer, final_json)
103 | f1 = result['f1']
104 |
105 | if f1 > best_f1_score:
106 | model_file = os.path.join(self.saveFolder, 'best_model.pt')
107 | self.save_for_predict(model_file, epoch)
108 | best_f1_score = f1
109 | pred_json_file = os.path.join(self.saveFolder, 'prediction.json')
110 | with open(pred_json_file, 'w') as output_file:
111 | json.dump(final_json, output_file)
112 | score_per_instance = []
113 | for instance, s in zip(final_json, all_f1s):
114 | score_per_instance.append({
115 | 'id': instance['id'],
116 | 'turn_id': instance['turn_id'],
117 | 'f1': s
118 | })
119 | score_per_instance_json_file = os.path.join(self.saveFolder, 'score_per_instance.json')
120 | with open(score_per_instance_json_file, 'w') as output_file:
121 | json.dump(score_per_instance, output_file)
122 |
123 | self.log("Epoch {0} - dev: F1: {1:.3f} (best F1: {2:.3f})".format(epoch, f1, best_f1_score))
124 | self.log("Results breakdown\n{0}".format(result))
125 |
126 | self.update(batch)
127 | if i % 100 == 0:
128 | self.log('updates[{0:6}] train loss[{1:.5f}] remaining[{2}]'.format(
129 | self.updates, self.train_loss.avg,
130 | str((datetime.now() - startTime) / (i + 1) * (len(train_batches) - i - 1)).split('.')[0]))
131 |
132 | print("PROGRESS: {0:.2f}%".format(100.0 * (epoch + 1) / numEpochs))
133 | print('Config file is at ' + self.opt['confFile'])
134 |
135 | def setup_model(self, vocab_embedding):
136 | self.train_loss = AverageMeter()
137 | self.network = SDNet(self.opt, vocab_embedding)
138 | if self.use_cuda:
139 | self.log('Putting model into GPU')
140 | self.network.cuda()
141 |
142 | parameters = [p for p in self.network.parameters() if p.requires_grad]
143 | self.optimizer = optim.Adamax(parameters)
144 | if 'ADAM2' in self.opt:
145 | print('ADAM2')
146 | self.optimizer = optim.Adam(parameters, lr = 0.0001)
147 |
148 | self.updates = 0
149 | self.epoch_start = 0
150 | self.loss_func = F.cross_entropy
151 |
152 | def update(self, batch):
153 | # Train mode
154 | self.network.train()
155 | self.network.drop_emb = True
156 |
157 | x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, query, query_mask, \
158 | query_char, query_char_mask, query_bert, query_bert_mask, query_bert_offsets, ground_truth, context_str, context_words, _, _, _, _ = batch
159 |
160 | # Run forward
161 | # score_s, score_e: batch x context_word_num
162 | # score_yes, score_no, score_no_answer: batch x 1
163 | score_s, score_e, score_yes, score_no, score_no_answer = self.network(x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets,
164 | query, query_mask, query_char, query_char_mask, query_bert, query_bert_mask, query_bert_offsets, len(context_words))
165 | max_len = self.opt['max_len'] or score_s.size(1)
166 | batch_size = score_s.shape[0]
167 | context_len = score_s.size(1)
168 | expand_score = gen_upper_triangle(score_s, score_e, max_len, self.use_cuda)
169 | scores = torch.cat((expand_score, score_no, score_yes, score_no_answer), dim=1) # batch x (context_len * context_len + 3)
170 | targets = []
171 | span_idx = int(context_len * context_len)
172 | for i in range(ground_truth.shape[0]):
173 | if ground_truth[i][0] == -1 and ground_truth[i][1] == -1: # no answer
174 | targets.append(span_idx + 2)
175 | if ground_truth[i][0] == 0 and ground_truth[i][1] == -1: # no
176 | targets.append(span_idx)
177 | if ground_truth[i][0] == -1 and ground_truth[i][1] == 0: # yes
178 | targets.append(span_idx + 1)
179 | if ground_truth[i][0] != -1 and ground_truth[i][1] != -1: # normal span
180 | targets.append(ground_truth[i][0] * context_len + ground_truth[i][1])
181 |
182 | targets = torch.LongTensor(np.array(targets))
183 | if self.use_cuda:
184 | targets = targets.cuda()
185 | loss = self.loss_func(scores, targets)
186 | self.train_loss.update(loss.data[0], 1)
187 | self.optimizer.zero_grad()
188 | loss.backward()
189 | torch.nn.utils.clip_grad_norm(self.network.parameters(), self.opt['grad_clipping'])
190 | self.optimizer.step()
191 | self.updates += 1
192 | if 'TUNE_PARTIAL' in self.opt:
193 | self.network.vocab_embed.weight.data[self.opt['tune_partial']:] = self.network.fixed_embedding
194 |
195 | def predict(self, batch):
196 | self.network.eval()
197 | self.network.drop_emb = False
198 |
199 | # Run forward
200 | x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, query, query_mask, \
201 | query_char, query_char_mask, query_bert, query_bert_mask, query_bert_offsets, ground_truth, context_str, context_words, \
202 | context_word_offsets, answers, context_id, turn_ids = batch
203 |
204 | context_len = len(context_words)
205 | score_s, score_e, score_yes, score_no, score_no_answer = self.network(x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets,
206 | query, query_mask, query_char, query_char_mask, query_bert, query_bert_mask, query_bert_offsets, len(context_words))
207 | batch_size = score_s.shape[0]
208 | max_len = self.opt['max_len'] or score_s.size(1)
209 |
210 | expand_score = gen_upper_triangle(score_s, score_e, max_len, self.use_cuda)
211 | scores = torch.cat((expand_score, score_no, score_yes, score_no_answer), dim=1) # batch x (context_len * context_len + 3)
212 | prob = F.softmax(scores, dim = 1).data.cpu() # Transfer to CPU/normal tensors for numpy ops
213 |
214 | # Get argmax text spans
215 | predictions = []
216 | confidence = []
217 |
218 | pred_json = []
219 | for i in range(batch_size):
220 | _, ids = torch.sort(prob[i, :], descending=True)
221 | idx = 0
222 | best_id = ids[idx]
223 |
224 | confidence.append(float(prob[i, best_id]))
225 | if best_id < context_len * context_len:
226 | st = best_id / context_len
227 | ed = best_id % context_len
228 | st = context_word_offsets[st][0]
229 | ed = context_word_offsets[ed][1]
230 | predictions.append(context_str[st:ed])
231 |
232 | if best_id == context_len * context_len:
233 | predictions.append('no')
234 |
235 | if best_id == context_len * context_len + 1:
236 | predictions.append('yes')
237 |
238 | if best_id == context_len * context_len + 2:
239 | predictions.append('unknown')
240 |
241 | pred_json.append({
242 | 'id': context_id,
243 | 'turn_id': turn_ids[i],
244 | 'answer': predictions[-1]
245 | })
246 |
247 | return (predictions, confidence, pred_json) # list of strings, list of floats, list of jsons
248 |
249 | def load_model(self, model_path):
250 | print('Loading model from', model_path)
251 | checkpoint = torch.load(model_path)
252 | state_dict = checkpoint['state_dict']
253 | new_state = set(self.network.state_dict().keys())
254 | for k in list(state_dict['network'].keys()):
255 | if k not in new_state:
256 | del state_dict['network'][k]
257 | for k, v in list(self.network.state_dict().items()):
258 | if k not in state_dict['network']:
259 | state_dict['network'][k] = v
260 | self.network.load_state_dict(state_dict['network'])
261 |
262 | print('Loading finished', model_path)
263 |
264 | def save(self, filename, epoch, prev_filename):
265 | params = {
266 | 'state_dict': {
267 | 'network': self.network.state_dict(),
268 | 'optimizer': self.optimizer.state_dict(),
269 | 'updates': self.updates # how many updates
270 | },
271 | 'train_loss': {
272 | 'val': self.train_loss.val,
273 | 'avg': self.train_loss.avg,
274 | 'sum': self.train_loss.sum,
275 | 'count': self.train_loss.count
276 | },
277 | 'config': self.opt,
278 | 'epoch': epoch
279 | }
280 | try:
281 | torch.save(params, filename)
282 | self.log('model saved to {}'.format(filename))
283 | if os.path.exists(prev_filename):
284 | os.remove(prev_filename)
285 | except BaseException:
286 | self.log('[ WARN: Saving failed... continuing anyway. ]')
287 |
288 | def save_for_predict(self, filename, epoch):
289 | network_state = dict([(k, v) for k, v in self.network.state_dict().items() if k[0:4] != 'CoVe' and k[0:4] != 'ELMo' and k[0:9] != 'AllenELMo' and k[0:4] != 'Bert'])
290 |
291 | if 'eval_embed.weight' in network_state:
292 | del network_state['eval_embed.weight']
293 | if 'fixed_embedding' in network_state:
294 | del network_state['fixed_embedding']
295 | params = {
296 | 'state_dict': {'network': network_state},
297 | 'config': self.opt,
298 | }
299 | try:
300 | torch.save(params, filename)
301 | self.log('model saved to {}'.format(filename))
302 | except BaseException:
303 | self.log('[ WARN: Saving failed... continuing anyway. ]')
304 |
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/__pycache__/BaseTrainer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/__pycache__/BaseTrainer.cpython-37.pyc
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/__pycache__/Layers.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/__pycache__/Layers.cpython-37.pyc
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/__pycache__/SDNet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/__pycache__/SDNet.cpython-37.pyc
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Models/__pycache__/SDNetTrainer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/__pycache__/SDNetTrainer.cpython-37.pyc
--------------------------------------------------------------------------------
/SDNet/code/SDNet/NOTICE.txt:
--------------------------------------------------------------------------------
1 | NOTICES AND INFORMATION
2 | Do Not Translate or Localize
3 |
4 | This software incorporates material from third parties. Microsoft makes certain
5 | open source code available at http://3rdpartysource.microsoft.com, or you may
6 | send a check or money order for US $5.00, including the product name, the open
7 | source component name, and version number, to:
8 |
9 | Source Code Compliance Team
10 | Microsoft Corporation
11 | One Microsoft Way
12 | Redmond, WA 98052
13 | USA
14 |
15 | Notwithstanding any other terms, you may reverse engineer this software to the
16 | extent required to debug changes to any libraries licensed under the GNU Lesser
17 | General Public License.
18 |
19 | ________________________
20 |
21 | huggingface/pytorch-pretrained-BERT v0.6.1 (Source)
22 |
23 | Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
24 | Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
25 |
26 |
27 | Apache License
28 | Version 2.0, January 2004
29 | http://www.apache.org/licenses/
30 |
31 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
32 |
33 | 1. Definitions.
34 |
35 | "License" shall mean the terms and conditions for use, reproduction,
36 | and distribution as defined by Sections 1 through 9 of this document.
37 |
38 | "Licensor" shall mean the copyright owner or entity authorized by
39 | the copyright owner that is granting the License.
40 |
41 | "Legal Entity" shall mean the union of the acting entity and all
42 | other entities that control, are controlled by, or are under common
43 | control with that entity. For the purposes of this definition,
44 | "control" means (i) the power, direct or indirect, to cause the
45 | direction or management of such entity, whether by contract or
46 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
47 | outstanding shares, or (iii) beneficial ownership of such entity.
48 |
49 | "You" (or "Your") shall mean an individual or Legal Entity
50 | exercising permissions granted by this License.
51 |
52 | "Source" form shall mean the preferred form for making modifications,
53 | including but not limited to software source code, documentation
54 | source, and configuration files.
55 |
56 | "Object" form shall mean any form resulting from mechanical
57 | transformation or translation of a Source form, including but
58 | not limited to compiled object code, generated documentation,
59 | and conversions to other media types.
60 |
61 | "Work" shall mean the work of authorship, whether in Source or
62 | Object form, made available under the License, as indicated by a
63 | copyright notice that is included in or attached to the work
64 | (an example is provided in the Appendix below).
65 |
66 | "Derivative Works" shall mean any work, whether in Source or Object
67 | form, that is based on (or derived from) the Work and for which the
68 | editorial revisions, annotations, elaborations, or other modifications
69 | represent, as a whole, an original work of authorship. For the purposes
70 | of this License, Derivative Works shall not include works that remain
71 | separable from, or merely link (or bind by name) to the interfaces of,
72 | the Work and Derivative Works thereof.
73 |
74 | "Contribution" shall mean any work of authorship, including
75 | the original version of the Work and any modifications or additions
76 | to that Work or Derivative Works thereof, that is intentionally
77 | submitted to Licensor for inclusion in the Work by the copyright owner
78 | or by an individual or Legal Entity authorized to submit on behalf of
79 | the copyright owner. For the purposes of this definition, "submitted"
80 | means any form of electronic, verbal, or written communication sent
81 | to the Licensor or its representatives, including but not limited to
82 | communication on electronic mailing lists, source code control systems,
83 | and issue tracking systems that are managed by, or on behalf of, the
84 | Licensor for the purpose of discussing and improving the Work, but
85 | excluding communication that is conspicuously marked or otherwise
86 | designated in writing by the copyright owner as "Not a Contribution."
87 |
88 | "Contributor" shall mean Licensor and any individual or Legal Entity
89 | on behalf of whom a Contribution has been received by Licensor and
90 | subsequently incorporated within the Work.
91 |
92 | 2. Grant of Copyright License. Subject to the terms and conditions of
93 | this License, each Contributor hereby grants to You a perpetual,
94 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
95 | copyright license to reproduce, prepare Derivative Works of,
96 | publicly display, publicly perform, sublicense, and distribute the
97 | Work and such Derivative Works in Source or Object form.
98 |
99 | 3. Grant of Patent License. Subject to the terms and conditions of
100 | this License, each Contributor hereby grants to You a perpetual,
101 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
102 | (except as stated in this section) patent license to make, have made,
103 | use, offer to sell, sell, import, and otherwise transfer the Work,
104 | where such license applies only to those patent claims licensable
105 | by such Contributor that are necessarily infringed by their
106 | Contribution(s) alone or by combination of their Contribution(s)
107 | with the Work to which such Contribution(s) was submitted. If You
108 | institute patent litigation against any entity (including a
109 | cross-claim or counterclaim in a lawsuit) alleging that the Work
110 | or a Contribution incorporated within the Work constitutes direct
111 | or contributory patent infringement, then any patent licenses
112 | granted to You under this License for that Work shall terminate
113 | as of the date such litigation is filed.
114 |
115 | 4. Redistribution. You may reproduce and distribute copies of the
116 | Work or Derivative Works thereof in any medium, with or without
117 | modifications, and in Source or Object form, provided that You
118 | meet the following conditions:
119 |
120 | (a) You must give any other recipients of the Work or
121 | Derivative Works a copy of this License; and
122 |
123 | (b) You must cause any modified files to carry prominent notices
124 | stating that You changed the files; and
125 |
126 | (c) You must retain, in the Source form of any Derivative Works
127 | that You distribute, all copyright, patent, trademark, and
128 | attribution notices from the Source form of the Work,
129 | excluding those notices that do not pertain to any part of
130 | the Derivative Works; and
131 |
132 | (d) If the Work includes a "NOTICE" text file as part of its
133 | distribution, then any Derivative Works that You distribute must
134 | include a readable copy of the attribution notices contained
135 | within such NOTICE file, excluding those notices that do not
136 | pertain to any part of the Derivative Works, in at least one
137 | of the following places: within a NOTICE text file distributed
138 | as part of the Derivative Works; within the Source form or
139 | documentation, if provided along with the Derivative Works; or,
140 | within a display generated by the Derivative Works, if and
141 | wherever such third-party notices normally appear. The contents
142 | of the NOTICE file are for informational purposes only and
143 | do not modify the License. You may add Your own attribution
144 | notices within Derivative Works that You distribute, alongside
145 | or as an addendum to the NOTICE text from the Work, provided
146 | that such additional attribution notices cannot be construed
147 | as modifying the License.
148 |
149 | You may add Your own copyright statement to Your modifications and
150 | may provide additional or different license terms and conditions
151 | for use, reproduction, or distribution of Your modifications, or
152 | for any such Derivative Works as a whole, provided Your use,
153 | reproduction, and distribution of the Work otherwise complies with
154 | the conditions stated in this License.
155 |
156 | 5. Submission of Contributions. Unless You explicitly state otherwise,
157 | any Contribution intentionally submitted for inclusion in the Work
158 | by You to the Licensor shall be under the terms and conditions of
159 | this License, without any additional terms or conditions.
160 | Notwithstanding the above, nothing herein shall supersede or modify
161 | the terms of any separate license agreement you may have executed
162 | with Licensor regarding such Contributions.
163 |
164 | 6. Trademarks. This License does not grant permission to use the trade
165 | names, trademarks, service marks, or product names of the Licensor,
166 | except as required for reasonable and customary use in describing the
167 | origin of the Work and reproducing the content of the NOTICE file.
168 |
169 | 7. Disclaimer of Warranty. Unless required by applicable law or
170 | agreed to in writing, Licensor provides the Work (and each
171 | Contributor provides its Contributions) on an "AS IS" BASIS,
172 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
173 | implied, including, without limitation, any warranties or conditions
174 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
175 | PARTICULAR PURPOSE. You are solely responsible for determining the
176 | appropriateness of using or redistributing the Work and assume any
177 | risks associated with Your exercise of permissions under this License.
178 |
179 | 8. Limitation of Liability. In no event and under no legal theory,
180 | whether in tort (including negligence), contract, or otherwise,
181 | unless required by applicable law (such as deliberate and grossly
182 | negligent acts) or agreed to in writing, shall any Contributor be
183 | liable to You for damages, including any direct, indirect, special,
184 | incidental, or consequential damages of any character arising as a
185 | result of this License or out of the use or inability to use the
186 | Work (including but not limited to damages for loss of goodwill,
187 | work stoppage, computer failure or malfunction, or any and all
188 | other commercial damages or losses), even if such Contributor
189 | has been advised of the possibility of such damages.
190 |
191 | 9. Accepting Warranty or Additional Liability. While redistributing
192 | the Work or Derivative Works thereof, You may choose to offer,
193 | and charge a fee for, acceptance of support, warranty, indemnity,
194 | or other liability obligations and/or rights consistent with this
195 | License. However, in accepting such obligations, You may act only
196 | on Your own behalf and on Your sole responsibility, not on behalf
197 | of any other Contributor, and only if You agree to indemnify,
198 | defend, and hold each Contributor harmless for any liability
199 | incurred by, or claims asserted against, such Contributor by reason
200 | of your accepting any such warranty or additional liability.
201 |
202 | END OF TERMS AND CONDITIONS
203 |
204 | APPENDIX: How to apply the Apache License to your work.
205 |
206 | To apply the Apache License to your work, attach the following
207 | boilerplate notice, with the fields enclosed by brackets "[]"
208 | replaced with your own identifying information. (Don't include
209 | the brackets!) The text should be enclosed in the appropriate
210 | comment syntax for the file format. We also recommend that a
211 | file or class name and description of purpose be included on the
212 | same "printed page" as the copyright notice for easier
213 | identification within third-party archives.
214 |
215 | Copyright [yyyy] [name of copyright owner]
216 |
217 | Licensed under the Apache License, Version 2.0 (the "License");
218 | you may not use this file except in compliance with the License.
219 | You may obtain a copy of the License at
220 |
221 | http://www.apache.org/licenses/LICENSE-2.0
222 |
223 | Unless required by applicable law or agreed to in writing, software
224 | distributed under the License is distributed on an "AS IS" BASIS,
225 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
226 | See the License for the specific language governing permissions and
227 | limitations under the License.
228 |
--------------------------------------------------------------------------------
/SDNet/code/SDNet/README.md:
--------------------------------------------------------------------------------
1 | # SDNet
2 |
3 | This is the official code for the Microsoft's submission of SDNet model to [CoQA](https://stanfordnlp.github.io/coqa/) leaderboard. It is implemented under PyTorch framework. The related paper to cite is:
4 |
5 | **SDNet: Contextualized Attention-based Deep Network for Conversational Question Answering**, by Chenguang Zhu, Michael Zeng and Xuedong Huang, at https://arxiv.org/abs/1812.03593.
6 |
7 | For usage of this code, please follow [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct).
8 |
9 | # Directory structure:
10 | * main.py: the starter code
11 |
12 | * Models/
13 | * BaseTrainer.py: Base class for trainer
14 | * SDNetTrainer.py: Trainer for SDNet, including training and predicting procedures
15 | * SDNet.py: The SDNet network structure
16 | * Layers.py: Related network layer functions
17 | * Bert/
18 | * Bert.py: Customized class to compute BERT contextualized embedding
19 | * modeling.py, optimization.py, tokenization.py: From Huggingface's PyTorch implementation of BERT
20 | * Utils/
21 | * Arguments.py: Process argument configuration file
22 | * Constants.py: Define constants used
23 | * CoQAPreprocess.py: preprocess CoQA raw data into intermediate binary/json file, including tokenzation, history preprending
24 | * CoQAUtils.py, General Utils.py: utility functions used in SDNet
25 | * Timing.py: Logging time
26 |
27 | # How to run
28 | Requirement: PyTorch 0.4.0, spaCy 2.0.
29 | The docker we used is available at dockerhub: https://hub.docker.com/r/zcgzcgzcg/squadv2/tags. Please use v3.0 or v4.0.
30 | 1. Create a folder (e.g. **coqa**) to contain data and running logs;
31 | 2. Create folder **coqa/data** to store CoQA raw data: **coqa-train-v1.0.json** and **coqa-dev-v1.0.json**;
32 | 3. Copy the file **conf** from the repo into folder **coqa**;
33 | 4. If you want to use BERT-Large, download their model into **coqa/bert-large-uncased**; if you want to use BERT-base, download their model into **coqa/bert-base-cased**;
34 | * The models can be downloaded from Huggingface:
35 | * 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
36 | * 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz"
37 | * bert-large-uncased-vocab.txt can be downloaded from Google's BERT repository
38 | 5. Create a folder **glove** in the same directory of **coqa** and download GloVe embedding **glove.840B.300d.txt** into the folder.
39 |
40 | Your directory should look like this:
41 | * coqa/
42 | * data/
43 | * coqa-train-v1.0.json
44 | * coqa-dev-v1.0.json
45 | * bert-large-uncased/
46 | * bert-large-uncased-vocab.txt
47 | * bert_config.json
48 | * pytorch_model.bin
49 | * conf
50 | * glove/
51 | * glove.840B.300d.txt
52 |
53 | Then, execute `python main.py train path_to_coqa/conf`.
54 |
55 | If you run for the first time, CoQAPreprocess.py will automatically create folders **conf~/spacy_intermediate_features~** inside **coqa** to store intermediate tokenization results, which will take a few hours.
56 |
57 | Every time you run the code, a new running folder **run_idx** will be created inside **coqa/conf~**, which contains running logs, prediction result on dev set, and best model.
58 |
59 | # Contact
60 | If you have any questions, please contact Chenguang Zhu, chezhu@microsoft.com
61 |
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Scratch/SQuAD_to_CoQA.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | # Scratch code to convert SQuAD data into CoQA format
5 |
6 | import json
7 |
8 | data = {'version': 0.2, 'data': []}
9 | with open('train-v2.0.json', 'r') as f:
10 | squad = json.load(f)
11 |
12 | cnt = 0
13 | for d in squad['data']:
14 | for p in d['paragraphs']:
15 | cnt += 1
16 | a = {'source': 'source', 'filename': 'filename', 'id': str(cnt), 'story': p['context']}
17 | ques = []
18 | ans = []
19 |
20 | additional = []
21 | turn_id = 1
22 | for qa in p['qas']:
23 | ques.append({
24 | 'input_text': qa['question'],
25 | 'turn_id': turn_id
26 | })
27 |
28 | if qa['is_impossible']:
29 | if len(ans) == 0:
30 | ans.append([])
31 | ans[0].append({
32 | 'input_text': 'unknown',
33 | 'span_text': 'unknown',
34 | 'span_start': -1,
35 | 'span_end': -1,
36 | 'turn_id': turn_id
37 | })
38 |
39 | for j in range(len(qa['answers'])):
40 | if j >= len(ans):
41 | ans.append([])
42 | ans[j].append({
43 | 'input_text': qa['answers'][j]['text'],
44 | 'span_text': qa['answers'][j]['text'],
45 | 'span_start': qa['answers'][j]['answer_start'],
46 | 'span_end': qa['answers'][j]['answer_start'] + len(qa['answers'][j]['text']),
47 | 'turn_id': turn_id
48 | })
49 |
50 | turn_id += 1
51 |
52 | a['questions'] = ques
53 | a['answers'] = ans[0]
54 | a['additional_answers'] = {}
55 | for j in range(1, len(ans)):
56 | a['additional_answers'][str(j-1)] = ans[j]
57 |
58 | data['data'].append(a)
59 |
60 |
61 | with open('squad_in_coqa_format.json', 'w') as output_file:
62 | json.dump(data, output_file, indent=4)
63 |
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/Arguments.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | import os
5 |
6 | class Arguments:
7 | def __init__(self, confFile):
8 | if not os.path.exists(confFile):
9 | raise Exception("The argument file does not exist: " + confFile)
10 | self.confFile = confFile
11 |
12 | def is_int(self, s):
13 | try:
14 | int(s)
15 | return True
16 | except ValueError:
17 | return False
18 |
19 | def is_float(self, s):
20 | try:
21 | float(s)
22 | return True
23 | except ValueError:
24 | return False
25 |
26 | def is_bool(self, s):
27 | return s.lower() == 'true' or s.lower() == 'false'
28 |
29 | def readHyperDriveArguments(self, arguments):
30 | hyperdrive_opts = {}
31 | for i in range(0, len(arguments), 2):
32 | hp_name, hp_value = arguments[i:i+2]
33 | hp_name = hp_name.replace("--", "")
34 | if self.is_int(hp_value):
35 | hp_value = int(hp_value)
36 | elif self.is_float(hp_value):
37 | hp_value = float(hp_value)
38 | hyperdrive_opts[hp_name] = hp_value
39 | return hyperdrive_opts
40 |
41 | def readArguments(self):
42 | opt = {}
43 | with open(self.confFile, encoding='utf-8') as f:
44 | for line in f:
45 | l = line.replace('\t', ' ').strip()
46 | if l.startswith("#"):
47 | continue
48 | parts = l.split()
49 | if len(parts) == 1:
50 | key = parts[0]
51 | if not key in opt:
52 | opt[key] = True
53 | if len(parts) == 2:
54 | key = parts[0]
55 | value = parts[1]
56 | if not key in opt:
57 | opt[key] = value
58 | if self.is_int(value):
59 | opt[key] = int(value)
60 | elif self.is_float(value):
61 | opt[key] = float(value)
62 | elif self.is_bool(value):
63 | opt[key] = value.lower() == 'true'
64 | else:
65 | print('Warning: key %s already exists' % key)
66 | return opt
67 |
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/CoQAPreprocess.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | """
5 | This file takes a CoQA data file as input and generates the input files for training the QA model.
6 | """
7 | import json
8 | import msgpack
9 | import multiprocessing
10 | import re
11 | import string
12 | import torch
13 | from tqdm import tqdm
14 | from collections import Counter
15 | from Utils.GeneralUtils import nlp, load_glove_vocab, pre_proc
16 | from Utils.CoQAUtils import token2id, token2id_sent, char2id_sent, build_embedding, feature_gen, POS, ENT
17 | import os
18 |
19 | class CoQAPreprocess():
20 | def __init__(self, opt):
21 | print('CoQA Preprocessing')
22 | self.opt = opt
23 | self.spacyDir = opt['FEATURE_FOLDER']
24 | self.train_file = os.path.join(opt['datadir'], opt['CoQA_TRAIN_FILE'])
25 | self.dev_file = os.path.join(opt['datadir'], opt['CoQA_DEV_FILE'])
26 | self.glove_file = os.path.join(opt['datadir'], opt['INIT_WORD_EMBEDDING_FILE'])
27 | self.glove_dim = 300
28 | self.official = 'OFFICIAL' in opt
29 | self.data_prefix = 'coqa-'
30 |
31 | if self.official:
32 | self.glove_vocab = load_glove_vocab(self.glove_file, self.glove_dim, to_lower = False)
33 | print('Official prediction initializes...')
34 | print('Loading training vocab and vocab char...')
35 | self.train_vocab, self.train_char_vocab, self.train_embedding = self.load_data()
36 | self.test_file = self.opt['OFFICIAL_TEST_FILE']
37 | return
38 |
39 | dataset_labels = ['train', 'dev']
40 | allExist = True
41 | for dataset_label in dataset_labels:
42 | if not os.path.exists(os.path.join(self.spacyDir, self.data_prefix + dataset_label + '-preprocessed.json')):
43 | allExist = False
44 |
45 | if allExist:
46 | return
47 |
48 | print('Previously result not found, creating preprocessed files now...')
49 | self.glove_vocab = load_glove_vocab(self.glove_file, self.glove_dim, to_lower = False)
50 | if not os.path.isdir(self.spacyDir):
51 | os.makedirs(self.spacyDir)
52 | print('Directory created: ' + self.spacyDir)
53 |
54 | for dataset_label in dataset_labels:
55 | self.preprocess(dataset_label)
56 |
57 | # dataset_label can be 'train' or 'dev' or 'test'
58 | def preprocess(self, dataset_label):
59 | file_name = self.train_file if dataset_label == 'train' else (self.dev_file if dataset_label == 'dev' else self.test_file)
60 | output_file_name = os.path.join(self.spacyDir, self.data_prefix + dataset_label + '-preprocessed.json')
61 |
62 | print('Preprocessing', dataset_label, 'file:', file_name)
63 | print('Loading json...')
64 | with open(file_name, 'r') as f:
65 | dataset = json.load(f)
66 |
67 | print('Processing json...')
68 |
69 | data = []
70 | tot = len(dataset['data'])
71 | for data_idx in tqdm(range(tot)):
72 | datum = dataset['data'][data_idx]
73 | context_str = datum['story']
74 | _datum = {'context': context_str,
75 | 'source': datum['source'],
76 | 'id': datum['id'],
77 | 'filename': datum['filename']}
78 |
79 | nlp_context = nlp(pre_proc(context_str))
80 | _datum['annotated_context'] = self.process(nlp_context)
81 | _datum['raw_context_offsets'] = self.get_raw_context_offsets(_datum['annotated_context']['word'], context_str)
82 | _datum['qas'] = []
83 | assert len(datum['questions']) == len(datum['answers'])
84 |
85 | additional_answers = {}
86 | if 'additional_answers' in datum:
87 | for k, answer in datum['additional_answers'].items():
88 | if len(answer) == len(datum['answers']):
89 | for ex in answer:
90 | idx = ex['turn_id']
91 | if idx not in additional_answers:
92 | additional_answers[idx] = []
93 | additional_answers[idx].append(ex['input_text']) # additional_answer is only used to eval, so raw_text is fine
94 |
95 | for i in range(len(datum['questions'])):
96 | question, answer = datum['questions'][i], datum['answers'][i]
97 | assert question['turn_id'] == answer['turn_id']
98 |
99 | idx = question['turn_id']
100 | _qas = {'turn_id': idx,
101 | 'question': question['input_text'],
102 | 'answer': answer['input_text']}
103 | if idx in additional_answers:
104 | _qas['additional_answers'] = additional_answers[idx]
105 |
106 | _qas['annotated_question'] = self.process(nlp(pre_proc(question['input_text'])))
107 | _qas['annotated_answer'] = self.process(nlp(pre_proc(answer['input_text'])))
108 | _qas['raw_answer'] = answer['input_text']
109 | _qas['answer_span_start'] = answer['span_start']
110 | _qas['answer_span_end'] = answer['span_end']
111 |
112 | start = answer['span_start']
113 | end = answer['span_end']
114 | chosen_text = _datum['context'][start: end].lower()
115 | while len(chosen_text) > 0 and chosen_text[0] in string.whitespace:
116 | chosen_text = chosen_text[1:]
117 | start += 1
118 | while len(chosen_text) > 0 and chosen_text[-1] in string.whitespace:
119 | chosen_text = chosen_text[:-1]
120 | end -= 1
121 | input_text = _qas['answer'].strip().lower()
122 | if input_text in chosen_text:
123 | p = chosen_text.find(input_text)
124 | _qas['answer_span'] = self.find_span(_datum['raw_context_offsets'],
125 | start + p, start + p + len(input_text))
126 | else:
127 | _qas['answer_span'] = self.find_span_with_gt(_datum['context'],
128 | _datum['raw_context_offsets'], input_text)
129 | long_question = ''
130 | for j in range(i - 2, i + 1):
131 | if j < 0:
132 | continue
133 | long_question += ' ' + datum['questions'][j]['input_text']
134 | if j < i:
135 | long_question += ' ' + datum['answers'][j]['input_text']
136 |
137 | long_question = long_question.strip()
138 | nlp_long_question = nlp(long_question)
139 | _qas['context_features'] = feature_gen(nlp_context, nlp_long_question)
140 |
141 | _datum['qas'].append(_qas)
142 | data.append(_datum)
143 |
144 | # build vocabulary
145 | if dataset_label == 'train':
146 | print('Build vocabulary from training data...')
147 | contexts = [_datum['annotated_context']['word'] for _datum in data]
148 | qas = [qa['annotated_question']['word'] + qa['annotated_answer']['word'] for qa in _datum['qas'] for _datum in data]
149 | self.train_vocab = self.build_vocab(contexts, qas)
150 | self.train_char_vocab = self.build_char_vocab(self.train_vocab)
151 |
152 | print('Getting word ids...')
153 | w2id = {w: i for i, w in enumerate(self.train_vocab)}
154 | c2id = {c: i for i, c in enumerate(self.train_char_vocab)}
155 | for _datum in data:
156 | _datum['annotated_context']['wordid'] = token2id_sent(_datum['annotated_context']['word'], w2id, unk_id = 1, to_lower = False)
157 | _datum['annotated_context']['charid'] = char2id_sent(_datum['annotated_context']['word'], c2id, unk_id = 1, to_lower = False)
158 | for qa in _datum['qas']:
159 | qa['annotated_question']['wordid'] = token2id_sent(qa['annotated_question']['word'], w2id, unk_id = 1, to_lower = False)
160 | qa['annotated_question']['charid'] = char2id_sent(qa['annotated_question']['word'], c2id, unk_id = 1, to_lower = False)
161 | qa['annotated_answer']['wordid'] = token2id_sent(qa['annotated_answer']['word'], w2id, unk_id = 1, to_lower = False)
162 | qa['annotated_answer']['charid'] = char2id_sent(qa['annotated_answer']['word'], c2id, unk_id = 1, to_lower = False)
163 |
164 | if dataset_label == 'train':
165 | # get the condensed dictionary embedding
166 | print('Getting embedding matrix for ' + dataset_label)
167 | embedding = build_embedding(self.glove_file, self.train_vocab, self.glove_dim)
168 | meta = {'vocab': self.train_vocab, 'char_vocab': self.train_char_vocab, 'embedding': embedding.tolist()}
169 | meta_file_name = os.path.join(self.spacyDir, dataset_label + '_meta.msgpack')
170 | print('Saving meta information to', meta_file_name)
171 | with open(meta_file_name, 'wb') as f:
172 | msgpack.dump(meta, f, encoding='utf8')
173 |
174 | dataset['data'] = data
175 |
176 | if dataset_label == 'test':
177 | return dataset
178 |
179 | with open(output_file_name, 'w') as output_file:
180 | json.dump(dataset, output_file, sort_keys=True, indent=4)
181 |
182 | '''
183 | Return train_vocab embedding
184 | '''
185 | def load_data(self):
186 | print('Load train_meta.msgpack...')
187 | meta_file_name = os.path.join(self.spacyDir, 'train_meta.msgpack')
188 | with open(meta_file_name, 'rb') as f:
189 | meta = msgpack.load(f, encoding='utf8')
190 | embedding = torch.Tensor(meta['embedding'])
191 | self.opt['vocab_size'] = embedding.size(0)
192 | self.opt['vocab_dim'] = embedding.size(1)
193 | self.opt['char_vocab_size'] = len(meta['char_vocab'])
194 | return meta['vocab'], meta['char_vocab'], embedding
195 |
196 | def build_vocab(self, contexts, qas): # vocabulary will also be sorted accordingly
197 | counter_c = Counter(w for doc in contexts for w in doc)
198 | counter_qa = Counter(w for doc in qas for w in doc)
199 | counter = counter_c + counter_qa
200 | vocab = sorted([t for t in counter_qa if t in self.glove_vocab], key=counter_qa.get, reverse=True)
201 | vocab += sorted([t for t in counter_c.keys() - counter_qa.keys() if t in self.glove_vocab],
202 | key=counter.get, reverse=True)
203 | total = sum(counter.values())
204 | matched = sum(counter[t] for t in vocab)
205 | print('vocab {1}/{0} OOV {2}/{3} ({4:.4f}%)'.format(
206 | len(counter), len(vocab), (total - matched), total, (total - matched) / total * 100))
207 | vocab.insert(0, "")
208 | vocab.insert(1, "")
209 | vocab.insert(2, "")
210 | vocab.insert(3, "")
211 | return vocab
212 |
213 | def build_char_vocab(self, words):
214 | counter = Counter(c for w in words for c in w)
215 | print('All characters: {0}'.format(len(counter)))
216 | char_vocab = [c for c, cnt in counter.items() if cnt > 3]
217 | print('Occurrence > 3 characters: {0}'.format(len(char_vocab)))
218 |
219 | char_vocab.insert(0, "")
220 | char_vocab.insert(1, "")
221 | char_vocab.insert(2, "")
222 | char_vocab.insert(3, "")
223 | return char_vocab
224 |
225 | def _str(self, s):
226 | """ Convert PTB tokens to normal tokens """
227 | if (s.lower() == '-lrb-'):
228 | s = '('
229 | elif (s.lower() == '-rrb-'):
230 | s = ')'
231 | elif (s.lower() == '-lsb-'):
232 | s = '['
233 | elif (s.lower() == '-rsb-'):
234 | s = ']'
235 | elif (s.lower() == '-lcb-'):
236 | s = '{'
237 | elif (s.lower() == '-rcb-'):
238 | s = '}'
239 | return s
240 |
241 | def process(self, parsed_text):
242 | output = {'word': [],
243 | 'lemma': [],
244 | 'pos': [],
245 | 'pos_id': [],
246 | 'ent': [],
247 | 'ent_id': [],
248 | 'offsets': [],
249 | 'sentences': []}
250 |
251 | for token in parsed_text:
252 | #[(token.text,token.idx) for token in parsed_sentence]
253 | output['word'].append(self._str(token.text))
254 | pos = token.tag_
255 | output['pos'].append(pos)
256 | output['pos_id'].append(token2id(pos, POS, 0))
257 |
258 | ent = 'O' if token.ent_iob_ == 'O' else (token.ent_iob_ + '-' + token.ent_type_)
259 | output['ent'].append(ent)
260 | output['ent_id'].append(token2id(ent, ENT, 0))
261 |
262 | output['lemma'].append(token.lemma_ if token.lemma_ != '-PRON-' else token.text.lower())
263 | output['offsets'].append((token.idx, token.idx + len(token.text)))
264 |
265 | word_idx = 0
266 | for sent in parsed_text.sents:
267 | output['sentences'].append((word_idx, word_idx + len(sent)))
268 | word_idx += len(sent)
269 |
270 | assert word_idx == len(output['word'])
271 | return output
272 |
273 | '''
274 | offsets based on raw_text
275 | this will solve the problem that, in raw_text, it's "a-b", in parsed test, it's "a - b"
276 | '''
277 | def get_raw_context_offsets(self, words, raw_text):
278 | raw_context_offsets = []
279 | p = 0
280 | for token in words:
281 | while p < len(raw_text) and re.match('\s', raw_text[p]):
282 | p += 1
283 | if raw_text[p:p + len(token)] != token:
284 | print('something is wrong! token', token, 'raw_text:', raw_text)
285 |
286 | raw_context_offsets.append((p, p + len(token)))
287 | p += len(token)
288 |
289 | return raw_context_offsets
290 |
291 | def normalize_answer(self, s):
292 | """Lower text and remove punctuation, storys and extra whitespace."""
293 |
294 | def remove_articles(text):
295 | regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
296 | return re.sub(regex, ' ', text)
297 |
298 | def white_space_fix(text):
299 | return ' '.join(text.split())
300 |
301 | def remove_punc(text):
302 | exclude = set(string.punctuation)
303 | return ''.join(ch for ch in text if ch not in exclude)
304 |
305 | def lower(text):
306 | return text.lower()
307 |
308 | return white_space_fix(remove_articles(remove_punc(lower(s))))
309 |
310 |
311 | # find the word id start and stop
312 | def find_span_with_gt(self, context, offsets, ground_truth):
313 | best_f1 = 0.0
314 | best_span = (len(offsets) - 1, len(offsets) - 1)
315 | gt = self.normalize_answer(pre_proc(ground_truth)).split()
316 |
317 | ls = [i for i in range(len(offsets)) if context[offsets[i][0]:offsets[i][1]].lower() in gt]
318 |
319 | for i in range(len(ls)):
320 | for j in range(i, len(ls)):
321 | pred = self.normalize_answer(pre_proc(context[offsets[ls[i]][0]: offsets[ls[j]][1]])).split()
322 | common = Counter(pred) & Counter(gt)
323 | num_same = sum(common.values())
324 | if num_same > 0:
325 | precision = 1.0 * num_same / len(pred)
326 | recall = 1.0 * num_same / len(gt)
327 | f1 = (2 * precision * recall) / (precision + recall)
328 | if f1 > best_f1:
329 | best_f1 = f1
330 | best_span = (ls[i], ls[j])
331 | return best_span
332 |
333 |
334 | # find the word id start and stop
335 | def find_span(self, offsets, start, end):
336 | start_index = -1
337 | end_index = -1
338 | for i, offset in enumerate(offsets):
339 | if (start_index < 0) or (start >= offset[0]):
340 | start_index = i
341 | if (end_index < 0) and (end <= offset[1]):
342 | end_index = i
343 | return (start_index, end_index)
344 |
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/Constants.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | PAD_WORD_ID = 0
5 | UNK_WORD_ID = 1
6 | END_WORD_ID = 2
7 |
8 | PAD_CHAR = 261
9 | BOW_CHAR = 259
10 | EOW_CHAR = 260
11 |
12 |
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/GeneralUtils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | import math
5 | import re
6 | from Utils.Constants import *
7 | import spacy
8 | import torch
9 | import torch.nn.functional as F
10 | import unicodedata
11 | import sys
12 | from torch.autograd import Variable
13 | nlp = spacy.load('en', parser = False)
14 |
15 | # normalize sentence
16 | def normalize_text(text):
17 | return unicodedata.normalize('NFD', text)
18 |
19 | def space_extend( matchobj):
20 | return ' ' + matchobj.group(0) + ' '
21 |
22 | # get rid of punctuation stuff and stripping
23 | def pre_proc(text):
24 | text = re.sub(u'-|\u2010|\u2011|\u2012|\u2013|\u2014|\u2015|%|\[|\]|:|\(|\)|/|\t', space_extend, text)
25 | text = text.strip(' \n')
26 | text = re.sub('\s+', ' ', text)
27 | return text
28 |
29 | # get a set of vocabulary
30 | def load_glove_vocab(file, wv_dim, to_lower = True):
31 | glove_vocab = set()
32 | print('Loading glove vocabulary from ' + file)
33 | lineCnt = 0
34 | with open(file, encoding = 'utf-8') as f:
35 | for line in f:
36 | # delete!!!
37 | #if lineCnt == 20000:
38 | # print('delete!')
39 | # break
40 |
41 | lineCnt = lineCnt + 1
42 | if lineCnt % 100000 == 0:
43 | print('.', end = '',flush=True)
44 | elems = line.split()
45 | token = normalize_text(''.join(elems[0:-wv_dim]))
46 | if to_lower:
47 | token = token.lower()
48 | glove_vocab.add(token)
49 |
50 | print('\n')
51 | print('%d words loaded from Glove\n' % len(glove_vocab))
52 | return glove_vocab
53 |
54 | def token2id(docs, vocab, unk_id=None):
55 | w2id = {w: i for i, w in enumerate(vocab)}
56 | ids = [[w2id[w] if w in w2id else unk_id for w in doc] for doc in docs]
57 | return ids
58 |
59 | def char2id(docs, char_vocab, unk_id=None):
60 | c2id = {c: i for i, c in enumerate(char_vocab)}
61 | ids = [[[c2id[""]] + [c2id[c] if c in c2id else unk_id for c in w] + [c2id[""]] for w in doc] for doc in docs]
62 | return ids
63 |
64 | def removeInvalidChar(sentence):
65 | ordId = list(sentence.encode('utf-8', errors='ignore'))
66 | ordId = [x for x in ordId if x >= 0 and x < 256]
67 | return ''.join([chr(x) for x in ordId])
68 |
69 | def makeVariable(x, use_cuda):
70 | if use_cuda:
71 | x = x.pin_memory()
72 | #return Variable(x.cuda(async = True), requires_grad = False)
73 | return Variable(x.cuda(non_blocking = True), requires_grad = False)
74 | else:
75 | return Variable(x, requires_grad = False)
76 |
77 | '''
78 | Input:
79 | nlp is an instance of spacy
80 | sentence is a string
81 |
82 | Output:
83 | A list of tokens, entity and POS tags
84 | '''
85 | def spacyTokenize(sentence, vocab_ent=None, vocab_tag=None):
86 | sentence = sentence.lower()
87 | sentence = pre_proc(sentence)
88 | raw_tokens = nlp(sentence)
89 | tokens = [normalize_text(token.text) for token in raw_tokens if not token.is_punct | token.is_space]
90 | ent = None
91 | if vocab_ent is not None:
92 | ent = [token2id(token.ent_type_, vocab_ent) + 1 for token in raw_tokens if not token.is_punct | token.is_space]
93 |
94 | tag = None
95 | if vocab_tag is not None:
96 | tag = [token2id(token.tag_, vocab_tag) + 1 for token in raw_tokens if not token.is_punct | token.is_space]
97 |
98 | return tokens, ent, tag
99 |
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/Timing.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | from datetime import datetime
5 |
6 | timeelapsed = {}
7 | startTime = {}
8 | endTime = {}
9 |
10 | def timerstart(name):
11 | startTime[name] = datetime.now()
12 |
13 | def timerstop(name):
14 | endTime[name] = datetime.now()
15 | if not name in timeelapsed:
16 | timeelapsed[name] = endTime[name] - startTime[name]
17 | else:
18 | timeelapsed[name] += endTime[name] - startTime[name]
19 |
20 | def timerreport():
21 | total = 0
22 | for name in timeelapsed:
23 | total += timeelapsed[name].total_seconds()
24 | print('')
25 | print('----------------Timer Report----------------------')
26 | for name, value in sorted(timeelapsed.items(), key = lambda item: -item[1].total_seconds()):
27 | print('%s: used time %s, %f%% ' % ('{:20}'.format(name), str(value).split('.')[0], value.total_seconds() / total * 100.0))
28 | print('--------------------------------------------------')
29 | print('')
30 |
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/__pycache__/Arguments.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Utils/__pycache__/Arguments.cpython-37.pyc
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/__pycache__/CoQAPreprocess.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Utils/__pycache__/CoQAPreprocess.cpython-37.pyc
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/__pycache__/CoQAUtils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Utils/__pycache__/CoQAUtils.cpython-37.pyc
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/__pycache__/Constants.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Utils/__pycache__/Constants.cpython-37.pyc
--------------------------------------------------------------------------------
/SDNet/code/SDNet/Utils/__pycache__/GeneralUtils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Utils/__pycache__/GeneralUtils.cpython-37.pyc
--------------------------------------------------------------------------------
/SDNet/code/SDNet/conf:
--------------------------------------------------------------------------------
1 | CoQA_TRAIN_FILE data/coqa-train-v1.0.json
2 | CoQA_DEV_FILE data/coqa-dev-v1.0.json
3 |
4 | PREV_ANS 2
5 | PREV_QUES 2
6 |
7 | DROPOUT 0.3
8 | my_dropout_p 0.3
9 | VARIATIONAL_DROPOUT
10 |
11 | BERT
12 | BERT_LARGE
13 | dropout_emb 0.4
14 |
15 | LOCK_BERT
16 | BERT_LINEAR_COMBINE
17 | BERT_tokenizer_file bert-base-cased/bert-base-cased-vocab.txt
18 | BERT_model_file bert-base-cased/
19 | BERT_large_tokenizer_file bert-large-uncased/bert-large-uncased-vocab.txt
20 | BERT_large_model_file bert-large-uncased/
21 |
22 | SEED 1033
23 | SPACY_FEATURE
24 | CONTEXT_RNN_HIDDEN_DIM 300
25 |
26 | MAX_WORD_PER_SENTENCE 30
27 | INIT_WORD_EMBEDDING_FILE ../glove/glove.840B.300d.txt
28 | MINI_BATCH 32
29 | EPOCH 30
30 |
31 | QUES_SELF_ATTN
32 | max_len 15
33 | concat_rnn False
34 | grad_clipping 10
35 | do_seq_dropout
36 | tune_partial 1000
37 | embedding_dim 300
38 | prealign_hidden 300
39 | flow_hidden_size 300
40 | query_self_attn_hidden_size 300
41 | pos_dim 12
42 | ent_dim 8
43 | hidden_size 125
44 | deep_att_hidden_size_per_abstr 250
45 | deep_inter_att_use_CoVe 1
46 | in_rnn_layers 2
47 | highlvl_hidden_size 125
48 | question_high_lvl_rnn_layers 1
49 | char_emb_size 8
50 | char_hidden_size 50
--------------------------------------------------------------------------------
/SDNet/code/SDNet/main.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | import argparse
5 | import os
6 | import sys
7 | import torch
8 | from Models.SDNetTrainer import SDNetTrainer
9 | from Utils.Arguments import Arguments
10 |
11 | opt = None
12 |
13 | parser = argparse.ArgumentParser(description='SDNet')
14 | parser.add_argument('command', help='Command: train')
15 | parser.add_argument('conf_file', help='Path to conf file.')
16 |
17 | cmdline_args = parser.parse_args()
18 | command = cmdline_args.command
19 | conf_file = cmdline_args.conf_file
20 | conf_args = Arguments(conf_file)
21 | opt = conf_args.readArguments()
22 | opt['cuda'] = torch.cuda.is_available()
23 | opt['confFile'] = conf_file
24 | opt['datadir'] = os.path.dirname(conf_file) # conf_file specifies where the data folder is
25 |
26 | for key,val in cmdline_args.__dict__.items():
27 | if val is not None and key not in ['command', 'conf_file']:
28 | opt[key] = val
29 |
30 | model = SDNetTrainer(opt)
31 |
32 | print('Select command: ' + command)
33 | model.train()
34 |
--------------------------------------------------------------------------------
/SDNet/code/preprocess/prepro.py:
--------------------------------------------------------------------------------
1 | import json
2 | f=open('train_v0.2.json','r')
3 | quacdata=json.load(f)
4 | data=[]
5 | for i in range(len(quacdata['data'])):
6 | temp={}
7 | temp["source"]="wikipedia"
8 | temp["filename"]=quacdata['data'][i]["title"]+".txt"
9 | temp["name"]=temp["filename"]
10 | temp["id"]=quacdata['data'][i]["paragraphs"][0]["id"]
11 | temp["story"]=quacdata['data'][i]["paragraphs"][0]["context"][:-13]
12 | temp["questions"]=[]
13 | temp["answers"]=[]
14 | turn=0
15 | for qa in quacdata['data'][i]["paragraphs"][0]["qas"]:
16 | turn+=1
17 | tempq={}
18 | tempa={}
19 | tempq["turn_id"]=turn
20 | tempq["input_text"]=qa["question"]
21 | temp["questions"].append(tempq)
22 | tempa["span_start"]=qa["orig_answer"]["answer_start"]
23 | tempa["span_end"]=qa["orig_answer"]["answer_start"]+len(qa["orig_answer"]["text"])
24 | answertext=qa["orig_answer"]["text"]
25 | if answertext=="CANNOTANSWER":
26 | answertext="unknown"
27 | tempa["span_start"]=-1
28 | tempa["span_end"]=-1
29 | tempa["span_text"]=answertext
30 | if qa["yesno"]=="y":
31 | tempa["input_text"]="Yes"
32 | elif qa["yesno"]=="n":
33 | tempa["input_text"]="No"
34 | else:
35 | tempa["input_text"]=answertext
36 | tempa["turn_id"]=turn
37 | temp["answers"].append(tempa)
38 | data.append(temp)
39 | output={}
40 | output["data"]=data
41 | output["version"]="1.0"
42 | outf=open('quac_train_processed.json','w')
43 | json.dump(output, outf ,indent=2 )
44 | outf.close()
45 | f.close()
46 |
47 | f=open('val_v0.2.json','r')
48 | quacdata=json.load(f)
49 | data=[]
50 | for i in range(len(quacdata['data'])):
51 | temp={}
52 | temp["source"]="wikipedia"
53 | temp["filename"]=quacdata['data'][i]["title"]+".txt"
54 | temp["name"]=temp["filename"]
55 | temp["id"]=quacdata['data'][i]["paragraphs"][0]["id"]
56 | temp["story"]=quacdata['data'][i]["paragraphs"][0]["context"][:-13]
57 | temp["questions"]=[]
58 | temp["answers"]=[]
59 | turn=0
60 | for qa in quacdata['data'][i]["paragraphs"][0]["qas"]:
61 | turn+=1
62 | tempq={}
63 | tempa={}
64 | tempq["turn_id"]=turn
65 | tempq["input_text"]=qa["question"]
66 | temp["questions"].append(tempq)
67 | tempa["span_start"]=qa["orig_answer"]["answer_start"]
68 | tempa["span_end"]=qa["orig_answer"]["answer_start"]+len(qa["orig_answer"]["text"])
69 | answertext=qa["orig_answer"]["text"]
70 | if answertext=="CANNOTANSWER":
71 | answertext="unknown"
72 | tempa["span_start"]=-1
73 | tempa["span_end"]=-1
74 | tempa["span_text"]=answertext
75 | if qa["yesno"]=="y":
76 | tempa["input_text"]="Yes"
77 | elif qa["yesno"]=="n":
78 | tempa["input_text"]="No"
79 | else:
80 | tempa["input_text"]=answertext
81 | tempa["turn_id"]=turn
82 | temp["answers"].append(tempa)
83 | data.append(temp)
84 | output={}
85 | output["data"]=data
86 | output["version"]="1.0"
87 | outf=open('quac_val_processed.json','w')
88 | json.dump(output, outf ,indent=2 )
89 | outf.close()
90 | f.close()
91 |
--------------------------------------------------------------------------------
/figure/1:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/figure/QA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/figure/QA.png
--------------------------------------------------------------------------------
/figure/coref-F1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/figure/coref-F1.png
--------------------------------------------------------------------------------
/figure/flow-coref.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/figure/flow-coref.png
--------------------------------------------------------------------------------
/figure/quac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/figure/quac.png
--------------------------------------------------------------------------------
/figure/task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/figure/task.png
--------------------------------------------------------------------------------