├── BiDAF ├── BiDAFF++_with_glove+bert.jsonnet ├── BiDAFF++_with_glove+elmo.jsonnet ├── Figures │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── Arch.png │ ├── baseline.png │ ├── enhenced.png │ └── enhenced_elmo.png ├── README.md ├── bert_token_embedder.py └── wordpiece_indexer.py ├── FlowQA_Attention ├── README.md ├── code │ ├── QA_model │ │ ├── README.md │ │ ├── detail_model.py │ │ ├── layers.py │ │ ├── model_QuAC.py │ │ └── utils.py │ ├── README.md │ ├── general_utils.py │ ├── predict_QuAC.py │ ├── preprocess_QuAC.py │ ├── requirements.txt │ └── train_QuAC.py └── figure │ ├── Attention Over Flow.png │ ├── Readme.me │ ├── Vanilla Flow Operation.png │ └── result.png ├── FlowQA_Coreference └── README.md ├── README.md ├── SDNet ├── Figures │ ├── f1.jpg │ └── loss.jpg ├── README.md └── code │ ├── SDNet │ ├── CONTRIBUTING.md │ ├── LICENSE │ ├── Models │ │ ├── BaseTrainer.py │ │ ├── Bert │ │ │ ├── Bert.py │ │ │ ├── __pycache__ │ │ │ │ ├── Bert.cpython-37.pyc │ │ │ │ ├── modeling.cpython-37.pyc │ │ │ │ └── tokenization.cpython-37.pyc │ │ │ ├── modeling.py │ │ │ ├── optimization.py │ │ │ └── tokenization.py │ │ ├── Layers.py │ │ ├── SDNet.py │ │ ├── SDNetTrainer.py │ │ └── __pycache__ │ │ │ ├── BaseTrainer.cpython-37.pyc │ │ │ ├── Layers.cpython-37.pyc │ │ │ ├── SDNet.cpython-37.pyc │ │ │ └── SDNetTrainer.cpython-37.pyc │ ├── NOTICE.txt │ ├── README.md │ ├── Scratch │ │ └── SQuAD_to_CoQA.py │ ├── Utils │ │ ├── Arguments.py │ │ ├── CoQAPreprocess.py │ │ ├── CoQAUtils.py │ │ ├── Constants.py │ │ ├── GeneralUtils.py │ │ ├── Timing.py │ │ └── __pycache__ │ │ │ ├── Arguments.cpython-37.pyc │ │ │ ├── CoQAPreprocess.cpython-37.pyc │ │ │ ├── CoQAUtils.cpython-37.pyc │ │ │ ├── Constants.cpython-37.pyc │ │ │ └── GeneralUtils.cpython-37.pyc │ ├── bert_vocab_files │ │ ├── bert-base-uncased-vocab.txt │ │ └── bert-large-uncased-vocab.txt │ ├── conf │ └── main.py │ └── preprocess │ └── prepro.py └── figure ├── 1 ├── QA.png ├── coref-F1.png ├── flow-coref.png ├── quac.png └── task.png /BiDAF/BiDAFF++_with_glove+bert.jsonnet: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_reader": { 3 | "type": "quac", 4 | "lazy": true, 5 | "num_context_answers": 2, 6 | 7 | "token_indexers": { 8 | "glove": { 9 | "type": "single_id", 10 | "lowercase_tokens": false 11 | }, 12 | "bert": { 13 | "type": "bert-pretrained", 14 | "pretrained_model": "bert-large-cased", 15 | "do_lowercase": false, 16 | "use_starting_offsets": true, 17 | "truncate_long_sequences": false 18 | }, 19 | "token_characters": { 20 | "type": "characters", 21 | "min_padding_length": 3, 22 | "character_tokenizer": { 23 | "byte_encoding": "utf-8", 24 | "end_tokens": [ 25 | 260 26 | ], 27 | "start_tokens": [ 28 | 259 29 | ] 30 | } 31 | } 32 | } 33 | }, 34 | "iterator": { 35 | "type": "bucket", 36 | "batch_size": 10, 37 | "max_instances_in_memory": 1000, 38 | "sorting_keys": [ 39 | [ 40 | "question", 41 | "num_fields" 42 | ], 43 | [ 44 | "passage", 45 | "num_tokens" 46 | ] 47 | ] 48 | }, 49 | "model": { 50 | "type": "dialog_qa", 51 | "dropout": 0.2, 52 | "initializer": [], 53 | "marker_embedding_dim": 10, 54 | "num_context_answers": 2, 55 | "phrase_layer": { 56 | "type": "gru", 57 | "bidirectional": true, 58 | "hidden_size": 100, 59 | "input_size": 1472, // elmo (1024) + cnn (100) + num_context_answers (2) * marker_embedding_dim (10) 60 | "num_layers": 1 61 | }, 62 | "residual_encoder": { 63 | "type": "gru", 64 | "bidirectional": true, 65 | "hidden_size": 100, 66 | "input_size": 200, 67 | "num_layers": 1 68 | }, 69 | "span_end_encoder": { 70 | "type": "gru", 71 | "bidirectional": true, 72 | "hidden_size": 100, 73 | "input_size": 400, 74 | "num_layers": 1 75 | }, 76 | "span_start_encoder": { 77 | "type": "gru", 78 | "bidirectional": true, 79 | "hidden_size": 100, 80 | "input_size": 200, 81 | "num_layers": 1 82 | }, 83 | 84 | "text_field_embedder": { 85 | "allow_unmatched_keys": true, 86 | "embedder_to_indexer_map": { 87 | "glove": ["glove"], 88 | "bert": ["bert", "bert-offsets"], 89 | "token_characters": ["token_characters"] 90 | }, 91 | "token_embedders": { 92 | "glove": { 93 | "type": "embedding", 94 | "embedding_dim": 300, 95 | "pretrained_file": "http://nlp.stanford.edu/data/glove.840B.300d.zip", 96 | "trainable": true 97 | }, 98 | "bert": { 99 | "type": "bert-pretrained", 100 | "pretrained_model": "bert-large-cased" 101 | }, 102 | "token_characters": { 103 | "type": "character_encoding", 104 | "embedding": { 105 | "embedding_dim": 16, 106 | "num_embeddings": 262 107 | }, 108 | "encoder": { 109 | "type": "cnn", 110 | "embedding_dim": 16, 111 | "num_filters": 128, 112 | "ngram_filter_sizes": [3], 113 | "conv_layer_activation": "relu" 114 | } 115 | } 116 | } 117 | } 118 | }, 119 | "train_data_path": "https://s3.amazonaws.com/my89public/quac/train_5000.json", 120 | "validation_data_path": "https://s3.amazonaws.com/my89public/quac/val.json", 121 | "trainer": { 122 | "cuda_device": 0, 123 | "learning_rate_scheduler": { 124 | "type": "reduce_on_plateau", 125 | "factor": 0.5, 126 | "mode": "max", 127 | "patience": 3 128 | }, 129 | "num_epochs": 30, 130 | "optimizer": { 131 | "type": "sgd", 132 | "lr": 0.01, 133 | "momentum": 0.9 134 | }, 135 | "patience": 10, 136 | "validation_metric": "+f1" 137 | }, 138 | "validation_iterator": { 139 | "type": "bucket", 140 | "batch_size": 3, 141 | "max_instances_in_memory": 1000, 142 | "sorting_keys": [ 143 | [ 144 | "question", 145 | "num_fields" 146 | ], 147 | [ 148 | "passage", 149 | "num_tokens" 150 | ] 151 | ] 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /BiDAF/BiDAFF++_with_glove+elmo.jsonnet: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_reader": { 3 | "type": "quac", 4 | "lazy": true, 5 | "num_context_answers": 2, 6 | "token_indexers": { 7 | "glove": { 8 | "type": "single_id", 9 | "lowercase_tokens": false 10 | }, 11 | "elmo": { 12 | "type": "elmo_characters" 13 | }, 14 | "token_characters": { 15 | "type": "characters", 16 | "character_tokenizer": { 17 | "byte_encoding": "utf-8", 18 | "end_tokens": [ 19 | 260 20 | ], 21 | "start_tokens": [ 22 | 259 23 | ] 24 | }, 25 | "min_padding_length": 5 26 | } 27 | } 28 | }, 29 | "iterator": { 30 | "type": "bucket", 31 | "batch_size": 10, 32 | "max_instances_in_memory": 1000, 33 | "sorting_keys": [ 34 | [ 35 | "question", 36 | "num_fields" 37 | ], 38 | [ 39 | "passage", 40 | "num_tokens" 41 | ] 42 | ] 43 | }, 44 | "model": { 45 | "type": "dialog_qa", 46 | "dropout": 0.2, 47 | "initializer": [], 48 | "marker_embedding_dim": 10, 49 | "num_context_answers": 2, 50 | "phrase_layer": { 51 | "type": "gru", 52 | "bidirectional": true, 53 | "hidden_size": 100, 54 | "input_size": 1444, // elmo (1024) + cnn (100) + num_context_answers (2) * marker_embedding_dim (10) 55 | "num_layers": 1 56 | }, 57 | "residual_encoder": { 58 | "type": "gru", 59 | "bidirectional": true, 60 | "hidden_size": 100, 61 | "input_size": 200, 62 | "num_layers": 1 63 | }, 64 | "span_end_encoder": { 65 | "type": "gru", 66 | "bidirectional": true, 67 | "hidden_size": 100, 68 | "input_size": 400, 69 | "num_layers": 1 70 | }, 71 | "span_start_encoder": { 72 | "type": "gru", 73 | "bidirectional": true, 74 | "hidden_size": 100, 75 | "input_size": 200, 76 | "num_layers": 1 77 | }, 78 | "text_field_embedder": { 79 | "glove": { 80 | "type": "embedding", 81 | "embedding_dim": 300, 82 | "pretrained_file": "http://nlp.stanford.edu/data/glove.840B.300d.zip", 83 | "trainable": true 84 | }, 85 | "elmo": { 86 | "type": "elmo_token_embedder", 87 | "do_layer_norm": false, 88 | "dropout": 0.2, 89 | "options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json", 90 | "weight_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" 91 | }, 92 | "token_characters": { 93 | "type": "character_encoding", 94 | "dropout": 0.2, 95 | "embedding": { 96 | "embedding_dim": 20, 97 | "num_embeddings": 262 98 | }, 99 | "encoder": { 100 | "type": "cnn", 101 | "embedding_dim": 20, 102 | "ngram_filter_sizes": [ 103 | 5 104 | ], 105 | "num_filters": 100 106 | } 107 | } 108 | } 109 | }, 110 | "train_data_path": "https://s3.amazonaws.com/my89public/quac/train_5000.json", 111 | "validation_data_path": "https://s3.amazonaws.com/my89public/quac/val.json", 112 | "trainer": { 113 | "cuda_device": 0, 114 | "learning_rate_scheduler": { 115 | "type": "reduce_on_plateau", 116 | "factor": 0.5, 117 | "mode": "max", 118 | "patience": 3 119 | }, 120 | "num_epochs": 30, 121 | "optimizer": { 122 | "type": "sgd", 123 | "lr": 0.01, 124 | "momentum": 0.9 125 | }, 126 | "patience": 10, 127 | "validation_metric": "+f1" 128 | }, 129 | "validation_iterator": { 130 | "type": "bucket", 131 | "batch_size": 3, 132 | "max_instances_in_memory": 1000, 133 | "sorting_keys": [ 134 | [ 135 | "question", 136 | "num_fields" 137 | ], 138 | [ 139 | "passage", 140 | "num_tokens" 141 | ] 142 | ] 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /BiDAF/Figures/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/1.png -------------------------------------------------------------------------------- /BiDAF/Figures/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/2.png -------------------------------------------------------------------------------- /BiDAF/Figures/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/3.png -------------------------------------------------------------------------------- /BiDAF/Figures/Arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/Arch.png -------------------------------------------------------------------------------- /BiDAF/Figures/baseline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/baseline.png -------------------------------------------------------------------------------- /BiDAF/Figures/enhenced.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/enhenced.png -------------------------------------------------------------------------------- /BiDAF/Figures/enhenced_elmo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/BiDAF/Figures/enhenced_elmo.png -------------------------------------------------------------------------------- /BiDAF/README.md: -------------------------------------------------------------------------------- 1 | # Using enhanced BiDAF++ on QuAC 2 | 3 | ## Descriptions 4 | An original BiDAF++ model uses Char-CNN for character embedding and GLoVe for word embedding. It is also equipped with contextualized embeddings and self attention. In this model, marker embeddings corresponding to previous answer words are used, while question turn numbers are encoded into question embeddings. 5 | 6 | We've used [AllenNLP](https://github.com/allenai/allennlp) library to modify the BiDAF++ and used enhenced models to train on [QuAC](https://quac.ai/) dataset. 7 | 8 | ## Architecture of enhenced BiDAF++ 9 |

10 | 11 |

12 | 13 | ### Embedding layers from BiDAF++ 14 | 15 | We aimed to apply ELMo or BERT embedding to the original embeddings. 16 | 17 | 18 | Let ELMok be a ELMo vector, xk denote a original embedding vector and hk present a contextual vector generated by bi-LSTM layer. Then we can use ELMo enhanced representation [xk; ELMok] instead of xk. Also, replace hk with [hk; ELMok]. 19 | We do the same thing when using BERT embedding. 20 | 21 | ELMo ver:[options_file](https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json) 22 | and [weight_file](https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5) 23 | BERT ver: [bert-large-cased](https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt) 24 | ### Self-Attention layers from BiDAF++ 25 | 26 | First, the input is passed through a linear layer with ReLU activations. Then, it passes through a bi-directional GRU or LSTM. After that, we apply attention mechanism between the context and itself as following: 27 | 28 | Let hi, hj be the vectors for context word i and word j and nc be the lengths of the context. Then compute attention between the two words as: 29 |

30 | 31 |

32 | 33 | where w1,w2, and w3 are learned vectors and ⊙ is element-wise multiplica- tion. In this case, we set aij = −inf if i = j. Then, compute an attended vector ci for each context token as: 34 |

35 | 36 |

37 | 38 | Compute a context-to-context vector tc: 39 |

40 | 41 |

42 | 43 | The output of this layer is built concatenating hi, ci, hi ⊙ ci and tc ⊙ ci, 44 | additionally summed with the input before ReLU. 45 | ### Modification 46 | 47 | We modified the wordpiece indexer and bert token embedder to train the model with BERT embedding, cause origianl two files cannot handle the situation with 500 more tokens in the sentence. The modified files would not influence training process of model with other embeddings. 48 | 49 | ## Platform and tools 50 | ### Launching a Deep Learning VM on GCP 51 | - 2 vCPUs with 13GB memory 52 | - 1 NVIDIA Tesla V100 GPU 53 | - PyTorch 1.0 + fast.ai 1.0(CUDA 10.0) 54 | - Install NVIDIA GPU driver automatically on first startup 55 | 56 | ### Setting up an AllenNLP virtual environment 57 | 58 | 1. Create a Conda environment with Python 3.6 59 | 60 | ```bash 61 | conda create -n allennlp python=3.6 62 | ``` 63 | 64 | 2. Activate the Conda environment. You will need to activate the Conda environment in each terminal in which you want to use AllenNLP. 65 | 66 | ```bash 67 | source activate allennlp 68 | ``` 69 | 70 | 71 | ## Usage 72 | 73 | ### Dataset 74 | [Train data](https://s3.amazonaws.com/my89public/quac/train_5000.json) 75 | [Validation data](https://s3.amazonaws.com/my89public/quac/val.json) 76 | 77 | ### Training 78 | Train a enhanced model with QuAC dataset which includes training and validation dataset 79 | ``` 80 | nohup allennlp train --serialization-dir > output.log & 81 | ``` 82 | As [QuAC](https://arxiv.org/pdf/1808.07036.pdf) mentioned,Questions in the training set have one reference answer, while validation and test questions have five references each, which makes the F1 score on validation dataset has a higher score then that on training set. 83 | 84 | Best model's configuration is [here](https://github.com/deepnlp-cs599-usc/quac/blob/master/BiDAF/BiDAFF%2B%2B_with_glove%2Bbert.jsonnet). 85 | ## Results 86 | ### F1 score 87 | Enhanced model by BERT 88 |

89 | 90 |

91 | Enhanced model by ELMo 92 |

93 | 94 |

95 | Baseline model 96 |

97 | 98 |

99 | 100 | ### Performance on baseline and enhanced models 101 | 102 | | | F1 score on training set | F1 score on validation set| 103 | | --- | --- | --- | 104 | | Baseline model | 49.40 | 55.59 | 105 | | Enhanced by ELMo | 49.40 | **58.40** | 106 | | Enhanced by BERT | **53.05** | **60.04**| 107 | 108 | ## Related works 109 | 110 | * [Bidirectional attention flow for machine comprehension](https://arxiv.org/abs/1611.01603) by Minjoon Seo et. al. 111 | * [Simple and effective multi-paragraph reading comprehension](https://arxiv.org/abs/1710.10723) by Christopher Clark et. al. 112 | * [Deep contextualized word representations](https://arxiv.org/abs/1802.05365) by Matthew E. Peters et. al. 113 | * [Bert: Pre-training of deep bidirectional transformers for language understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin et. al. 114 | -------------------------------------------------------------------------------- /BiDAF/bert_token_embedder.py: -------------------------------------------------------------------------------- 1 | """ 2 | A ``TokenEmbedder`` which uses one of the BERT models 3 | (https://github.com/google-research/bert) 4 | to produce embeddings. 5 | At its core it uses Hugging Face's PyTorch implementation 6 | (https://github.com/huggingface/pytorch-pretrained-BERT), 7 | so thanks to them! 8 | """ 9 | import logging 10 | 11 | import torch 12 | import torch.nn.functional as F 13 | 14 | from pytorch_pretrained_bert.modeling import BertModel 15 | 16 | from allennlp.modules.scalar_mix import ScalarMix 17 | from allennlp.modules.token_embedders.token_embedder import TokenEmbedder 18 | from allennlp.nn import util 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | class BertEmbedder(TokenEmbedder): 24 | """ 25 | A ``TokenEmbedder`` that produces BERT embeddings for your tokens. 26 | Should be paired with a ``BertIndexer``, which produces wordpiece ids. 27 | Most likely you probably want to use ``PretrainedBertEmbedder`` 28 | for one of the named pretrained models, not this base class. 29 | Parameters 30 | ---------- 31 | bert_model: ``BertModel`` 32 | The BERT model being wrapped. 33 | top_layer_only: ``bool``, optional (default = ``False``) 34 | If ``True``, then only return the top layer instead of apply the scalar mix. 35 | max_pieces : int, optional (default: 512) 36 | The BERT embedder uses positional embeddings and so has a corresponding 37 | maximum length for its input ids. Assuming the inputs are windowed 38 | and padded appropriately by this length, the embedder will split them into a 39 | large batch, feed them into BERT, and recombine the output as if it was a 40 | longer sequence. 41 | start_tokens : int, optional (default: 1) 42 | The number of starting special tokens input to BERT (usually 1, i.e., [CLS]) 43 | end_tokens : int, optional (default: 1) 44 | The number of ending tokens input to BERT (usually 1, i.e., [SEP]) 45 | """ 46 | def __init__(self, 47 | bert_model: BertModel, 48 | top_layer_only: bool = False, 49 | max_pieces: int = 512, 50 | start_tokens: int = 1, 51 | end_tokens: int = 1) -> None: 52 | super().__init__() 53 | self.bert_model = bert_model 54 | self.output_dim = bert_model.config.hidden_size 55 | self.max_pieces = max_pieces 56 | self.start_tokens = start_tokens 57 | self.end_tokens = end_tokens 58 | 59 | if not top_layer_only: 60 | self._scalar_mix = ScalarMix(bert_model.config.num_hidden_layers, 61 | do_layer_norm=False) 62 | else: 63 | self._scalar_mix = None 64 | 65 | def get_output_dim(self) -> int: 66 | return self.output_dim 67 | 68 | def forward(self, 69 | input_ids: torch.LongTensor, 70 | offsets: torch.LongTensor = None, 71 | token_type_ids: torch.LongTensor = None) -> torch.Tensor: 72 | """ 73 | Parameters 74 | ---------- 75 | input_ids : ``torch.LongTensor`` 76 | The (batch_size, ..., max_sequence_length) tensor of wordpiece ids. 77 | offsets : ``torch.LongTensor``, optional 78 | The BERT embeddings are one per wordpiece. However it's possible/likely 79 | you might want one per original token. In that case, ``offsets`` 80 | represents the indices of the desired wordpiece for each original token. 81 | Depending on how your token indexer is configured, this could be the 82 | position of the last wordpiece for each token, or it could be the position 83 | of the first wordpiece for each token. 84 | For example, if you had the sentence "Definitely not", and if the corresponding 85 | wordpieces were ["Def", "##in", "##ite", "##ly", "not"], then the input_ids 86 | would be 5 wordpiece ids, and the "last wordpiece" offsets would be [3, 4]. 87 | If offsets are provided, the returned tensor will contain only the wordpiece 88 | embeddings at those positions, and (in particular) will contain one embedding 89 | per token. If offsets are not provided, the entire tensor of wordpiece embeddings 90 | will be returned. 91 | token_type_ids : ``torch.LongTensor``, optional 92 | If an input consists of two sentences (as in the BERT paper), 93 | tokens from the first sentence should have type 0 and tokens from 94 | the second sentence should have type 1. If you don't provide this 95 | (the default BertIndexer doesn't) then it's assumed to be all 0s. 96 | """ 97 | # pylint: disable=arguments-differ 98 | batch_size, full_seq_len = input_ids.size(0), input_ids.size(-1) 99 | initial_dims = list(input_ids.shape[:-1]) 100 | 101 | # The embedder may receive an input tensor that has a sequence length longer than can 102 | # be fit. In that case, we should expect the wordpiece indexer to create padded windows 103 | # of length `self.max_pieces` for us, and have them concatenated into one long sequence. 104 | # E.g., "[CLS] I went to the [SEP] [CLS] to the store to [SEP] ..." 105 | # We can then split the sequence into sub-sequences of that length, and concatenate them 106 | # along the batch dimension so we effectively have one huge batch of partial sentences. 107 | # This can then be fed into BERT without any sentence length issues. Keep in mind 108 | # that the memory consumption can dramatically increase for large batches with extremely 109 | # long sentences. 110 | needs_split = full_seq_len > self.max_pieces 111 | last_window_size = 0 112 | if needs_split: 113 | # Split the flattened list by the window size, `max_pieces` 114 | split_input_ids = list(input_ids.split(self.max_pieces, dim=-1)) 115 | 116 | # We want all sequences to be the same length, so pad the last sequence 117 | last_window_size = split_input_ids[-1].size(-1) 118 | padding_amount = self.max_pieces - last_window_size 119 | split_input_ids[-1] = F.pad(split_input_ids[-1], pad=[0, padding_amount], value=0) 120 | 121 | # Now combine the sequences along the batch dimension 122 | input_ids = torch.cat(split_input_ids, dim=0) 123 | 124 | if token_type_ids is None: 125 | token_type_ids = torch.zeros_like(input_ids) 126 | 127 | input_mask = (input_ids != 0).long() 128 | 129 | # input_ids may have extra dimensions, so we reshape down to 2-d 130 | # before calling the BERT model and then reshape back at the end. 131 | all_encoder_layers, _ = self.bert_model(input_ids=util.combine_initial_dims(input_ids), 132 | token_type_ids=util.combine_initial_dims(token_type_ids), 133 | attention_mask=util.combine_initial_dims(input_mask)) 134 | all_encoder_layers = torch.stack(all_encoder_layers) 135 | 136 | if needs_split: 137 | # First, unpack the output embeddings into one long sequence again 138 | unpacked_embeddings = torch.split(all_encoder_layers, batch_size, dim=1) 139 | unpacked_embeddings = torch.cat(unpacked_embeddings, dim=2) 140 | 141 | # Next, select indices of the sequence such that it will result in embeddings representing the original 142 | # sentence. To do this, the indices will be the left half of each embedded window sub-sequence. E.g., 143 | # 0 1 2 3 4 5 6 7 8 9 10 11 144 | # "[CLS] I went to the [SEP] [CLS] to the store to [SEP]" 145 | # should now have indices [0, 1, 2, 7, 8]. The remaining code adds the final window indices (so the 146 | # final indices should be [0, 1, 2, 7, 8, 9, 10, 11]) 147 | 148 | full_seq_len = unpacked_embeddings.size(-2) - self.max_pieces 149 | stride = self.max_pieces // 2 150 | select_indices = [0] 151 | select_indices.extend(i for i in range(full_seq_len) 152 | if self.start_tokens - 1 < i % self.max_pieces < stride) 153 | 154 | # Add the final window indices 155 | final_window_size = last_window_size - self.start_tokens + self.end_tokens 156 | select_indices.extend(full_seq_len + i for i in range(self.start_tokens, final_window_size)) 157 | 158 | initial_dims.append(len(select_indices)) 159 | 160 | recombined_embeddings = unpacked_embeddings[:, :, select_indices] 161 | else: 162 | recombined_embeddings = all_encoder_layers 163 | 164 | # Recombine the outputs of all layers 165 | # (layers, batch_size * d1 * ... * dn, sequence_length, embedding_dim) 166 | # recombined = torch.cat(combined, dim=2) 167 | input_mask = (recombined_embeddings != 0).long() 168 | 169 | if self._scalar_mix is not None: 170 | mix = self._scalar_mix(recombined_embeddings, input_mask) 171 | else: 172 | mix = recombined_embeddings[-1] 173 | 174 | # At this point, mix is (batch_size * d1 * ... * dn, sequence_length, embedding_dim) 175 | 176 | if offsets is None: 177 | # Resize to (batch_size, d1, ..., dn, sequence_length, embedding_dim) 178 | dims = initial_dims if needs_split else input_ids.size() 179 | return util.uncombine_initial_dims(mix, dims) 180 | else: 181 | # offsets is (batch_size, d1, ..., dn, orig_sequence_length) 182 | offsets2d = util.combine_initial_dims(offsets) 183 | # now offsets is (batch_size * d1 * ... * dn, orig_sequence_length) 184 | range_vector = util.get_range_vector(offsets2d.size(0), 185 | device=util.get_device_of(mix)).unsqueeze(1) 186 | # selected embeddings is also (batch_size * d1 * ... * dn, orig_sequence_length) 187 | selected_embeddings = mix[range_vector, offsets2d] 188 | 189 | return util.uncombine_initial_dims(selected_embeddings, offsets.size()) 190 | 191 | 192 | @TokenEmbedder.register("bert-pretrained") 193 | class PretrainedBertEmbedder(BertEmbedder): 194 | # pylint: disable=line-too-long 195 | """ 196 | Parameters 197 | ---------- 198 | pretrained_model: ``str`` 199 | Either the name of the pretrained model to use (e.g. 'bert-base-uncased'), 200 | or the path to the .tar.gz file with the model weights. 201 | If the name is a key in the list of pretrained models at 202 | https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L41 203 | the corresponding path will be used; otherwise it will be interpreted as a path or URL. 204 | requires_grad : ``bool``, optional (default = False) 205 | If True, compute gradient of BERT parameters for fine tuning. 206 | top_layer_only: ``bool``, optional (default = ``False``) 207 | If ``True``, then only return the top layer instead of apply the scalar mix. 208 | """ 209 | def __init__(self, pretrained_model: str, requires_grad: bool = False, top_layer_only: bool = False) -> None: 210 | model = BertModel.from_pretrained(pretrained_model) 211 | 212 | for param in model.parameters(): 213 | param.requires_grad = requires_grad 214 | 215 | super().__init__(bert_model=model, top_layer_only=top_layer_only) -------------------------------------------------------------------------------- /FlowQA_Attention/README.md: -------------------------------------------------------------------------------- 1 | # FlowQA + Attention Over Flow 2 | 3 | 4 | 5 | ## How we improve FlowQA -- Adding Self-attention 6 | 7 | We have noticed that in the "flow" operation, no attention mechanism is used. We believe that applying attention here should be considered, because not every historical conversation is important. When new questions are posted, it is highly likely that only several previous questions are involved, instead of all. By adding attention in the "flow" operation, we could align new representations with previous useful ones. We would like to call it "attention-over-flow". 8 | 9 | ![image](https://github.com/deepnlp-cs599-usc/quac/blob/master/FlowQA_Attention/figure/Attention%20Over%20Flow.png) 10 | 11 | 12 | ## Experiments 13 | 14 | We conducted experiments on Google Cloud, using a 2-CPU, 1 Tesla K80 GPU with 11 GB GPU memory and 60 GB disk memory. We use python 3.6.8 and pytorch 1.0.1. The batch size is set to be 2. The code is based on [momohuang's code on github](https://github.com/momohuang/FlowQA). 15 | 16 | How to run the code is the same as FlowQA. However, using "attention over flow" is set to be default. 17 | 18 | From https://github.com/momohuang/FlowQA we borrow their instructions as follows 19 | 20 | Step 1: 21 | perform the following: 22 | 23 | > pip install -r requirements.txt 24 | 25 | to install all dependent python packages. 26 | 27 | Step 2: 28 | download necessary files using: 29 | 30 | > ./download.sh 31 | 32 | Step 3: 33 | preprocess the data files using: 34 | 35 | > python preprocess_QuAC.py 36 | 37 | Step 4: 38 | run the training code using: 39 | 40 | > python train_QuAC.py 41 | 42 | To specify not using "attention over flow", run: 43 | 44 | > python train_QuAC.py --flow_attention=0 45 | 46 | 47 | ## Results 48 | 49 | The result shows that our attempt slightly improves FlowQA by 0.1 of F-1 value on dev set. We would like to mention that the model converges much faster, as only 10 epochs is used instead of 20. 50 | 51 | ![image](https://github.com/deepnlp-cs599-usc/quac/blob/master/FlowQA_Attention/figure/result.png) 52 | 53 | 54 | ## References 55 | 56 | [Flowqa: Grasping flow in history for conversational machine comprehensionn.](https://arxiv.org/abs/1810.06683) By Huang H Y, Choi E, Yih W. 57 | 58 | [Quac: Question answering in context.](https://arxiv.org/abs/1808.07036) By Choi E, He H, Iyyer M, et al. 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /FlowQA_Attention/code/QA_model/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /FlowQA_Attention/code/QA_model/model_QuAC.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import logging 7 | 8 | from torch.nn import Parameter 9 | from torch.autograd import Variable 10 | from .utils import AverageMeter 11 | from .detail_model import FlowQA 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class QAModel(object): 17 | """ 18 | High level model that handles intializing the underlying network 19 | architecture, saving, updating examples, and predicting examples. 20 | """ 21 | 22 | def __init__(self, opt, embedding=None, state_dict=None): 23 | # Book-keeping. 24 | self.opt = opt 25 | self.updates = state_dict['updates'] if state_dict else 0 26 | self.eval_embed_transfer = True 27 | self.train_loss = AverageMeter() 28 | 29 | # Building network. 30 | self.network = FlowQA(opt, embedding) 31 | if state_dict: 32 | new_state = set(self.network.state_dict().keys()) 33 | for k in list(state_dict['network'].keys()): 34 | if k not in new_state: 35 | del state_dict['network'][k] 36 | self.network.load_state_dict(state_dict['network']) 37 | 38 | # Building optimizer. 39 | parameters = [p for p in self.network.parameters() if p.requires_grad] 40 | if opt['optimizer'] == 'sgd': 41 | self.optimizer = optim.SGD(parameters, opt['learning_rate'], 42 | momentum=opt['momentum'], 43 | weight_decay=opt['weight_decay']) 44 | elif opt['optimizer'] == 'adamax': 45 | self.optimizer = optim.Adamax(parameters, 46 | weight_decay=opt['weight_decay']) 47 | elif opt['optimizer'] == 'adadelta': 48 | self.optimizer = optim.Adadelta(parameters, rho=0.95, weight_decay=opt['weight_decay']) 49 | else: 50 | raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer']) 51 | if state_dict: 52 | self.optimizer.load_state_dict(state_dict['optimizer']) 53 | 54 | if opt['fix_embeddings']: 55 | wvec_size = 0 56 | else: 57 | wvec_size = (opt['vocab_size'] - opt['tune_partial']) * opt['embedding_dim'] 58 | self.total_param = sum([p.nelement() for p in parameters]) - wvec_size 59 | 60 | def update(self, batch): 61 | # Train mode 62 | self.network.train() 63 | torch.set_grad_enabled(True) 64 | 65 | # Transfer to GPU 66 | if self.opt['cuda']: 67 | inputs = [e.cuda(non_blocking=True) for e in batch[:9] + batch[17:]] 68 | overall_mask = batch[9].cuda(non_blocking=True) 69 | 70 | answer_s = batch[10].cuda(non_blocking=True) 71 | answer_e = batch[11].cuda(non_blocking=True) 72 | answer_c = batch[12].cuda(non_blocking=True) 73 | else: 74 | inputs = [e for e in batch[:9] + batch[17:]] 75 | overall_mask = batch[9] 76 | 77 | answer_s = batch[10] 78 | answer_e = batch[11] 79 | answer_c = batch[12] 80 | 81 | # Run forward 82 | # output: [batch_size, question_num, context_len], [batch_size, question_num] 83 | score_s, score_e, score_no_answ = self.network(*inputs) 84 | 85 | # Compute loss and accuracies 86 | loss = self.opt['elmo_lambda'] * (self.network.elmo.scalar_mix_0.scalar_parameters[0] ** 2 87 | + self.network.elmo.scalar_mix_0.scalar_parameters[1] ** 2 88 | + self.network.elmo.scalar_mix_0.scalar_parameters[2] ** 2) # ELMo L2 regularization 89 | all_no_answ = (answer_c == 0) 90 | answer_s.masked_fill_(all_no_answ, -100) # ignore_index is -100 in F.cross_entropy 91 | answer_e.masked_fill_(all_no_answ, -100) 92 | 93 | for i in range(overall_mask.size(0)): 94 | q_num = sum(overall_mask[i]) # the true question number for this sampled context 95 | 96 | target_s = answer_s[i, :q_num] # Size: q_num 97 | target_e = answer_e[i, :q_num] 98 | target_c = answer_c[i, :q_num] 99 | target_no_answ = all_no_answ[i, :q_num] 100 | 101 | # single_loss is averaged across q_num 102 | if self.opt['question_normalize']: 103 | single_loss = F.binary_cross_entropy_with_logits(score_no_answ[i, :q_num], target_no_answ.float()) * q_num.item() / 8.0 104 | single_loss = single_loss + F.cross_entropy(score_s[i, :q_num], target_s) * (q_num - sum(target_no_answ)).item() / 7.0 105 | single_loss = single_loss + F.cross_entropy(score_e[i, :q_num], target_e) * (q_num - sum(target_no_answ)).item() / 7.0 106 | else: 107 | single_loss = F.binary_cross_entropy_with_logits(score_no_answ[i, :q_num], target_no_answ.float()) \ 108 | + F.cross_entropy(score_s[i, :q_num], target_s) + F.cross_entropy(score_e[i, :q_num], target_e) 109 | 110 | loss = loss + (single_loss / overall_mask.size(0)) 111 | self.train_loss.update(loss.item(), overall_mask.size(0)) 112 | 113 | # Clear gradients and run backward 114 | self.optimizer.zero_grad() 115 | loss.backward() 116 | 117 | # Clip gradients 118 | torch.nn.utils.clip_grad_norm_(self.network.parameters(), 119 | self.opt['grad_clipping']) 120 | 121 | # Update parameters 122 | self.optimizer.step() 123 | self.updates += 1 124 | 125 | # Reset any partially fixed parameters (e.g. rare words) 126 | self.reset_embeddings() 127 | self.eval_embed_transfer = True 128 | 129 | def predict(self, batch, No_Ans_Threshold=None): 130 | # Eval mode 131 | self.network.eval() 132 | torch.set_grad_enabled(False) 133 | 134 | # Transfer trained embedding to evaluation embedding 135 | if self.eval_embed_transfer: 136 | self.update_eval_embed() 137 | self.eval_embed_transfer = False 138 | 139 | # Transfer to GPU 140 | if self.opt['cuda']: 141 | inputs = [e.cuda(non_blocking=True) for e in batch[:9] + batch[17:]] 142 | else: 143 | inputs = [e for e in batch[:9] + batch[17:]] 144 | 145 | # Run forward 146 | # output: [batch_size, question_num, context_len], [batch_size, question_num] 147 | score_s, score_e, score_no_answ = self.network(*inputs) 148 | score_s = F.softmax(score_s, dim=2) 149 | score_e = F.softmax(score_e, dim=2) 150 | 151 | # Transfer to CPU/normal tensors for numpy ops 152 | score_s = score_s.data.cpu() 153 | score_e = score_e.data.cpu() 154 | score_no_answ = score_no_answ.data.cpu() 155 | 156 | # Get argmax text spans 157 | text = batch[13] 158 | spans = batch[14] 159 | overall_mask = batch[9] 160 | 161 | predictions, no_ans_scores = [], [] 162 | max_len = self.opt['max_len'] or score_s.size(2) 163 | 164 | for i in range(overall_mask.size(0)): 165 | dialog_pred, dialog_noans = [], [] 166 | 167 | for j in range(overall_mask.size(1)): 168 | if overall_mask[i, j] == 0: # this dialog has ended 169 | break 170 | 171 | dialog_noans.append(score_no_answ[i, j].item()) 172 | if No_Ans_Threshold is not None and score_no_answ[i, j] > No_Ans_Threshold: 173 | dialog_pred.append("CANNOTANSWER") 174 | else: 175 | scores = torch.ger(score_s[i, j], score_e[i, j]) 176 | scores.triu_().tril_(max_len - 1) 177 | scores = scores.numpy() 178 | s_idx, e_idx = np.unravel_index(np.argmax(scores), scores.shape) 179 | 180 | s_offset, e_offset = spans[i][s_idx][0], spans[i][e_idx][1] 181 | dialog_pred.append(text[i][s_offset:e_offset]) 182 | 183 | predictions.append(dialog_pred) 184 | no_ans_scores.append(dialog_noans) 185 | 186 | return predictions, no_ans_scores # list of (list of strings), list of (list of floats) 187 | 188 | # allow the evaluation embedding be larger than training embedding 189 | # this is helpful if we have pretrained word embeddings 190 | def setup_eval_embed(self, eval_embed, padding_idx = 0): 191 | # eval_embed should be a supermatrix of training embedding 192 | self.network.eval_embed = nn.Embedding(eval_embed.size(0), 193 | eval_embed.size(1), 194 | padding_idx = padding_idx) 195 | self.network.eval_embed.weight.data = eval_embed 196 | for p in self.network.eval_embed.parameters(): 197 | p.requires_grad = False 198 | self.eval_embed_transfer = True 199 | 200 | if hasattr(self.network, 'CoVe'): 201 | self.network.CoVe.setup_eval_embed(eval_embed) 202 | 203 | def update_eval_embed(self): 204 | # update evaluation embedding to trained embedding 205 | if self.opt['tune_partial'] > 0: 206 | offset = self.opt['tune_partial'] 207 | self.network.eval_embed.weight.data[0:offset] \ 208 | = self.network.embedding.weight.data[0:offset] 209 | else: 210 | offset = 10 211 | self.network.eval_embed.weight.data[0:offset] \ 212 | = self.network.embedding.weight.data[0:offset] 213 | 214 | def reset_embeddings(self): 215 | # Reset fixed embeddings to original value 216 | if self.opt['tune_partial'] > 0: 217 | offset = self.opt['tune_partial'] 218 | if offset < self.network.embedding.weight.data.size(0): 219 | self.network.embedding.weight.data[offset:] \ 220 | = self.network.fixed_embedding 221 | 222 | def get_pretrain(self, state_dict): 223 | own_state = self.network.state_dict() 224 | for name, param in state_dict.items(): 225 | if name not in own_state: 226 | continue 227 | if isinstance(param, Parameter): 228 | param = param.data 229 | try: 230 | own_state[name].copy_(param) 231 | except: 232 | print("Skip", name) 233 | continue 234 | 235 | def save(self, filename, epoch): 236 | params = { 237 | 'state_dict': { 238 | 'network': self.network.state_dict(), 239 | 'optimizer': self.optimizer.state_dict(), 240 | 'updates': self.updates # how many updates 241 | }, 242 | 'config': self.opt, 243 | 'epoch': epoch 244 | } 245 | try: 246 | torch.save(params, filename) 247 | logger.info('model saved to {}'.format(filename)) 248 | except BaseException: 249 | logger.warn('[ WARN: Saving failed... continuing anyway. ]') 250 | 251 | def save_for_predict(self, filename, epoch): 252 | network_state = dict([(k, v) for k, v in self.network.state_dict().items() if k[0:4] != 'CoVe']) 253 | if 'eval_embed.weight' in network_state: 254 | del network_state['eval_embed.weight'] 255 | if 'fixed_embedding' in network_state: 256 | del network_state['fixed_embedding'] 257 | params = { 258 | 'state_dict': {'network': network_state}, 259 | 'config': self.opt, 260 | } 261 | try: 262 | torch.save(params, filename) 263 | logger.info('model saved to {}'.format(filename)) 264 | except BaseException: 265 | logger.warn('[ WARN: Saving failed... continuing anyway. ]') 266 | 267 | def cuda(self): 268 | self.network.cuda() 269 | -------------------------------------------------------------------------------- /FlowQA_Attention/code/QA_model/utils.py: -------------------------------------------------------------------------------- 1 | class AverageMeter(object): 2 | """Computes and stores the average and current value.""" 3 | def __init__(self): 4 | self.reset() 5 | 6 | def reset(self): 7 | self.val = 0 8 | self.avg = 0 9 | self.sum = 0 10 | self.count = 0 11 | 12 | def update(self, val, n=1): 13 | self.val = val 14 | self.sum += val * n 15 | self.count += n 16 | self.avg = self.sum / self.count 17 | -------------------------------------------------------------------------------- /FlowQA_Attention/code/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /FlowQA_Attention/code/predict_QuAC.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import sys 4 | import random 5 | import string 6 | import logging 7 | import argparse 8 | from os.path import basename 9 | from shutil import copyfile 10 | from datetime import datetime 11 | from collections import Counter 12 | import torch 13 | import msgpack 14 | import pickle 15 | import pandas as pd 16 | import numpy as np 17 | from QA_model.model_QuAC import QAModel 18 | from general_utils import score, BatchGen_QuAC, find_best_score_and_thresh 19 | 20 | parser = argparse.ArgumentParser( 21 | description='Predict using a Dialog QA model.' 22 | ) 23 | parser.add_argument('--dev_dir', default='QuAC_data/') 24 | 25 | parser.add_argument('-o', '--output_dir', default='pred_out/') 26 | parser.add_argument('--number', type=int, default=-1, help='id of the current prediction') 27 | parser.add_argument('-m', '--model', default='', 28 | help='testing model pathname, e.g. "models/checkpoint_epoch_11.pt"') 29 | 30 | parser.add_argument('-bs', '--batch_size', type=int, default=4) 31 | parser.add_argument('--no_ans', type=float, default=0) 32 | parser.add_argument('--min_f1', type=float, default=0.4) 33 | 34 | parser.add_argument('--show', type=int, default=3) 35 | parser.add_argument('--seed', type=int, default=1023, 36 | help='random seed for data shuffling, dropout, etc.') 37 | parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available(), 38 | help='whether to use GPU acceleration.') 39 | 40 | args = parser.parse_args() 41 | if args.model == '': 42 | print("model file is not provided") 43 | sys.exit(-1) 44 | if args.model[-3:] != '.pt': 45 | print("does not recognize the model file") 46 | sys.exit(-1) 47 | 48 | # create prediction output dir 49 | os.makedirs(args.output_dir, exist_ok=True) 50 | # count the number of prediction files 51 | if args.number == -1: 52 | args.number = len(os.listdir(args.output_dir))+1 53 | args.output = args.output_dir + 'pred' + str(args.number) + '.pckl' 54 | 55 | random.seed(args.seed) 56 | np.random.seed(args.seed) 57 | torch.manual_seed(args.seed) 58 | if args.cuda: 59 | torch.cuda.manual_seed_all(args.seed) 60 | 61 | log = logging.getLogger(__name__) 62 | log.setLevel(logging.DEBUG) 63 | ch = logging.StreamHandler(sys.stdout) 64 | ch.setLevel(logging.INFO) 65 | formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S') 66 | ch.setFormatter(formatter) 67 | log.addHandler(ch) 68 | 69 | def main(): 70 | log.info('[program starts.]') 71 | checkpoint = torch.load(args.model) 72 | opt = checkpoint['config'] 73 | opt['task_name'] = 'QuAC' 74 | opt['cuda'] = args.cuda 75 | opt['seed'] = args.seed 76 | if opt.get('disperse_flow') is None: 77 | opt['disperse_flow'] = False 78 | if opt.get('rationale_lambda') is None: 79 | opt['rationale_lambda'] = 0.0 80 | if opt.get('no_dialog_flow') is None: 81 | opt['no_dialog_flow'] = False 82 | if opt.get('do_hierarchical_query') is None: 83 | opt['do_hierarchical_query'] = False 84 | state_dict = checkpoint['state_dict'] 85 | log.info('[model loaded.]') 86 | 87 | test, test_embedding, test_answer = load_dev_data(opt) 88 | model = QAModel(opt, state_dict = state_dict) 89 | log.info('[Data loaded.]') 90 | 91 | model.setup_eval_embed(test_embedding) 92 | 93 | if args.cuda: 94 | model.cuda() 95 | 96 | batches = BatchGen_QuAC(test, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, dialog_ctx=opt['explicit_dialog_ctx'], use_dialog_act=opt['use_dialog_act'], precompute_elmo=opt['elmo_batch_size'] // args.batch_size) 97 | sample_idx = random.sample(range(len(batches)), args.show) 98 | 99 | predictions = [] 100 | no_ans_scores = [] 101 | for i, batch in enumerate(batches): 102 | prediction, noans = model.predict(batch, No_Ans_Threshold=args.no_ans) 103 | predictions.extend(prediction) 104 | no_ans_scores.extend(noans) 105 | 106 | if not (i in sample_idx): 107 | continue 108 | 109 | print("Context: ", batch[-4][0]) 110 | for j in range(len(batch[-2][0])): 111 | print("Q: ", batch[-2][0][j]) 112 | print("A: ", prediction[0][j]) 113 | print(" True A: ", batch[-1][0][j], "| Follow up" if batch[-6][0][j].item() // 10 else "| Don't follow up") 114 | print(" Val. A: ", test_answer[args.batch_size * i][j]) 115 | print("") 116 | 117 | 118 | pred_out = {'predictions': predictions, 'no_ans_scores': no_ans_scores} 119 | with open(args.output, 'wb') as f: 120 | pickle.dump(pred_out, f) 121 | 122 | f1, h_f1, HEQ_Q, HEQ_D = score(predictions, test_answer, min_F1=args.min_f1) 123 | log.warning("Test F1: {:.2f}, HEQ_Q: {:.2f}, HEQ_D: {:.2f}".format(f1, HEQ_Q, HEQ_D)) 124 | 125 | def load_dev_data(opt): # can be extended to true test set 126 | with open(os.path.join(args.dev_dir, 'dev_meta.msgpack'), 'rb') as f: 127 | meta = msgpack.load(f, encoding='utf8') 128 | embedding = torch.Tensor(meta['embedding']) 129 | assert opt['embedding_dim'] == embedding.size(1) 130 | 131 | with open(os.path.join(args.dev_dir, 'dev_data.msgpack'), 'rb') as f: 132 | data = msgpack.load(f, encoding='utf8') 133 | 134 | assert opt['num_features'] == len(data['context_features'][0][0]) + opt['explicit_dialog_ctx'] * (opt['use_dialog_act']*3 + 2) 135 | 136 | dev = {'context': list(zip( 137 | data['context_ids'], 138 | data['context_tags'], 139 | data['context_ents'], 140 | data['context'], 141 | data['context_span'], 142 | data['1st_question'], 143 | data['context_tokenized'])), 144 | 'qa': list(zip( 145 | data['question_CID'], 146 | data['question_ids'], 147 | data['context_features'], 148 | data['answer_start'], 149 | data['answer_end'], 150 | data['answer_choice'], 151 | data['question'], 152 | data['answer'], 153 | data['question_tokenized'])) 154 | } 155 | 156 | dev_answer = [] 157 | for i, CID in enumerate(data['question_CID']): 158 | if len(dev_answer) <= CID: 159 | dev_answer.append([]) 160 | dev_answer[CID].append(data['all_answer'][i]) 161 | 162 | return dev, embedding, dev_answer 163 | 164 | if __name__ == '__main__': 165 | main() 166 | -------------------------------------------------------------------------------- /FlowQA_Attention/code/preprocess_QuAC.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import spacy 4 | import msgpack 5 | import unicodedata 6 | import numpy as np 7 | import pandas as pd 8 | import argparse 9 | import collections 10 | import multiprocessing 11 | import logging 12 | import random 13 | from allennlp.modules.elmo import batch_to_ids 14 | from general_utils import flatten_json, normalize_text, build_embedding, load_glove_vocab, pre_proc, get_context_span, find_answer_span, feature_gen, token2id 15 | 16 | parser = argparse.ArgumentParser( 17 | description='Preprocessing train + dev files, about 20 minutes to run on Servers.' 18 | ) 19 | parser.add_argument('--wv_file', default='glove/glove.840B.300d.txt', 20 | help='path to word vector file.') 21 | parser.add_argument('--wv_dim', type=int, default=300, 22 | help='word vector dimension.') 23 | parser.add_argument('--sort_all', action='store_true', 24 | help='sort the vocabulary by frequencies of all words.' 25 | 'Otherwise consider question words first.') 26 | parser.add_argument('--threads', type=int, default=multiprocessing.cpu_count(), 27 | help='number of threads for preprocessing.') 28 | parser.add_argument('--no_match', action='store_true', 29 | help='do not extract the three exact matching features.') 30 | parser.add_argument('--seed', type=int, default=1023, 31 | help='random seed for data shuffling, embedding init, etc.') 32 | 33 | 34 | args = parser.parse_args() 35 | trn_file = 'QuAC_data/train.json' 36 | dev_file = 'QuAC_data/dev.json' 37 | wv_file = args.wv_file 38 | wv_dim = args.wv_dim 39 | nlp = spacy.load('en', disable=['parser']) 40 | 41 | random.seed(args.seed) 42 | np.random.seed(args.seed) 43 | 44 | logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG, 45 | datefmt='%m/%d/%Y %I:%M:%S') 46 | log = logging.getLogger(__name__) 47 | 48 | log.info('start data preparing... (using {} threads)'.format(args.threads)) 49 | 50 | glove_vocab = load_glove_vocab(wv_file, wv_dim) # return a "set" of vocabulary 51 | log.info('glove loaded.') 52 | 53 | #=============================================================== 54 | #=================== Work on training data ===================== 55 | #=============================================================== 56 | 57 | def proc_train(ith, article): 58 | rows = [] 59 | 60 | for paragraph in article['paragraphs']: 61 | context = paragraph['context'] 62 | for qa in paragraph['qas']: 63 | question = qa['question'] 64 | answers = qa['orig_answer'] 65 | 66 | answer = answers['text'] 67 | answer_start = answers['answer_start'] 68 | answer_end = answers['answer_start'] + len(answers['text']) 69 | answer_choice = 0 if answer == 'CANNOTANSWER' else\ 70 | 1 if qa['yesno'] == 'y' else\ 71 | 2 if qa['yesno'] == 'n' else\ 72 | 3 # Not a yes/no question 73 | if answer_choice != 0: 74 | """ 75 | 0: Do not ask a follow up question! 76 | 1: Definitely ask a follow up question! 77 | 2: Not too important, but you can ask a follow up. 78 | """ 79 | answer_choice += 10 * (0 if qa['followup'] == "n" else\ 80 | 1 if qa['followup'] == "y" else\ 81 | 2) 82 | else: 83 | answer_start, answer_end = -1, -1 84 | rows.append((ith, question, answer, answer_start, answer_end, answer_choice)) 85 | return rows, context 86 | 87 | train, train_context = flatten_json(trn_file, proc_train) 88 | train = pd.DataFrame(train, columns=['context_idx', 'question', 'answer', 89 | 'answer_start', 'answer_end', 'answer_choice']) 90 | log.info('train json data flattened.') 91 | 92 | print(train) 93 | 94 | trC_iter = (pre_proc(c) for c in train_context) 95 | trQ_iter = (pre_proc(q) for q in train.question) 96 | trC_docs = [doc for doc in nlp.pipe(trC_iter, batch_size=64, n_threads=args.threads)] 97 | trQ_docs = [doc for doc in nlp.pipe(trQ_iter, batch_size=64, n_threads=args.threads)] 98 | 99 | # tokens 100 | trC_tokens = [[normalize_text(w.text) for w in doc] for doc in trC_docs] 101 | trQ_tokens = [[normalize_text(w.text) for w in doc] for doc in trQ_docs] 102 | trC_unnorm_tokens = [[w.text for w in doc] for doc in trC_docs] 103 | log.info('All tokens for training are obtained.') 104 | 105 | train_context_span = [get_context_span(a, b) for a, b in zip(train_context, trC_unnorm_tokens)] 106 | 107 | ans_st_token_ls, ans_end_token_ls = [], [] 108 | for ans_st, ans_end, idx in zip(train.answer_start, train.answer_end, train.context_idx): 109 | ans_st_token, ans_end_token = find_answer_span(train_context_span[idx], ans_st, ans_end) 110 | ans_st_token_ls.append(ans_st_token) 111 | ans_end_token_ls.append(ans_end_token) 112 | 113 | train['answer_start_token'], train['answer_end_token'] = ans_st_token_ls, ans_end_token_ls 114 | initial_len = len(train) 115 | train.dropna(inplace=True) # modify self DataFrame 116 | log.info('drop {0}/{1} inconsistent samples.'.format(initial_len - len(train), initial_len)) 117 | log.info('answer span for training is generated.') 118 | 119 | # features 120 | trC_tags, trC_ents, trC_features = feature_gen(trC_docs, train.context_idx, trQ_docs, args.no_match) 121 | log.info('features for training is generated: {}, {}, {}'.format(len(trC_tags), len(trC_ents), len(trC_features))) 122 | 123 | def build_train_vocab(questions, contexts): # vocabulary will also be sorted accordingly 124 | if args.sort_all: 125 | counter = collections.Counter(w for doc in questions + contexts for w in doc) 126 | vocab = sorted([t for t in counter if t in glove_vocab], key=counter.get, reverse=True) 127 | else: 128 | counter_c = collections.Counter(w for doc in contexts for w in doc) 129 | counter_q = collections.Counter(w for doc in questions for w in doc) 130 | counter = counter_c + counter_q 131 | vocab = sorted([t for t in counter_q if t in glove_vocab], key=counter_q.get, reverse=True) 132 | vocab += sorted([t for t in counter_c.keys() - counter_q.keys() if t in glove_vocab], 133 | key=counter.get, reverse=True) 134 | total = sum(counter.values()) 135 | matched = sum(counter[t] for t in vocab) 136 | log.info('vocab {1}/{0} OOV {2}/{3} ({4:.4f}%)'.format( 137 | len(counter), len(vocab), (total - matched), total, (total - matched) / total * 100)) 138 | vocab.insert(0, "") 139 | vocab.insert(1, "") 140 | vocab.insert(2, "") 141 | vocab.insert(3, "") 142 | return vocab 143 | 144 | # vocab 145 | tr_vocab = build_train_vocab(trQ_tokens, trC_tokens) 146 | trC_ids = token2id(trC_tokens, tr_vocab, unk_id=1) 147 | trQ_ids = token2id(trQ_tokens, tr_vocab, unk_id=1) 148 | trQ_tokens = [[""] + doc + [""] for doc in trQ_tokens] 149 | trQ_ids = [[2] + qsent + [3] for qsent in trQ_ids] 150 | print(trQ_ids[:10]) 151 | # tags 152 | vocab_tag = [''] + list(nlp.tagger.labels) 153 | trC_tag_ids = token2id(trC_tags, vocab_tag) 154 | # entities 155 | vocab_ent = list(set([ent for sent in trC_ents for ent in sent])) 156 | trC_ent_ids = token2id(trC_ents, vocab_ent, unk_id=0) 157 | 158 | log.info('Found {} POS tags.'.format(len(vocab_tag))) 159 | log.info('Found {} entity tags: {}'.format(len(vocab_ent), vocab_ent)) 160 | log.info('vocabulary for training is built.') 161 | 162 | tr_embedding = build_embedding(wv_file, tr_vocab, wv_dim) 163 | log.info('got embedding matrix for training.') 164 | 165 | # don't store row name in csv 166 | #train.to_csv('QuAC_data/train.csv', index=False, encoding='utf8') 167 | 168 | meta = { 169 | 'vocab': tr_vocab, 170 | 'embedding': tr_embedding.tolist() 171 | } 172 | with open('QuAC_data/train_meta.msgpack', 'wb') as f: 173 | msgpack.dump(meta, f) 174 | 175 | prev_CID, first_question = -1, [] 176 | for i, CID in enumerate(train.context_idx): 177 | if not (CID == prev_CID): 178 | first_question.append(i) 179 | prev_CID = CID 180 | 181 | result = { 182 | 'question_ids': trQ_ids, 183 | 'context_ids': trC_ids, 184 | 'context_features': trC_features, # exact match, tf 185 | 'context_tags': trC_tag_ids, # POS tagging 186 | 'context_ents': trC_ent_ids, # Entity recognition 187 | 'context': train_context, 188 | 'context_span': train_context_span, 189 | '1st_question': first_question, 190 | 'question_CID': train.context_idx.tolist(), 191 | 'question': train.question.tolist(), 192 | 'answer': train.answer.tolist(), 193 | 'answer_start': train.answer_start_token.tolist(), 194 | 'answer_end': train.answer_end_token.tolist(), 195 | 'answer_choice': train.answer_choice.tolist(), 196 | 'context_tokenized': trC_tokens, 197 | 'question_tokenized': trQ_tokens 198 | } 199 | with open('QuAC_data/train_data.msgpack', 'wb') as f: 200 | msgpack.dump(result, f) 201 | 202 | log.info('saved training to disk.') 203 | 204 | #========================================================== 205 | #=================== Work on dev data ===================== 206 | #========================================================== 207 | 208 | def proc_dev(ith, article): 209 | rows = [] 210 | 211 | for paragraph in article['paragraphs']: 212 | context = paragraph['context'] 213 | for qa in paragraph['qas']: 214 | question = qa['question'] 215 | answers = qa['orig_answer'] 216 | 217 | answer = answers['text'] 218 | answer_start = answers['answer_start'] 219 | answer_end = answers['answer_start'] + len(answers['text']) 220 | answer_choice = 0 if answer == 'CANNOTANSWER' else\ 221 | 1 if qa['yesno'] == 'y' else\ 222 | 2 if qa['yesno'] == 'n' else\ 223 | 3 # Not a yes/no question 224 | if answer_choice != 0: 225 | """ 226 | 0: Do not ask a follow up question! 227 | 1: Definitely ask a follow up question! 228 | 2: Not too important, but you can ask a follow up. 229 | """ 230 | answer_choice += 10 * (0 if qa['followup'] == "n" else\ 231 | 1 if qa['followup'] == "y" else\ 232 | 2) 233 | else: 234 | answer_start, answer_end = -1, -1 235 | 236 | ans_ls = [] 237 | for ans in qa['answers']: 238 | ans_ls.append(ans['text']) 239 | 240 | rows.append((ith, question, answer, answer_start, answer_end, answer_choice, ans_ls)) 241 | return rows, context 242 | 243 | dev, dev_context = flatten_json(dev_file, proc_dev) 244 | dev = pd.DataFrame(dev, columns=['context_idx', 'question', 'answer', 245 | 'answer_start', 'answer_end', 'answer_choice', 'all_answer']) 246 | log.info('dev json data flattened.') 247 | 248 | print(dev) 249 | 250 | devC_iter = (pre_proc(c) for c in dev_context) 251 | devQ_iter = (pre_proc(q) for q in dev.question) 252 | devC_docs = [doc for doc in nlp.pipe( 253 | devC_iter, batch_size=64, n_threads=args.threads)] 254 | devQ_docs = [doc for doc in nlp.pipe( 255 | devQ_iter, batch_size=64, n_threads=args.threads)] 256 | 257 | # tokens 258 | devC_tokens = [[normalize_text(w.text) for w in doc] for doc in devC_docs] 259 | devQ_tokens = [[normalize_text(w.text) for w in doc] for doc in devQ_docs] 260 | devC_unnorm_tokens = [[w.text for w in doc] for doc in devC_docs] 261 | log.info('All tokens for dev are obtained.') 262 | 263 | dev_context_span = [get_context_span(a, b) for a, b in zip(dev_context, devC_unnorm_tokens)] 264 | log.info('context span for dev is generated.') 265 | 266 | ans_st_token_ls, ans_end_token_ls = [], [] 267 | for ans_st, ans_end, idx in zip(dev.answer_start, dev.answer_end, dev.context_idx): 268 | ans_st_token, ans_end_token = find_answer_span(dev_context_span[idx], ans_st, ans_end) 269 | ans_st_token_ls.append(ans_st_token) 270 | ans_end_token_ls.append(ans_end_token) 271 | 272 | dev['answer_start_token'], dev['answer_end_token'] = ans_st_token_ls, ans_end_token_ls 273 | initial_len = len(dev) 274 | dev.dropna(inplace=True) # modify self DataFrame 275 | log.info('drop {0}/{1} inconsistent samples.'.format(initial_len - len(dev), initial_len)) 276 | log.info('answer span for dev is generated.') 277 | 278 | # features 279 | devC_tags, devC_ents, devC_features = feature_gen(devC_docs, dev.context_idx, devQ_docs, args.no_match) 280 | log.info('features for dev is generated: {}, {}, {}'.format(len(devC_tags), len(devC_ents), len(devC_features))) 281 | 282 | def build_dev_vocab(questions, contexts): # most vocabulary comes from tr_vocab 283 | existing_vocab = set(tr_vocab) 284 | new_vocab = list(set([w for doc in questions + contexts for w in doc if w not in existing_vocab and w in glove_vocab])) 285 | vocab = tr_vocab + new_vocab 286 | log.info('train vocab {0}, total vocab {1}'.format(len(tr_vocab), len(vocab))) 287 | return vocab 288 | 289 | # vocab 290 | dev_vocab = build_dev_vocab(devQ_tokens, devC_tokens) # tr_vocab is a subset of dev_vocab 291 | devC_ids = token2id(devC_tokens, dev_vocab, unk_id=1) 292 | devQ_ids = token2id(devQ_tokens, dev_vocab, unk_id=1) 293 | devQ_tokens = [[""] + doc + [""] for doc in devQ_tokens] 294 | devQ_ids = [[2] + qsent + [3] for qsent in devQ_ids] 295 | print(devQ_ids[:10]) 296 | # tags 297 | devC_tag_ids = token2id(devC_tags, vocab_tag) # vocab_tag same as training 298 | # entities 299 | devC_ent_ids = token2id(devC_ents, vocab_ent, unk_id=0) # vocab_ent same as training 300 | log.info('vocabulary for dev is built.') 301 | 302 | dev_embedding = build_embedding(wv_file, dev_vocab, wv_dim) 303 | # tr_embedding is a submatrix of dev_embedding 304 | log.info('got embedding matrix for dev.') 305 | 306 | # don't store row name in csv 307 | #dev.to_csv('QuAC_data/dev.csv', index=False, encoding='utf8') 308 | 309 | meta = { 310 | 'vocab': dev_vocab, 311 | 'embedding': dev_embedding.tolist() 312 | } 313 | with open('QuAC_data/dev_meta.msgpack', 'wb') as f: 314 | msgpack.dump(meta, f) 315 | 316 | prev_CID, first_question = -1, [] 317 | for i, CID in enumerate(dev.context_idx): 318 | if not (CID == prev_CID): 319 | first_question.append(i) 320 | prev_CID = CID 321 | 322 | result = { 323 | 'question_ids': devQ_ids, 324 | 'context_ids': devC_ids, 325 | 'context_features': devC_features, # exact match, tf 326 | 'context_tags': devC_tag_ids, # POS tagging 327 | 'context_ents': devC_ent_ids, # Entity recognition 328 | 'context': dev_context, 329 | 'context_span': dev_context_span, 330 | '1st_question': first_question, 331 | 'question_CID': dev.context_idx.tolist(), 332 | 'question': dev.question.tolist(), 333 | 'answer': dev.answer.tolist(), 334 | 'answer_start': dev.answer_start_token.tolist(), 335 | 'answer_end': dev.answer_end_token.tolist(), 336 | 'answer_choice': dev.answer_choice.tolist(), 337 | 'all_answer': dev.all_answer.tolist(), 338 | 'context_tokenized': devC_tokens, 339 | 'question_tokenized': devQ_tokens 340 | } 341 | with open('QuAC_data/dev_data.msgpack', 'wb') as f: 342 | msgpack.dump(result, f) 343 | 344 | log.info('saved dev to disk.') 345 | -------------------------------------------------------------------------------- /FlowQA_Attention/code/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | msgpack-python 4 | spacy 5 | allennlp 6 | torch 7 | -------------------------------------------------------------------------------- /FlowQA_Attention/code/train_QuAC.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import random 5 | import string 6 | import logging 7 | import argparse 8 | from shutil import copyfile 9 | from datetime import datetime 10 | from collections import Counter 11 | import torch 12 | import msgpack 13 | import pandas as pd 14 | import numpy as np 15 | from QA_model.model_QuAC import QAModel 16 | from general_utils import find_best_score_and_thresh, BatchGen_QuAC 17 | 18 | parser = argparse.ArgumentParser( 19 | description='Train a Dialog QA model.' 20 | ) 21 | 22 | # system 23 | parser.add_argument('--task_name', default='QuAC') 24 | parser.add_argument('--name', default='', help='additional name of the current run') 25 | parser.add_argument('--log_file', default='output.log', 26 | help='path for log file.') 27 | parser.add_argument('--log_per_updates', type=int, default=20, 28 | help='log model loss per x updates (mini-batches).') 29 | 30 | parser.add_argument('--train_dir', default='QuAC_data/') 31 | parser.add_argument('--dev_dir', default='QuAC_data/') 32 | parser.add_argument('--answer_type_num', type=int, default=1) 33 | 34 | parser.add_argument('--model_dir', default='models', 35 | help='path to store saved models.') 36 | parser.add_argument('--eval_per_epoch', type=int, default=1, 37 | help='perform evaluation per x epoches.') 38 | parser.add_argument('--MTLSTM_path', default='glove/MT-LSTM.pth') 39 | parser.add_argument('--save_all', dest='save_best_only', action='store_false', help='save all models.') 40 | parser.add_argument('--do_not_save', action='store_true', help='don\'t save any model') 41 | parser.add_argument('--save_for_predict', action='store_true') 42 | parser.add_argument('--seed', type=int, default=1023, 43 | help='random seed for data shuffling, dropout, etc.') 44 | parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available(), 45 | help='whether to use GPU acceleration.') 46 | # training 47 | parser.add_argument('-e', '--epoches', type=int, default=30) 48 | parser.add_argument('-bs', '--batch_size', type=int, default=3) 49 | parser.add_argument('-ebs', '--elmo_batch_size', type=int, default=12) 50 | parser.add_argument('-rs', '--resume', default='', 51 | help='previous model pathname. ' 52 | 'e.g. "models/checkpoint_epoch_11.pt"') 53 | parser.add_argument('-ro', '--resume_options', action='store_true', 54 | help='use previous model options, ignore the cli and defaults.') 55 | parser.add_argument('-rlr', '--reduce_lr', type=float, default=0., 56 | help='reduce initial (resumed) learning rate by this factor.') 57 | parser.add_argument('-op', '--optimizer', default='adamax', 58 | help='supported optimizer: adamax, sgd, adadelta, adam') 59 | parser.add_argument('-gc', '--grad_clipping', type=float, default=10) 60 | parser.add_argument('-wd', '--weight_decay', type=float, default=0) 61 | parser.add_argument('-lr', '--learning_rate', type=float, default=0.1, 62 | help='only applied to SGD.') 63 | parser.add_argument('-mm', '--momentum', type=float, default=0, 64 | help='only applied to SGD.') 65 | parser.add_argument('-tp', '--tune_partial', type=int, default=1000, 66 | help='finetune top-x embeddings (including , ).') 67 | parser.add_argument('--fix_embeddings', action='store_true', 68 | help='if true, `tune_partial` will be ignored.') 69 | parser.add_argument('--elmo_lambda', type=float, default=0.0) 70 | parser.add_argument('--no_question_normalize', dest='question_normalize', action='store_false') # when set, do dialog normalize 71 | parser.add_argument('--pretrain', default='') 72 | 73 | # model 74 | parser.add_argument('--explicit_dialog_ctx', type=int, default=2) 75 | parser.add_argument('--use_dialog_act', action='store_true') 76 | parser.add_argument('--no_dialog_flow', action='store_true') 77 | parser.add_argument('--no_hierarchical_query', dest='do_hierarchical_query', action='store_false') 78 | parser.add_argument('--no_prealign', dest='do_prealign', action='store_false') 79 | 80 | parser.add_argument('--final_output_att_hidden', type=int, default=250) 81 | parser.add_argument('--question_merge', default='linear_self_attn') 82 | parser.add_argument('--no_ptr_update', dest='do_ptr_update', action='store_false') 83 | parser.add_argument('--no_ptr_net_indep_attn', dest='ptr_net_indep_attn', action='store_false') 84 | parser.add_argument('--ptr_net_attn_type', default='Bilinear', help="Attention for answer span output: Bilinear, MLP or Default") 85 | 86 | parser.add_argument('--do_residual_rnn', dest='do_residual_rnn', action='store_true') 87 | parser.add_argument('--do_residual_everything', dest='do_residual_everything', action='store_true') 88 | parser.add_argument('--do_residual', dest='do_residual', action='store_true') 89 | parser.add_argument('--rnn_layers', type=int, default=1, help="Default number of RNN layers") 90 | parser.add_argument('--rnn_type', default='lstm', 91 | help='supported types: rnn, gru, lstm') 92 | parser.add_argument('--concat_rnn', dest='concat_rnn', action='store_true') 93 | 94 | parser.add_argument('--hidden_size', type=int, default=125) 95 | parser.add_argument('--self_attention_opt', type=int, default=1) # 0: no self attention 96 | 97 | parser.add_argument('--deep_inter_att_do_similar', type=int, default=0) 98 | parser.add_argument('--deep_att_hidden_size_per_abstr', type=int, default=250) 99 | 100 | parser.add_argument('--no_elmo', dest='use_elmo', action='store_false') 101 | parser.add_argument('--no_em', action='store_true') 102 | 103 | parser.add_argument('--no_wemb', dest='use_wemb', action='store_false') # word embedding 104 | parser.add_argument('--CoVe_opt', type=int, default=1) # contexualized embedding option 105 | parser.add_argument('--no_pos', dest='use_pos', action='store_false') # pos tagging 106 | parser.add_argument('--pos_size', type=int, default=51, help='how many kinds of POS tags.') 107 | parser.add_argument('--pos_dim', type=int, default=12, help='the embedding dimension for POS tags.') 108 | parser.add_argument('--no_ner', dest='use_ner', action='store_false') # named entity 109 | parser.add_argument('--ner_size', type=int, default=19, help='how many kinds of named entity tags.') 110 | parser.add_argument('--ner_dim', type=int, default=8, help='the embedding dimension for named entity tags.') 111 | 112 | parser.add_argument('--prealign_hidden', type=int, default=300) 113 | parser.add_argument('--prealign_option', type=int, default=2, help='0: No prealign, 1, 2, ...: Different options') 114 | 115 | parser.add_argument('--no_seq_dropout', dest='do_seq_dropout', action='store_false') 116 | parser.add_argument('--my_dropout_p', type=float, default=0.4) 117 | parser.add_argument('--dropout_emb', type=float, default=0.4) 118 | 119 | parser.add_argument('--max_len', type=int, default=35) 120 | 121 | parser.add_argument('--flow_attention', type=int, default = 1) 122 | parser.add_argument('--use_bert', help="Whether to use BERT or not, and which version to use") 123 | parser.add_argument('--bert_path', default='bert_vocab_files/') 124 | args = parser.parse_args() 125 | 126 | if args.name != '': 127 | args.model_dir = args.model_dir + '_' + args.name 128 | args.log_file = os.path.dirname(args.log_file) + 'output_' + args.name + '.log' 129 | 130 | # set model dir 131 | model_dir = args.model_dir 132 | os.makedirs(model_dir, exist_ok=True) 133 | model_dir = os.path.abspath(model_dir) 134 | 135 | # set random seed 136 | random.seed(args.seed) 137 | np.random.seed(args.seed) 138 | torch.manual_seed(args.seed) 139 | if args.cuda: 140 | torch.cuda.manual_seed_all(args.seed) 141 | 142 | # setup logger 143 | log = logging.getLogger(__name__) 144 | log.setLevel(logging.DEBUG) 145 | fh = logging.FileHandler(args.log_file) 146 | fh.setLevel(logging.DEBUG) 147 | ch = logging.StreamHandler(sys.stdout) 148 | ch.setLevel(logging.INFO) 149 | formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S') 150 | fh.setFormatter(formatter) 151 | ch.setFormatter(formatter) 152 | log.addHandler(fh) 153 | log.addHandler(ch) 154 | 155 | def main(): 156 | log.info('[program starts.]') 157 | opt = vars(args) # changing opt will change args 158 | train, train_embedding, opt = load_train_data(opt) 159 | dev, dev_embedding, dev_answer = load_dev_data(opt) 160 | opt['num_features'] += args.explicit_dialog_ctx * (args.use_dialog_act*3 + 2) # dialog_act + previous answer 161 | if opt['use_elmo'] == False: 162 | opt['elmo_batch_size'] = 0 163 | log.info('[Data loaded.]') 164 | 165 | if args.resume: 166 | log.info('[loading previous model...]') 167 | checkpoint = torch.load(args.resume) 168 | if args.resume_options: 169 | opt = checkpoint['config'] 170 | state_dict = checkpoint['state_dict'] 171 | model = QAModel(opt, train_embedding, state_dict) 172 | epoch_0 = checkpoint['epoch'] + 1 173 | for i in range(checkpoint['epoch']): 174 | random.shuffle(list(range(len(train)))) # synchronize random seed 175 | if args.reduce_lr: 176 | lr_decay(model.optimizer, lr_decay=args.reduce_lr) 177 | else: 178 | model = QAModel(opt, train_embedding) 179 | epoch_0 = 1 180 | 181 | if args.pretrain: 182 | pretrain_model = torch.load(args.pretrain) 183 | state_dict = pretrain_model['state_dict']['network'] 184 | 185 | model.get_pretrain(state_dict) 186 | 187 | model.setup_eval_embed(dev_embedding) 188 | log.info("[dev] Total number of params: {}".format(model.total_param)) 189 | 190 | if args.cuda: 191 | model.cuda() 192 | 193 | if args.resume: 194 | batches = BatchGen_QuAC(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, dialog_ctx=args.explicit_dialog_ctx, use_dialog_act=args.use_dialog_act, use_bert=args.use_bert, bert_path=args.bert_path) 195 | predictions, no_ans_scores = [], [] 196 | for batch in batches: 197 | phrases, noans = model.predict(batch) 198 | predictions.extend(phrases) 199 | no_ans_scores.extend(noans) 200 | f1, na, thresh = find_best_score_and_thresh(predictions, dev_answer, no_ans_scores) 201 | log.info("[dev F1: {} NA: {} TH: {}]".format(f1, na, thresh)) 202 | best_val_score, best_na, best_thresh = f1, na, thresh 203 | else: 204 | best_val_score, best_na, best_thresh = 0.0, 0.0, 0.0 205 | 206 | for epoch in range(epoch_0, epoch_0 + args.epoches): 207 | log.warning('Epoch {}'.format(epoch)) 208 | # train 209 | batches = BatchGen_QuAC(train, batch_size=args.batch_size, gpu=args.cuda, dialog_ctx=args.explicit_dialog_ctx, use_dialog_act=args.use_dialog_act, precompute_elmo=args.elmo_batch_size // args.batch_size, use_bert = args.use_bert, bert_path=args.bert_path) 210 | start = datetime.now() 211 | for i, batch in enumerate(batches): 212 | model.update(batch) 213 | if i % args.log_per_updates == 0: 214 | log.info('updates[{0:6}] train loss[{1:.5f}] remaining[{2}]'.format( 215 | model.updates, model.train_loss.avg, 216 | str((datetime.now() - start) / (i + 1) * (len(batches) - i - 1)).split('.')[0])) 217 | 218 | # eval 219 | if epoch % args.eval_per_epoch == 0: 220 | batches = BatchGen_QuAC(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, dialog_ctx=args.explicit_dialog_ctx, use_dialog_act=args.use_dialog_act, precompute_elmo=args.elmo_batch_size // args.batch_size, use_bert = args.use_bert, bert_path=args.bert_path) 221 | predictions, no_ans_scores = [], [] 222 | for batch in batches: 223 | phrases, noans = model.predict(batch) 224 | predictions.extend(phrases) 225 | no_ans_scores.extend(noans) 226 | f1, na, thresh = find_best_score_and_thresh(predictions, dev_answer, no_ans_scores) 227 | 228 | # save 229 | if args.save_best_only: 230 | if f1 > best_val_score: 231 | best_val_score, best_na, best_thresh = f1, na, thresh 232 | model_file = os.path.join(model_dir, 'best_model.pt') 233 | model.save(model_file, epoch) 234 | log.info('[new best model saved.]') 235 | else: 236 | model_file = os.path.join(model_dir, 'checkpoint_epoch_{}.pt'.format(epoch)) 237 | model.save(model_file, epoch) 238 | if f1 > best_val_score: 239 | best_val_score, best_na, best_thresh = f1, na, thresh 240 | copyfile(os.path.join(model_dir, model_file), 241 | os.path.join(model_dir, 'best_model.pt')) 242 | log.info('[new best model saved.]') 243 | 244 | log.warning("Epoch {} - dev F1: {:.3f} NA: {:.3f} TH: {:.3f} (best F1: {:.3f} NA: {:.3f} TH: {:.3f})".format(epoch, f1, na, thresh, best_val_score, best_na, best_thresh)) 245 | 246 | def lr_decay(optimizer, lr_decay): 247 | for param_group in optimizer.param_groups: 248 | param_group['lr'] *= lr_decay 249 | log.info('[learning rate reduced by {}]'.format(lr_decay)) 250 | return optimizer 251 | 252 | def load_train_data(opt): 253 | with open(os.path.join(args.train_dir, 'train_meta.msgpack'), 'rb') as f: 254 | meta = msgpack.load(f, encoding='utf8') 255 | embedding = torch.Tensor(meta['embedding']) 256 | opt['vocab_size'] = embedding.size(0) 257 | opt['embedding_dim'] = embedding.size(1) 258 | 259 | with open(os.path.join(args.train_dir, 'train_data.msgpack'), 'rb') as f: 260 | data = msgpack.load(f, encoding='utf8') 261 | #data_orig = pd.read_csv(os.path.join(args.train_dir, 'train.csv')) 262 | 263 | opt['num_features'] = len(data['context_features'][0][0]) 264 | 265 | train = {'context': list(zip( 266 | data['context_ids'], 267 | data['context_tags'], 268 | data['context_ents'], 269 | data['context'], 270 | data['context_span'], 271 | data['1st_question'], 272 | data['context_tokenized'])), 273 | 'qa': list(zip( 274 | data['question_CID'], 275 | data['question_ids'], 276 | data['context_features'], 277 | data['answer_start'], 278 | data['answer_end'], 279 | data['answer_choice'], 280 | data['question'], 281 | data['answer'], 282 | data['question_tokenized'])) 283 | } 284 | return train, embedding, opt 285 | 286 | def load_dev_data(opt): # can be extended to true test set 287 | with open(os.path.join(args.dev_dir, 'dev_meta.msgpack'), 'rb') as f: 288 | meta = msgpack.load(f, encoding='utf8') 289 | embedding = torch.Tensor(meta['embedding']) 290 | assert opt['embedding_dim'] == embedding.size(1) 291 | 292 | with open(os.path.join(args.dev_dir, 'dev_data.msgpack'), 'rb') as f: 293 | data = msgpack.load(f, encoding='utf8') 294 | #data_orig = pd.read_csv(os.path.join(args.dev_dir, 'dev.csv')) 295 | 296 | assert opt['num_features'] == len(data['context_features'][0][0]) 297 | 298 | dev = {'context': list(zip( 299 | data['context_ids'], 300 | data['context_tags'], 301 | data['context_ents'], 302 | data['context'], 303 | data['context_span'], 304 | data['1st_question'], 305 | data['context_tokenized'])), 306 | 'qa': list(zip( 307 | data['question_CID'], 308 | data['question_ids'], 309 | data['context_features'], 310 | data['answer_start'], 311 | data['answer_end'], 312 | data['answer_choice'], 313 | data['question'], 314 | data['answer'], 315 | data['question_tokenized'])) 316 | } 317 | 318 | dev_answer = [] 319 | for i, CID in enumerate(data['question_CID']): 320 | if len(dev_answer) <= CID: 321 | dev_answer.append([]) 322 | dev_answer[CID].append(data['all_answer'][i]) 323 | 324 | return dev, embedding, dev_answer 325 | 326 | if __name__ == '__main__': 327 | main() 328 | -------------------------------------------------------------------------------- /FlowQA_Attention/figure/Attention Over Flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/FlowQA_Attention/figure/Attention Over Flow.png -------------------------------------------------------------------------------- /FlowQA_Attention/figure/Readme.me: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /FlowQA_Attention/figure/Vanilla Flow Operation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/FlowQA_Attention/figure/Vanilla Flow Operation.png -------------------------------------------------------------------------------- /FlowQA_Attention/figure/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/FlowQA_Attention/figure/result.png -------------------------------------------------------------------------------- /FlowQA_Coreference/README.md: -------------------------------------------------------------------------------- 1 | # FlowQA + Coreference 2 | 3 | We used the [coreference model](https://github.com/kentonl/e2e-coref) proposed by Lee et all to improve the [FlowQA](https://github.com/momohuang/FlowQA) model. 4 | In short, this model clusters spans in text that coreference among other. 5 | For example, let's consider below conversation; 6 | 7 | A: What is the story about? 8 | 9 | B: young girl and her dog 10 | 11 | A: What were they doing? 12 | 13 | B: set out a trip 14 | 15 | In this conversation, the phrase "young girl and her dog" said by A refers to "they" said by B. 16 | Also, the pronoun "her" said by A refers to "young girl" which appears earlier. 17 | Therefore, the model will cluster the following spans; 18 | 19 | [["young girl", "her"], ["young girl and her dog", "they"]] 20 | 21 | 22 | In order to feed this coreference information to the FlowQA model, we can 23 | 24 | * Feed the result from the coreference model to the FlowQA model 25 | * Feed the internal representation of each tokens in coreference model to the FlowQA model. 26 | 27 | We concatenate these two new representation with the existing input of FlowQA, which consists of word embeddings, part of speech and named entity. 28 | 29 | ## Feed the result from the coreference model 30 | 31 | We use one hot encoding to represent the results from coreference model. 32 | Each token will be coresponding to a vector of which the length is the number of clusters. 33 | If a token belong to an ith cluster, the element ith in one hot vector will be 1. 34 | Here is the one hot encoding for above example. 35 | 36 | "young" [1, 1] 37 | "girl" [1, 1] 38 | "and" [0 ,1] 39 | "her" [1, 1] 40 | "dog" [1, 1] 41 | "What" [0, 0] 42 | "were" [0, 0] 43 | "they" [0, 1] 44 | "doing" [0, 0] 45 | 46 | ## Feed the internal representation 47 | 48 | We use the latent representation from the coreference model as an input of FlowQA model. To be precise, we use X* from the figure 3 of [this paper](https://arxiv.org/abs/1707.07045). Since the model predicts the coreference, we hypothesize that the latent representation of the model will encode the coreference as well. 49 | 50 | # Experiment 51 | 52 | First, we set up the Coreference Model according to [instruction](https://github.com/kentonl/e2e-coref). 53 | Then we use their pre-trained coreference model to predict the coreference of QuAC data set. 54 | We extract the latent representation of each token and save them. 55 | After that, we convert the result of the coreference model to one hot encoding. 56 | Finally, we concatenate them with one hot encoding to obtain the augmented input vector that will be used to input FlowQA. 57 | The augmented vector' size is 450 which consists of 400 for X* latent representation and 50 for one hot encoding. 58 | 59 | Second, we set up the FlowQA model according to the [instruction](https://github.com/momohuang/FlowQA). 60 | We have to modify ```train_QuAC.py```, ```general_utils.py``` and ```detail_model.py``` to support the augmented representation. 61 | Then we train the model with this new input representation. 62 | 63 | 64 | We use [Google Cloud Platform](https://cloud.google.com/) with 4 vCPUs, 26 GB memory and 1 x NVIDIA Tesla K80 to run both FlowQA and Coreference models. 65 | It takes around 50 hour to predict the coreference for entire QuAC dataset and takes around 4 hour per epoch to train FlowQA model 66 | 67 | 68 | # Result 69 | 70 | Below is the comparison of F1 between the original FlowQA model and our modification. We can see that the improved model perform slightly better. 71 | 72 | ![ ](/figure/coref-F1.png) 73 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Question Answering In Context 2 | Question Answering (QA) has long been a promising yet challenging task, and a large number of work has been done in this area. However, simple QA tasks such as extracting answer spans from a given text have been criticized as "shallow pattern recognition", which does not equip machines with the capability of reading. Recently, Conversational Question Answering has been proposed to address this issue. In this task, a program needs to not only answer questions, but also answer them in a conversational style. 3 | 4 | In this project, we are going to use deep neural networks to address [QuAC](https://quac.ai/), one of these Conversational QA datasets. We are going to train existing models on QuAC and compare their pros and cons. We will also use models effective on other related datasets and invent new models involving different network architectures or mechanisms. We hope that we could obtain a better, and even state-of-the-art performance. 5 | 6 | This project is a part of CS 599 Deep Learning and Its Applications in Spring 2019 at Department of Computer Science, University of Southern California. 7 | 8 | ## QuAC 9 |

10 | 11 |

12 | Figure 1: Task setup of QuAC. (Pics from [QuAC](https://quac.ai)) 13 |

14 | 15 | [Question Answering in Context(QuAC)](https://quac.ai/) is a newly proposed dataset with regards to conversational QA. The task is to answer a question in a conversation given a context and historical questions and answers in the conversation. Compared with the former ones, its questions are often highly context-dependent, elliptical, and even unanswerable. The architecture o typical QA models for QuAC: 16 | 17 |

18 | 19 |

20 | Figure 2: Baseline QA Model for QuAC. 21 |

22 | 23 | ## Problem Formulation 24 | The input of the problem is a context as background and a sequence of text-free questions, each question has two binary properties “yes/no” and “followup”. The first property indicates whether the question can be answered by affirmation “yes” or “no”. The second property indicates whether the question is a follow-up of previous questions. 25 | 26 | The output is the answer of each question. The answer must be a span of context unless the question is a “yes/no“ question, or the answer should be “cannot answer“ if the answer cannot be found in given context. 27 | 28 |

29 | 30 | 31 |

32 | 33 | ## Approaches 34 | 35 | We try to enhance two baseline models: FlowQA and BiDAF++. We also try to apply an existing model, SDNet, which works on another QA dataset. 36 | 37 | ### FlowQA 38 | 39 | [FlowQA](https://github.com/momohuang/FlowQA) has been shown to have a considerable good performance on conversational question-answering tasks such as CoQA and QuAC. Firstly, we re-run the FlowQA model and investigate why it has great performance and why it is special. 40 | 41 | 42 | Why does FlowQA work? We believe that it is because of its "flow" operation, which uses a LSTM (or GRU) to represent context words in terms of question turns. By using "flow" operation, FlowQA could capture and retain conversational information. Since 43 | conversations are in fact sequences, it is a natural, yet smart, idea, to encode conversational information in this way. 44 | 45 | 46 |

47 | 48 | 49 |

50 | 51 | #### [FlowQA + Attention Over Flow](FlowQA_Attention) 52 | 53 | We believe that applying attention here should be considered, because not every historical conversation is important. 54 | When new questions are posted, it is highly likely that only several previous questions are involved, instead of all. By adding attention in the "flow" operation, we could align new representations with previous useful ones. We would like to call it "attention-over-flow". 55 | 56 | 57 |

58 | 59 | 60 |

61 | 62 | #### [FlowQA + Coreference](FlowQA_Coreference) 63 | 64 | Since the QuAC data set may contain many coreference in context and also in conversations, our intuition is that we could exploit the coreference resolution model to improve FlowQA model. 65 | Below figure shows the improved model where the gray blocks are the existing FlowQA model and the blue blocks are coreference model we added. 66 | Our experiment shows that, with coreference model, the F1 is slightly better. 67 |

68 | 69 |

70 | 71 | ### [SD-Net](SDNet) 72 | A contextualized attention-based deep neural network developed by Microsoft. It is originally evaluated on another question answering dataset CoQA, and it is the first model that reaches in-domain F1 score higher than 80% (80.7%) on CoQA. Here we are applying this model on QuAC dataset. Since the format of two datasets are different, we need to preprocess data carefully. 73 | 74 | ### [BiDAF++](BiDAF) 75 | An original [BiDAF++](https://arxiv.org/abs/1710.10723) model uses Char-CNN for character embedding and GLoVe for word embedding. It is also equipped with contextualized embeddings and self attention. In this model, marker embeddings corresponding to previous answer words are used, while question turn numbers are encoded into question embeddings. We intend to append ELMo or BERT embedding to word embeddings and contextualized embeddings to get better performance. 76 |

77 | 78 |

79 | 80 | 81 | ## Results 82 | | Model | F1 | Remark | 83 | | ------------- | ------------- | ------------- | 84 | | **FlowQA Baseline** | 64.24| Baseline in one experiment | 85 | | FlowQA + Coreference | 62.50 | Baseline=62.26 in another experiment| 86 | | FlowQA + Attention on Flows | 64.35 | | 87 | | **BiDAF++ Baseline** | 55.59 | | 88 | | ELMO + BiDAF++ | 58.40 | | 89 | | BERT + BIDAF++ | 60.04 | | 90 | | **SDNet** | 33.13 | | 91 | 92 | ## Conclusion 93 | * **FlowQA+Attention Over Flow**: adding attention layers over flow operation layer slightly improves the FlowQA model. We believe that it is because the representations generated in this way focus more on recent dialogs and help resolve coreferences. 94 | 95 | * **FlowQA+Coreferece**: Adding vector representation that encodes the coreference in long context can increase the performance of the model for task involving complex dependencies (context and dialogue) 96 | 97 | * **Enhancing BiDAF++**: Appending ELMo or BERT embedding to word embeddings and contextualized embeddings. Because ELMo extracts context features from language model and BERT uses pre-trained token embedding model which could perform better than GLoVe on QuAC dataset, these two embeddings could enhance the model performance. 98 | 99 | * **SDNet**: SDnet is not originally applied on QuAC dataset. So we adjusted the format of dataset so it can be fitted on SDNet. However the result is not as good as we expected. We can try to figure out issues and improve this model in future. 100 | 101 | ## References 102 | 103 | * [FlowQA: Grasping Flow in History for Conversational Machine Comprehension](https://arxiv.org/abs/1810.06683) by Huang et. al. 104 | * [Bidirectional attention flow for machine comprehension](https://arxiv.org/abs/1611.01603) by Minjoon Seo et. al. 105 | * [Simple and effective multi-paragraph reading comprehension](https://arxiv.org/abs/1710.10723) by Christopher Clark et. al. 106 | * [Deep contextualized word representations](https://arxiv.org/abs/1802.05365) by Matthew E. Peters et. al. 107 | * [Bert: Pre-training of deep bidirectional transformers for language understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin et. al. 108 | * [SDNet: Contextualized Attention-based Deep Network for Conversational Question Answering](https://arxiv.org/abs/1812.03593) by Chenguang Zhu et. al. 109 | * [End-to-end Neural Coreference Resolution](https://arxiv.org/abs/1707.07045) by Lee et. al. 110 | -------------------------------------------------------------------------------- /SDNet/Figures/f1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/Figures/f1.jpg -------------------------------------------------------------------------------- /SDNet/Figures/loss.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/Figures/loss.jpg -------------------------------------------------------------------------------- /SDNet/README.md: -------------------------------------------------------------------------------- 1 | 2 | # SDNet 3 | [SDNet](https://arxiv.org/abs/1812.03593) is a contextualized attention-based deep neural network developed by Microsoft. It is originally evaluated on another question answering dataset [CoQA](https://stanfordnlp.github.io/coqa/), it is the first model that reaches in-domain F1 score higher than 80% (80.7%) on CoQA. 4 | 5 | ## Difference between CoQA and QuAC 6 | Since the format of dataset is different between CoQA and QuAC, we need to convert the format first to let SDNet work on QuAC. Unlike QuAC, the answers in CoQA are context-free. However, SDNet still need to generate an answer span from text, so we can take this as output on QuAC. Moreover, questions in CoQA don’t have properties such as “follow up” and “yes/no”, but all questions are possible to be related to previous questions and answers. Original SDNet model prepends 2 rounds of previous questions and answers to obtain the best F1 score, so we will keep this behavior and ignore “follow up” property. Also, SDNet essentially needs to compute the probability of answer being affirmation “yes” or “no”, and “No answer” (“unknown” in CoQA), which we can use directly on QuAC. 7 | 8 | ## Usage 9 | In preprocess folder there is a script to convert QuAC data to CoQA format. 10 | 11 | For detailed usage please refer to [SDNet GitHub page](https://github.com/Microsoft/SDNet) 12 | 13 | ## Result and conclusion 14 | Training loss and development set F1 score are shown below: 15 | 16 |

17 | 18 |

19 | 20 |

21 | 22 |

23 | 24 | Unfortunately, the reult of SDNet is not as good as we expected. We only get 33.13 F1 score on development set. We tried to feagure out the reason why this happends (such as format and preprocess issue) but couldn't make any significant improvement on that. We can set this as a future work and continue working on this. 25 | 26 | ## References 27 | 28 | * [SDNet: Contextualized Attention-based Deep Network for Conversational Question Answering](https://arxiv.org/abs/1812.03593) by Chenguang Zhu et. al. 29 | -------------------------------------------------------------------------------- /SDNet/code/SDNet/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to 4 | agree to a Contributor License Agreement (CLA) declaring that you have the right to, 5 | and actually do, grant us the rights to use your contribution. For details, visit 6 | https://cla.microsoft.com. 7 | 8 | When you submit a pull request, a CLA-bot will automatically determine whether you need 9 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the 10 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA. 11 | 12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 14 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. -------------------------------------------------------------------------------- /SDNet/code/SDNet/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /SDNet/code/SDNet/Models/BaseTrainer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | import os 5 | 6 | class BaseTrainer(): 7 | def __init__(self, opt): 8 | self.opt = opt 9 | self.isTrain = False; 10 | if self.opt['cuda'] == True: 11 | self.use_cuda = True 12 | print('Using Cuda\n') 13 | else: 14 | self.use_cuda = False 15 | print('Using CPU\n') 16 | 17 | self.is_official = 'OFFICIAL' in self.opt 18 | # Make sure raw text feature files are ready 19 | self.use_spacy = 'SPACY_FEATURE' in self.opt 20 | self.opt['logFile'] = 'log.txt' 21 | 22 | opt['FEATURE_FOLDER'] = 'conf~/' + ('spacy_intermediate_feature~/' if self.use_spacy else 'intermediate_feature~/') 23 | opt['FEATURE_FOLDER'] = os.path.join(opt['datadir'], opt['FEATURE_FOLDER']) 24 | 25 | def log(self, s): 26 | # In official case, the program does not output logs 27 | if self.is_official: 28 | return 29 | 30 | with open(os.path.join(self.saveFolder, self.opt['logFile']), 'a') as f: 31 | f.write(s + '\n') 32 | print(s) 33 | 34 | def getSaveFolder(self): 35 | runid = 1 36 | while True: 37 | saveFolder = os.path.join(self.opt['datadir'], 'conf~', 'run_' + str(runid)) 38 | if not os.path.exists(saveFolder): 39 | self.saveFolder = saveFolder 40 | os.makedirs(self.saveFolder) 41 | print('Saving logs, model and evaluation in ' + self.saveFolder) 42 | return 43 | runid = runid + 1 44 | 45 | # save copy of conf file 46 | def saveConf(self): 47 | with open(self.opt['confFile'], encoding='utf-8') as f: 48 | with open(os.path.join(self.saveFolder, 'conf_copy'), 'w', encoding='utf-8') as fw: 49 | for line in f: 50 | fw.write(line + '\n') 51 | 52 | def train(self): 53 | pass 54 | 55 | def load(self): 56 | pass 57 | -------------------------------------------------------------------------------- /SDNet/code/SDNet/Models/Bert/Bert.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | import os 5 | import torch 6 | import torch.nn as nn 7 | from torch.autograd import Variable 8 | from Models.Bert.modeling import BertModel 9 | 10 | ''' 11 | BERT 12 | ''' 13 | class Bert(nn.Module): 14 | def __init__(self, opt): 15 | super(Bert, self).__init__() 16 | print('Loading BERT model...') 17 | self.BERT_MAX_LEN = 512 18 | self.linear_combine = 'BERT_LINEAR_COMBINE' in opt 19 | 20 | if 'BERT_LARGE' in opt: 21 | print('Using BERT Large model') 22 | model_file = os.path.join(opt['datadir'], opt['BERT_large_model_file']) 23 | print('Loading BERT model from', model_file) 24 | self.bert_model = BertModel.from_pretrained(model_file) 25 | #self.bert_model = BertModel.from_pretrained('bert-large-uncased') 26 | self.bert_dim = 1024 27 | self.bert_layer = 24 28 | else: 29 | print('Using BERT base model') 30 | model_file = os.path.join(opt['datadir'], opt['BERT_model_file']) 31 | print('Loading BERT model from', model_file) 32 | self.bert_model = BertModel.from_pretrained(model_file) 33 | #self.bert_model = BertModel.from_pretrained('bert-base-cased') 34 | self.bert_dim = 768 35 | self.bert_layer = 12 36 | self.bert_model.cuda() 37 | self.bert_model.eval() 38 | 39 | print('Finished loading') 40 | 41 | ''' 42 | Input: 43 | x_bert: batch * max_bert_sent_len (ids) 44 | x_bert_mask: batch * max_bert_sent_len (0/1) 45 | x_bert_offset: batch * max_real_word_num * 2 46 | x_mask: batch * max_real_word_num 47 | Output: 48 | embedding: batch * max_real_word_num * bert_dim 49 | ''' 50 | def forward(self, x_bert, x_bert_mask, x_bert_offset, x_mask): 51 | if self.linear_combine: 52 | return self.combine_forward(x_bert, x_bert_mask, x_bert_offset, x_mask) 53 | 54 | last_layers = [] 55 | bert_sent_len = x_bert.shape[1] 56 | p = 0 57 | while p < bert_sent_len: 58 | all_encoder_layers, _ = self.bert_model(x_bert[:, p:(p + self.BERT_MAX_LEN)], token_type_ids=None, attention_mask=x_bert_mask[:, p:(p + self.BERT_MAX_LEN)]) # bert_layer * batch * max_bert_sent_len * bert_dim 59 | last_layers.append(all_encoder_layers[-1]) # batch * up_to_512 * bert_dim 60 | p += self.BERT_MAX_LEN 61 | 62 | bert_embedding = torch.cat(last_layers, 1) 63 | 64 | batch_size = x_mask.shape[0] 65 | max_word_num = x_mask.shape[1] 66 | output = Variable(torch.zeros(batch_size, max_word_num, self.bert_dim)) 67 | for i in range(batch_size): 68 | for j in range(max_word_num): 69 | if x_mask[i, j] == 0: 70 | continue 71 | st = x_bert_offset[i, j, 0] 72 | ed = x_bert_offset[i, j, 1] 73 | # we can also try using st only, ed only 74 | if st + 1 == ed: # including st==ed 75 | output[i, j, :] = bert_embedding[i, st, :] 76 | else: 77 | subword_ebd_sum = torch.sum(bert_embedding[i, st:ed, :], dim = 0) 78 | if st < ed: 79 | output[i, j, :] = subword_ebd_sum / float(ed - st) # dim 0 is st:ed 80 | 81 | output = output.cuda() 82 | return output 83 | 84 | def combine_forward(self, x_bert, x_bert_mask, x_bert_offset, x_mask): 85 | all_layers = [] 86 | 87 | bert_sent_len = x_bert.shape[1] 88 | p = 0 89 | while p < bert_sent_len: 90 | all_encoder_layers, _ = self.bert_model(x_bert[:, p:(p + self.BERT_MAX_LEN)], token_type_ids=None, attention_mask=x_bert_mask[:, p:(p + self.BERT_MAX_LEN)]) # bert_layer * batch * max_bert_sent_len * bert_dim 91 | all_layers.append(torch.cat(all_encoder_layers, dim = 2)) # batch * up_to_512 * (bert_dim * layer) 92 | p += self.BERT_MAX_LEN 93 | 94 | bert_embedding = torch.cat(all_layers, dim = 1) # batch * up_to_512 * (bert_dim * layer) 95 | batch_size = x_mask.shape[0] 96 | max_word_num = x_mask.shape[1] 97 | tot_dim = bert_embedding.shape[2] 98 | output = Variable(torch.zeros(batch_size, max_word_num, tot_dim)) 99 | for i in range(batch_size): 100 | for j in range(max_word_num): 101 | if x_mask[i, j] == 0: 102 | continue 103 | st = x_bert_offset[i, j, 0] 104 | ed = x_bert_offset[i, j, 1] 105 | # we can also try using st only, ed only 106 | if st + 1 == ed: # including st==ed 107 | output[i, j, :] = bert_embedding[i, st, :] 108 | else: 109 | subword_ebd_sum = torch.sum(bert_embedding[i, st:ed, :], dim = 0) 110 | if st < ed: 111 | output[i, j, :] = subword_ebd_sum / float(ed - st) # dim 0 is st:ed 112 | 113 | outputs = [] 114 | for i in range(self.bert_layer): 115 | now = output[:, :, (i * self.bert_dim) : ((i + 1) * self.bert_dim)] 116 | now = now.cuda() 117 | outputs.append(now) 118 | 119 | return outputs 120 | -------------------------------------------------------------------------------- /SDNet/code/SDNet/Models/Bert/__pycache__/Bert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/Bert/__pycache__/Bert.cpython-37.pyc -------------------------------------------------------------------------------- /SDNet/code/SDNet/Models/Bert/__pycache__/modeling.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/Bert/__pycache__/modeling.cpython-37.pyc -------------------------------------------------------------------------------- /SDNet/code/SDNet/Models/Bert/__pycache__/tokenization.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/Bert/__pycache__/tokenization.cpython-37.pyc -------------------------------------------------------------------------------- /SDNet/code/SDNet/Models/Bert/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch optimization for BERT model.""" 16 | 17 | import math 18 | import torch 19 | from torch.optim import Optimizer 20 | from torch.nn.utils import clip_grad_norm_ 21 | 22 | def warmup_cosine(x, warmup=0.002): 23 | if x < warmup: 24 | return x/warmup 25 | return 0.5 * (1.0 + torch.cos(math.pi * x)) 26 | 27 | def warmup_constant(x, warmup=0.002): 28 | if x < warmup: 29 | return x/warmup 30 | return 1.0 31 | 32 | def warmup_linear(x, warmup=0.002): 33 | if x < warmup: 34 | return x/warmup 35 | return 1.0 - x 36 | 37 | SCHEDULES = { 38 | 'warmup_cosine':warmup_cosine, 39 | 'warmup_constant':warmup_constant, 40 | 'warmup_linear':warmup_linear, 41 | } 42 | 43 | 44 | class BertAdam(Optimizer): 45 | """Implements BERT version of Adam algorithm with weight decay fix. 46 | Params: 47 | lr: learning rate 48 | warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 49 | t_total: total number of training steps for the learning 50 | rate schedule, -1 means constant learning rate. Default: -1 51 | schedule: schedule to use for the warmup (see above). Default: 'warmup_linear' 52 | b1: Adams b1. Default: 0.9 53 | b2: Adams b2. Default: 0.999 54 | e: Adams epsilon. Default: 1e-6 55 | weight_decay_rate: Weight decay. Default: 0.01 56 | max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 57 | """ 58 | def __init__(self, params, lr, warmup=-1, t_total=-1, schedule='warmup_linear', 59 | b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01, 60 | max_grad_norm=1.0): 61 | if not lr >= 0.0: 62 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) 63 | if schedule not in SCHEDULES: 64 | raise ValueError("Invalid schedule parameter: {}".format(schedule)) 65 | if not 0.0 <= warmup < 1.0 and not warmup == -1: 66 | raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) 67 | if not 0.0 <= b1 < 1.0: 68 | raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) 69 | if not 0.0 <= b2 < 1.0: 70 | raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) 71 | if not e >= 0.0: 72 | raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) 73 | defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, 74 | b1=b1, b2=b2, e=e, weight_decay_rate=weight_decay_rate, 75 | max_grad_norm=max_grad_norm) 76 | super(BertAdam, self).__init__(params, defaults) 77 | 78 | def get_lr(self): 79 | lr = [] 80 | for group in self.param_groups: 81 | for p in group['params']: 82 | state = self.state[p] 83 | if len(state) == 0: 84 | return [0] 85 | if group['t_total'] != -1: 86 | schedule_fct = SCHEDULES[group['schedule']] 87 | lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) 88 | else: 89 | lr_scheduled = group['lr'] 90 | lr.append(lr_scheduled) 91 | return lr 92 | 93 | def step(self, closure=None): 94 | """Performs a single optimization step. 95 | 96 | Arguments: 97 | closure (callable, optional): A closure that reevaluates the model 98 | and returns the loss. 99 | """ 100 | loss = None 101 | if closure is not None: 102 | loss = closure() 103 | 104 | for group in self.param_groups: 105 | for p in group['params']: 106 | if p.grad is None: 107 | continue 108 | grad = p.grad.data 109 | if grad.is_sparse: 110 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 111 | 112 | state = self.state[p] 113 | 114 | # State initialization 115 | if len(state) == 0: 116 | state['step'] = 0 117 | # Exponential moving average of gradient values 118 | state['next_m'] = torch.zeros_like(p.data) 119 | # Exponential moving average of squared gradient values 120 | state['next_v'] = torch.zeros_like(p.data) 121 | 122 | next_m, next_v = state['next_m'], state['next_v'] 123 | beta1, beta2 = group['b1'], group['b2'] 124 | 125 | # Add grad clipping 126 | if group['max_grad_norm'] > 0: 127 | clip_grad_norm_(p, group['max_grad_norm']) 128 | 129 | # Decay the first and second moment running average coefficient 130 | # In-place operations to update the averages at the same time 131 | next_m.mul_(beta1).add_(1 - beta1, grad) 132 | next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) 133 | update = next_m / (next_v.sqrt() + group['e']) 134 | 135 | # Just adding the square of the weights to the loss function is *not* 136 | # the correct way of using L2 regularization/weight decay with Adam, 137 | # since that will interact with the m and v parameters in strange ways. 138 | # 139 | # Instead we want to decay the weights in a manner that doesn't interact 140 | # with the m/v parameters. This is equivalent to adding the square 141 | # of the weights to the loss with plain (non-momentum) SGD. 142 | if group['weight_decay_rate'] > 0.0: 143 | update += group['weight_decay_rate'] * p.data 144 | 145 | if group['t_total'] != -1: 146 | schedule_fct = SCHEDULES[group['schedule']] 147 | lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) 148 | else: 149 | lr_scheduled = group['lr'] 150 | 151 | update_with_lr = lr_scheduled * update 152 | p.data.add_(-update_with_lr) 153 | 154 | state['step'] += 1 155 | 156 | # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 157 | # No bias correction 158 | # bias_correction1 = 1 - beta1 ** state['step'] 159 | # bias_correction2 = 1 - beta2 ** state['step'] 160 | 161 | return loss 162 | -------------------------------------------------------------------------------- /SDNet/code/SDNet/Models/Bert/tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import unicodedata 23 | import os 24 | import logging 25 | 26 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 27 | datefmt = '%m/%d/%Y %H:%M:%S', 28 | level = logging.INFO) 29 | logger = logging.getLogger(__name__) 30 | 31 | PRETRAINED_VOCAB_ARCHIVE_MAP = { 32 | 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 33 | 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", 34 | 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", 35 | 'bert-base-multilingual': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-vocab.txt", 36 | 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", 37 | } 38 | 39 | def convert_to_unicode(text): 40 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 41 | if isinstance(text, str): 42 | return text 43 | elif isinstance(text, bytes): 44 | return text.decode("utf-8", "ignore") 45 | else: 46 | raise ValueError("Unsupported string type: %s" % (type(text))) 47 | 48 | 49 | def printable_text(text): 50 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 51 | 52 | # These functions want `str` for both Python2 and Python3, but in one case 53 | # it's a Unicode string and in the other it's a byte string. 54 | if isinstance(text, str): 55 | return text 56 | elif isinstance(text, bytes): 57 | return text.decode("utf-8", "ignore") 58 | else: 59 | raise ValueError("Unsupported string type: %s" % (type(text))) 60 | 61 | 62 | def load_vocab(vocab_file): 63 | """Loads a vocabulary file into a dictionary.""" 64 | vocab = collections.OrderedDict() 65 | index = 0 66 | with open(vocab_file, "r", encoding="utf8") as reader: 67 | while True: 68 | token = convert_to_unicode(reader.readline()) 69 | if not token: 70 | break 71 | token = token.strip() 72 | vocab[token] = index 73 | index += 1 74 | return vocab 75 | 76 | 77 | def whitespace_tokenize(text): 78 | """Runs basic whitespace cleaning and splitting on a peice of text.""" 79 | text = text.strip() 80 | if not text: 81 | return [] 82 | tokens = text.split() 83 | return tokens 84 | 85 | 86 | class BertTokenizer(object): 87 | """Runs end-to-end tokenization: punctuation splitting + wordpiece""" 88 | def __init__(self, vocab_file, do_lower_case=True): 89 | if not os.path.isfile(vocab_file): 90 | raise ValueError( 91 | "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " 92 | "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) 93 | self.vocab = load_vocab(vocab_file) 94 | self.ids_to_tokens = collections.OrderedDict( 95 | [(ids, tok) for tok, ids in self.vocab.items()]) 96 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 97 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 98 | 99 | def tokenize(self, text): 100 | split_tokens = [] 101 | for token in self.basic_tokenizer.tokenize(text): 102 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 103 | split_tokens.append(sub_token) 104 | return split_tokens 105 | 106 | def convert_tokens_to_ids(self, tokens): 107 | """Converts a sequence of tokens into ids using the vocab.""" 108 | ids = [] 109 | for token in tokens: 110 | ids.append(self.vocab[token]) 111 | return ids 112 | 113 | def convert_ids_to_tokens(self, ids): 114 | """Converts a sequence of ids in wordpiece tokens using the vocab.""" 115 | tokens = [] 116 | for i in ids: 117 | tokens.append(self.ids_to_tokens[i]) 118 | return tokens 119 | 120 | @classmethod 121 | def from_pretrained(cls, pretrained_model_name, do_lower_case=True): 122 | """ 123 | Instantiate a PreTrainedBertModel from a pre-trained model file. 124 | Download and cache the pre-trained model file if needed. 125 | """ 126 | if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP: 127 | vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name] 128 | else: 129 | vocab_file = pretrained_model_name 130 | # redirect to the cache, if necessary 131 | try: 132 | resolved_vocab_file = vocab_file 133 | if resolved_vocab_file == vocab_file: 134 | logger.info("loading vocabulary file {}".format(vocab_file)) 135 | else: 136 | logger.info("loading vocabulary file {} from cache at {}".format( 137 | vocab_file, resolved_vocab_file)) 138 | # Instantiate tokenizer. 139 | tokenizer = cls(resolved_vocab_file, do_lower_case) 140 | except FileNotFoundError: 141 | logger.error( 142 | "Model name '{}' was not found in model name list ({}). " 143 | "We assumed '{}' was a path or url but couldn't find any file " 144 | "associated to this path or url.".format( 145 | pretrained_model_name, 146 | ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), 147 | pretrained_model_name)) 148 | tokenizer = None 149 | return tokenizer 150 | 151 | 152 | class BasicTokenizer(object): 153 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 154 | 155 | def __init__(self, do_lower_case=True): 156 | """Constructs a BasicTokenizer. 157 | 158 | Args: 159 | do_lower_case: Whether to lower case the input. 160 | """ 161 | self.do_lower_case = do_lower_case 162 | 163 | def tokenize(self, text): 164 | """Tokenizes a piece of text.""" 165 | text = convert_to_unicode(text) 166 | text = self._clean_text(text) 167 | # This was added on November 1st, 2018 for the multilingual and Chinese 168 | # models. This is also applied to the English models now, but it doesn't 169 | # matter since the English models were not trained on any Chinese data 170 | # and generally don't have any Chinese data in them (there are Chinese 171 | # characters in the vocabulary because Wikipedia does have some Chinese 172 | # words in the English Wikipedia.). 173 | text = self._tokenize_chinese_chars(text) 174 | orig_tokens = whitespace_tokenize(text) 175 | split_tokens = [] 176 | for token in orig_tokens: 177 | if self.do_lower_case: 178 | token = token.lower() 179 | token = self._run_strip_accents(token) 180 | split_tokens.extend(self._run_split_on_punc(token)) 181 | 182 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 183 | return output_tokens 184 | 185 | def _run_strip_accents(self, text): 186 | """Strips accents from a piece of text.""" 187 | text = unicodedata.normalize("NFD", text) 188 | output = [] 189 | for char in text: 190 | cat = unicodedata.category(char) 191 | if cat == "Mn": 192 | continue 193 | output.append(char) 194 | return "".join(output) 195 | 196 | def _run_split_on_punc(self, text): 197 | """Splits punctuation on a piece of text.""" 198 | chars = list(text) 199 | i = 0 200 | start_new_word = True 201 | output = [] 202 | while i < len(chars): 203 | char = chars[i] 204 | if _is_punctuation(char): 205 | output.append([char]) 206 | start_new_word = True 207 | else: 208 | if start_new_word: 209 | output.append([]) 210 | start_new_word = False 211 | output[-1].append(char) 212 | i += 1 213 | 214 | return ["".join(x) for x in output] 215 | 216 | def _tokenize_chinese_chars(self, text): 217 | """Adds whitespace around any CJK character.""" 218 | output = [] 219 | for char in text: 220 | cp = ord(char) 221 | if self._is_chinese_char(cp): 222 | output.append(" ") 223 | output.append(char) 224 | output.append(" ") 225 | else: 226 | output.append(char) 227 | return "".join(output) 228 | 229 | def _is_chinese_char(self, cp): 230 | """Checks whether CP is the codepoint of a CJK character.""" 231 | # This defines a "chinese character" as anything in the CJK Unicode block: 232 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 233 | # 234 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 235 | # despite its name. The modern Korean Hangul alphabet is a different block, 236 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 237 | # space-separated words, so they are not treated specially and handled 238 | # like the all of the other languages. 239 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 240 | (cp >= 0x3400 and cp <= 0x4DBF) or # 241 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 242 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 243 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 244 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 245 | (cp >= 0xF900 and cp <= 0xFAFF) or # 246 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 247 | return True 248 | 249 | return False 250 | 251 | def _clean_text(self, text): 252 | """Performs invalid character removal and whitespace cleanup on text.""" 253 | output = [] 254 | for char in text: 255 | cp = ord(char) 256 | if cp == 0 or cp == 0xfffd or _is_control(char): 257 | continue 258 | if _is_whitespace(char): 259 | output.append(" ") 260 | else: 261 | output.append(char) 262 | return "".join(output) 263 | 264 | 265 | class WordpieceTokenizer(object): 266 | """Runs WordPiece tokenization.""" 267 | 268 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): 269 | self.vocab = vocab 270 | self.unk_token = unk_token 271 | self.max_input_chars_per_word = max_input_chars_per_word 272 | 273 | def tokenize(self, text): 274 | """Tokenizes a piece of text into its word pieces. 275 | 276 | This uses a greedy longest-match-first algorithm to perform tokenization 277 | using the given vocabulary. 278 | 279 | For example: 280 | input = "unaffable" 281 | output = ["un", "##aff", "##able"] 282 | 283 | Args: 284 | text: A single token or whitespace separated tokens. This should have 285 | already been passed through `BasicTokenizer. 286 | 287 | Returns: 288 | A list of wordpiece tokens. 289 | """ 290 | 291 | text = convert_to_unicode(text) 292 | 293 | output_tokens = [] 294 | for token in whitespace_tokenize(text): 295 | chars = list(token) 296 | if len(chars) > self.max_input_chars_per_word: 297 | output_tokens.append(self.unk_token) 298 | continue 299 | 300 | is_bad = False 301 | start = 0 302 | sub_tokens = [] 303 | while start < len(chars): 304 | end = len(chars) 305 | cur_substr = None 306 | while start < end: 307 | substr = "".join(chars[start:end]) 308 | if start > 0: 309 | substr = "##" + substr 310 | if substr in self.vocab: 311 | cur_substr = substr 312 | break 313 | end -= 1 314 | if cur_substr is None: 315 | is_bad = True 316 | break 317 | sub_tokens.append(cur_substr) 318 | start = end 319 | 320 | if is_bad: 321 | output_tokens.append(self.unk_token) 322 | else: 323 | output_tokens.extend(sub_tokens) 324 | return output_tokens 325 | 326 | 327 | def _is_whitespace(char): 328 | """Checks whether `chars` is a whitespace character.""" 329 | # \t, \n, and \r are technically contorl characters but we treat them 330 | # as whitespace since they are generally considered as such. 331 | if char == " " or char == "\t" or char == "\n" or char == "\r": 332 | return True 333 | cat = unicodedata.category(char) 334 | if cat == "Zs": 335 | return True 336 | return False 337 | 338 | 339 | def _is_control(char): 340 | """Checks whether `chars` is a control character.""" 341 | # These are technically control characters but we count them as whitespace 342 | # characters. 343 | if char == "\t" or char == "\n" or char == "\r": 344 | return False 345 | cat = unicodedata.category(char) 346 | if cat.startswith("C"): 347 | return True 348 | return False 349 | 350 | 351 | def _is_punctuation(char): 352 | """Checks whether `chars` is a punctuation character.""" 353 | cp = ord(char) 354 | # We treat all non-letter/number ASCII as punctuation. 355 | # Characters such as "^", "$", and "`" are not in the Unicode 356 | # Punctuation class but we treat them as punctuation anyways, for 357 | # consistency. 358 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 359 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 360 | return True 361 | cat = unicodedata.category(char) 362 | if cat.startswith("P"): 363 | return True 364 | return False 365 | -------------------------------------------------------------------------------- /SDNet/code/SDNet/Models/SDNet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | import math 5 | import random 6 | import numpy as np 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from torch.autograd import Variable 11 | import torch.nn.init as init 12 | from torch.nn.parameter import Parameter 13 | from Models.Bert.Bert import Bert 14 | from Models.Layers import MaxPooling, CNN, dropout, RNN_from_opt, set_dropout_prob, weighted_avg, set_seq_dropout, Attention, DeepAttention, LinearSelfAttn, GetFinalScores 15 | from Utils.CoQAUtils import POS, ENT 16 | 17 | ''' 18 | SDNet 19 | ''' 20 | class SDNet(nn.Module): 21 | def __init__(self, opt, word_embedding): 22 | super(SDNet, self).__init__() 23 | print('SDNet model\n') 24 | 25 | self.opt = opt 26 | self.use_cuda = (self.opt['cuda'] == True) 27 | set_dropout_prob(0.0 if not 'DROPOUT' in opt else float(opt['DROPOUT'])) 28 | set_seq_dropout('VARIATIONAL_DROPOUT' in self.opt) 29 | 30 | x_input_size = 0 31 | ques_input_size = 0 32 | 33 | self.vocab_size = int(opt['vocab_size']) 34 | vocab_dim = int(opt['vocab_dim']) 35 | self.vocab_embed = nn.Embedding(self.vocab_size, vocab_dim, padding_idx = 1) 36 | self.vocab_embed.weight.data = word_embedding 37 | 38 | x_input_size += vocab_dim 39 | ques_input_size += vocab_dim 40 | 41 | if 'CHAR_CNN' in self.opt: 42 | print('CHAR_CNN') 43 | char_vocab_size = int(opt['char_vocab_size']) 44 | char_dim = int(opt['char_emb_size']) 45 | char_hidden_size = int(opt['char_hidden_size']) 46 | self.char_embed = nn.Embedding(char_vocab_size, char_dim, padding_idx = 1) 47 | self.char_cnn = CNN(char_dim, 3, char_hidden_size) 48 | self.maxpooling = MaxPooling() 49 | x_input_size += char_hidden_size 50 | ques_input_size += char_hidden_size 51 | 52 | if 'TUNE_PARTIAL' in self.opt: 53 | print('TUNE_PARTIAL') 54 | self.fixed_embedding = word_embedding[opt['tune_partial']:] 55 | else: 56 | self.vocab_embed.weight.requires_grad = False 57 | 58 | cdim = 0 59 | self.use_contextual = False 60 | 61 | if 'BERT' in self.opt: 62 | print('Using BERT') 63 | self.Bert = Bert(self.opt) 64 | if 'LOCK_BERT' in self.opt: 65 | print('Lock BERT\'s weights') 66 | for p in self.Bert.parameters(): 67 | p.requires_grad = False 68 | if 'BERT_LARGE' in self.opt: 69 | print('BERT_LARGE') 70 | bert_dim = 1024 71 | bert_layers = 24 72 | else: 73 | bert_dim = 768 74 | bert_layers = 12 75 | 76 | print('BERT dim:', bert_dim, 'BERT_LAYERS:', bert_layers) 77 | 78 | if 'BERT_LINEAR_COMBINE' in self.opt: 79 | print('BERT_LINEAR_COMBINE') 80 | self.alphaBERT = nn.Parameter(torch.Tensor(bert_layers), requires_grad=True) 81 | self.gammaBERT = nn.Parameter(torch.Tensor(1, 1), requires_grad=True) 82 | torch.nn.init.constant(self.alphaBERT, 1.0) 83 | torch.nn.init.constant(self.gammaBERT, 1.0) 84 | 85 | cdim = bert_dim 86 | x_input_size += bert_dim 87 | ques_input_size += bert_dim 88 | 89 | self.pre_align = Attention(vocab_dim, opt['prealign_hidden'], correlation_func = 3, do_similarity = True) 90 | x_input_size += vocab_dim 91 | 92 | pos_dim = opt['pos_dim'] 93 | ent_dim = opt['ent_dim'] 94 | self.pos_embedding = nn.Embedding(len(POS), pos_dim) 95 | self.ent_embedding = nn.Embedding(len(ENT), ent_dim) 96 | 97 | x_feat_len = 4 98 | if 'ANSWER_SPAN_IN_CONTEXT_FEATURE' in self.opt: 99 | print('ANSWER_SPAN_IN_CONTEXT_FEATURE') 100 | x_feat_len += 1 101 | 102 | x_input_size += pos_dim + ent_dim + x_feat_len 103 | 104 | print('Initially, the vector_sizes [doc, query] are', x_input_size, ques_input_size) 105 | 106 | addtional_feat = cdim if self.use_contextual else 0 107 | 108 | # RNN context encoder 109 | self.context_rnn, context_rnn_output_size = RNN_from_opt(x_input_size, opt['hidden_size'], 110 | num_layers=opt['in_rnn_layers'], concat_rnn=opt['concat_rnn'], add_feat=addtional_feat) 111 | # RNN question encoder 112 | self.ques_rnn, ques_rnn_output_size = RNN_from_opt(ques_input_size, opt['hidden_size'], 113 | num_layers=opt['in_rnn_layers'], concat_rnn=opt['concat_rnn'], add_feat=addtional_feat) 114 | 115 | # Output sizes of rnn encoders 116 | print('After Input LSTM, the vector_sizes [doc, query] are [', context_rnn_output_size, ques_rnn_output_size, '] *', opt['in_rnn_layers']) 117 | 118 | # Deep inter-attention 119 | self.deep_attn = DeepAttention(opt, abstr_list_cnt=opt['in_rnn_layers'], 120 | deep_att_hidden_size_per_abstr=opt['deep_att_hidden_size_per_abstr'], correlation_func=3, word_hidden_size=vocab_dim + addtional_feat) 121 | self.deep_attn_input_size = self.deep_attn.rnn_input_size 122 | self.deep_attn_output_size = self.deep_attn.output_size 123 | 124 | # Question understanding and compression 125 | self.high_lvl_ques_rnn , high_lvl_ques_rnn_output_size = RNN_from_opt(ques_rnn_output_size * opt['in_rnn_layers'], 126 | opt['highlvl_hidden_size'], num_layers = opt['question_high_lvl_rnn_layers'], concat_rnn = True) 127 | 128 | self.after_deep_attn_size = self.deep_attn_output_size + self.deep_attn_input_size + addtional_feat + vocab_dim 129 | self.self_attn_input_size = self.after_deep_attn_size 130 | self_attn_output_size = self.deep_attn_output_size 131 | 132 | # Self attention on context 133 | self.highlvl_self_att = Attention(self.self_attn_input_size, opt['deep_att_hidden_size_per_abstr'], correlation_func=3) 134 | print('Self deep-attention input is {}-dim'.format(self.self_attn_input_size)) 135 | 136 | self.high_lvl_context_rnn, high_lvl_context_rnn_output_size = RNN_from_opt(self.deep_attn_output_size + self_attn_output_size, 137 | opt['highlvl_hidden_size'], num_layers = 1, concat_rnn = False) 138 | context_final_size = high_lvl_context_rnn_output_size 139 | 140 | print('Do Question self attention') 141 | self.ques_self_attn = Attention(high_lvl_ques_rnn_output_size, opt['query_self_attn_hidden_size'], correlation_func=3) 142 | 143 | ques_final_size = high_lvl_ques_rnn_output_size 144 | print('Before answer span finding, hidden size are', context_final_size, ques_final_size) 145 | 146 | # Question merging 147 | self.ques_merger = LinearSelfAttn(ques_final_size) 148 | self.get_answer = GetFinalScores(context_final_size, ques_final_size) 149 | 150 | ''' 151 | x: 1 x x_len (word_ids) 152 | x_single_mask: 1 x x_len 153 | x_char: 1 x x_len x char_len (char_ids) 154 | x_char_mask: 1 x x_len x char_len 155 | x_features: batch_size x x_len x feature_len (5, if answer_span_in_context_feature; 4 otherwise) 156 | x_pos: 1 x x_len (POS id) 157 | x_ent: 1 x x_len (entity id) 158 | x_bert: 1 x x_bert_token_len 159 | x_bert_mask: 1 x x_bert_token_len 160 | x_bert_offsets: 1 x x_len x 2 161 | q: batch x q_len (word_ids) 162 | q_mask: batch x q_len 163 | q_char: batch x q_len x char_len (char ids) 164 | q_char_mask: batch x q_len x char_len 165 | q_bert: 1 x q_bert_token_len 166 | q_bert_mask: 1 x q_bert_token_len 167 | q_bert_offsets: 1 x q_len x 2 168 | context_len: number of words in context (only one per batch) 169 | return: 170 | score_s: batch x context_len 171 | score_e: batch x context_len 172 | score_no: batch x 1 173 | score_yes: batch x 1 174 | score_noanswer: batch x 1 175 | ''' 176 | def forward(self, x, x_single_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, q, q_mask, q_char, q_char_mask, q_bert, q_bert_mask, q_bert_offsets, context_len): 177 | batch_size = q.shape[0] 178 | x_mask = x_single_mask.expand(batch_size, -1) 179 | x_word_embed = self.vocab_embed(x).expand(batch_size, -1, -1) # batch x x_len x vocab_dim 180 | ques_word_embed = self.vocab_embed(q) # batch x q_len x vocab_dim 181 | 182 | x_input_list = [dropout(x_word_embed, p=self.opt['dropout_emb'], training=self.drop_emb)] # batch x x_len x vocab_dim 183 | ques_input_list = [dropout(ques_word_embed, p=self.opt['dropout_emb'], training=self.drop_emb)] # batch x q_len x vocab_dim 184 | 185 | # contextualized embedding 186 | x_cemb = ques_cemb = None 187 | if 'BERT' in self.opt: 188 | x_cemb = ques_cemb = None 189 | 190 | if 'BERT_LINEAR_COMBINE' in self.opt: 191 | x_bert_output = self.Bert(x_bert, x_bert_mask, x_bert_offsets, x_single_mask) 192 | x_cemb_mid = self.linear_sum(x_bert_output, self.alphaBERT, self.gammaBERT) 193 | ques_bert_output = self.Bert(q_bert, q_bert_mask, q_bert_offsets, q_mask) 194 | ques_cemb_mid = self.linear_sum(ques_bert_output, self.alphaBERT, self.gammaBERT) 195 | x_cemb_mid = x_cemb_mid.expand(batch_size, -1, -1) 196 | else: 197 | x_cemb_mid = self.Bert(x_bert, x_bert_mask, x_bert_offsets, x_single_mask) 198 | x_cemb_mid = x_cemb_mid.expand(batch_size, -1, -1) 199 | ques_cemb_mid = self.Bert(q_bert, q_bert_mask, q_bert_offsets, q_mask) 200 | 201 | x_input_list.append(x_cemb_mid) 202 | ques_input_list.append(ques_cemb_mid) 203 | 204 | if 'CHAR_CNN' in self.opt: 205 | x_char_final = self.character_cnn(x_char, x_char_mask) 206 | x_char_final = x_char_final.expand(batch_size, -1, -1) 207 | ques_char_final = self.character_cnn(q_char, q_char_mask) 208 | x_input_list.append(x_char_final) 209 | ques_input_list.append(ques_char_final) 210 | 211 | x_prealign = self.pre_align(x_word_embed, ques_word_embed, q_mask) 212 | x_input_list.append(x_prealign) # batch x x_len x (vocab_dim + cdim + vocab_dim) 213 | 214 | x_pos_emb = self.pos_embedding(x_pos).expand(batch_size, -1, -1) # batch x x_len x pos_dim 215 | x_ent_emb = self.ent_embedding(x_ent).expand(batch_size, -1, -1) # batch x x_len x ent_dim 216 | x_input_list.append(x_pos_emb) 217 | x_input_list.append(x_ent_emb) 218 | x_input_list.append(x_features) # batch x x_len x (vocab_dim + cdim + vocab_dim + pos_dim + ent_dim + feature_dim) 219 | 220 | x_input = torch.cat(x_input_list, 2) # batch x x_len x (vocab_dim + cdim + vocab_dim + pos_dim + ent_dim + feature_dim) 221 | ques_input = torch.cat(ques_input_list, 2) # batch x q_len x (vocab_dim + cdim) 222 | 223 | # Multi-layer RNN 224 | _, x_rnn_layers = self.context_rnn(x_input, x_mask, return_list=True, x_additional=x_cemb) # layer x batch x x_len x context_rnn_output_size 225 | _, ques_rnn_layers = self.ques_rnn(ques_input, q_mask, return_list=True, x_additional=ques_cemb) # layer x batch x q_len x ques_rnn_output_size 226 | 227 | # rnn with question only 228 | ques_highlvl = self.high_lvl_ques_rnn(torch.cat(ques_rnn_layers, 2), q_mask) # batch x q_len x high_lvl_ques_rnn_output_size 229 | ques_rnn_layers.append(ques_highlvl) # (layer + 1) layers 230 | 231 | # deep multilevel inter-attention 232 | if x_cemb is None: 233 | x_long = x_word_embed 234 | ques_long = ques_word_embed 235 | else: 236 | x_long = torch.cat([x_word_embed, x_cemb], 2) # batch x x_len x (vocab_dim + cdim) 237 | ques_long = torch.cat([ques_word_embed, ques_cemb], 2) # batch x q_len x (vocab_dim + cdim) 238 | 239 | x_rnn_after_inter_attn, x_inter_attn = self.deep_attn([x_long], x_rnn_layers, [ques_long], ques_rnn_layers, x_mask, q_mask, return_bef_rnn=True) 240 | # x_rnn_after_inter_attn: batch x x_len x deep_attn_output_size 241 | # x_inter_attn: batch x x_len x deep_attn_input_size 242 | 243 | # deep self attention 244 | if x_cemb is None: 245 | x_self_attn_input = torch.cat([x_rnn_after_inter_attn, x_inter_attn, x_word_embed], 2) 246 | else: 247 | x_self_attn_input = torch.cat([x_rnn_after_inter_attn, x_inter_attn, x_cemb, x_word_embed], 2) 248 | # batch x x_len x (deep_attn_output_size + deep_attn_input_size + cdim + vocab_dim) 249 | 250 | x_self_attn_output = self.highlvl_self_att(x_self_attn_input, x_self_attn_input, x_mask, x3=x_rnn_after_inter_attn, drop_diagonal=True) 251 | # batch x x_len x deep_attn_output_size 252 | 253 | x_highlvl_output = self.high_lvl_context_rnn(torch.cat([x_rnn_after_inter_attn, x_self_attn_output], 2), x_mask) 254 | # bach x x_len x high_lvl_context_rnn.output_size 255 | x_final = x_highlvl_output 256 | 257 | # question self attention 258 | ques_final = self.ques_self_attn(ques_highlvl, ques_highlvl, q_mask, x3=None, drop_diagonal=True) # batch x q_len x high_lvl_ques_rnn_output_size 259 | 260 | # merge questions 261 | q_merge_weights = self.ques_merger(ques_final, q_mask) 262 | ques_merged = weighted_avg(ques_final, q_merge_weights) # batch x ques_final_size 263 | 264 | # predict scores 265 | score_s, score_e, score_no, score_yes, score_noanswer = self.get_answer(x_final, ques_merged, x_mask) 266 | return score_s, score_e, score_no, score_yes, score_noanswer 267 | 268 | ''' 269 | input: 270 | x_char: batch x word_num x char_num 271 | x_char_mask: batch x word_num x char_num 272 | output: 273 | x_char_cnn_final: batch x word_num x char_cnn_hidden_size 274 | ''' 275 | def character_cnn(self, x_char, x_char_mask): 276 | x_char_embed = self.char_embed(x_char) # batch x word_num x char_num x char_dim 277 | batch_size = x_char_embed.shape[0] 278 | word_num = x_char_embed.shape[1] 279 | char_num = x_char_embed.shape[2] 280 | char_dim = x_char_embed.shape[3] 281 | x_char_cnn = self.char_cnn(x_char_embed.contiguous().view(-1, char_num, char_dim), x_char_mask) # (batch x word_num) x char_num x char_cnn_hidden_size 282 | x_char_cnn_final = self.maxpooling(x_char_cnn, x_char_mask.contiguous().view(-1, char_num)).contiguous().view(batch_size, word_num, -1) # batch x word_num x char_cnn_hidden_size 283 | return x_char_cnn_final 284 | 285 | def linear_sum(self, output, alpha, gamma): 286 | alpha_softmax = F.softmax(alpha) 287 | for i in range(len(output)): 288 | t = output[i] * alpha_softmax[i] * gamma 289 | if i == 0: 290 | res = t 291 | else: 292 | res += t 293 | 294 | res = dropout(res, p=self.opt['dropout_emb'], training=self.drop_emb) 295 | return res 296 | -------------------------------------------------------------------------------- /SDNet/code/SDNet/Models/SDNetTrainer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | from datetime import datetime 5 | import json 6 | import numpy as np 7 | import os 8 | import random 9 | import sys 10 | import time 11 | import torch 12 | from torch.autograd import Variable 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | import torch.optim as optim 16 | from Utils.CoQAPreprocess import CoQAPreprocess 17 | from Models.Layers import MaxPooling, set_dropout_prob 18 | from Models.SDNet import SDNet 19 | from Models.BaseTrainer import BaseTrainer 20 | from Utils.CoQAUtils import BatchGen, AverageMeter, gen_upper_triangle, score 21 | 22 | class SDNetTrainer(BaseTrainer): 23 | def __init__(self, opt): 24 | super(SDNetTrainer, self).__init__(opt) 25 | print('SDNet Model Trainer') 26 | set_dropout_prob(0.0 if not 'DROPOUT' in opt else float(opt['DROPOUT'])) 27 | self.seed = int(opt['SEED']) 28 | self.data_prefix = 'coqa-' 29 | random.seed(self.seed) 30 | np.random.seed(self.seed) 31 | torch.manual_seed(self.seed) 32 | self.preproc = CoQAPreprocess(self.opt) 33 | if self.use_cuda: 34 | torch.cuda.manual_seed_all(self.seed) 35 | 36 | def official(self, model_path, test_data): 37 | print('-----------------------------------------------') 38 | print("Initializing model...") 39 | self.setup_model(self.preproc.train_embedding) 40 | self.load_model(model_path) 41 | 42 | print("Predicting in batches...") 43 | test_batches = BatchGen(self.opt, test_data['data'], self.use_cuda, self.preproc.train_vocab, self.preproc.train_char_vocab, evaluation=True) 44 | predictions = [] 45 | confidence = [] 46 | final_json = [] 47 | cnt = 0 48 | for j, test_batch in enumerate(test_batches): 49 | cnt += 1 50 | if cnt % 50 == 0: 51 | print(cnt, '/', len(test_batches)) 52 | phrase, phrase_score, pred_json = self.predict(test_batch) 53 | predictions.extend(phrase) 54 | confidence.extend(phrase_score) 55 | final_json.extend(pred_json) 56 | 57 | return predictions, confidence, final_json 58 | 59 | def train(self): 60 | self.isTrain = True 61 | self.getSaveFolder() 62 | self.saveConf() 63 | self.vocab, self.char_vocab, vocab_embedding = self.preproc.load_data() 64 | self.log('-----------------------------------------------') 65 | self.log("Initializing model...") 66 | self.setup_model(vocab_embedding) 67 | 68 | if 'RESUME' in self.opt: 69 | model_path = os.path.join(self.opt['datadir'], self.opt['MODEL_PATH']) 70 | self.load_model(model_path) 71 | 72 | print('Loading train json...') 73 | with open(os.path.join(self.opt['FEATURE_FOLDER'], self.data_prefix + 'train-preprocessed.json'), 'r') as f: 74 | train_data = json.load(f) 75 | 76 | print('Loading dev json...') 77 | with open(os.path.join(self.opt['FEATURE_FOLDER'], self.data_prefix + 'dev-preprocessed.json'), 'r') as f: 78 | dev_data = json.load(f) 79 | 80 | best_f1_score = 0.0 81 | numEpochs = self.opt['EPOCH'] 82 | for epoch in range(self.epoch_start, numEpochs): 83 | self.log('Epoch {}'.format(epoch)) 84 | self.network.train() 85 | startTime = datetime.now() 86 | train_batches = BatchGen(self.opt, train_data['data'], self.use_cuda, self.vocab, self.char_vocab) 87 | dev_batches = BatchGen(self.opt, dev_data['data'], self.use_cuda, self.vocab, self.char_vocab, evaluation=True) 88 | for i, batch in enumerate(train_batches): 89 | if i == len(train_batches) - 1 or (epoch == 0 and i == 0 and ('RESUME' in self.opt)) or (i > 0 and i % 1500 == 0): 90 | print('Saving folder is', self.saveFolder) 91 | print('Evaluating on dev set...') 92 | predictions = [] 93 | confidence = [] 94 | dev_answer = [] 95 | final_json = [] 96 | for j, dev_batch in enumerate(dev_batches): 97 | phrase, phrase_score, pred_json = self.predict(dev_batch) 98 | final_json.extend(pred_json) 99 | predictions.extend(phrase) 100 | confidence.extend(phrase_score) 101 | dev_answer.extend(dev_batch[-3]) # answer_str 102 | result, all_f1s = score(predictions, dev_answer, final_json) 103 | f1 = result['f1'] 104 | 105 | if f1 > best_f1_score: 106 | model_file = os.path.join(self.saveFolder, 'best_model.pt') 107 | self.save_for_predict(model_file, epoch) 108 | best_f1_score = f1 109 | pred_json_file = os.path.join(self.saveFolder, 'prediction.json') 110 | with open(pred_json_file, 'w') as output_file: 111 | json.dump(final_json, output_file) 112 | score_per_instance = [] 113 | for instance, s in zip(final_json, all_f1s): 114 | score_per_instance.append({ 115 | 'id': instance['id'], 116 | 'turn_id': instance['turn_id'], 117 | 'f1': s 118 | }) 119 | score_per_instance_json_file = os.path.join(self.saveFolder, 'score_per_instance.json') 120 | with open(score_per_instance_json_file, 'w') as output_file: 121 | json.dump(score_per_instance, output_file) 122 | 123 | self.log("Epoch {0} - dev: F1: {1:.3f} (best F1: {2:.3f})".format(epoch, f1, best_f1_score)) 124 | self.log("Results breakdown\n{0}".format(result)) 125 | 126 | self.update(batch) 127 | if i % 100 == 0: 128 | self.log('updates[{0:6}] train loss[{1:.5f}] remaining[{2}]'.format( 129 | self.updates, self.train_loss.avg, 130 | str((datetime.now() - startTime) / (i + 1) * (len(train_batches) - i - 1)).split('.')[0])) 131 | 132 | print("PROGRESS: {0:.2f}%".format(100.0 * (epoch + 1) / numEpochs)) 133 | print('Config file is at ' + self.opt['confFile']) 134 | 135 | def setup_model(self, vocab_embedding): 136 | self.train_loss = AverageMeter() 137 | self.network = SDNet(self.opt, vocab_embedding) 138 | if self.use_cuda: 139 | self.log('Putting model into GPU') 140 | self.network.cuda() 141 | 142 | parameters = [p for p in self.network.parameters() if p.requires_grad] 143 | self.optimizer = optim.Adamax(parameters) 144 | if 'ADAM2' in self.opt: 145 | print('ADAM2') 146 | self.optimizer = optim.Adam(parameters, lr = 0.0001) 147 | 148 | self.updates = 0 149 | self.epoch_start = 0 150 | self.loss_func = F.cross_entropy 151 | 152 | def update(self, batch): 153 | # Train mode 154 | self.network.train() 155 | self.network.drop_emb = True 156 | 157 | x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, query, query_mask, \ 158 | query_char, query_char_mask, query_bert, query_bert_mask, query_bert_offsets, ground_truth, context_str, context_words, _, _, _, _ = batch 159 | 160 | # Run forward 161 | # score_s, score_e: batch x context_word_num 162 | # score_yes, score_no, score_no_answer: batch x 1 163 | score_s, score_e, score_yes, score_no, score_no_answer = self.network(x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, 164 | query, query_mask, query_char, query_char_mask, query_bert, query_bert_mask, query_bert_offsets, len(context_words)) 165 | max_len = self.opt['max_len'] or score_s.size(1) 166 | batch_size = score_s.shape[0] 167 | context_len = score_s.size(1) 168 | expand_score = gen_upper_triangle(score_s, score_e, max_len, self.use_cuda) 169 | scores = torch.cat((expand_score, score_no, score_yes, score_no_answer), dim=1) # batch x (context_len * context_len + 3) 170 | targets = [] 171 | span_idx = int(context_len * context_len) 172 | for i in range(ground_truth.shape[0]): 173 | if ground_truth[i][0] == -1 and ground_truth[i][1] == -1: # no answer 174 | targets.append(span_idx + 2) 175 | if ground_truth[i][0] == 0 and ground_truth[i][1] == -1: # no 176 | targets.append(span_idx) 177 | if ground_truth[i][0] == -1 and ground_truth[i][1] == 0: # yes 178 | targets.append(span_idx + 1) 179 | if ground_truth[i][0] != -1 and ground_truth[i][1] != -1: # normal span 180 | targets.append(ground_truth[i][0] * context_len + ground_truth[i][1]) 181 | 182 | targets = torch.LongTensor(np.array(targets)) 183 | if self.use_cuda: 184 | targets = targets.cuda() 185 | loss = self.loss_func(scores, targets) 186 | self.train_loss.update(loss.data[0], 1) 187 | self.optimizer.zero_grad() 188 | loss.backward() 189 | torch.nn.utils.clip_grad_norm(self.network.parameters(), self.opt['grad_clipping']) 190 | self.optimizer.step() 191 | self.updates += 1 192 | if 'TUNE_PARTIAL' in self.opt: 193 | self.network.vocab_embed.weight.data[self.opt['tune_partial']:] = self.network.fixed_embedding 194 | 195 | def predict(self, batch): 196 | self.network.eval() 197 | self.network.drop_emb = False 198 | 199 | # Run forward 200 | x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, query, query_mask, \ 201 | query_char, query_char_mask, query_bert, query_bert_mask, query_bert_offsets, ground_truth, context_str, context_words, \ 202 | context_word_offsets, answers, context_id, turn_ids = batch 203 | 204 | context_len = len(context_words) 205 | score_s, score_e, score_yes, score_no, score_no_answer = self.network(x, x_mask, x_char, x_char_mask, x_features, x_pos, x_ent, x_bert, x_bert_mask, x_bert_offsets, 206 | query, query_mask, query_char, query_char_mask, query_bert, query_bert_mask, query_bert_offsets, len(context_words)) 207 | batch_size = score_s.shape[0] 208 | max_len = self.opt['max_len'] or score_s.size(1) 209 | 210 | expand_score = gen_upper_triangle(score_s, score_e, max_len, self.use_cuda) 211 | scores = torch.cat((expand_score, score_no, score_yes, score_no_answer), dim=1) # batch x (context_len * context_len + 3) 212 | prob = F.softmax(scores, dim = 1).data.cpu() # Transfer to CPU/normal tensors for numpy ops 213 | 214 | # Get argmax text spans 215 | predictions = [] 216 | confidence = [] 217 | 218 | pred_json = [] 219 | for i in range(batch_size): 220 | _, ids = torch.sort(prob[i, :], descending=True) 221 | idx = 0 222 | best_id = ids[idx] 223 | 224 | confidence.append(float(prob[i, best_id])) 225 | if best_id < context_len * context_len: 226 | st = best_id / context_len 227 | ed = best_id % context_len 228 | st = context_word_offsets[st][0] 229 | ed = context_word_offsets[ed][1] 230 | predictions.append(context_str[st:ed]) 231 | 232 | if best_id == context_len * context_len: 233 | predictions.append('no') 234 | 235 | if best_id == context_len * context_len + 1: 236 | predictions.append('yes') 237 | 238 | if best_id == context_len * context_len + 2: 239 | predictions.append('unknown') 240 | 241 | pred_json.append({ 242 | 'id': context_id, 243 | 'turn_id': turn_ids[i], 244 | 'answer': predictions[-1] 245 | }) 246 | 247 | return (predictions, confidence, pred_json) # list of strings, list of floats, list of jsons 248 | 249 | def load_model(self, model_path): 250 | print('Loading model from', model_path) 251 | checkpoint = torch.load(model_path) 252 | state_dict = checkpoint['state_dict'] 253 | new_state = set(self.network.state_dict().keys()) 254 | for k in list(state_dict['network'].keys()): 255 | if k not in new_state: 256 | del state_dict['network'][k] 257 | for k, v in list(self.network.state_dict().items()): 258 | if k not in state_dict['network']: 259 | state_dict['network'][k] = v 260 | self.network.load_state_dict(state_dict['network']) 261 | 262 | print('Loading finished', model_path) 263 | 264 | def save(self, filename, epoch, prev_filename): 265 | params = { 266 | 'state_dict': { 267 | 'network': self.network.state_dict(), 268 | 'optimizer': self.optimizer.state_dict(), 269 | 'updates': self.updates # how many updates 270 | }, 271 | 'train_loss': { 272 | 'val': self.train_loss.val, 273 | 'avg': self.train_loss.avg, 274 | 'sum': self.train_loss.sum, 275 | 'count': self.train_loss.count 276 | }, 277 | 'config': self.opt, 278 | 'epoch': epoch 279 | } 280 | try: 281 | torch.save(params, filename) 282 | self.log('model saved to {}'.format(filename)) 283 | if os.path.exists(prev_filename): 284 | os.remove(prev_filename) 285 | except BaseException: 286 | self.log('[ WARN: Saving failed... continuing anyway. ]') 287 | 288 | def save_for_predict(self, filename, epoch): 289 | network_state = dict([(k, v) for k, v in self.network.state_dict().items() if k[0:4] != 'CoVe' and k[0:4] != 'ELMo' and k[0:9] != 'AllenELMo' and k[0:4] != 'Bert']) 290 | 291 | if 'eval_embed.weight' in network_state: 292 | del network_state['eval_embed.weight'] 293 | if 'fixed_embedding' in network_state: 294 | del network_state['fixed_embedding'] 295 | params = { 296 | 'state_dict': {'network': network_state}, 297 | 'config': self.opt, 298 | } 299 | try: 300 | torch.save(params, filename) 301 | self.log('model saved to {}'.format(filename)) 302 | except BaseException: 303 | self.log('[ WARN: Saving failed... continuing anyway. ]') 304 | -------------------------------------------------------------------------------- /SDNet/code/SDNet/Models/__pycache__/BaseTrainer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/__pycache__/BaseTrainer.cpython-37.pyc -------------------------------------------------------------------------------- /SDNet/code/SDNet/Models/__pycache__/Layers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/__pycache__/Layers.cpython-37.pyc -------------------------------------------------------------------------------- /SDNet/code/SDNet/Models/__pycache__/SDNet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/__pycache__/SDNet.cpython-37.pyc -------------------------------------------------------------------------------- /SDNet/code/SDNet/Models/__pycache__/SDNetTrainer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Models/__pycache__/SDNetTrainer.cpython-37.pyc -------------------------------------------------------------------------------- /SDNet/code/SDNet/NOTICE.txt: -------------------------------------------------------------------------------- 1 | NOTICES AND INFORMATION 2 | Do Not Translate or Localize 3 | 4 | This software incorporates material from third parties. Microsoft makes certain 5 | open source code available at http://3rdpartysource.microsoft.com, or you may 6 | send a check or money order for US $5.00, including the product name, the open 7 | source component name, and version number, to: 8 | 9 | Source Code Compliance Team 10 | Microsoft Corporation 11 | One Microsoft Way 12 | Redmond, WA 98052 13 | USA 14 | 15 | Notwithstanding any other terms, you may reverse engineer this software to the 16 | extent required to debug changes to any libraries licensed under the GNU Lesser 17 | General Public License. 18 | 19 | ________________________ 20 | 21 | huggingface/pytorch-pretrained-BERT v0.6.1 (Source) 22 | 23 | Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 24 | Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 25 | 26 | 27 | Apache License 28 | Version 2.0, January 2004 29 | http://www.apache.org/licenses/ 30 | 31 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 32 | 33 | 1. Definitions. 34 | 35 | "License" shall mean the terms and conditions for use, reproduction, 36 | and distribution as defined by Sections 1 through 9 of this document. 37 | 38 | "Licensor" shall mean the copyright owner or entity authorized by 39 | the copyright owner that is granting the License. 40 | 41 | "Legal Entity" shall mean the union of the acting entity and all 42 | other entities that control, are controlled by, or are under common 43 | control with that entity. For the purposes of this definition, 44 | "control" means (i) the power, direct or indirect, to cause the 45 | direction or management of such entity, whether by contract or 46 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 47 | outstanding shares, or (iii) beneficial ownership of such entity. 48 | 49 | "You" (or "Your") shall mean an individual or Legal Entity 50 | exercising permissions granted by this License. 51 | 52 | "Source" form shall mean the preferred form for making modifications, 53 | including but not limited to software source code, documentation 54 | source, and configuration files. 55 | 56 | "Object" form shall mean any form resulting from mechanical 57 | transformation or translation of a Source form, including but 58 | not limited to compiled object code, generated documentation, 59 | and conversions to other media types. 60 | 61 | "Work" shall mean the work of authorship, whether in Source or 62 | Object form, made available under the License, as indicated by a 63 | copyright notice that is included in or attached to the work 64 | (an example is provided in the Appendix below). 65 | 66 | "Derivative Works" shall mean any work, whether in Source or Object 67 | form, that is based on (or derived from) the Work and for which the 68 | editorial revisions, annotations, elaborations, or other modifications 69 | represent, as a whole, an original work of authorship. For the purposes 70 | of this License, Derivative Works shall not include works that remain 71 | separable from, or merely link (or bind by name) to the interfaces of, 72 | the Work and Derivative Works thereof. 73 | 74 | "Contribution" shall mean any work of authorship, including 75 | the original version of the Work and any modifications or additions 76 | to that Work or Derivative Works thereof, that is intentionally 77 | submitted to Licensor for inclusion in the Work by the copyright owner 78 | or by an individual or Legal Entity authorized to submit on behalf of 79 | the copyright owner. For the purposes of this definition, "submitted" 80 | means any form of electronic, verbal, or written communication sent 81 | to the Licensor or its representatives, including but not limited to 82 | communication on electronic mailing lists, source code control systems, 83 | and issue tracking systems that are managed by, or on behalf of, the 84 | Licensor for the purpose of discussing and improving the Work, but 85 | excluding communication that is conspicuously marked or otherwise 86 | designated in writing by the copyright owner as "Not a Contribution." 87 | 88 | "Contributor" shall mean Licensor and any individual or Legal Entity 89 | on behalf of whom a Contribution has been received by Licensor and 90 | subsequently incorporated within the Work. 91 | 92 | 2. Grant of Copyright License. Subject to the terms and conditions of 93 | this License, each Contributor hereby grants to You a perpetual, 94 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 95 | copyright license to reproduce, prepare Derivative Works of, 96 | publicly display, publicly perform, sublicense, and distribute the 97 | Work and such Derivative Works in Source or Object form. 98 | 99 | 3. Grant of Patent License. Subject to the terms and conditions of 100 | this License, each Contributor hereby grants to You a perpetual, 101 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 102 | (except as stated in this section) patent license to make, have made, 103 | use, offer to sell, sell, import, and otherwise transfer the Work, 104 | where such license applies only to those patent claims licensable 105 | by such Contributor that are necessarily infringed by their 106 | Contribution(s) alone or by combination of their Contribution(s) 107 | with the Work to which such Contribution(s) was submitted. If You 108 | institute patent litigation against any entity (including a 109 | cross-claim or counterclaim in a lawsuit) alleging that the Work 110 | or a Contribution incorporated within the Work constitutes direct 111 | or contributory patent infringement, then any patent licenses 112 | granted to You under this License for that Work shall terminate 113 | as of the date such litigation is filed. 114 | 115 | 4. Redistribution. You may reproduce and distribute copies of the 116 | Work or Derivative Works thereof in any medium, with or without 117 | modifications, and in Source or Object form, provided that You 118 | meet the following conditions: 119 | 120 | (a) You must give any other recipients of the Work or 121 | Derivative Works a copy of this License; and 122 | 123 | (b) You must cause any modified files to carry prominent notices 124 | stating that You changed the files; and 125 | 126 | (c) You must retain, in the Source form of any Derivative Works 127 | that You distribute, all copyright, patent, trademark, and 128 | attribution notices from the Source form of the Work, 129 | excluding those notices that do not pertain to any part of 130 | the Derivative Works; and 131 | 132 | (d) If the Work includes a "NOTICE" text file as part of its 133 | distribution, then any Derivative Works that You distribute must 134 | include a readable copy of the attribution notices contained 135 | within such NOTICE file, excluding those notices that do not 136 | pertain to any part of the Derivative Works, in at least one 137 | of the following places: within a NOTICE text file distributed 138 | as part of the Derivative Works; within the Source form or 139 | documentation, if provided along with the Derivative Works; or, 140 | within a display generated by the Derivative Works, if and 141 | wherever such third-party notices normally appear. The contents 142 | of the NOTICE file are for informational purposes only and 143 | do not modify the License. You may add Your own attribution 144 | notices within Derivative Works that You distribute, alongside 145 | or as an addendum to the NOTICE text from the Work, provided 146 | that such additional attribution notices cannot be construed 147 | as modifying the License. 148 | 149 | You may add Your own copyright statement to Your modifications and 150 | may provide additional or different license terms and conditions 151 | for use, reproduction, or distribution of Your modifications, or 152 | for any such Derivative Works as a whole, provided Your use, 153 | reproduction, and distribution of the Work otherwise complies with 154 | the conditions stated in this License. 155 | 156 | 5. Submission of Contributions. Unless You explicitly state otherwise, 157 | any Contribution intentionally submitted for inclusion in the Work 158 | by You to the Licensor shall be under the terms and conditions of 159 | this License, without any additional terms or conditions. 160 | Notwithstanding the above, nothing herein shall supersede or modify 161 | the terms of any separate license agreement you may have executed 162 | with Licensor regarding such Contributions. 163 | 164 | 6. Trademarks. This License does not grant permission to use the trade 165 | names, trademarks, service marks, or product names of the Licensor, 166 | except as required for reasonable and customary use in describing the 167 | origin of the Work and reproducing the content of the NOTICE file. 168 | 169 | 7. Disclaimer of Warranty. Unless required by applicable law or 170 | agreed to in writing, Licensor provides the Work (and each 171 | Contributor provides its Contributions) on an "AS IS" BASIS, 172 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 173 | implied, including, without limitation, any warranties or conditions 174 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 175 | PARTICULAR PURPOSE. You are solely responsible for determining the 176 | appropriateness of using or redistributing the Work and assume any 177 | risks associated with Your exercise of permissions under this License. 178 | 179 | 8. Limitation of Liability. In no event and under no legal theory, 180 | whether in tort (including negligence), contract, or otherwise, 181 | unless required by applicable law (such as deliberate and grossly 182 | negligent acts) or agreed to in writing, shall any Contributor be 183 | liable to You for damages, including any direct, indirect, special, 184 | incidental, or consequential damages of any character arising as a 185 | result of this License or out of the use or inability to use the 186 | Work (including but not limited to damages for loss of goodwill, 187 | work stoppage, computer failure or malfunction, or any and all 188 | other commercial damages or losses), even if such Contributor 189 | has been advised of the possibility of such damages. 190 | 191 | 9. Accepting Warranty or Additional Liability. While redistributing 192 | the Work or Derivative Works thereof, You may choose to offer, 193 | and charge a fee for, acceptance of support, warranty, indemnity, 194 | or other liability obligations and/or rights consistent with this 195 | License. However, in accepting such obligations, You may act only 196 | on Your own behalf and on Your sole responsibility, not on behalf 197 | of any other Contributor, and only if You agree to indemnify, 198 | defend, and hold each Contributor harmless for any liability 199 | incurred by, or claims asserted against, such Contributor by reason 200 | of your accepting any such warranty or additional liability. 201 | 202 | END OF TERMS AND CONDITIONS 203 | 204 | APPENDIX: How to apply the Apache License to your work. 205 | 206 | To apply the Apache License to your work, attach the following 207 | boilerplate notice, with the fields enclosed by brackets "[]" 208 | replaced with your own identifying information. (Don't include 209 | the brackets!) The text should be enclosed in the appropriate 210 | comment syntax for the file format. We also recommend that a 211 | file or class name and description of purpose be included on the 212 | same "printed page" as the copyright notice for easier 213 | identification within third-party archives. 214 | 215 | Copyright [yyyy] [name of copyright owner] 216 | 217 | Licensed under the Apache License, Version 2.0 (the "License"); 218 | you may not use this file except in compliance with the License. 219 | You may obtain a copy of the License at 220 | 221 | http://www.apache.org/licenses/LICENSE-2.0 222 | 223 | Unless required by applicable law or agreed to in writing, software 224 | distributed under the License is distributed on an "AS IS" BASIS, 225 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 226 | See the License for the specific language governing permissions and 227 | limitations under the License. 228 | -------------------------------------------------------------------------------- /SDNet/code/SDNet/README.md: -------------------------------------------------------------------------------- 1 | # SDNet 2 | 3 | This is the official code for the Microsoft's submission of SDNet model to [CoQA](https://stanfordnlp.github.io/coqa/) leaderboard. It is implemented under PyTorch framework. The related paper to cite is: 4 | 5 | **SDNet: Contextualized Attention-based Deep Network for Conversational Question Answering**, by Chenguang Zhu, Michael Zeng and Xuedong Huang, at https://arxiv.org/abs/1812.03593. 6 | 7 | For usage of this code, please follow [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct). 8 | 9 | # Directory structure: 10 | * main.py: the starter code 11 | 12 | * Models/ 13 | * BaseTrainer.py: Base class for trainer 14 | * SDNetTrainer.py: Trainer for SDNet, including training and predicting procedures 15 | * SDNet.py: The SDNet network structure 16 | * Layers.py: Related network layer functions 17 | * Bert/ 18 | * Bert.py: Customized class to compute BERT contextualized embedding 19 | * modeling.py, optimization.py, tokenization.py: From Huggingface's PyTorch implementation of BERT 20 | * Utils/ 21 | * Arguments.py: Process argument configuration file 22 | * Constants.py: Define constants used 23 | * CoQAPreprocess.py: preprocess CoQA raw data into intermediate binary/json file, including tokenzation, history preprending 24 | * CoQAUtils.py, General Utils.py: utility functions used in SDNet 25 | * Timing.py: Logging time 26 | 27 | # How to run 28 | Requirement: PyTorch 0.4.0, spaCy 2.0. 29 | The docker we used is available at dockerhub: https://hub.docker.com/r/zcgzcgzcg/squadv2/tags. Please use v3.0 or v4.0. 30 | 1. Create a folder (e.g. **coqa**) to contain data and running logs; 31 | 2. Create folder **coqa/data** to store CoQA raw data: **coqa-train-v1.0.json** and **coqa-dev-v1.0.json**; 32 | 3. Copy the file **conf** from the repo into folder **coqa**; 33 | 4. If you want to use BERT-Large, download their model into **coqa/bert-large-uncased**; if you want to use BERT-base, download their model into **coqa/bert-base-cased**; 34 | * The models can be downloaded from Huggingface: 35 | * 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz", 36 | * 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz" 37 | * bert-large-uncased-vocab.txt can be downloaded from Google's BERT repository 38 | 5. Create a folder **glove** in the same directory of **coqa** and download GloVe embedding **glove.840B.300d.txt** into the folder. 39 | 40 | Your directory should look like this: 41 | * coqa/ 42 | * data/ 43 | * coqa-train-v1.0.json 44 | * coqa-dev-v1.0.json 45 | * bert-large-uncased/ 46 | * bert-large-uncased-vocab.txt 47 | * bert_config.json 48 | * pytorch_model.bin 49 | * conf 50 | * glove/ 51 | * glove.840B.300d.txt 52 | 53 | Then, execute `python main.py train path_to_coqa/conf`. 54 | 55 | If you run for the first time, CoQAPreprocess.py will automatically create folders **conf~/spacy_intermediate_features~** inside **coqa** to store intermediate tokenization results, which will take a few hours. 56 | 57 | Every time you run the code, a new running folder **run_idx** will be created inside **coqa/conf~**, which contains running logs, prediction result on dev set, and best model. 58 | 59 | # Contact 60 | If you have any questions, please contact Chenguang Zhu, chezhu@microsoft.com 61 | -------------------------------------------------------------------------------- /SDNet/code/SDNet/Scratch/SQuAD_to_CoQA.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | # Scratch code to convert SQuAD data into CoQA format 5 | 6 | import json 7 | 8 | data = {'version': 0.2, 'data': []} 9 | with open('train-v2.0.json', 'r') as f: 10 | squad = json.load(f) 11 | 12 | cnt = 0 13 | for d in squad['data']: 14 | for p in d['paragraphs']: 15 | cnt += 1 16 | a = {'source': 'source', 'filename': 'filename', 'id': str(cnt), 'story': p['context']} 17 | ques = [] 18 | ans = [] 19 | 20 | additional = [] 21 | turn_id = 1 22 | for qa in p['qas']: 23 | ques.append({ 24 | 'input_text': qa['question'], 25 | 'turn_id': turn_id 26 | }) 27 | 28 | if qa['is_impossible']: 29 | if len(ans) == 0: 30 | ans.append([]) 31 | ans[0].append({ 32 | 'input_text': 'unknown', 33 | 'span_text': 'unknown', 34 | 'span_start': -1, 35 | 'span_end': -1, 36 | 'turn_id': turn_id 37 | }) 38 | 39 | for j in range(len(qa['answers'])): 40 | if j >= len(ans): 41 | ans.append([]) 42 | ans[j].append({ 43 | 'input_text': qa['answers'][j]['text'], 44 | 'span_text': qa['answers'][j]['text'], 45 | 'span_start': qa['answers'][j]['answer_start'], 46 | 'span_end': qa['answers'][j]['answer_start'] + len(qa['answers'][j]['text']), 47 | 'turn_id': turn_id 48 | }) 49 | 50 | turn_id += 1 51 | 52 | a['questions'] = ques 53 | a['answers'] = ans[0] 54 | a['additional_answers'] = {} 55 | for j in range(1, len(ans)): 56 | a['additional_answers'][str(j-1)] = ans[j] 57 | 58 | data['data'].append(a) 59 | 60 | 61 | with open('squad_in_coqa_format.json', 'w') as output_file: 62 | json.dump(data, output_file, indent=4) 63 | -------------------------------------------------------------------------------- /SDNet/code/SDNet/Utils/Arguments.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | import os 5 | 6 | class Arguments: 7 | def __init__(self, confFile): 8 | if not os.path.exists(confFile): 9 | raise Exception("The argument file does not exist: " + confFile) 10 | self.confFile = confFile 11 | 12 | def is_int(self, s): 13 | try: 14 | int(s) 15 | return True 16 | except ValueError: 17 | return False 18 | 19 | def is_float(self, s): 20 | try: 21 | float(s) 22 | return True 23 | except ValueError: 24 | return False 25 | 26 | def is_bool(self, s): 27 | return s.lower() == 'true' or s.lower() == 'false' 28 | 29 | def readHyperDriveArguments(self, arguments): 30 | hyperdrive_opts = {} 31 | for i in range(0, len(arguments), 2): 32 | hp_name, hp_value = arguments[i:i+2] 33 | hp_name = hp_name.replace("--", "") 34 | if self.is_int(hp_value): 35 | hp_value = int(hp_value) 36 | elif self.is_float(hp_value): 37 | hp_value = float(hp_value) 38 | hyperdrive_opts[hp_name] = hp_value 39 | return hyperdrive_opts 40 | 41 | def readArguments(self): 42 | opt = {} 43 | with open(self.confFile, encoding='utf-8') as f: 44 | for line in f: 45 | l = line.replace('\t', ' ').strip() 46 | if l.startswith("#"): 47 | continue 48 | parts = l.split() 49 | if len(parts) == 1: 50 | key = parts[0] 51 | if not key in opt: 52 | opt[key] = True 53 | if len(parts) == 2: 54 | key = parts[0] 55 | value = parts[1] 56 | if not key in opt: 57 | opt[key] = value 58 | if self.is_int(value): 59 | opt[key] = int(value) 60 | elif self.is_float(value): 61 | opt[key] = float(value) 62 | elif self.is_bool(value): 63 | opt[key] = value.lower() == 'true' 64 | else: 65 | print('Warning: key %s already exists' % key) 66 | return opt 67 | -------------------------------------------------------------------------------- /SDNet/code/SDNet/Utils/CoQAPreprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | """ 5 | This file takes a CoQA data file as input and generates the input files for training the QA model. 6 | """ 7 | import json 8 | import msgpack 9 | import multiprocessing 10 | import re 11 | import string 12 | import torch 13 | from tqdm import tqdm 14 | from collections import Counter 15 | from Utils.GeneralUtils import nlp, load_glove_vocab, pre_proc 16 | from Utils.CoQAUtils import token2id, token2id_sent, char2id_sent, build_embedding, feature_gen, POS, ENT 17 | import os 18 | 19 | class CoQAPreprocess(): 20 | def __init__(self, opt): 21 | print('CoQA Preprocessing') 22 | self.opt = opt 23 | self.spacyDir = opt['FEATURE_FOLDER'] 24 | self.train_file = os.path.join(opt['datadir'], opt['CoQA_TRAIN_FILE']) 25 | self.dev_file = os.path.join(opt['datadir'], opt['CoQA_DEV_FILE']) 26 | self.glove_file = os.path.join(opt['datadir'], opt['INIT_WORD_EMBEDDING_FILE']) 27 | self.glove_dim = 300 28 | self.official = 'OFFICIAL' in opt 29 | self.data_prefix = 'coqa-' 30 | 31 | if self.official: 32 | self.glove_vocab = load_glove_vocab(self.glove_file, self.glove_dim, to_lower = False) 33 | print('Official prediction initializes...') 34 | print('Loading training vocab and vocab char...') 35 | self.train_vocab, self.train_char_vocab, self.train_embedding = self.load_data() 36 | self.test_file = self.opt['OFFICIAL_TEST_FILE'] 37 | return 38 | 39 | dataset_labels = ['train', 'dev'] 40 | allExist = True 41 | for dataset_label in dataset_labels: 42 | if not os.path.exists(os.path.join(self.spacyDir, self.data_prefix + dataset_label + '-preprocessed.json')): 43 | allExist = False 44 | 45 | if allExist: 46 | return 47 | 48 | print('Previously result not found, creating preprocessed files now...') 49 | self.glove_vocab = load_glove_vocab(self.glove_file, self.glove_dim, to_lower = False) 50 | if not os.path.isdir(self.spacyDir): 51 | os.makedirs(self.spacyDir) 52 | print('Directory created: ' + self.spacyDir) 53 | 54 | for dataset_label in dataset_labels: 55 | self.preprocess(dataset_label) 56 | 57 | # dataset_label can be 'train' or 'dev' or 'test' 58 | def preprocess(self, dataset_label): 59 | file_name = self.train_file if dataset_label == 'train' else (self.dev_file if dataset_label == 'dev' else self.test_file) 60 | output_file_name = os.path.join(self.spacyDir, self.data_prefix + dataset_label + '-preprocessed.json') 61 | 62 | print('Preprocessing', dataset_label, 'file:', file_name) 63 | print('Loading json...') 64 | with open(file_name, 'r') as f: 65 | dataset = json.load(f) 66 | 67 | print('Processing json...') 68 | 69 | data = [] 70 | tot = len(dataset['data']) 71 | for data_idx in tqdm(range(tot)): 72 | datum = dataset['data'][data_idx] 73 | context_str = datum['story'] 74 | _datum = {'context': context_str, 75 | 'source': datum['source'], 76 | 'id': datum['id'], 77 | 'filename': datum['filename']} 78 | 79 | nlp_context = nlp(pre_proc(context_str)) 80 | _datum['annotated_context'] = self.process(nlp_context) 81 | _datum['raw_context_offsets'] = self.get_raw_context_offsets(_datum['annotated_context']['word'], context_str) 82 | _datum['qas'] = [] 83 | assert len(datum['questions']) == len(datum['answers']) 84 | 85 | additional_answers = {} 86 | if 'additional_answers' in datum: 87 | for k, answer in datum['additional_answers'].items(): 88 | if len(answer) == len(datum['answers']): 89 | for ex in answer: 90 | idx = ex['turn_id'] 91 | if idx not in additional_answers: 92 | additional_answers[idx] = [] 93 | additional_answers[idx].append(ex['input_text']) # additional_answer is only used to eval, so raw_text is fine 94 | 95 | for i in range(len(datum['questions'])): 96 | question, answer = datum['questions'][i], datum['answers'][i] 97 | assert question['turn_id'] == answer['turn_id'] 98 | 99 | idx = question['turn_id'] 100 | _qas = {'turn_id': idx, 101 | 'question': question['input_text'], 102 | 'answer': answer['input_text']} 103 | if idx in additional_answers: 104 | _qas['additional_answers'] = additional_answers[idx] 105 | 106 | _qas['annotated_question'] = self.process(nlp(pre_proc(question['input_text']))) 107 | _qas['annotated_answer'] = self.process(nlp(pre_proc(answer['input_text']))) 108 | _qas['raw_answer'] = answer['input_text'] 109 | _qas['answer_span_start'] = answer['span_start'] 110 | _qas['answer_span_end'] = answer['span_end'] 111 | 112 | start = answer['span_start'] 113 | end = answer['span_end'] 114 | chosen_text = _datum['context'][start: end].lower() 115 | while len(chosen_text) > 0 and chosen_text[0] in string.whitespace: 116 | chosen_text = chosen_text[1:] 117 | start += 1 118 | while len(chosen_text) > 0 and chosen_text[-1] in string.whitespace: 119 | chosen_text = chosen_text[:-1] 120 | end -= 1 121 | input_text = _qas['answer'].strip().lower() 122 | if input_text in chosen_text: 123 | p = chosen_text.find(input_text) 124 | _qas['answer_span'] = self.find_span(_datum['raw_context_offsets'], 125 | start + p, start + p + len(input_text)) 126 | else: 127 | _qas['answer_span'] = self.find_span_with_gt(_datum['context'], 128 | _datum['raw_context_offsets'], input_text) 129 | long_question = '' 130 | for j in range(i - 2, i + 1): 131 | if j < 0: 132 | continue 133 | long_question += ' ' + datum['questions'][j]['input_text'] 134 | if j < i: 135 | long_question += ' ' + datum['answers'][j]['input_text'] 136 | 137 | long_question = long_question.strip() 138 | nlp_long_question = nlp(long_question) 139 | _qas['context_features'] = feature_gen(nlp_context, nlp_long_question) 140 | 141 | _datum['qas'].append(_qas) 142 | data.append(_datum) 143 | 144 | # build vocabulary 145 | if dataset_label == 'train': 146 | print('Build vocabulary from training data...') 147 | contexts = [_datum['annotated_context']['word'] for _datum in data] 148 | qas = [qa['annotated_question']['word'] + qa['annotated_answer']['word'] for qa in _datum['qas'] for _datum in data] 149 | self.train_vocab = self.build_vocab(contexts, qas) 150 | self.train_char_vocab = self.build_char_vocab(self.train_vocab) 151 | 152 | print('Getting word ids...') 153 | w2id = {w: i for i, w in enumerate(self.train_vocab)} 154 | c2id = {c: i for i, c in enumerate(self.train_char_vocab)} 155 | for _datum in data: 156 | _datum['annotated_context']['wordid'] = token2id_sent(_datum['annotated_context']['word'], w2id, unk_id = 1, to_lower = False) 157 | _datum['annotated_context']['charid'] = char2id_sent(_datum['annotated_context']['word'], c2id, unk_id = 1, to_lower = False) 158 | for qa in _datum['qas']: 159 | qa['annotated_question']['wordid'] = token2id_sent(qa['annotated_question']['word'], w2id, unk_id = 1, to_lower = False) 160 | qa['annotated_question']['charid'] = char2id_sent(qa['annotated_question']['word'], c2id, unk_id = 1, to_lower = False) 161 | qa['annotated_answer']['wordid'] = token2id_sent(qa['annotated_answer']['word'], w2id, unk_id = 1, to_lower = False) 162 | qa['annotated_answer']['charid'] = char2id_sent(qa['annotated_answer']['word'], c2id, unk_id = 1, to_lower = False) 163 | 164 | if dataset_label == 'train': 165 | # get the condensed dictionary embedding 166 | print('Getting embedding matrix for ' + dataset_label) 167 | embedding = build_embedding(self.glove_file, self.train_vocab, self.glove_dim) 168 | meta = {'vocab': self.train_vocab, 'char_vocab': self.train_char_vocab, 'embedding': embedding.tolist()} 169 | meta_file_name = os.path.join(self.spacyDir, dataset_label + '_meta.msgpack') 170 | print('Saving meta information to', meta_file_name) 171 | with open(meta_file_name, 'wb') as f: 172 | msgpack.dump(meta, f, encoding='utf8') 173 | 174 | dataset['data'] = data 175 | 176 | if dataset_label == 'test': 177 | return dataset 178 | 179 | with open(output_file_name, 'w') as output_file: 180 | json.dump(dataset, output_file, sort_keys=True, indent=4) 181 | 182 | ''' 183 | Return train_vocab embedding 184 | ''' 185 | def load_data(self): 186 | print('Load train_meta.msgpack...') 187 | meta_file_name = os.path.join(self.spacyDir, 'train_meta.msgpack') 188 | with open(meta_file_name, 'rb') as f: 189 | meta = msgpack.load(f, encoding='utf8') 190 | embedding = torch.Tensor(meta['embedding']) 191 | self.opt['vocab_size'] = embedding.size(0) 192 | self.opt['vocab_dim'] = embedding.size(1) 193 | self.opt['char_vocab_size'] = len(meta['char_vocab']) 194 | return meta['vocab'], meta['char_vocab'], embedding 195 | 196 | def build_vocab(self, contexts, qas): # vocabulary will also be sorted accordingly 197 | counter_c = Counter(w for doc in contexts for w in doc) 198 | counter_qa = Counter(w for doc in qas for w in doc) 199 | counter = counter_c + counter_qa 200 | vocab = sorted([t for t in counter_qa if t in self.glove_vocab], key=counter_qa.get, reverse=True) 201 | vocab += sorted([t for t in counter_c.keys() - counter_qa.keys() if t in self.glove_vocab], 202 | key=counter.get, reverse=True) 203 | total = sum(counter.values()) 204 | matched = sum(counter[t] for t in vocab) 205 | print('vocab {1}/{0} OOV {2}/{3} ({4:.4f}%)'.format( 206 | len(counter), len(vocab), (total - matched), total, (total - matched) / total * 100)) 207 | vocab.insert(0, "") 208 | vocab.insert(1, "") 209 | vocab.insert(2, "") 210 | vocab.insert(3, "") 211 | return vocab 212 | 213 | def build_char_vocab(self, words): 214 | counter = Counter(c for w in words for c in w) 215 | print('All characters: {0}'.format(len(counter))) 216 | char_vocab = [c for c, cnt in counter.items() if cnt > 3] 217 | print('Occurrence > 3 characters: {0}'.format(len(char_vocab))) 218 | 219 | char_vocab.insert(0, "") 220 | char_vocab.insert(1, "") 221 | char_vocab.insert(2, "") 222 | char_vocab.insert(3, "") 223 | return char_vocab 224 | 225 | def _str(self, s): 226 | """ Convert PTB tokens to normal tokens """ 227 | if (s.lower() == '-lrb-'): 228 | s = '(' 229 | elif (s.lower() == '-rrb-'): 230 | s = ')' 231 | elif (s.lower() == '-lsb-'): 232 | s = '[' 233 | elif (s.lower() == '-rsb-'): 234 | s = ']' 235 | elif (s.lower() == '-lcb-'): 236 | s = '{' 237 | elif (s.lower() == '-rcb-'): 238 | s = '}' 239 | return s 240 | 241 | def process(self, parsed_text): 242 | output = {'word': [], 243 | 'lemma': [], 244 | 'pos': [], 245 | 'pos_id': [], 246 | 'ent': [], 247 | 'ent_id': [], 248 | 'offsets': [], 249 | 'sentences': []} 250 | 251 | for token in parsed_text: 252 | #[(token.text,token.idx) for token in parsed_sentence] 253 | output['word'].append(self._str(token.text)) 254 | pos = token.tag_ 255 | output['pos'].append(pos) 256 | output['pos_id'].append(token2id(pos, POS, 0)) 257 | 258 | ent = 'O' if token.ent_iob_ == 'O' else (token.ent_iob_ + '-' + token.ent_type_) 259 | output['ent'].append(ent) 260 | output['ent_id'].append(token2id(ent, ENT, 0)) 261 | 262 | output['lemma'].append(token.lemma_ if token.lemma_ != '-PRON-' else token.text.lower()) 263 | output['offsets'].append((token.idx, token.idx + len(token.text))) 264 | 265 | word_idx = 0 266 | for sent in parsed_text.sents: 267 | output['sentences'].append((word_idx, word_idx + len(sent))) 268 | word_idx += len(sent) 269 | 270 | assert word_idx == len(output['word']) 271 | return output 272 | 273 | ''' 274 | offsets based on raw_text 275 | this will solve the problem that, in raw_text, it's "a-b", in parsed test, it's "a - b" 276 | ''' 277 | def get_raw_context_offsets(self, words, raw_text): 278 | raw_context_offsets = [] 279 | p = 0 280 | for token in words: 281 | while p < len(raw_text) and re.match('\s', raw_text[p]): 282 | p += 1 283 | if raw_text[p:p + len(token)] != token: 284 | print('something is wrong! token', token, 'raw_text:', raw_text) 285 | 286 | raw_context_offsets.append((p, p + len(token))) 287 | p += len(token) 288 | 289 | return raw_context_offsets 290 | 291 | def normalize_answer(self, s): 292 | """Lower text and remove punctuation, storys and extra whitespace.""" 293 | 294 | def remove_articles(text): 295 | regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) 296 | return re.sub(regex, ' ', text) 297 | 298 | def white_space_fix(text): 299 | return ' '.join(text.split()) 300 | 301 | def remove_punc(text): 302 | exclude = set(string.punctuation) 303 | return ''.join(ch for ch in text if ch not in exclude) 304 | 305 | def lower(text): 306 | return text.lower() 307 | 308 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 309 | 310 | 311 | # find the word id start and stop 312 | def find_span_with_gt(self, context, offsets, ground_truth): 313 | best_f1 = 0.0 314 | best_span = (len(offsets) - 1, len(offsets) - 1) 315 | gt = self.normalize_answer(pre_proc(ground_truth)).split() 316 | 317 | ls = [i for i in range(len(offsets)) if context[offsets[i][0]:offsets[i][1]].lower() in gt] 318 | 319 | for i in range(len(ls)): 320 | for j in range(i, len(ls)): 321 | pred = self.normalize_answer(pre_proc(context[offsets[ls[i]][0]: offsets[ls[j]][1]])).split() 322 | common = Counter(pred) & Counter(gt) 323 | num_same = sum(common.values()) 324 | if num_same > 0: 325 | precision = 1.0 * num_same / len(pred) 326 | recall = 1.0 * num_same / len(gt) 327 | f1 = (2 * precision * recall) / (precision + recall) 328 | if f1 > best_f1: 329 | best_f1 = f1 330 | best_span = (ls[i], ls[j]) 331 | return best_span 332 | 333 | 334 | # find the word id start and stop 335 | def find_span(self, offsets, start, end): 336 | start_index = -1 337 | end_index = -1 338 | for i, offset in enumerate(offsets): 339 | if (start_index < 0) or (start >= offset[0]): 340 | start_index = i 341 | if (end_index < 0) and (end <= offset[1]): 342 | end_index = i 343 | return (start_index, end_index) 344 | -------------------------------------------------------------------------------- /SDNet/code/SDNet/Utils/Constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | PAD_WORD_ID = 0 5 | UNK_WORD_ID = 1 6 | END_WORD_ID = 2 7 | 8 | PAD_CHAR = 261 9 | BOW_CHAR = 259 10 | EOW_CHAR = 260 11 | 12 | -------------------------------------------------------------------------------- /SDNet/code/SDNet/Utils/GeneralUtils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | import math 5 | import re 6 | from Utils.Constants import * 7 | import spacy 8 | import torch 9 | import torch.nn.functional as F 10 | import unicodedata 11 | import sys 12 | from torch.autograd import Variable 13 | nlp = spacy.load('en', parser = False) 14 | 15 | # normalize sentence 16 | def normalize_text(text): 17 | return unicodedata.normalize('NFD', text) 18 | 19 | def space_extend( matchobj): 20 | return ' ' + matchobj.group(0) + ' ' 21 | 22 | # get rid of punctuation stuff and stripping 23 | def pre_proc(text): 24 | text = re.sub(u'-|\u2010|\u2011|\u2012|\u2013|\u2014|\u2015|%|\[|\]|:|\(|\)|/|\t', space_extend, text) 25 | text = text.strip(' \n') 26 | text = re.sub('\s+', ' ', text) 27 | return text 28 | 29 | # get a set of vocabulary 30 | def load_glove_vocab(file, wv_dim, to_lower = True): 31 | glove_vocab = set() 32 | print('Loading glove vocabulary from ' + file) 33 | lineCnt = 0 34 | with open(file, encoding = 'utf-8') as f: 35 | for line in f: 36 | # delete!!! 37 | #if lineCnt == 20000: 38 | # print('delete!') 39 | # break 40 | 41 | lineCnt = lineCnt + 1 42 | if lineCnt % 100000 == 0: 43 | print('.', end = '',flush=True) 44 | elems = line.split() 45 | token = normalize_text(''.join(elems[0:-wv_dim])) 46 | if to_lower: 47 | token = token.lower() 48 | glove_vocab.add(token) 49 | 50 | print('\n') 51 | print('%d words loaded from Glove\n' % len(glove_vocab)) 52 | return glove_vocab 53 | 54 | def token2id(docs, vocab, unk_id=None): 55 | w2id = {w: i for i, w in enumerate(vocab)} 56 | ids = [[w2id[w] if w in w2id else unk_id for w in doc] for doc in docs] 57 | return ids 58 | 59 | def char2id(docs, char_vocab, unk_id=None): 60 | c2id = {c: i for i, c in enumerate(char_vocab)} 61 | ids = [[[c2id[""]] + [c2id[c] if c in c2id else unk_id for c in w] + [c2id[""]] for w in doc] for doc in docs] 62 | return ids 63 | 64 | def removeInvalidChar(sentence): 65 | ordId = list(sentence.encode('utf-8', errors='ignore')) 66 | ordId = [x for x in ordId if x >= 0 and x < 256] 67 | return ''.join([chr(x) for x in ordId]) 68 | 69 | def makeVariable(x, use_cuda): 70 | if use_cuda: 71 | x = x.pin_memory() 72 | #return Variable(x.cuda(async = True), requires_grad = False) 73 | return Variable(x.cuda(non_blocking = True), requires_grad = False) 74 | else: 75 | return Variable(x, requires_grad = False) 76 | 77 | ''' 78 | Input: 79 | nlp is an instance of spacy 80 | sentence is a string 81 | 82 | Output: 83 | A list of tokens, entity and POS tags 84 | ''' 85 | def spacyTokenize(sentence, vocab_ent=None, vocab_tag=None): 86 | sentence = sentence.lower() 87 | sentence = pre_proc(sentence) 88 | raw_tokens = nlp(sentence) 89 | tokens = [normalize_text(token.text) for token in raw_tokens if not token.is_punct | token.is_space] 90 | ent = None 91 | if vocab_ent is not None: 92 | ent = [token2id(token.ent_type_, vocab_ent) + 1 for token in raw_tokens if not token.is_punct | token.is_space] 93 | 94 | tag = None 95 | if vocab_tag is not None: 96 | tag = [token2id(token.tag_, vocab_tag) + 1 for token in raw_tokens if not token.is_punct | token.is_space] 97 | 98 | return tokens, ent, tag 99 | -------------------------------------------------------------------------------- /SDNet/code/SDNet/Utils/Timing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | from datetime import datetime 5 | 6 | timeelapsed = {} 7 | startTime = {} 8 | endTime = {} 9 | 10 | def timerstart(name): 11 | startTime[name] = datetime.now() 12 | 13 | def timerstop(name): 14 | endTime[name] = datetime.now() 15 | if not name in timeelapsed: 16 | timeelapsed[name] = endTime[name] - startTime[name] 17 | else: 18 | timeelapsed[name] += endTime[name] - startTime[name] 19 | 20 | def timerreport(): 21 | total = 0 22 | for name in timeelapsed: 23 | total += timeelapsed[name].total_seconds() 24 | print('') 25 | print('----------------Timer Report----------------------') 26 | for name, value in sorted(timeelapsed.items(), key = lambda item: -item[1].total_seconds()): 27 | print('%s: used time %s, %f%% ' % ('{:20}'.format(name), str(value).split('.')[0], value.total_seconds() / total * 100.0)) 28 | print('--------------------------------------------------') 29 | print('') 30 | -------------------------------------------------------------------------------- /SDNet/code/SDNet/Utils/__pycache__/Arguments.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Utils/__pycache__/Arguments.cpython-37.pyc -------------------------------------------------------------------------------- /SDNet/code/SDNet/Utils/__pycache__/CoQAPreprocess.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Utils/__pycache__/CoQAPreprocess.cpython-37.pyc -------------------------------------------------------------------------------- /SDNet/code/SDNet/Utils/__pycache__/CoQAUtils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Utils/__pycache__/CoQAUtils.cpython-37.pyc -------------------------------------------------------------------------------- /SDNet/code/SDNet/Utils/__pycache__/Constants.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Utils/__pycache__/Constants.cpython-37.pyc -------------------------------------------------------------------------------- /SDNet/code/SDNet/Utils/__pycache__/GeneralUtils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/SDNet/code/SDNet/Utils/__pycache__/GeneralUtils.cpython-37.pyc -------------------------------------------------------------------------------- /SDNet/code/SDNet/conf: -------------------------------------------------------------------------------- 1 | CoQA_TRAIN_FILE data/coqa-train-v1.0.json 2 | CoQA_DEV_FILE data/coqa-dev-v1.0.json 3 | 4 | PREV_ANS 2 5 | PREV_QUES 2 6 | 7 | DROPOUT 0.3 8 | my_dropout_p 0.3 9 | VARIATIONAL_DROPOUT 10 | 11 | BERT 12 | BERT_LARGE 13 | dropout_emb 0.4 14 | 15 | LOCK_BERT 16 | BERT_LINEAR_COMBINE 17 | BERT_tokenizer_file bert-base-cased/bert-base-cased-vocab.txt 18 | BERT_model_file bert-base-cased/ 19 | BERT_large_tokenizer_file bert-large-uncased/bert-large-uncased-vocab.txt 20 | BERT_large_model_file bert-large-uncased/ 21 | 22 | SEED 1033 23 | SPACY_FEATURE 24 | CONTEXT_RNN_HIDDEN_DIM 300 25 | 26 | MAX_WORD_PER_SENTENCE 30 27 | INIT_WORD_EMBEDDING_FILE ../glove/glove.840B.300d.txt 28 | MINI_BATCH 32 29 | EPOCH 30 30 | 31 | QUES_SELF_ATTN 32 | max_len 15 33 | concat_rnn False 34 | grad_clipping 10 35 | do_seq_dropout 36 | tune_partial 1000 37 | embedding_dim 300 38 | prealign_hidden 300 39 | flow_hidden_size 300 40 | query_self_attn_hidden_size 300 41 | pos_dim 12 42 | ent_dim 8 43 | hidden_size 125 44 | deep_att_hidden_size_per_abstr 250 45 | deep_inter_att_use_CoVe 1 46 | in_rnn_layers 2 47 | highlvl_hidden_size 125 48 | question_high_lvl_rnn_layers 1 49 | char_emb_size 8 50 | char_hidden_size 50 -------------------------------------------------------------------------------- /SDNet/code/SDNet/main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | import argparse 5 | import os 6 | import sys 7 | import torch 8 | from Models.SDNetTrainer import SDNetTrainer 9 | from Utils.Arguments import Arguments 10 | 11 | opt = None 12 | 13 | parser = argparse.ArgumentParser(description='SDNet') 14 | parser.add_argument('command', help='Command: train') 15 | parser.add_argument('conf_file', help='Path to conf file.') 16 | 17 | cmdline_args = parser.parse_args() 18 | command = cmdline_args.command 19 | conf_file = cmdline_args.conf_file 20 | conf_args = Arguments(conf_file) 21 | opt = conf_args.readArguments() 22 | opt['cuda'] = torch.cuda.is_available() 23 | opt['confFile'] = conf_file 24 | opt['datadir'] = os.path.dirname(conf_file) # conf_file specifies where the data folder is 25 | 26 | for key,val in cmdline_args.__dict__.items(): 27 | if val is not None and key not in ['command', 'conf_file']: 28 | opt[key] = val 29 | 30 | model = SDNetTrainer(opt) 31 | 32 | print('Select command: ' + command) 33 | model.train() 34 | -------------------------------------------------------------------------------- /SDNet/code/preprocess/prepro.py: -------------------------------------------------------------------------------- 1 | import json 2 | f=open('train_v0.2.json','r') 3 | quacdata=json.load(f) 4 | data=[] 5 | for i in range(len(quacdata['data'])): 6 | temp={} 7 | temp["source"]="wikipedia" 8 | temp["filename"]=quacdata['data'][i]["title"]+".txt" 9 | temp["name"]=temp["filename"] 10 | temp["id"]=quacdata['data'][i]["paragraphs"][0]["id"] 11 | temp["story"]=quacdata['data'][i]["paragraphs"][0]["context"][:-13] 12 | temp["questions"]=[] 13 | temp["answers"]=[] 14 | turn=0 15 | for qa in quacdata['data'][i]["paragraphs"][0]["qas"]: 16 | turn+=1 17 | tempq={} 18 | tempa={} 19 | tempq["turn_id"]=turn 20 | tempq["input_text"]=qa["question"] 21 | temp["questions"].append(tempq) 22 | tempa["span_start"]=qa["orig_answer"]["answer_start"] 23 | tempa["span_end"]=qa["orig_answer"]["answer_start"]+len(qa["orig_answer"]["text"]) 24 | answertext=qa["orig_answer"]["text"] 25 | if answertext=="CANNOTANSWER": 26 | answertext="unknown" 27 | tempa["span_start"]=-1 28 | tempa["span_end"]=-1 29 | tempa["span_text"]=answertext 30 | if qa["yesno"]=="y": 31 | tempa["input_text"]="Yes" 32 | elif qa["yesno"]=="n": 33 | tempa["input_text"]="No" 34 | else: 35 | tempa["input_text"]=answertext 36 | tempa["turn_id"]=turn 37 | temp["answers"].append(tempa) 38 | data.append(temp) 39 | output={} 40 | output["data"]=data 41 | output["version"]="1.0" 42 | outf=open('quac_train_processed.json','w') 43 | json.dump(output, outf ,indent=2 ) 44 | outf.close() 45 | f.close() 46 | 47 | f=open('val_v0.2.json','r') 48 | quacdata=json.load(f) 49 | data=[] 50 | for i in range(len(quacdata['data'])): 51 | temp={} 52 | temp["source"]="wikipedia" 53 | temp["filename"]=quacdata['data'][i]["title"]+".txt" 54 | temp["name"]=temp["filename"] 55 | temp["id"]=quacdata['data'][i]["paragraphs"][0]["id"] 56 | temp["story"]=quacdata['data'][i]["paragraphs"][0]["context"][:-13] 57 | temp["questions"]=[] 58 | temp["answers"]=[] 59 | turn=0 60 | for qa in quacdata['data'][i]["paragraphs"][0]["qas"]: 61 | turn+=1 62 | tempq={} 63 | tempa={} 64 | tempq["turn_id"]=turn 65 | tempq["input_text"]=qa["question"] 66 | temp["questions"].append(tempq) 67 | tempa["span_start"]=qa["orig_answer"]["answer_start"] 68 | tempa["span_end"]=qa["orig_answer"]["answer_start"]+len(qa["orig_answer"]["text"]) 69 | answertext=qa["orig_answer"]["text"] 70 | if answertext=="CANNOTANSWER": 71 | answertext="unknown" 72 | tempa["span_start"]=-1 73 | tempa["span_end"]=-1 74 | tempa["span_text"]=answertext 75 | if qa["yesno"]=="y": 76 | tempa["input_text"]="Yes" 77 | elif qa["yesno"]=="n": 78 | tempa["input_text"]="No" 79 | else: 80 | tempa["input_text"]=answertext 81 | tempa["turn_id"]=turn 82 | temp["answers"].append(tempa) 83 | data.append(temp) 84 | output={} 85 | output["data"]=data 86 | output["version"]="1.0" 87 | outf=open('quac_val_processed.json','w') 88 | json.dump(output, outf ,indent=2 ) 89 | outf.close() 90 | f.close() 91 | -------------------------------------------------------------------------------- /figure/1: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /figure/QA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/figure/QA.png -------------------------------------------------------------------------------- /figure/coref-F1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/figure/coref-F1.png -------------------------------------------------------------------------------- /figure/flow-coref.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/figure/flow-coref.png -------------------------------------------------------------------------------- /figure/quac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/figure/quac.png -------------------------------------------------------------------------------- /figure/task.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepnlp-cs599-usc/quac/c1d5599e3a83b99b0dad563b11fd047380c60416/figure/task.png --------------------------------------------------------------------------------