├── .project ├── .pydevproject ├── .settings └── org.eclipse.core.resources.prefs ├── LICENSE.txt ├── README.md ├── configs ├── quora.sample.config └── snli.sample.config └── src ├── SentenceMatchDataStream.py ├── SentenceMatchDataStream.pyc ├── SentenceMatchDecoder.py ├── SentenceMatchModelGraph.py ├── SentenceMatchModelGraph.pyc ├── SentenceMatchTrainer.py ├── __init__.py ├── layer_utils.py ├── match_utils.py ├── match_utils.pyc ├── my_rnn.py ├── my_rnn.pyc ├── namespace_utils.py ├── namespace_utils.pyc ├── vocab_utils.py └── vocab_utils.pyc /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | BiMPM 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | 15 | org.python.pydev.pythonNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /${PROJECT_DIR_NAME} 5 | /${PROJECT_DIR_NAME}/src 6 | 7 | python 2.7 8 | Default 9 | 10 | /u/zhigwang/.local/lib/python2.7/site-packages 11 | 12 | 13 | -------------------------------------------------------------------------------- /.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/SentenceMatchDecoder.py=utf-8 3 | encoding//src/SentenceMatchTrainer.py=utf-8 4 | encoding//src/vocab_utils.py=utf-8 5 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BiMPM: Bilateral Multi-Perspective Matching for Natural Language Sentences 2 | 3 | ## Updates (Jan 28, 2018) 4 | * This repository has been updated to tensorflow 1.5 5 | * The training process speeds up 15+ times without lossing the accuracy. 6 | * All codes have been re-constructed for better readability and adaptability. 7 | 8 | ## Description 9 | This repository includes the source code for natural language sentence matching. 10 | Basically, the program takes two sentences as input, and predict a label for the two input sentences. 11 | You can use this program to deal with tasks like [paraphrase identification](https://aclweb.org/aclwiki/index.php?title=Paraphrase_Identification_%28State_of_the_art%29), [natural language inference](http://nlp.stanford.edu/projects/snli/), [duplicate questions identification](https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs) et al. More details about the underneath model can be found in our [paper](https://arxiv.org/pdf/1702.03814.pdf) published in IJCAI 2017. Please cite our paper when you use this program! :heart_eyes: 12 | 13 | ## Requirements 14 | * python 2.7 15 | * tensorflow 1.5 16 | 17 | ## Data format 18 | Both the train and test sets require a tab-separated format. 19 | Each line in the train (or test) file corresponds to an instance, and it should be arranged as 20 | > label sentence#1 sentence#2 instanceID 21 | 22 | For more details about the data format, you can download the [SNLI](https://drive.google.com/file/d/1CxjKsaM6YgZPRKmJhNn7WcIC3gISehcS/view?usp=sharing) and the [Quora Question Pair](https://drive.google.com/file/d/0B0PlTAo--BnaQWlsZl9FZ3l1c28/view?usp=sharing) datasets used in our [paper](https://arxiv.org/pdf/1702.03814.pdf). 23 | 24 | 25 | ## Training 26 | You can find the training script at BiMPM/src/SentenceMatchTrainer.py 27 | 28 | First, edit the configuration file at ${workspace}/BiMPM/configs/snli.sample.config (or ${workspace}/BiMPM/configs/quora.sample.config ). 29 | You need to change the "train\_path", "dev\_path", "word\_vec\_path", "model\_dir", "suffix" to your own setting. 30 | 31 | Second, launch job using the following command line 32 | > python ${workspace}/BiMPM/SentenceMatchTrainer.py --config\_path ${workspace}/BiMPM/configs/snli.sample.config 33 | 34 | 35 | ## Testing 36 | You can find the testing script at BiMPM/src/SentenceMatchDecoder.py 37 | > python ${workspace}/BiMPM/src/SentenceMatchDecoder.py --in\_path ${your\_path\_to}/dev.tsv --word\_vec\_path ${your\_path\_to}/wordvec.txt --out\_path ${your\_path\_to}/result.json --model\_prefix ${model\_dir}/SentenceMatch.${suffix} 38 | 39 | Where "model\_dir" and "suffix" are the variables set in your configuration file. 40 | 41 | The output file is a json file with the follwing format. 42 | 43 | ```javascript 44 | { 45 | { 46 | "ID": "instanceID", 47 | "truth": label, 48 | "sent1": sentence1, 49 | "sent2": sentence2, 50 | "prediction": prediciton, 51 | "probs": probs_for_all_possible_labels 52 | }, 53 | { 54 | "ID": "instanceID", 55 | "truth": label, 56 | "sent1": sentence1, 57 | "sent2": sentence2, 58 | "prediction": prediciton, 59 | "probs": probs_for_all_possible_labels 60 | } 61 | } 62 | ``` 63 | 64 | 65 | ## Reporting issues 66 | Please let [me](https://zhiguowang.github.io/) know, if you encounter any problems. 67 | -------------------------------------------------------------------------------- /configs/quora.sample.config: -------------------------------------------------------------------------------- 1 | { 2 | "train_path": "/u/zhigwang/zhigwang1/sentence_match/quora/data/train.tsv", 3 | "dev_path": "/u/zhigwang/zhigwang1/sentence_match/quora/data/dev.tsv", 4 | "word_vec_path": "/u/zhigwang/zhigwang1/sentence_match/quora/wordvec.txt", 5 | "model_dir": "/u/zhigwang/zhigwang1/sentence_match/quora/logs", 6 | "suffix": "quora", 7 | "fix_word_vec": true, 8 | "isLower": true, 9 | "max_sent_length": 50, 10 | "max_char_per_word": 10, 11 | 12 | "with_char": true, 13 | "char_emb_dim": 20, 14 | "char_lstm_dim": 40, 15 | 16 | 17 | "batch_size": 60, 18 | "max_epochs": 20, 19 | "dropout_rate": 0.1, 20 | "learning_rate": 0.0005, 21 | "optimize_type": "adam", 22 | "lambda_l2": 0.0, 23 | "grad_clipper": 10.0, 24 | 25 | "context_layer_num": 1, 26 | "context_lstm_dim": 100, 27 | "aggregation_layer_num": 1, 28 | "aggregation_lstm_dim": 100, 29 | 30 | "with_full_match": true, 31 | "with_maxpool_match": false, 32 | "with_max_attentive_match": false, 33 | "with_attentive_match": true, 34 | 35 | "with_cosine": true, 36 | "with_mp_cosine": true, 37 | "cosine_MP_dim": 5, 38 | 39 | "att_dim": 50, 40 | "att_type": "symmetric", 41 | 42 | "highway_layer_num": 1, 43 | "with_highway": true, 44 | "with_match_highway": true, 45 | "with_aggregation_highway": true, 46 | 47 | "use_cudnn": true, 48 | 49 | "with_moving_average": false 50 | } 51 | -------------------------------------------------------------------------------- /configs/snli.sample.config: -------------------------------------------------------------------------------- 1 | { 2 | "train_path": "/u/zhigwang/zhigwang1/sentence_match/snli/train.tsv", 3 | "dev_path": "/u/zhigwang/zhigwang1/sentence_match/snli/dev.tsv", 4 | "word_vec_path": "/u/zhigwang/zhigwang1/sentence_match/snli/wordvec.txt", 5 | "model_dir": "/u/zhigwang/zhigwang1/sentence_match/snli/logs", 6 | "suffix": "quora", 7 | "fix_word_vec": true, 8 | "isLower": true, 9 | "max_sent_length": 100, 10 | "max_char_per_word": 10, 11 | 12 | "with_char": true, 13 | "char_emb_dim": 20, 14 | "char_lstm_dim": 40, 15 | 16 | "batch_size": 100, 17 | "max_epochs": 10, 18 | "dropout_rate": 0.2, 19 | "learning_rate": 0.001, 20 | "optimize_type": "adam", 21 | "lambda_l2": 0.0, 22 | "grad_clipper": 10.0, 23 | 24 | "context_layer_num": 1, 25 | "context_lstm_dim": 100, 26 | "aggregation_layer_num": 1, 27 | "aggregation_lstm_dim": 100, 28 | 29 | "with_full_match": true, 30 | "with_maxpool_match": false, 31 | "with_max_attentive_match": false, 32 | "with_attentive_match": true, 33 | 34 | "with_cosine": true, 35 | "with_mp_cosine": true, 36 | "cosine_MP_dim": 5, 37 | 38 | "att_dim": 50, 39 | "att_type": "symmetric", 40 | 41 | "highway_layer_num": 1, 42 | "with_highway": true, 43 | "with_match_highway": true, 44 | "with_aggregation_highway": true, 45 | 46 | "use_cudnn": true, 47 | 48 | "with_moving_average": false 49 | } 50 | -------------------------------------------------------------------------------- /src/SentenceMatchDataStream.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import re 3 | 4 | def make_batches(size, batch_size): 5 | nb_batch = int(np.ceil(size/float(batch_size))) 6 | return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)] # zgwang: starting point of each batch 7 | 8 | def pad_2d_vals(in_vals, dim1_size, dim2_size, dtype=np.int32): 9 | out_val = np.zeros((dim1_size, dim2_size), dtype=dtype) 10 | if dim1_size > len(in_vals): dim1_size = len(in_vals) 11 | for i in xrange(dim1_size): 12 | cur_in_vals = in_vals[i] 13 | cur_dim2_size = dim2_size 14 | if cur_dim2_size > len(cur_in_vals): cur_dim2_size = len(cur_in_vals) 15 | out_val[i,:cur_dim2_size] = cur_in_vals[:cur_dim2_size] 16 | return out_val 17 | 18 | def pad_3d_vals(in_vals, dim1_size, dim2_size, dim3_size, dtype=np.int32): 19 | out_val = np.zeros((dim1_size, dim2_size, dim3_size), dtype=dtype) 20 | if dim1_size > len(in_vals): dim1_size = len(in_vals) 21 | for i in xrange(dim1_size): 22 | in_vals_i = in_vals[i] 23 | cur_dim2_size = dim2_size 24 | if cur_dim2_size > len(in_vals_i): cur_dim2_size = len(in_vals_i) 25 | for j in xrange(cur_dim2_size): 26 | in_vals_ij = in_vals_i[j] 27 | cur_dim3_size = dim3_size 28 | if cur_dim3_size > len(in_vals_ij): cur_dim3_size = len(in_vals_ij) 29 | out_val[i, j, :cur_dim3_size] = in_vals_ij[:cur_dim3_size] 30 | return out_val 31 | 32 | 33 | def read_all_instances(inpath, word_vocab=None, label_vocab=None, char_vocab=None, max_sent_length=100, 34 | max_char_per_word=10, isLower=True): 35 | instances = [] 36 | infile = open(inpath, 'rt') 37 | idx = -1 38 | for line in infile: 39 | idx += 1 40 | line = line.decode('utf-8').strip() 41 | if line.startswith('-'): continue 42 | items = re.split("\t", line) 43 | label = items[0] 44 | sentence1 = items[1].strip() 45 | sentence2 = items[2].strip() 46 | cur_ID = "{}".format(idx) 47 | if len(items)>=4: cur_ID = items[3] 48 | if isLower: 49 | sentence1 = sentence1.lower() 50 | sentence2 = sentence2.lower() 51 | if label_vocab is not None: 52 | label_id = label_vocab.getIndex(label) 53 | if label_id >= label_vocab.vocab_size: label_id = 0 54 | else: 55 | label_id = int(label) 56 | word_idx_1 = word_vocab.to_index_sequence(sentence1) 57 | word_idx_2 = word_vocab.to_index_sequence(sentence2) 58 | if char_vocab is not None: 59 | char_matrix_idx_1 = char_vocab.to_character_matrix(sentence1, max_char_per_word=max_char_per_word) 60 | char_matrix_idx_2 = char_vocab.to_character_matrix(sentence2, max_char_per_word=max_char_per_word) 61 | else: 62 | char_matrix_idx_1 = None 63 | char_matrix_idx_2 = None 64 | if len(word_idx_1) > max_sent_length: 65 | word_idx_1 = word_idx_1[:max_sent_length] 66 | if char_vocab is not None: char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length] 67 | if len(word_idx_2) > max_sent_length: 68 | word_idx_2 = word_idx_2[:max_sent_length] 69 | if char_vocab is not None: char_matrix_idx_2 = char_matrix_idx_2[:max_sent_length] 70 | instances.append((label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2, cur_ID)) 71 | infile.close() 72 | return instances 73 | 74 | class SentenceMatchDataStream(object): 75 | def __init__(self, inpath, word_vocab=None, char_vocab=None, label_vocab=None, 76 | isShuffle=False, isLoop=False, isSort=True, options=None): 77 | instances = read_all_instances(inpath, word_vocab=word_vocab, label_vocab=label_vocab, 78 | char_vocab=char_vocab, max_sent_length=options.max_sent_length, max_char_per_word=options.max_char_per_word, 79 | isLower=options.isLower) 80 | 81 | # sort instances based on sentence length 82 | if isSort: instances = sorted(instances, key=lambda instance: (len(instance[4]), len(instance[5]))) # sort instances based on length 83 | self.num_instances = len(instances) 84 | 85 | # distribute into different buckets 86 | batch_spans = make_batches(self.num_instances, options.batch_size) 87 | self.batches = [] 88 | for batch_index, (batch_start, batch_end) in enumerate(batch_spans): 89 | cur_instances = [] 90 | for i in xrange(batch_start, batch_end): 91 | cur_instances.append(instances[i]) 92 | cur_batch = InstanceBatch(cur_instances, with_char=options.with_char) 93 | self.batches.append(cur_batch) 94 | 95 | instances = None 96 | self.num_batch = len(self.batches) 97 | self.index_array = np.arange(self.num_batch) 98 | self.isShuffle = isShuffle 99 | if self.isShuffle: np.random.shuffle(self.index_array) 100 | self.isLoop = isLoop 101 | self.cur_pointer = 0 102 | 103 | def nextBatch(self): 104 | if self.cur_pointer>=self.num_batch: 105 | if not self.isLoop: return None 106 | self.cur_pointer = 0 107 | if self.isShuffle: np.random.shuffle(self.index_array) 108 | # print('{} '.format(self.index_array[self.cur_pointer])) 109 | cur_batch = self.batches[self.index_array[self.cur_pointer]] 110 | self.cur_pointer += 1 111 | return cur_batch 112 | 113 | def shuffle(self): 114 | if self.isShuffle: np.random.shuffle(self.index_array) 115 | 116 | def reset(self): 117 | self.cur_pointer = 0 118 | 119 | def get_num_batch(self): 120 | return self.num_batch 121 | 122 | def get_num_instance(self): 123 | return self.num_instances 124 | 125 | def get_batch(self, i): 126 | if i >= self.num_batch: return None 127 | return self.batches[self.index_array[i]] 128 | 129 | 130 | class InstanceBatch(object): 131 | def __init__(self, instances, with_char=False): 132 | self.instances = instances 133 | self.batch_size = len(instances) 134 | self.question_len = 0 135 | self.passage_len = 0 136 | 137 | self.question_lengths = [] # tf.placeholder(tf.int32, [None]) 138 | self.in_question_words = [] # tf.placeholder(tf.int32, [None, None]) # [batch_size, question_len] 139 | self.passage_lengths = [] # tf.placeholder(tf.int32, [None]) 140 | self.in_passage_words = [] # tf.placeholder(tf.int32, [None, None]) # [batch_size, passage_len] 141 | self.label_truth = [] # [batch_size] 142 | 143 | if with_char: 144 | self.in_question_chars = [] # tf.placeholder(tf.int32, [None, None, None]) # [batch_size, question_len, q_char_len] 145 | self.question_char_lengths = [] # tf.placeholder(tf.int32, [None, None]) # [batch_size, question_len] 146 | self.in_passage_chars = [] # tf.placeholder(tf.int32, [None, None, None]) # [batch_size, passage_len, p_char_len] 147 | self.passage_char_lengths = [] # tf.placeholder(tf.int32, [None, None]) # [batch_size, passage_len] 148 | 149 | for (label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2, cur_ID) in instances: 150 | cur_question_length = len(word_idx_1) 151 | cur_passage_length = len(word_idx_2) 152 | if self.question_len < cur_question_length: self.question_len = cur_question_length 153 | if self.passage_len < cur_passage_length: self.passage_len = cur_passage_length 154 | self.question_lengths.append(cur_question_length) 155 | self.in_question_words.append(word_idx_1) 156 | self.passage_lengths.append(cur_passage_length) 157 | self.in_passage_words.append(word_idx_2) 158 | self.label_truth.append(label_id) 159 | if with_char: 160 | self.in_question_chars.append(char_matrix_idx_1) 161 | self.in_passage_chars.append(char_matrix_idx_2) 162 | self.question_char_lengths.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_1]) 163 | self.passage_char_lengths.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_2]) 164 | 165 | # padding all value into np arrays 166 | self.question_lengths = np.array(self.question_lengths, dtype=np.int32) 167 | self.in_question_words = pad_2d_vals(self.in_question_words, self.batch_size, self.question_len, dtype=np.int32) 168 | self.passage_lengths = np.array(self.passage_lengths, dtype=np.int32) 169 | self.in_passage_words = pad_2d_vals(self.in_passage_words, self.batch_size, self.passage_len, dtype=np.int32) 170 | self.label_truth = np.array(self.label_truth, dtype=np.int32) 171 | if with_char: 172 | max_char_length1 = np.max([np.max(aa) for aa in self.question_char_lengths]) 173 | self.in_question_chars = pad_3d_vals(self.in_question_chars, self.batch_size, self.question_len, 174 | max_char_length1, dtype=np.int32) 175 | max_char_length2 = np.max([np.max(aa) for aa in self.passage_char_lengths]) 176 | self.in_passage_chars = pad_3d_vals(self.in_passage_chars, self.batch_size, self.passage_len, 177 | max_char_length2, dtype=np.int32) 178 | 179 | self.question_char_lengths = pad_2d_vals(self.question_char_lengths, self.batch_size, self.question_len) 180 | self.passage_char_lengths = pad_2d_vals(self.passage_char_lengths, self.batch_size, self.passage_len) 181 | -------------------------------------------------------------------------------- /src/SentenceMatchDataStream.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiguowang/BiMPM/33cc8fe5d450f432a6843bc05cad29c6ce9f5714/src/SentenceMatchDataStream.pyc -------------------------------------------------------------------------------- /src/SentenceMatchDecoder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import print_function 3 | import argparse 4 | import sys 5 | from vocab_utils import Vocab 6 | import namespace_utils 7 | 8 | import tensorflow as tf 9 | import SentenceMatchTrainer 10 | from SentenceMatchModelGraph import SentenceMatchModelGraph 11 | from SentenceMatchDataStream import SentenceMatchDataStream 12 | 13 | 14 | if __name__ == '__main__': 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--model_prefix', type=str, required=True, help='Prefix to the models.') 17 | parser.add_argument('--in_path', type=str, required=True, help='the path to the test file.') 18 | parser.add_argument('--out_path', type=str, required=True, help='The path to the output file.') 19 | parser.add_argument('--word_vec_path', type=str, help='word embedding file for the input file.') 20 | 21 | args, unparsed = parser.parse_known_args() 22 | 23 | # load the configuration file 24 | print('Loading configurations.') 25 | options = namespace_utils.load_namespace(args.model_prefix + ".config.json") 26 | 27 | if args.word_vec_path is None: args.word_vec_path = options.word_vec_path 28 | 29 | 30 | # load vocabs 31 | print('Loading vocabs.') 32 | word_vocab = Vocab(args.word_vec_path, fileformat='txt3') 33 | label_vocab = Vocab(args.model_prefix + ".label_vocab", fileformat='txt2') 34 | print('word_vocab: {}'.format(word_vocab.word_vecs.shape)) 35 | print('label_vocab: {}'.format(label_vocab.word_vecs.shape)) 36 | num_classes = label_vocab.size() 37 | 38 | if options.with_char: 39 | char_vocab = Vocab(args.model_prefix + ".char_vocab", fileformat='txt2') 40 | print('char_vocab: {}'.format(char_vocab.word_vecs.shape)) 41 | 42 | print('Build SentenceMatchDataStream ... ') 43 | testDataStream = SentenceMatchDataStream(args.in_path, word_vocab=word_vocab, char_vocab=char_vocab, 44 | label_vocab=label_vocab, 45 | isShuffle=False, isLoop=True, isSort=True, options=options) 46 | print('Number of instances in devDataStream: {}'.format(testDataStream.get_num_instance())) 47 | print('Number of batches in devDataStream: {}'.format(testDataStream.get_num_batch())) 48 | sys.stdout.flush() 49 | 50 | best_path = args.model_prefix + ".best.model" 51 | init_scale = 0.01 52 | with tf.Graph().as_default(): 53 | initializer = tf.random_uniform_initializer(-init_scale, init_scale) 54 | global_step = tf.train.get_or_create_global_step() 55 | with tf.variable_scope("Model", reuse=False, initializer=initializer): 56 | valid_graph = SentenceMatchModelGraph(num_classes, word_vocab=word_vocab, char_vocab=char_vocab, 57 | is_training=False, options=options) 58 | 59 | initializer = tf.global_variables_initializer() 60 | vars_ = {} 61 | for var in tf.global_variables(): 62 | if "word_embedding" in var.name: continue 63 | if not var.name.startswith("Model"): continue 64 | vars_[var.name.split(":")[0]] = var 65 | saver = tf.train.Saver(vars_) 66 | 67 | sess = tf.Session() 68 | sess.run(initializer) 69 | print("Restoring model from " + best_path) 70 | saver.restore(sess, best_path) 71 | print("DONE!") 72 | acc = SentenceMatchTrainer.evaluation(sess, valid_graph, testDataStream, outpath=args.out_path, 73 | label_vocab=label_vocab) 74 | print("Accuracy for test set is %.2f" % acc) 75 | 76 | 77 | -------------------------------------------------------------------------------- /src/SentenceMatchModelGraph.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import layer_utils 3 | import match_utils 4 | 5 | 6 | class SentenceMatchModelGraph(object): 7 | def __init__(self, num_classes, word_vocab=None, char_vocab=None, is_training=True, options=None, global_step=None): 8 | self.options = options 9 | self.create_placeholders() 10 | self.create_model_graph(num_classes, word_vocab, char_vocab, is_training, global_step=global_step) 11 | 12 | def create_placeholders(self): 13 | self.question_lengths = tf.placeholder(tf.int32, [None]) 14 | self.passage_lengths = tf.placeholder(tf.int32, [None]) 15 | self.truth = tf.placeholder(tf.int32, [None]) # [batch_size] 16 | self.in_question_words = tf.placeholder(tf.int32, [None, None]) # [batch_size, question_len] 17 | self.in_passage_words = tf.placeholder(tf.int32, [None, None]) # [batch_size, passage_len] 18 | 19 | if self.options.with_char: 20 | self.question_char_lengths = tf.placeholder(tf.int32, [None,None]) # [batch_size, question_len] 21 | self.passage_char_lengths = tf.placeholder(tf.int32, [None,None]) # [batch_size, passage_len] 22 | self.in_question_chars = tf.placeholder(tf.int32, [None, None, None]) # [batch_size, question_len, q_char_len] 23 | self.in_passage_chars = tf.placeholder(tf.int32, [None, None, None]) # [batch_size, passage_len, p_char_len] 24 | 25 | def create_feed_dict(self, cur_batch, is_training=False): 26 | feed_dict = { 27 | self.question_lengths: cur_batch.question_lengths, 28 | self.passage_lengths: cur_batch.passage_lengths, 29 | self.in_question_words: cur_batch.in_question_words, 30 | self.in_passage_words: cur_batch.in_passage_words, 31 | self.truth : cur_batch.label_truth, 32 | } 33 | 34 | if self.options.with_char: 35 | feed_dict[self.question_char_lengths] = cur_batch.question_char_lengths 36 | feed_dict[self.passage_char_lengths] = cur_batch.passage_char_lengths 37 | feed_dict[self.in_question_chars] = cur_batch.in_question_chars 38 | feed_dict[self.in_passage_chars] = cur_batch.in_passage_chars 39 | 40 | return feed_dict 41 | 42 | 43 | def create_model_graph(self, num_classes, word_vocab=None, char_vocab=None, is_training=True, global_step=None): 44 | options = self.options 45 | # ======word representation layer====== 46 | in_question_repres = [] 47 | in_passage_repres = [] 48 | input_dim = 0 49 | if word_vocab is not None: 50 | word_vec_trainable = True 51 | cur_device = '/gpu:0' 52 | if options.fix_word_vec: 53 | word_vec_trainable = False 54 | cur_device = '/cpu:0' 55 | with tf.device(cur_device): 56 | self.word_embedding = tf.get_variable("word_embedding", trainable=word_vec_trainable, 57 | initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) 58 | 59 | in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] 60 | in_passage_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] 61 | in_question_repres.append(in_question_word_repres) 62 | in_passage_repres.append(in_passage_word_repres) 63 | 64 | input_shape = tf.shape(self.in_question_words) 65 | batch_size = input_shape[0] 66 | question_len = input_shape[1] 67 | input_shape = tf.shape(self.in_passage_words) 68 | passage_len = input_shape[1] 69 | input_dim += word_vocab.word_dim 70 | 71 | if options.with_char and char_vocab is not None: 72 | input_shape = tf.shape(self.in_question_chars) 73 | batch_size = input_shape[0] 74 | question_len = input_shape[1] 75 | q_char_len = input_shape[2] 76 | input_shape = tf.shape(self.in_passage_chars) 77 | passage_len = input_shape[1] 78 | p_char_len = input_shape[2] 79 | char_dim = char_vocab.word_dim 80 | self.char_embedding = tf.get_variable("char_embedding", initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32) 81 | 82 | in_question_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_question_chars) # [batch_size, question_len, q_char_len, char_dim] 83 | in_question_char_repres = tf.reshape(in_question_char_repres, shape=[-1, q_char_len, char_dim]) 84 | question_char_lengths = tf.reshape(self.question_char_lengths, [-1]) 85 | quesiton_char_mask = tf.sequence_mask(question_char_lengths, q_char_len, dtype=tf.float32) # [batch_size*question_len, q_char_len] 86 | in_question_char_repres = tf.multiply(in_question_char_repres, tf.expand_dims(quesiton_char_mask, axis=-1)) 87 | 88 | 89 | in_passage_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_passage_chars) # [batch_size, passage_len, p_char_len, char_dim] 90 | in_passage_char_repres = tf.reshape(in_passage_char_repres, shape=[-1, p_char_len, char_dim]) 91 | passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1]) 92 | passage_char_mask = tf.sequence_mask(passage_char_lengths, p_char_len, dtype=tf.float32) # [batch_size*passage_len, p_char_len] 93 | in_passage_char_repres = tf.multiply(in_passage_char_repres, tf.expand_dims(passage_char_mask, axis=-1)) 94 | 95 | (question_char_outputs_fw, question_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_question_char_repres, options.char_lstm_dim, 96 | input_lengths=question_char_lengths,scope_name="char_lstm", reuse=False, 97 | is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) 98 | question_char_outputs_fw = layer_utils.collect_final_step_of_lstm(question_char_outputs_fw, question_char_lengths - 1) 99 | question_char_outputs_bw = question_char_outputs_bw[:, 0, :] 100 | question_char_outputs = tf.concat(axis=1, values=[question_char_outputs_fw, question_char_outputs_bw]) 101 | question_char_outputs = tf.reshape(question_char_outputs, [batch_size, question_len, 2*options.char_lstm_dim]) 102 | 103 | (passage_char_outputs_fw, passage_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_passage_char_repres, options.char_lstm_dim, 104 | input_lengths=passage_char_lengths, scope_name="char_lstm", reuse=True, 105 | is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) 106 | passage_char_outputs_fw = layer_utils.collect_final_step_of_lstm(passage_char_outputs_fw, passage_char_lengths - 1) 107 | passage_char_outputs_bw = passage_char_outputs_bw[:, 0, :] 108 | passage_char_outputs = tf.concat(axis=1, values=[passage_char_outputs_fw, passage_char_outputs_bw]) 109 | passage_char_outputs = tf.reshape(passage_char_outputs, [batch_size, passage_len, 2*options.char_lstm_dim]) 110 | 111 | in_question_repres.append(question_char_outputs) 112 | in_passage_repres.append(passage_char_outputs) 113 | 114 | input_dim += 2*options.char_lstm_dim 115 | 116 | in_question_repres = tf.concat(axis=2, values=in_question_repres) # [batch_size, question_len, dim] 117 | in_passage_repres = tf.concat(axis=2, values=in_passage_repres) # [batch_size, passage_len, dim] 118 | 119 | if is_training: 120 | in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate)) 121 | in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) 122 | 123 | mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] 124 | question_mask = tf.sequence_mask(self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] 125 | 126 | # ======Highway layer====== 127 | if options.with_highway: 128 | with tf.variable_scope("input_highway"): 129 | in_question_repres = match_utils.multi_highway_layer(in_question_repres, input_dim, options.highway_layer_num) 130 | tf.get_variable_scope().reuse_variables() 131 | in_passage_repres = match_utils.multi_highway_layer(in_passage_repres, input_dim, options.highway_layer_num) 132 | 133 | # in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1)) 134 | # in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(mask, axis=-1)) 135 | 136 | # ========Bilateral Matching===== 137 | (match_representation, match_dim) = match_utils.bilateral_match_func(in_question_repres, in_passage_repres, 138 | self.question_lengths, self.passage_lengths, question_mask, mask, input_dim, is_training, options=options) 139 | 140 | #========Prediction Layer========= 141 | # match_dim = 4 * self.options.aggregation_lstm_dim 142 | w_0 = tf.get_variable("w_0", [match_dim, match_dim/2], dtype=tf.float32) 143 | b_0 = tf.get_variable("b_0", [match_dim/2], dtype=tf.float32) 144 | w_1 = tf.get_variable("w_1", [match_dim/2, num_classes],dtype=tf.float32) 145 | b_1 = tf.get_variable("b_1", [num_classes],dtype=tf.float32) 146 | 147 | # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate)) 148 | logits = tf.matmul(match_representation, w_0) + b_0 149 | logits = tf.tanh(logits) 150 | if is_training: logits = tf.nn.dropout(logits, (1 - options.dropout_rate)) 151 | logits = tf.matmul(logits, w_1) + b_1 152 | 153 | self.prob = tf.nn.softmax(logits) 154 | 155 | gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) 156 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) 157 | 158 | correct = tf.nn.in_top_k(logits, self.truth, 1) 159 | self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) 160 | self.predictions = tf.argmax(self.prob, 1) 161 | 162 | if not is_training: return 163 | 164 | tvars = tf.trainable_variables() 165 | if self.options.lambda_l2>0.0: 166 | l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) 167 | self.loss = self.loss + self.options.lambda_l2 * l2_loss 168 | 169 | if self.options.optimize_type == 'adadelta': 170 | optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.options.learning_rate) 171 | elif self.options.optimize_type == 'adam': 172 | optimizer = tf.train.AdamOptimizer(learning_rate=self.options.learning_rate) 173 | 174 | grads = layer_utils.compute_gradients(self.loss, tvars) 175 | grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper) 176 | self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) 177 | # self.train_op = optimizer.apply_gradients(zip(grads, tvars)) 178 | 179 | if self.options.with_moving_average: 180 | # Track the moving averages of all trainable variables. 181 | MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. 182 | variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step) 183 | variables_averages_op = variable_averages.apply(tf.trainable_variables()) 184 | train_ops = [self.train_op, variables_averages_op] 185 | self.train_op = tf.group(*train_ops) 186 | 187 | -------------------------------------------------------------------------------- /src/SentenceMatchModelGraph.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiguowang/BiMPM/33cc8fe5d450f432a6843bc05cad29c6ce9f5714/src/SentenceMatchModelGraph.pyc -------------------------------------------------------------------------------- /src/SentenceMatchTrainer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import print_function 3 | import argparse 4 | import os 5 | import sys 6 | import time 7 | import re 8 | import tensorflow as tf 9 | import json 10 | 11 | from vocab_utils import Vocab 12 | from SentenceMatchDataStream import SentenceMatchDataStream 13 | from SentenceMatchModelGraph import SentenceMatchModelGraph 14 | import namespace_utils 15 | 16 | def collect_vocabs(train_path, with_POS=False, with_NER=False): 17 | all_labels = set() 18 | all_words = set() 19 | all_POSs = None 20 | all_NERs = None 21 | if with_POS: all_POSs = set() 22 | if with_NER: all_NERs = set() 23 | infile = open(train_path, 'rt') 24 | for line in infile: 25 | line = line.decode('utf-8').strip() 26 | if line.startswith('-'): continue 27 | items = re.split("\t", line) 28 | label = items[0] 29 | sentence1 = re.split("\\s+",items[1].lower()) 30 | sentence2 = re.split("\\s+",items[2].lower()) 31 | all_labels.add(label) 32 | all_words.update(sentence1) 33 | all_words.update(sentence2) 34 | if with_POS: 35 | all_POSs.update(re.split("\\s+",items[3])) 36 | all_POSs.update(re.split("\\s+",items[4])) 37 | if with_NER: 38 | all_NERs.update(re.split("\\s+",items[5])) 39 | all_NERs.update(re.split("\\s+",items[6])) 40 | infile.close() 41 | 42 | all_chars = set() 43 | for word in all_words: 44 | for char in word: 45 | all_chars.add(char) 46 | return (all_words, all_chars, all_labels, all_POSs, all_NERs) 47 | 48 | def output_probs(probs, label_vocab): 49 | out_string = "" 50 | for i in xrange(probs.size): 51 | out_string += " {}:{}".format(label_vocab.getWord(i), probs[i]) 52 | return out_string.strip() 53 | 54 | def evaluation(sess, valid_graph, devDataStream, outpath=None, label_vocab=None): 55 | if outpath is not None: 56 | result_json = {} 57 | total = 0 58 | correct = 0 59 | for batch_index in xrange(devDataStream.get_num_batch()): # for each batch 60 | cur_batch = devDataStream.get_batch(batch_index) 61 | total += cur_batch.batch_size 62 | feed_dict = valid_graph.create_feed_dict(cur_batch, is_training=True) 63 | [cur_correct, probs, predictions] = sess.run([valid_graph.eval_correct, valid_graph.prob, valid_graph.predictions], feed_dict=feed_dict) 64 | correct += cur_correct 65 | if outpath is not None: 66 | for i in xrange(cur_batch.batch_size): 67 | (label, sentence1, sentence2, _, _, _, _, _, cur_ID) = cur_batch.instances[i] 68 | result_json[cur_ID] = { 69 | "ID": cur_ID, 70 | "truth": label, 71 | "sent1": sentence1, 72 | "sent2": sentence2, 73 | "prediction": label_vocab.getWord(predictions[i]), 74 | "probs": output_probs(probs[i], label_vocab), 75 | } 76 | accuracy = correct / float(total) * 100 77 | if outpath is not None: 78 | with open(outpath, 'w') as outfile: 79 | json.dump(result_json, outfile) 80 | return accuracy 81 | 82 | def train(sess, saver, train_graph, valid_graph, trainDataStream, devDataStream, options, best_path): 83 | best_accuracy = -1 84 | for epoch in range(options.max_epochs): 85 | print('Train in epoch %d' % epoch) 86 | # training 87 | trainDataStream.shuffle() 88 | num_batch = trainDataStream.get_num_batch() 89 | start_time = time.time() 90 | total_loss = 0 91 | for batch_index in xrange(num_batch): # for each batch 92 | cur_batch = trainDataStream.get_batch(batch_index) 93 | feed_dict = train_graph.create_feed_dict(cur_batch, is_training=True) 94 | _, loss_value = sess.run([train_graph.train_op, train_graph.loss], feed_dict=feed_dict) 95 | total_loss += loss_value 96 | if batch_index % 100 == 0: 97 | print('{} '.format(batch_index), end="") 98 | sys.stdout.flush() 99 | 100 | print() 101 | duration = time.time() - start_time 102 | print('Epoch %d: loss = %.4f (%.3f sec)' % (epoch, total_loss / num_batch, duration)) 103 | # evaluation 104 | start_time = time.time() 105 | acc = evaluation(sess, valid_graph, devDataStream) 106 | duration = time.time() - start_time 107 | print("Accuracy: %.2f" % acc) 108 | print('Evaluation time: %.3f sec' % (duration)) 109 | if acc>= best_accuracy: 110 | best_accuracy = acc 111 | saver.save(sess, best_path) 112 | 113 | 114 | def main(FLAGS): 115 | train_path = FLAGS.train_path 116 | dev_path = FLAGS.dev_path 117 | word_vec_path = FLAGS.word_vec_path 118 | log_dir = FLAGS.model_dir 119 | if not os.path.exists(log_dir): 120 | os.makedirs(log_dir) 121 | 122 | path_prefix = log_dir + "/SentenceMatch.{}".format(FLAGS.suffix) 123 | 124 | namespace_utils.save_namespace(FLAGS, path_prefix + ".config.json") 125 | 126 | # build vocabs 127 | word_vocab = Vocab(word_vec_path, fileformat='txt3') 128 | 129 | best_path = path_prefix + '.best.model' 130 | char_path = path_prefix + ".char_vocab" 131 | label_path = path_prefix + ".label_vocab" 132 | has_pre_trained_model = False 133 | char_vocab = None 134 | if os.path.exists(best_path + ".index"): 135 | has_pre_trained_model = True 136 | print('Loading vocabs from a pre-trained model ...') 137 | label_vocab = Vocab(label_path, fileformat='txt2') 138 | if FLAGS.with_char: char_vocab = Vocab(char_path, fileformat='txt2') 139 | else: 140 | print('Collecting words, chars and labels ...') 141 | (all_words, all_chars, all_labels, all_POSs, all_NERs) = collect_vocabs(train_path) 142 | print('Number of words: {}'.format(len(all_words))) 143 | label_vocab = Vocab(fileformat='voc', voc=all_labels,dim=2) 144 | label_vocab.dump_to_txt2(label_path) 145 | 146 | if FLAGS.with_char: 147 | print('Number of chars: {}'.format(len(all_chars))) 148 | char_vocab = Vocab(fileformat='voc', voc=all_chars,dim=FLAGS.char_emb_dim) 149 | char_vocab.dump_to_txt2(char_path) 150 | 151 | print('word_vocab shape is {}'.format(word_vocab.word_vecs.shape)) 152 | num_classes = label_vocab.size() 153 | print("Number of labels: {}".format(num_classes)) 154 | sys.stdout.flush() 155 | 156 | print('Build SentenceMatchDataStream ... ') 157 | trainDataStream = SentenceMatchDataStream(train_path, word_vocab=word_vocab, char_vocab=char_vocab, label_vocab=label_vocab, 158 | isShuffle=True, isLoop=True, isSort=True, options=FLAGS) 159 | print('Number of instances in trainDataStream: {}'.format(trainDataStream.get_num_instance())) 160 | print('Number of batches in trainDataStream: {}'.format(trainDataStream.get_num_batch())) 161 | sys.stdout.flush() 162 | 163 | devDataStream = SentenceMatchDataStream(dev_path, word_vocab=word_vocab, char_vocab=char_vocab, label_vocab=label_vocab, 164 | isShuffle=False, isLoop=True, isSort=True, options=FLAGS) 165 | print('Number of instances in devDataStream: {}'.format(devDataStream.get_num_instance())) 166 | print('Number of batches in devDataStream: {}'.format(devDataStream.get_num_batch())) 167 | sys.stdout.flush() 168 | 169 | init_scale = 0.01 170 | with tf.Graph().as_default(): 171 | initializer = tf.random_uniform_initializer(-init_scale, init_scale) 172 | global_step = tf.train.get_or_create_global_step() 173 | with tf.variable_scope("Model", reuse=None, initializer=initializer): 174 | train_graph = SentenceMatchModelGraph(num_classes, word_vocab=word_vocab, char_vocab=char_vocab, 175 | is_training=True, options=FLAGS, global_step=global_step) 176 | 177 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 178 | valid_graph = SentenceMatchModelGraph(num_classes, word_vocab=word_vocab, char_vocab=char_vocab, 179 | is_training=False, options=FLAGS) 180 | 181 | 182 | initializer = tf.global_variables_initializer() 183 | vars_ = {} 184 | for var in tf.global_variables(): 185 | if "word_embedding" in var.name: continue 186 | # if not var.name.startswith("Model"): continue 187 | vars_[var.name.split(":")[0]] = var 188 | saver = tf.train.Saver(vars_) 189 | 190 | sess = tf.Session() 191 | sess.run(initializer) 192 | if has_pre_trained_model: 193 | print("Restoring model from " + best_path) 194 | saver.restore(sess, best_path) 195 | print("DONE!") 196 | 197 | # training 198 | train(sess, saver, train_graph, valid_graph, trainDataStream, devDataStream, FLAGS, best_path) 199 | 200 | def enrich_options(options): 201 | if not options.__dict__.has_key("in_format"): 202 | options.__dict__["in_format"] = 'tsv' 203 | 204 | return options 205 | 206 | if __name__ == '__main__': 207 | parser = argparse.ArgumentParser() 208 | parser.add_argument('--train_path', type=str, help='Path to the train set.') 209 | parser.add_argument('--dev_path', type=str, help='Path to the dev set.') 210 | parser.add_argument('--test_path', type=str, help='Path to the test set.') 211 | parser.add_argument('--word_vec_path', type=str, help='Path the to pre-trained word vector model.') 212 | parser.add_argument('--model_dir', type=str, help='Directory to save model files.') 213 | parser.add_argument('--batch_size', type=int, default=60, help='Number of instances in each batch.') 214 | parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning rate.') 215 | parser.add_argument('--lambda_l2', type=float, default=0.0, help='The coefficient of L2 regularizer.') 216 | parser.add_argument('--dropout_rate', type=float, default=0.1, help='Dropout ratio.') 217 | parser.add_argument('--max_epochs', type=int, default=10, help='Maximum epochs for training.') 218 | parser.add_argument('--optimize_type', type=str, default='adam', help='Optimizer type.') 219 | parser.add_argument('--char_emb_dim', type=int, default=20, help='Number of dimension for character embeddings.') 220 | parser.add_argument('--char_lstm_dim', type=int, default=100, help='Number of dimension for character-composed embeddings.') 221 | parser.add_argument('--context_lstm_dim', type=int, default=100, help='Number of dimension for context representation layer.') 222 | parser.add_argument('--aggregation_lstm_dim', type=int, default=100, help='Number of dimension for aggregation layer.') 223 | parser.add_argument('--max_char_per_word', type=int, default=10, help='Maximum number of characters for each word.') 224 | parser.add_argument('--max_sent_length', type=int, default=100, help='Maximum number of words within each sentence.') 225 | parser.add_argument('--aggregation_layer_num', type=int, default=1, help='Number of LSTM layers for aggregation layer.') 226 | parser.add_argument('--context_layer_num', type=int, default=1, help='Number of LSTM layers for context representation layer.') 227 | parser.add_argument('--highway_layer_num', type=int, default=1, help='Number of highway layers.') 228 | parser.add_argument('--suffix', type=str, default='normal', help='Suffix of the model name.') 229 | parser.add_argument('--fix_word_vec', default=False, help='Fix pre-trained word embeddings during training.', action='store_true') 230 | parser.add_argument('--with_highway', default=False, help='Utilize highway layers.', action='store_true') 231 | parser.add_argument('--with_match_highway', default=False, help='Utilize highway layers for matching layer.', action='store_true') 232 | parser.add_argument('--with_aggregation_highway', default=False, help='Utilize highway layers for aggregation layer.', action='store_true') 233 | parser.add_argument('--with_full_match', default=False, help='With full matching.', action='store_true') 234 | parser.add_argument('--with_maxpool_match', default=False, help='With maxpooling matching', action='store_true') 235 | parser.add_argument('--with_attentive_match', default=False, help='With attentive matching', action='store_true') 236 | parser.add_argument('--with_max_attentive_match', default=False, help='With max attentive matching.', action='store_true') 237 | parser.add_argument('--with_char', default=False, help='With character-composed embeddings.', action='store_true') 238 | 239 | parser.add_argument('--config_path', type=str, help='Configuration file.') 240 | 241 | # print("CUDA_VISIBLE_DEVICES " + os.environ['CUDA_VISIBLE_DEVICES']) 242 | args, unparsed = parser.parse_known_args() 243 | if args.config_path is not None: 244 | print('Loading the configuration from ' + args.config_path) 245 | FLAGS = namespace_utils.load_namespace(args.config_path) 246 | else: 247 | FLAGS = args 248 | sys.stdout.flush() 249 | 250 | # enrich arguments to backwards compatibility 251 | FLAGS = enrich_options(FLAGS) 252 | 253 | main(FLAGS) 254 | 255 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiguowang/BiMPM/33cc8fe5d450f432a6843bc05cad29c6ce9f5714/src/__init__.py -------------------------------------------------------------------------------- /src/layer_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python.ops import nn_ops 3 | 4 | def my_lstm_layer(input_reps, lstm_dim, input_lengths=None, scope_name=None, reuse=False, is_training=True, 5 | dropout_rate=0.2, use_cudnn=True): 6 | ''' 7 | :param inputs: [batch_size, seq_len, feature_dim] 8 | :param lstm_dim: 9 | :param scope_name: 10 | :param reuse: 11 | :param is_training: 12 | :param dropout_rate: 13 | :return: 14 | ''' 15 | input_reps = dropout_layer(input_reps, dropout_rate, is_training=is_training) 16 | with tf.variable_scope(scope_name, reuse=reuse): 17 | if use_cudnn: 18 | inputs = tf.transpose(input_reps, [1, 0, 2]) 19 | lstm = tf.contrib.cudnn_rnn.CudnnLSTM(1, lstm_dim, direction="bidirectional", 20 | name="{}_cudnn_bi_lstm".format(scope_name), dropout=dropout_rate if is_training else 0) 21 | outputs, _ = lstm(inputs) 22 | outputs = tf.transpose(outputs, [1, 0, 2]) 23 | f_rep = outputs[:, :, 0:lstm_dim] 24 | b_rep = outputs[:, :, lstm_dim:2*lstm_dim] 25 | else: 26 | context_lstm_cell_fw = tf.nn.rnn_cell.BasicLSTMCell(lstm_dim) 27 | context_lstm_cell_bw = tf.nn.rnn_cell.BasicLSTMCell(lstm_dim) 28 | if is_training: 29 | context_lstm_cell_fw = tf.nn.rnn_cell.DropoutWrapper(context_lstm_cell_fw, output_keep_prob=(1 - dropout_rate)) 30 | context_lstm_cell_bw = tf.nn.rnn_cell.DropoutWrapper(context_lstm_cell_bw, output_keep_prob=(1 - dropout_rate)) 31 | context_lstm_cell_fw = tf.nn.rnn_cell.MultiRNNCell([context_lstm_cell_fw]) 32 | context_lstm_cell_bw = tf.nn.rnn_cell.MultiRNNCell([context_lstm_cell_bw]) 33 | 34 | (f_rep, b_rep), _ = tf.nn.bidirectional_dynamic_rnn( 35 | context_lstm_cell_fw, context_lstm_cell_bw, input_reps, dtype=tf.float32, 36 | sequence_length=input_lengths) # [batch_size, question_len, context_lstm_dim] 37 | outputs = tf.concat(axis=2, values=[f_rep, b_rep]) 38 | return (f_rep,b_rep, outputs) 39 | 40 | def dropout_layer(input_reps, dropout_rate, is_training=True): 41 | if is_training: 42 | output_repr = tf.nn.dropout(input_reps, (1 - dropout_rate)) 43 | else: 44 | output_repr = input_reps 45 | return output_repr 46 | 47 | def cosine_distance(y1,y2, cosine_norm=True, eps=1e-6): 48 | # cosine_norm = True 49 | # y1 [....,a, 1, d] 50 | # y2 [....,1, b, d] 51 | cosine_numerator = tf.reduce_sum(tf.multiply(y1, y2), axis=-1) 52 | if not cosine_norm: 53 | return tf.tanh(cosine_numerator) 54 | y1_norm = tf.sqrt(tf.maximum(tf.reduce_sum(tf.square(y1), axis=-1), eps)) 55 | y2_norm = tf.sqrt(tf.maximum(tf.reduce_sum(tf.square(y2), axis=-1), eps)) 56 | return cosine_numerator / y1_norm / y2_norm 57 | 58 | def euclidean_distance(y1, y2, eps=1e-6): 59 | distance = tf.sqrt(tf.maximum(tf.reduce_sum(tf.square(y1 - y2), axis=-1), eps)) 60 | return distance 61 | 62 | def cross_entropy(logits, truth, mask=None): 63 | # logits: [batch_size, passage_len] 64 | # truth: [batch_size, passage_len] 65 | # mask: [batch_size, passage_len] 66 | if mask is not None: logits = tf.multiply(logits, mask) 67 | xdev = tf.subtract(logits, tf.expand_dims(tf.reduce_max(logits, 1), -1)) 68 | log_predictions = tf.subtract(xdev, tf.expand_dims(tf.log(tf.reduce_sum(tf.exp(xdev),-1)),-1)) 69 | result = tf.multiply(truth, log_predictions) # [batch_size, passage_len] 70 | if mask is not None: result = tf.multiply(result, mask) # [batch_size, passage_len] 71 | return tf.multiply(-1.0,tf.reduce_sum(result, -1)) # [batch_size] 72 | 73 | def projection_layer(in_val, input_size, output_size, activation_func=tf.tanh, scope=None): 74 | # in_val: [batch_size, passage_len, dim] 75 | input_shape = tf.shape(in_val) 76 | batch_size = input_shape[0] 77 | passage_len = input_shape[1] 78 | # feat_dim = input_shape[2] 79 | in_val = tf.reshape(in_val, [batch_size * passage_len, input_size]) 80 | with tf.variable_scope(scope or "projection_layer"): 81 | full_w = tf.get_variable("full_w", [input_size, output_size], dtype=tf.float32) 82 | full_b = tf.get_variable("full_b", [output_size], dtype=tf.float32) 83 | outputs = activation_func(tf.nn.xw_plus_b(in_val, full_w, full_b)) 84 | outputs = tf.reshape(outputs, [batch_size, passage_len, output_size]) 85 | return outputs # [batch_size, passage_len, output_size] 86 | 87 | def highway_layer(in_val, output_size, activation_func=tf.tanh, scope=None): 88 | # in_val: [batch_size, passage_len, dim] 89 | input_shape = tf.shape(in_val) 90 | batch_size = input_shape[0] 91 | passage_len = input_shape[1] 92 | # feat_dim = input_shape[2] 93 | in_val = tf.reshape(in_val, [batch_size * passage_len, output_size]) 94 | with tf.variable_scope(scope or "highway_layer"): 95 | highway_w = tf.get_variable("highway_w", [output_size, output_size], dtype=tf.float32) 96 | highway_b = tf.get_variable("highway_b", [output_size], dtype=tf.float32) 97 | full_w = tf.get_variable("full_w", [output_size, output_size], dtype=tf.float32) 98 | full_b = tf.get_variable("full_b", [output_size], dtype=tf.float32) 99 | trans = activation_func(tf.nn.xw_plus_b(in_val, full_w, full_b)) 100 | gate = tf.nn.sigmoid(tf.nn.xw_plus_b(in_val, highway_w, highway_b)) 101 | outputs = tf.add(tf.multiply(trans, gate), tf.multiply(in_val, tf.subtract(1.0, gate)), "y") 102 | outputs = tf.reshape(outputs, [batch_size, passage_len, output_size]) 103 | return outputs 104 | 105 | def multi_highway_layer(in_val, output_size, num_layers, activation_func=tf.tanh, scope_name=None, reuse=False): 106 | with tf.variable_scope(scope_name, reuse=reuse): 107 | for i in xrange(num_layers): 108 | cur_scope_name = scope_name + "-{}".format(i) 109 | in_val = highway_layer(in_val, output_size,activation_func=activation_func, scope=cur_scope_name) 110 | return in_val 111 | 112 | def collect_representation(representation, positions): 113 | # representation: [batch_size, node_num, feature_dim] 114 | # positions: [batch_size, neigh_num] 115 | return collect_probs(representation, positions) 116 | 117 | def collect_final_step_of_lstm(lstm_representation, lengths): 118 | # lstm_representation: [batch_size, passsage_length, dim] 119 | # lengths: [batch_size] 120 | lengths = tf.maximum(lengths, tf.zeros_like(lengths, dtype=tf.int32)) 121 | 122 | batch_size = tf.shape(lengths)[0] 123 | batch_nums = tf.range(0, limit=batch_size) # shape (batch_size) 124 | indices = tf.stack((batch_nums, lengths), axis=1) # shape (batch_size, 2) 125 | result = tf.gather_nd(lstm_representation, indices, name='last-forwar-lstm') 126 | return result # [batch_size, dim] 127 | 128 | def collect_probs(probs, positions): 129 | # probs [batch_size, chunks_size] 130 | # positions [batch_size, pair_size] 131 | batch_size = tf.shape(probs)[0] 132 | pair_size = tf.shape(positions)[1] 133 | batch_nums = tf.range(0, limit=batch_size) # shape (batch_size) 134 | batch_nums = tf.reshape(batch_nums, shape=[-1, 1]) # [batch_size, 1] 135 | batch_nums = tf.tile(batch_nums, multiples=[1, pair_size]) # [batch_size, pair_size] 136 | 137 | indices = tf.stack((batch_nums, positions), axis=2) # shape (batch_size, pair_size, 2) 138 | pair_probs = tf.gather_nd(probs, indices) 139 | # pair_probs = tf.reshape(pair_probs, shape=[batch_size, pair_size]) 140 | return pair_probs 141 | 142 | 143 | def calcuate_attention(in_value_1, in_value_2, feature_dim1, feature_dim2, scope_name='att', 144 | att_type='symmetric', att_dim=20, remove_diagnoal=False, mask1=None, mask2=None, is_training=False, dropout_rate=0.2): 145 | input_shape = tf.shape(in_value_1) 146 | batch_size = input_shape[0] 147 | len_1 = input_shape[1] 148 | len_2 = tf.shape(in_value_2)[1] 149 | 150 | in_value_1 = dropout_layer(in_value_1, dropout_rate, is_training=is_training) 151 | in_value_2 = dropout_layer(in_value_2, dropout_rate, is_training=is_training) 152 | with tf.variable_scope(scope_name): 153 | # calculate attention ==> a: [batch_size, len_1, len_2] 154 | atten_w1 = tf.get_variable("atten_w1", [feature_dim1, att_dim], dtype=tf.float32) 155 | if feature_dim1 == feature_dim2: atten_w2 = atten_w1 156 | else: atten_w2 = tf.get_variable("atten_w2", [feature_dim2, att_dim], dtype=tf.float32) 157 | atten_value_1 = tf.matmul(tf.reshape(in_value_1, [batch_size * len_1, feature_dim1]), atten_w1) # [batch_size*len_1, feature_dim] 158 | atten_value_1 = tf.reshape(atten_value_1, [batch_size, len_1, att_dim]) 159 | atten_value_2 = tf.matmul(tf.reshape(in_value_2, [batch_size * len_2, feature_dim2]), atten_w2) # [batch_size*len_2, feature_dim] 160 | atten_value_2 = tf.reshape(atten_value_2, [batch_size, len_2, att_dim]) 161 | 162 | 163 | if att_type == 'additive': 164 | atten_b = tf.get_variable("atten_b", [att_dim], dtype=tf.float32) 165 | atten_v = tf.get_variable("atten_v", [1, att_dim], dtype=tf.float32) 166 | atten_value_1 = tf.expand_dims(atten_value_1, axis=2, name="atten_value_1") # [batch_size, len_1, 'x', feature_dim] 167 | atten_value_2 = tf.expand_dims(atten_value_2, axis=1, name="atten_value_2") # [batch_size, 'x', len_2, feature_dim] 168 | atten_value = atten_value_1 + atten_value_2 # + tf.expand_dims(tf.expand_dims(tf.expand_dims(atten_b, axis=0), axis=0), axis=0) 169 | atten_value = nn_ops.bias_add(atten_value, atten_b) 170 | atten_value = tf.tanh(atten_value) # [batch_size, len_1, len_2, feature_dim] 171 | atten_value = tf.reshape(atten_value, [-1, att_dim]) * atten_v # tf.expand_dims(atten_v, axis=0) # [batch_size*len_1*len_2, feature_dim] 172 | atten_value = tf.reduce_sum(atten_value, axis=-1) 173 | atten_value = tf.reshape(atten_value, [batch_size, len_1, len_2]) 174 | else: 175 | atten_value_1 = tf.tanh(atten_value_1) 176 | # atten_value_1 = tf.nn.relu(atten_value_1) 177 | atten_value_2 = tf.tanh(atten_value_2) 178 | # atten_value_2 = tf.nn.relu(atten_value_2) 179 | diagnoal_params = tf.get_variable("diagnoal_params", [1, 1, att_dim], dtype=tf.float32) 180 | atten_value_1 = atten_value_1 * diagnoal_params 181 | atten_value = tf.matmul(atten_value_1, atten_value_2, transpose_b=True) # [batch_size, len_1, len_2] 182 | 183 | # normalize 184 | if remove_diagnoal: 185 | diagnoal = tf.ones([len_1], tf.float32) # [len1] 186 | diagnoal = 1.0 - tf.diag(diagnoal) # [len1, len1] 187 | diagnoal = tf.expand_dims(diagnoal, axis=0) # ['x', len1, len1] 188 | atten_value = atten_value * diagnoal 189 | if mask1 is not None: atten_value = tf.multiply(atten_value, tf.expand_dims(mask1, axis=-1)) 190 | if mask2 is not None: atten_value = tf.multiply(atten_value, tf.expand_dims(mask2, axis=1)) 191 | atten_value = tf.nn.softmax(atten_value, name='atten_value') # [batch_size, len_1, len_2] 192 | if remove_diagnoal: atten_value = atten_value * diagnoal 193 | if mask1 is not None: atten_value = tf.multiply(atten_value, tf.expand_dims(mask1, axis=-1)) 194 | if mask2 is not None: atten_value = tf.multiply(atten_value, tf.expand_dims(mask2, axis=1)) 195 | 196 | return atten_value 197 | 198 | def weighted_sum(atten_scores, in_values): 199 | ''' 200 | 201 | :param atten_scores: # [batch_size, len1, len2] 202 | :param in_values: [batch_size, len2, dim] 203 | :return: 204 | ''' 205 | return tf.matmul(atten_scores, in_values) 206 | 207 | def cal_relevancy_matrix(in_question_repres, in_passage_repres): 208 | in_question_repres_tmp = tf.expand_dims(in_question_repres, 1) # [batch_size, 1, question_len, dim] 209 | in_passage_repres_tmp = tf.expand_dims(in_passage_repres, 2) # [batch_size, passage_len, 1, dim] 210 | relevancy_matrix = cosine_distance(in_question_repres_tmp,in_passage_repres_tmp) # [batch_size, passage_len, question_len] 211 | return relevancy_matrix 212 | 213 | def mask_relevancy_matrix(relevancy_matrix, question_mask, passage_mask): 214 | # relevancy_matrix: [batch_size, passage_len, question_len] 215 | # question_mask: [batch_size, question_len] 216 | # passage_mask: [batch_size, passsage_len] 217 | if question_mask is not None: 218 | relevancy_matrix = tf.multiply(relevancy_matrix, tf.expand_dims(question_mask, 1)) 219 | relevancy_matrix = tf.multiply(relevancy_matrix, tf.expand_dims(passage_mask, 2)) 220 | return relevancy_matrix 221 | 222 | def compute_gradients(tensor, var_list): 223 | grads = tf.gradients(tensor, var_list) 224 | return [grad if grad is not None else tf.zeros_like(var) for var, grad in zip(var_list, grads)] 225 | -------------------------------------------------------------------------------- /src/match_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import layer_utils 3 | 4 | eps = 1e-6 5 | def cosine_distance(y1,y2): 6 | # y1 [....,a, 1, d] 7 | # y2 [....,1, b, d] 8 | cosine_numerator = tf.reduce_sum(tf.multiply(y1, y2), axis=-1) 9 | y1_norm = tf.sqrt(tf.maximum(tf.reduce_sum(tf.square(y1), axis=-1), eps)) 10 | y2_norm = tf.sqrt(tf.maximum(tf.reduce_sum(tf.square(y2), axis=-1), eps)) 11 | return cosine_numerator / y1_norm / y2_norm 12 | 13 | def cal_relevancy_matrix(in_question_repres, in_passage_repres): 14 | in_question_repres_tmp = tf.expand_dims(in_question_repres, 1) # [batch_size, 1, question_len, dim] 15 | in_passage_repres_tmp = tf.expand_dims(in_passage_repres, 2) # [batch_size, passage_len, 1, dim] 16 | relevancy_matrix = cosine_distance(in_question_repres_tmp,in_passage_repres_tmp) # [batch_size, passage_len, question_len] 17 | return relevancy_matrix 18 | 19 | def mask_relevancy_matrix(relevancy_matrix, question_mask, passage_mask): 20 | # relevancy_matrix: [batch_size, passage_len, question_len] 21 | # question_mask: [batch_size, question_len] 22 | # passage_mask: [batch_size, passsage_len] 23 | relevancy_matrix = tf.multiply(relevancy_matrix, tf.expand_dims(question_mask, 1)) 24 | relevancy_matrix = tf.multiply(relevancy_matrix, tf.expand_dims(passage_mask, 2)) 25 | return relevancy_matrix 26 | 27 | def multi_perspective_expand_for_3D(in_tensor, decompose_params): 28 | in_tensor = tf.expand_dims(in_tensor, axis=2) #[batch_size, passage_len, 'x', dim] 29 | decompose_params = tf.expand_dims(tf.expand_dims(decompose_params, axis=0), axis=0) # [1, 1, decompse_dim, dim] 30 | return tf.multiply(in_tensor, decompose_params)#[batch_size, passage_len, decompse_dim, dim] 31 | 32 | def multi_perspective_expand_for_2D(in_tensor, decompose_params): 33 | in_tensor = tf.expand_dims(in_tensor, axis=1) #[batch_size, 'x', dim] 34 | decompose_params = tf.expand_dims(decompose_params, axis=0) # [1, decompse_dim, dim] 35 | return tf.multiply(in_tensor, decompose_params) # [batch_size, decompse_dim, dim] 36 | 37 | 38 | def cal_maxpooling_matching(passage_rep, question_rep, decompose_params): 39 | # passage_representation: [batch_size, passage_len, dim] 40 | # qusetion_representation: [batch_size, question_len, dim] 41 | # decompose_params: [decompose_dim, dim] 42 | 43 | def singel_instance(x): 44 | p = x[0] 45 | q = x[1] 46 | # p: [pasasge_len, dim], q: [question_len, dim] 47 | p = multi_perspective_expand_for_2D(p, decompose_params) # [pasasge_len, decompose_dim, dim] 48 | q = multi_perspective_expand_for_2D(q, decompose_params) # [question_len, decompose_dim, dim] 49 | p = tf.expand_dims(p, 1) # [pasasge_len, 1, decompose_dim, dim] 50 | q = tf.expand_dims(q, 0) # [1, question_len, decompose_dim, dim] 51 | return cosine_distance(p, q) # [passage_len, question_len, decompose] 52 | elems = (passage_rep, question_rep) 53 | matching_matrix = tf.map_fn(singel_instance, elems, dtype=tf.float32) # [batch_size, passage_len, question_len, decompse_dim] 54 | return tf.concat(axis=2, values=[tf.reduce_max(matching_matrix, axis=2), tf.reduce_mean(matching_matrix, axis=2)])# [batch_size, passage_len, 2*decompse_dim] 55 | 56 | def cross_entropy(logits, truth, mask): 57 | # logits: [batch_size, passage_len] 58 | # truth: [batch_size, passage_len] 59 | # mask: [batch_size, passage_len] 60 | 61 | # xdev = x - x.max() 62 | # return xdev - T.log(T.sum(T.exp(xdev))) 63 | logits = tf.multiply(logits, mask) 64 | xdev = tf.sub(logits, tf.expand_dims(tf.reduce_max(logits, 1), -1)) 65 | log_predictions = tf.sub(xdev, tf.expand_dims(tf.log(tf.reduce_sum(tf.exp(xdev),-1)),-1)) 66 | # return -T.sum(targets * log_predictions) 67 | result = tf.multiply(tf.multiply(truth, log_predictions), mask) # [batch_size, passage_len] 68 | return tf.multiply(-1.0,tf.reduce_sum(result, -1)) # [batch_size] 69 | 70 | def highway_layer(in_val, output_size, scope=None): 71 | # in_val: [batch_size, passage_len, dim] 72 | input_shape = tf.shape(in_val) 73 | batch_size = input_shape[0] 74 | passage_len = input_shape[1] 75 | # feat_dim = input_shape[2] 76 | in_val = tf.reshape(in_val, [batch_size * passage_len, output_size]) 77 | with tf.variable_scope(scope or "highway_layer"): 78 | highway_w = tf.get_variable("highway_w", [output_size, output_size], dtype=tf.float32) 79 | highway_b = tf.get_variable("highway_b", [output_size], dtype=tf.float32) 80 | full_w = tf.get_variable("full_w", [output_size, output_size], dtype=tf.float32) 81 | full_b = tf.get_variable("full_b", [output_size], dtype=tf.float32) 82 | trans = tf.nn.tanh(tf.nn.xw_plus_b(in_val, full_w, full_b)) 83 | gate = tf.nn.sigmoid(tf.nn.xw_plus_b(in_val, highway_w, highway_b)) 84 | outputs = trans*gate + in_val* (1.0- gate) 85 | outputs = tf.reshape(outputs, [batch_size, passage_len, output_size]) 86 | return outputs 87 | 88 | def multi_highway_layer(in_val, output_size, num_layers, scope=None): 89 | scope_name = 'highway_layer' 90 | if scope is not None: scope_name = scope 91 | for i in xrange(num_layers): 92 | cur_scope_name = scope_name + "-{}".format(i) 93 | in_val = highway_layer(in_val, output_size, scope=cur_scope_name) 94 | return in_val 95 | 96 | def cal_max_question_representation(question_representation, atten_scores): 97 | atten_positions = tf.argmax(atten_scores, axis=2, output_type=tf.int32) # [batch_size, passage_len] 98 | max_question_reps = layer_utils.collect_representation(question_representation, atten_positions) 99 | return max_question_reps 100 | 101 | def multi_perspective_match(feature_dim, repres1, repres2, is_training=True, dropout_rate=0.2, 102 | options=None, scope_name='mp-match', reuse=False): 103 | ''' 104 | :param repres1: [batch_size, len, feature_dim] 105 | :param repres2: [batch_size, len, feature_dim] 106 | :return: 107 | ''' 108 | input_shape = tf.shape(repres1) 109 | batch_size = input_shape[0] 110 | seq_length = input_shape[1] 111 | matching_result = [] 112 | with tf.variable_scope(scope_name, reuse=reuse): 113 | match_dim = 0 114 | if options.with_cosine: 115 | cosine_value = layer_utils.cosine_distance(repres1, repres2, cosine_norm=False) 116 | cosine_value = tf.reshape(cosine_value, [batch_size, seq_length, 1]) 117 | matching_result.append(cosine_value) 118 | match_dim += 1 119 | 120 | if options.with_mp_cosine: 121 | mp_cosine_params = tf.get_variable("mp_cosine", shape=[options.cosine_MP_dim, feature_dim], dtype=tf.float32) 122 | mp_cosine_params = tf.expand_dims(mp_cosine_params, axis=0) 123 | mp_cosine_params = tf.expand_dims(mp_cosine_params, axis=0) 124 | repres1_flat = tf.expand_dims(repres1, axis=2) 125 | repres2_flat = tf.expand_dims(repres2, axis=2) 126 | mp_cosine_matching = layer_utils.cosine_distance(tf.multiply(repres1_flat, mp_cosine_params), 127 | repres2_flat,cosine_norm=False) 128 | matching_result.append(mp_cosine_matching) 129 | match_dim += options.cosine_MP_dim 130 | 131 | matching_result = tf.concat(axis=2, values=matching_result) 132 | return (matching_result, match_dim) 133 | 134 | 135 | def match_passage_with_question(passage_reps, question_reps, passage_mask, question_mask, passage_lengths, question_lengths, 136 | context_lstm_dim, scope=None, 137 | with_full_match=True, with_maxpool_match=True, with_attentive_match=True, with_max_attentive_match=True, 138 | is_training=True, options=None, dropout_rate=0, forward=True): 139 | passage_reps = tf.multiply(passage_reps, tf.expand_dims(passage_mask,-1)) 140 | question_reps = tf.multiply(question_reps, tf.expand_dims(question_mask,-1)) 141 | all_question_aware_representatins = [] 142 | dim = 0 143 | with tf.variable_scope(scope or "match_passage_with_question"): 144 | relevancy_matrix = cal_relevancy_matrix(question_reps, passage_reps) 145 | relevancy_matrix = mask_relevancy_matrix(relevancy_matrix, question_mask, passage_mask) 146 | # relevancy_matrix = layer_utils.calcuate_attention(passage_reps, question_reps, context_lstm_dim, context_lstm_dim, 147 | # scope_name="fw_attention", att_type=options.att_type, att_dim=options.att_dim, 148 | # remove_diagnoal=False, mask1=passage_mask, mask2=question_mask, is_training=is_training, dropout_rate=dropout_rate) 149 | 150 | all_question_aware_representatins.append(tf.reduce_max(relevancy_matrix, axis=2,keep_dims=True)) 151 | all_question_aware_representatins.append(tf.reduce_mean(relevancy_matrix, axis=2,keep_dims=True)) 152 | dim += 2 153 | if with_full_match: 154 | if forward: 155 | question_full_rep = layer_utils.collect_final_step_of_lstm(question_reps, question_lengths - 1) 156 | else: 157 | question_full_rep = question_reps[:,0,:] 158 | 159 | passage_len = tf.shape(passage_reps)[1] 160 | question_full_rep = tf.expand_dims(question_full_rep, axis=1) 161 | question_full_rep = tf.tile(question_full_rep, [1, passage_len, 1]) # [batch_size, pasasge_len, feature_dim] 162 | 163 | (attentive_rep, match_dim) = multi_perspective_match(context_lstm_dim, 164 | passage_reps, question_full_rep, is_training=is_training, dropout_rate=options.dropout_rate, 165 | options=options, scope_name='mp-match-full-match') 166 | all_question_aware_representatins.append(attentive_rep) 167 | dim += match_dim 168 | 169 | if with_maxpool_match: 170 | maxpooling_decomp_params = tf.get_variable("maxpooling_matching_decomp", 171 | shape=[options.cosine_MP_dim, context_lstm_dim], dtype=tf.float32) 172 | maxpooling_rep = cal_maxpooling_matching(passage_reps, question_reps, maxpooling_decomp_params) 173 | all_question_aware_representatins.append(maxpooling_rep) 174 | dim += 2*options.cosine_MP_dim 175 | 176 | if with_attentive_match: 177 | atten_scores = layer_utils.calcuate_attention(passage_reps, question_reps, context_lstm_dim, context_lstm_dim, 178 | scope_name="attention", att_type=options.att_type, att_dim=options.att_dim, 179 | remove_diagnoal=False, mask1=passage_mask, mask2=question_mask, is_training=is_training, dropout_rate=dropout_rate) 180 | att_question_contexts = tf.matmul(atten_scores, question_reps) 181 | (attentive_rep, match_dim) = multi_perspective_match(context_lstm_dim, 182 | passage_reps, att_question_contexts, is_training=is_training, dropout_rate=options.dropout_rate, 183 | options=options, scope_name='mp-match-att_question') 184 | all_question_aware_representatins.append(attentive_rep) 185 | dim += match_dim 186 | 187 | if with_max_attentive_match: 188 | max_att = cal_max_question_representation(question_reps, relevancy_matrix) 189 | (max_attentive_rep, match_dim) = multi_perspective_match(context_lstm_dim, 190 | passage_reps, max_att, is_training=is_training, dropout_rate=options.dropout_rate, 191 | options=options, scope_name='mp-match-max-att') 192 | all_question_aware_representatins.append(max_attentive_rep) 193 | dim += match_dim 194 | 195 | all_question_aware_representatins = tf.concat(axis=2, values=all_question_aware_representatins) 196 | return (all_question_aware_representatins, dim) 197 | 198 | def bilateral_match_func(in_question_repres, in_passage_repres, 199 | question_lengths, passage_lengths, question_mask, passage_mask, input_dim, is_training, options=None): 200 | 201 | question_aware_representatins = [] 202 | question_aware_dim = 0 203 | passage_aware_representatins = [] 204 | passage_aware_dim = 0 205 | 206 | # ====word level matching====== 207 | (match_reps, match_dim) = match_passage_with_question(in_passage_repres, in_question_repres, passage_mask, question_mask, passage_lengths, 208 | question_lengths, input_dim, scope="word_match_forward", 209 | with_full_match=False, with_maxpool_match=options.with_maxpool_match, 210 | with_attentive_match=options.with_attentive_match, 211 | with_max_attentive_match=options.with_max_attentive_match, 212 | is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True) 213 | question_aware_representatins.append(match_reps) 214 | question_aware_dim += match_dim 215 | 216 | (match_reps, match_dim) = match_passage_with_question(in_question_repres, in_passage_repres, question_mask, passage_mask, question_lengths, 217 | passage_lengths, input_dim, scope="word_match_backward", 218 | with_full_match=False, with_maxpool_match=options.with_maxpool_match, 219 | with_attentive_match=options.with_attentive_match, 220 | with_max_attentive_match=options.with_max_attentive_match, 221 | is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) 222 | passage_aware_representatins.append(match_reps) 223 | passage_aware_dim += match_dim 224 | 225 | with tf.variable_scope('context_MP_matching'): 226 | for i in xrange(options.context_layer_num): # support multiple context layer 227 | with tf.variable_scope('layer-{}'.format(i)): 228 | # contextual lstm for both passage and question 229 | in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1)) 230 | in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(passage_mask, axis=-1)) 231 | (question_context_representation_fw, question_context_representation_bw, 232 | in_question_repres) = layer_utils.my_lstm_layer( 233 | in_question_repres, options.context_lstm_dim, input_lengths= question_lengths,scope_name="context_represent", 234 | reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) 235 | (passage_context_representation_fw, passage_context_representation_bw, 236 | in_passage_repres) = layer_utils.my_lstm_layer( 237 | in_passage_repres, options.context_lstm_dim, input_lengths=passage_lengths, scope_name="context_represent", 238 | reuse=True, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) 239 | 240 | # Multi-perspective matching 241 | with tf.variable_scope('left_MP_matching'): 242 | (match_reps, match_dim) = match_passage_with_question(passage_context_representation_fw, 243 | question_context_representation_fw, passage_mask, question_mask, passage_lengths, 244 | question_lengths, options.context_lstm_dim, scope="forward_match", 245 | with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, 246 | with_attentive_match=options.with_attentive_match, 247 | with_max_attentive_match=options.with_max_attentive_match, 248 | is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True) 249 | question_aware_representatins.append(match_reps) 250 | question_aware_dim += match_dim 251 | (match_reps, match_dim) = match_passage_with_question(passage_context_representation_bw, 252 | question_context_representation_bw, passage_mask, question_mask, passage_lengths, 253 | question_lengths, options.context_lstm_dim, scope="backward_match", 254 | with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, 255 | with_attentive_match=options.with_attentive_match, 256 | with_max_attentive_match=options.with_max_attentive_match, 257 | is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) 258 | question_aware_representatins.append(match_reps) 259 | question_aware_dim += match_dim 260 | 261 | with tf.variable_scope('right_MP_matching'): 262 | (match_reps, match_dim) = match_passage_with_question(question_context_representation_fw, 263 | passage_context_representation_fw, question_mask, passage_mask, question_lengths, 264 | passage_lengths, options.context_lstm_dim, scope="forward_match", 265 | with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, 266 | with_attentive_match=options.with_attentive_match, 267 | with_max_attentive_match=options.with_max_attentive_match, 268 | is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True) 269 | passage_aware_representatins.append(match_reps) 270 | passage_aware_dim += match_dim 271 | (match_reps, match_dim) = match_passage_with_question(question_context_representation_bw, 272 | passage_context_representation_bw, question_mask, passage_mask, question_lengths, 273 | passage_lengths, options.context_lstm_dim, scope="backward_match", 274 | with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, 275 | with_attentive_match=options.with_attentive_match, 276 | with_max_attentive_match=options.with_max_attentive_match, 277 | is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) 278 | passage_aware_representatins.append(match_reps) 279 | passage_aware_dim += match_dim 280 | 281 | question_aware_representatins = tf.concat(axis=2, values=question_aware_representatins) # [batch_size, passage_len, question_aware_dim] 282 | passage_aware_representatins = tf.concat(axis=2, values=passage_aware_representatins) # [batch_size, question_len, question_aware_dim] 283 | 284 | if is_training: 285 | question_aware_representatins = tf.nn.dropout(question_aware_representatins, (1 - options.dropout_rate)) 286 | passage_aware_representatins = tf.nn.dropout(passage_aware_representatins, (1 - options.dropout_rate)) 287 | 288 | # ======Highway layer====== 289 | if options.with_match_highway: 290 | with tf.variable_scope("left_matching_highway"): 291 | question_aware_representatins = multi_highway_layer(question_aware_representatins, question_aware_dim, 292 | options.highway_layer_num) 293 | with tf.variable_scope("right_matching_highway"): 294 | passage_aware_representatins = multi_highway_layer(passage_aware_representatins, passage_aware_dim, 295 | options.highway_layer_num) 296 | 297 | #========Aggregation Layer====== 298 | aggregation_representation = [] 299 | aggregation_dim = 0 300 | 301 | qa_aggregation_input = question_aware_representatins 302 | pa_aggregation_input = passage_aware_representatins 303 | with tf.variable_scope('aggregation_layer'): 304 | for i in xrange(options.aggregation_layer_num): # support multiple aggregation layer 305 | qa_aggregation_input = tf.multiply(qa_aggregation_input, tf.expand_dims(passage_mask, axis=-1)) 306 | (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( 307 | qa_aggregation_input, options.aggregation_lstm_dim, input_lengths=passage_lengths, scope_name='left_layer-{}'.format(i), 308 | reuse=False, is_training=is_training, dropout_rate=options.dropout_rate,use_cudnn=options.use_cudnn) 309 | fw_rep = layer_utils.collect_final_step_of_lstm(fw_rep, passage_lengths - 1) 310 | bw_rep = bw_rep[:, 0, :] 311 | aggregation_representation.append(fw_rep) 312 | aggregation_representation.append(bw_rep) 313 | aggregation_dim += 2* options.aggregation_lstm_dim 314 | qa_aggregation_input = cur_aggregation_representation# [batch_size, passage_len, 2*aggregation_lstm_dim] 315 | 316 | pa_aggregation_input = tf.multiply(pa_aggregation_input, tf.expand_dims(question_mask, axis=-1)) 317 | (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( 318 | pa_aggregation_input, options.aggregation_lstm_dim, 319 | input_lengths=question_lengths, scope_name='right_layer-{}'.format(i), 320 | reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) 321 | fw_rep = layer_utils.collect_final_step_of_lstm(fw_rep, question_lengths - 1) 322 | bw_rep = bw_rep[:, 0, :] 323 | aggregation_representation.append(fw_rep) 324 | aggregation_representation.append(bw_rep) 325 | aggregation_dim += 2* options.aggregation_lstm_dim 326 | pa_aggregation_input = cur_aggregation_representation# [batch_size, passage_len, 2*aggregation_lstm_dim] 327 | 328 | aggregation_representation = tf.concat(axis=1, values=aggregation_representation) # [batch_size, aggregation_dim] 329 | 330 | # ======Highway layer====== 331 | if options.with_aggregation_highway: 332 | with tf.variable_scope("aggregation_highway"): 333 | agg_shape = tf.shape(aggregation_representation) 334 | batch_size = agg_shape[0] 335 | aggregation_representation = tf.reshape(aggregation_representation, [1, batch_size, aggregation_dim]) 336 | aggregation_representation = multi_highway_layer(aggregation_representation, aggregation_dim, options.highway_layer_num) 337 | aggregation_representation = tf.reshape(aggregation_representation, [batch_size, aggregation_dim]) 338 | 339 | return (aggregation_representation, aggregation_dim) 340 | 341 | -------------------------------------------------------------------------------- /src/match_utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiguowang/BiMPM/33cc8fe5d450f432a6843bc05cad29c6ce9f5714/src/match_utils.pyc -------------------------------------------------------------------------------- /src/my_rnn.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from tensorflow.python.framework import constant_op 6 | from tensorflow.python.framework import dtypes 7 | from tensorflow.python.framework import ops 8 | from tensorflow.python.framework import tensor_shape 9 | from tensorflow.python.ops import array_ops 10 | from tensorflow.python.ops import control_flow_ops 11 | from tensorflow.python.ops import math_ops 12 | from tensorflow.python.ops import rnn_cell 13 | from tensorflow.python.ops import tensor_array_ops 14 | from tensorflow.python.ops import variable_scope as vs 15 | from tensorflow.python.util import nest 16 | from tensorflow.python.ops import rnn 17 | import tensorflow as tf 18 | 19 | _state_size_with_prefix = rnn_cell._state_size_with_prefix 20 | 21 | def _dynamic_rnn_loop(cell, inputs, initial_state, parallel_iterations, swap_memory, sequence_length=None, dtype=None): 22 | """Internal implementation of Dynamic RNN. 23 | 24 | Args: 25 | cell: An instance of RNNCell. 26 | inputs: A `Tensor` of shape [time, batch_size, input_size], or a nested 27 | tuple of such elements. 28 | initial_state: A `Tensor` of shape `[batch_size, state_size]`, or if 29 | `cell.state_size` is a tuple, then this should be a tuple of 30 | tensors having shapes `[batch_size, s] for s in cell.state_size`. 31 | parallel_iterations: Positive Python int. 32 | swap_memory: A Python boolean 33 | sequence_length: (optional) An `int32` `Tensor` of shape [batch_size]. 34 | dtype: (optional) Expected dtype of output. If not specified, inferred from 35 | initial_state. 36 | 37 | Returns: 38 | Tuple `(final_outputs, final_state)`. 39 | final_outputs: 40 | A `Tensor` of shape `[time, batch_size, cell.output_size]`. If 41 | `cell.output_size` is a (possibly nested) tuple of ints or `TensorShape` 42 | objects, then this returns a (possibly nsted) tuple of Tensors matching 43 | the corresponding shapes. 44 | final_state: 45 | A `Tensor`, or possibly nested tuple of Tensors, matching in length 46 | and shapes to `initial_state`. 47 | 48 | Raises: 49 | ValueError: If the input depth cannot be inferred via shape inference 50 | from the inputs. 51 | """ 52 | state = initial_state 53 | assert isinstance(parallel_iterations, int), "parallel_iterations must be int" 54 | 55 | state_size = cell.state_size 56 | 57 | flat_input = nest.flatten(inputs) 58 | flat_output_size = nest.flatten(cell.output_size) 59 | 60 | # Construct an initial output 61 | input_shape = array_ops.shape(flat_input[0]) 62 | time_steps = input_shape[0] 63 | batch_size = input_shape[1] 64 | 65 | inputs_got_shape = tuple(input_.get_shape().with_rank_at_least(3) for input_ in flat_input) 66 | 67 | const_time_steps, const_batch_size = inputs_got_shape[0].as_list()[:2] 68 | 69 | for shape in inputs_got_shape: 70 | if not shape[2:].is_fully_defined(): 71 | raise ValueError( 72 | "Input size (depth of inputs) must be accessible via shape inference," 73 | " but saw value None.") 74 | got_time_steps = shape[0].value 75 | got_batch_size = shape[1].value 76 | if const_time_steps != got_time_steps: 77 | raise ValueError( 78 | "Time steps is not the same for all the elements in the input in a " 79 | "batch.") 80 | if const_batch_size != got_batch_size: 81 | raise ValueError( 82 | "Batch_size is not the same for all the elements in the input.") 83 | 84 | # Prepare dynamic conditional copying of state & output 85 | def _create_zero_arrays(size): 86 | size = _state_size_with_prefix(size, prefix=[batch_size]) 87 | return array_ops.zeros(array_ops.pack(size), rnn._infer_state_dtype(dtype, state)) 88 | 89 | flat_zero_output = tuple(_create_zero_arrays(output) for output in flat_output_size) 90 | zero_output = nest.pack_sequence_as(structure=cell.output_size, flat_sequence=flat_zero_output) 91 | 92 | if sequence_length is not None: 93 | min_sequence_length = math_ops.reduce_min(sequence_length) 94 | max_sequence_length = math_ops.reduce_max(sequence_length) 95 | 96 | time = array_ops.constant(0, dtype=dtypes.int32, name="time") 97 | 98 | with ops.name_scope("dynamic_rnn") as scope: 99 | base_name = scope 100 | 101 | def _create_ta(name, dtype): 102 | return tensor_array_ops.TensorArray(dtype=dtype, size=time_steps, tensor_array_name=base_name + name,clear_after_read=False) 103 | 104 | output_ta = tuple(_create_ta("output_%d" % i, rnn._infer_state_dtype(dtype, state)) for i in range(len(flat_output_size))) 105 | input_ta = tuple(_create_ta("input_%d" % i, flat_input[0].dtype) for i in range(len(flat_input))) 106 | 107 | input_ta = tuple(ta.unpack(input_) for ta, input_ in zip(input_ta, flat_input)) 108 | 109 | def _time_step(time, output_ta_t, state): 110 | """Take a time step of the dynamic RNN. 111 | 112 | Args: 113 | time: int32 scalar Tensor. 114 | output_ta_t: List of `TensorArray`s that represent the output. 115 | state: nested tuple of vector tensors that represent the state. 116 | 117 | Returns: 118 | The tuple (time + 1, output_ta_t with updated flow, new_state). 119 | """ 120 | 121 | input_t = tuple(ta.read(time) for ta in input_ta) 122 | # Restore some shape information 123 | for input_, shape in zip(input_t, inputs_got_shape): 124 | input_.set_shape(shape[1:]) 125 | 126 | input_t = nest.pack_sequence_as(structure=inputs, flat_sequence=input_t) 127 | call_cell = lambda: cell(input_t, state) 128 | 129 | def f1(): return zero_output 130 | def f2(): return tuple(ta.read(tf.subtract(time, 1)) for ta in output_ta_t)#output_ta_t.read(tf.subtract(time, 1)) 131 | cur_zero_output = tf.cond(tf.less(time, 1), f1, f2) 132 | 133 | if sequence_length is not None: 134 | (output, new_state) = rnn._rnn_step( 135 | time=time, 136 | sequence_length=sequence_length, 137 | min_sequence_length=min_sequence_length, 138 | max_sequence_length=max_sequence_length, 139 | zero_output=cur_zero_output, # TODO 140 | state=state, 141 | call_cell=call_cell, 142 | state_size=state_size, 143 | skip_conditionals=True) 144 | else: 145 | (output, new_state) = call_cell() 146 | 147 | # Pack state if using state tuples 148 | output = nest.flatten(output) 149 | 150 | output_ta_t = tuple(ta.write(time, out) for ta, out in zip(output_ta_t, output)) 151 | 152 | return (time + 1, output_ta_t, new_state) 153 | 154 | _, output_final_ta, final_state = control_flow_ops.while_loop( 155 | cond=lambda time, *_: time < time_steps, 156 | body=_time_step, 157 | loop_vars=(time, output_ta, state), 158 | parallel_iterations=parallel_iterations, 159 | swap_memory=swap_memory) 160 | 161 | # Unpack final output if not using output tuples. 162 | final_outputs = tuple(ta.pack() for ta in output_final_ta) 163 | 164 | # Restore some shape information 165 | for output, output_size in zip(final_outputs, flat_output_size): 166 | shape = _state_size_with_prefix( 167 | output_size, prefix=[const_time_steps, const_batch_size]) 168 | output.set_shape(shape) 169 | 170 | final_outputs = nest.pack_sequence_as( 171 | structure=cell.output_size, flat_sequence=final_outputs) 172 | 173 | return (final_outputs, final_state) 174 | 175 | 176 | def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None, 177 | initial_state_fw=None, initial_state_bw=None, 178 | dtype=None, parallel_iterations=None, 179 | swap_memory=False, time_major=False, scope=None): 180 | """Creates a dynamic version of bidirectional recurrent neural network. 181 | 182 | Similar to the unidirectional case above (rnn) but takes input and builds 183 | independent forward and backward RNNs. The input_size of forward and 184 | backward cell must match. The initial state for both directions is zero by 185 | default (but can be set optionally) and no intermediate states are ever 186 | returned -- the network is fully unrolled for the given (passed in) 187 | length(s) of the sequence(s) or completely unrolled if length(s) is not 188 | given. 189 | 190 | Args: 191 | cell_fw: An instance of RNNCell, to be used for forward direction. 192 | cell_bw: An instance of RNNCell, to be used for backward direction. 193 | inputs: The RNN inputs. 194 | If time_major == False (default), this must be a tensor of shape: 195 | `[batch_size, max_time, input_size]`. 196 | If time_major == True, this must be a tensor of shape: 197 | `[max_time, batch_size, input_size]`. 198 | [batch_size, input_size]. 199 | sequence_length: An int32/int64 vector, size `[batch_size]`, 200 | containing the actual lengths for each of the sequences. 201 | initial_state_fw: (optional) An initial state for the forward RNN. 202 | This must be a tensor of appropriate type and shape 203 | `[batch_size, cell_fw.state_size]`. 204 | If `cell_fw.state_size` is a tuple, this should be a tuple of 205 | tensors having shapes `[batch_size, s] for s in cell_fw.state_size`. 206 | initial_state_bw: (optional) Same as for `initial_state_fw`, but using 207 | the corresponding properties of `cell_bw`. 208 | dtype: (optional) The data type for the initial states and expected output. 209 | Required if initial_states are not provided or RNN states have a 210 | heterogeneous dtype. 211 | parallel_iterations: (Default: 32). The number of iterations to run in 212 | parallel. Those operations which do not have any temporal dependency 213 | and can be run in parallel, will be. This parameter trades off 214 | time for space. Values >> 1 use more memory but take less time, 215 | while smaller values use less memory but computations take longer. 216 | swap_memory: Transparently swap the tensors produced in forward inference 217 | but needed for back prop from GPU to CPU. This allows training RNNs 218 | which would typically not fit on a single GPU, with very minimal (or no) 219 | performance penalty. 220 | time_major: The shape format of the `inputs` and `outputs` Tensors. 221 | If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`. 222 | If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`. 223 | Using `time_major = True` is a bit more efficient because it avoids 224 | transposes at the beginning and end of the RNN calculation. However, 225 | most TensorFlow data is batch-major, so by default this function 226 | accepts input and emits output in batch-major form. 227 | dtype: (optional) The data type for the initial state. Required if 228 | either of the initial states are not provided. 229 | scope: VariableScope for the created subgraph; defaults to "BiRNN" 230 | 231 | Returns: 232 | A tuple (outputs, output_states) where: 233 | outputs: A tuple (output_fw, output_bw) containing the forward and 234 | the backward rnn output `Tensor`. 235 | If time_major == False (default), 236 | output_fw will be a `Tensor` shaped: 237 | `[batch_size, max_time, cell_fw.output_size]` 238 | and output_bw will be a `Tensor` shaped: 239 | `[batch_size, max_time, cell_bw.output_size]`. 240 | If time_major == True, 241 | output_fw will be a `Tensor` shaped: 242 | `[max_time, batch_size, cell_fw.output_size]` 243 | and output_bw will be a `Tensor` shaped: 244 | `[max_time, batch_size, cell_bw.output_size]`. 245 | It returns a tuple instead of a single concatenated `Tensor`, unlike 246 | in the `bidirectional_rnn`. If the concatenated one is preferred, 247 | the forward and backward outputs can be concatenated as 248 | `tf.concat(2, outputs)`. 249 | output_states: A tuple (output_state_fw, output_state_bw) containing 250 | the forward and the backward final states of bidirectional rnn. 251 | 252 | Raises: 253 | TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`. 254 | """ 255 | 256 | if not isinstance(cell_fw, rnn_cell.RNNCell): 257 | raise TypeError("cell_fw must be an instance of RNNCell") 258 | if not isinstance(cell_bw, rnn_cell.RNNCell): 259 | raise TypeError("cell_bw must be an instance of RNNCell") 260 | 261 | with vs.variable_scope(scope or "BiRNN"): 262 | # Forward direction 263 | with vs.variable_scope("FW") as fw_scope: 264 | output_fw, output_state_fw = dynamic_rnn( 265 | cell=cell_fw, inputs=inputs, sequence_length=sequence_length, 266 | initial_state=initial_state_fw, dtype=dtype, 267 | parallel_iterations=parallel_iterations, swap_memory=swap_memory, 268 | time_major=time_major, scope=fw_scope) 269 | 270 | # Backward direction 271 | if not time_major: 272 | time_dim = 1 273 | batch_dim = 0 274 | else: 275 | time_dim = 0 276 | batch_dim = 1 277 | 278 | with vs.variable_scope("BW") as bw_scope: 279 | inputs_reverse = array_ops.reverse_sequence( 280 | input=inputs, seq_lengths=sequence_length, 281 | seq_dim=time_dim, batch_dim=batch_dim) 282 | tmp, output_state_bw = dynamic_rnn( 283 | cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length, 284 | initial_state=initial_state_bw, dtype=dtype, 285 | parallel_iterations=parallel_iterations, swap_memory=swap_memory, 286 | time_major=time_major, scope=bw_scope) 287 | 288 | output_bw = array_ops.reverse_sequence( 289 | input=tmp, seq_lengths=sequence_length, 290 | seq_dim=time_dim, batch_dim=batch_dim) 291 | 292 | outputs = (output_fw, output_bw) 293 | output_states = (output_state_fw, output_state_bw) 294 | 295 | return (outputs, output_states) 296 | 297 | def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None, 298 | dtype=None, parallel_iterations=None, swap_memory=False, 299 | time_major=False, scope=None): 300 | """Creates a recurrent neural network specified by RNNCell `cell`. 301 | 302 | This function is functionally identical to the function `rnn` above, but 303 | performs fully dynamic unrolling of `inputs`. 304 | 305 | Unlike `rnn`, the input `inputs` is not a Python list of `Tensors`, one for 306 | each frame. Instead, `inputs` may be a single `Tensor` where 307 | the maximum time is either the first or second dimension (see the parameter 308 | `time_major`). Alternatively, it may be a (possibly nested) tuple of 309 | Tensors, each of them having matching batch and time dimensions. 310 | The corresponding output is either a single `Tensor` having the same number 311 | of time steps and batch size, or a (possibly nested) tuple of such tensors, 312 | matching the nested structure of `cell.output_size`. 313 | 314 | The parameter `sequence_length` is optional and is used to copy-through state 315 | and zero-out outputs when past a batch element's sequence length. So it's more 316 | for correctness than performance, unlike in rnn(). 317 | 318 | Args: 319 | cell: An instance of RNNCell. 320 | inputs: The RNN inputs. 321 | 322 | If `time_major == False` (default), this must be a `Tensor` of shape: 323 | `[batch_size, max_time, ...]`, or a nested tuple of such 324 | elements. 325 | 326 | If `time_major == True`, this must be a `Tensor` of shape: 327 | `[max_time, batch_size, ...]`, or a nested tuple of such 328 | elements. 329 | 330 | This may also be a (possibly nested) tuple of Tensors satisfying 331 | this property. The first two dimensions must match across all the inputs, 332 | but otherwise the ranks and other shape components may differ. 333 | In this case, input to `cell` at each time-step will replicate the 334 | structure of these tuples, except for the time dimension (from which the 335 | time is taken). 336 | 337 | The input to `cell` at each time step will be a `Tensor` or (possibly 338 | nested) tuple of Tensors each with dimensions `[batch_size, ...]`. 339 | sequence_length: (optional) An int32/int64 vector sized `[batch_size]`. 340 | initial_state: (optional) An initial state for the RNN. 341 | If `cell.state_size` is an integer, this must be 342 | a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`. 343 | If `cell.state_size` is a tuple, this should be a tuple of 344 | tensors having shapes `[batch_size, s] for s in cell.state_size`. 345 | dtype: (optional) The data type for the initial state and expected output. 346 | Required if initial_state is not provided or RNN state has a heterogeneous 347 | dtype. 348 | parallel_iterations: (Default: 32). The number of iterations to run in 349 | parallel. Those operations which do not have any temporal dependency 350 | and can be run in parallel, will be. This parameter trades off 351 | time for space. Values >> 1 use more memory but take less time, 352 | while smaller values use less memory but computations take longer. 353 | swap_memory: Transparently swap the tensors produced in forward inference 354 | but needed for back prop from GPU to CPU. This allows training RNNs 355 | which would typically not fit on a single GPU, with very minimal (or no) 356 | performance penalty. 357 | time_major: The shape format of the `inputs` and `outputs` Tensors. 358 | If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`. 359 | If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`. 360 | Using `time_major = True` is a bit more efficient because it avoids 361 | transposes at the beginning and end of the RNN calculation. However, 362 | most TensorFlow data is batch-major, so by default this function 363 | accepts input and emits output in batch-major form. 364 | scope: VariableScope for the created subgraph; defaults to "RNN". 365 | 366 | Returns: 367 | A pair (outputs, state) where: 368 | 369 | outputs: The RNN output `Tensor`. 370 | 371 | If time_major == False (default), this will be a `Tensor` shaped: 372 | `[batch_size, max_time, cell.output_size]`. 373 | 374 | If time_major == True, this will be a `Tensor` shaped: 375 | `[max_time, batch_size, cell.output_size]`. 376 | 377 | Note, if `cell.output_size` is a (possibly nested) tuple of integers 378 | or `TensorShape` objects, then `outputs` will be a tuple having the 379 | same structure as `cell.output_size`, containing Tensors having shapes 380 | corresponding to the shape data in `cell.output_size`. 381 | 382 | state: The final state. If `cell.state_size` is an int, this 383 | will be shaped `[batch_size, cell.state_size]`. If it is a 384 | `TensorShape`, this will be shaped `[batch_size] + cell.state_size`. 385 | If it is a (possibly nested) tuple of ints or `TensorShape`, this will 386 | be a tuple having the corresponding shapes. 387 | 388 | Raises: 389 | TypeError: If `cell` is not an instance of RNNCell. 390 | ValueError: If inputs is None or an empty list. 391 | """ 392 | 393 | if not isinstance(cell, rnn_cell.RNNCell): 394 | raise TypeError("cell must be an instance of RNNCell") 395 | 396 | # By default, time_major==False and inputs are batch-major: shaped 397 | # [batch, time, depth] 398 | # For internal calculations, we transpose to [time, batch, depth] 399 | flat_input = nest.flatten(inputs) 400 | 401 | if not time_major: 402 | # (B,T,D) => (T,B,D) 403 | flat_input = tuple(array_ops.transpose(input_, [1, 0, 2]) for input_ in flat_input) 404 | 405 | parallel_iterations = parallel_iterations or 32 406 | if sequence_length is not None: 407 | sequence_length = math_ops.to_int32(sequence_length) 408 | if sequence_length.get_shape().ndims not in (None, 1): 409 | raise ValueError( 410 | "sequence_length must be a vector of length batch_size, " 411 | "but saw shape: %s" % sequence_length.get_shape()) 412 | sequence_length = array_ops.identity( # Just to find it in the graph. 413 | sequence_length, name="sequence_length") 414 | 415 | # Create a new scope in which the caching device is either 416 | # determined by the parent scope, or is set to place the cached 417 | # Variable using the same placement as for the rest of the RNN. 418 | with vs.variable_scope(scope or "RNN") as varscope: 419 | if varscope.caching_device is None: 420 | varscope.set_caching_device(lambda op: op.device) 421 | input_shape = tuple(array_ops.shape(input_) for input_ in flat_input) 422 | batch_size = input_shape[0][1] 423 | 424 | for input_ in input_shape: 425 | if input_[1].get_shape() != batch_size.get_shape(): 426 | raise ValueError("All inputs should have the same batch size") 427 | 428 | if initial_state is not None: 429 | state = initial_state 430 | else: 431 | if not dtype: 432 | raise ValueError("If no initial_state is provided, dtype must be.") 433 | state = cell.zero_state(batch_size, dtype) 434 | 435 | def _assert_has_shape(x, shape): 436 | x_shape = array_ops.shape(x) 437 | packed_shape = array_ops.pack(shape) 438 | return control_flow_ops.Assert( 439 | math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)), 440 | ["Expected shape for Tensor %s is " % x.name, 441 | packed_shape, " but saw shape: ", x_shape]) 442 | 443 | if sequence_length is not None: 444 | # Perform some shape validation 445 | with ops.control_dependencies([_assert_has_shape(sequence_length, [batch_size])]): 446 | sequence_length = array_ops.identity(sequence_length, name="CheckSeqLen") 447 | 448 | inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input) 449 | 450 | (outputs, final_state) = _dynamic_rnn_loop( 451 | cell, 452 | inputs, 453 | state, 454 | parallel_iterations=parallel_iterations, 455 | swap_memory=swap_memory, 456 | sequence_length=sequence_length, 457 | dtype=dtype) 458 | 459 | # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth]. 460 | # If we are performing batch-major calculations, transpose output back 461 | # to shape [batch, time, depth] 462 | if not time_major: 463 | # (T,B,D) => (B,T,D) 464 | flat_output = nest.flatten(outputs) 465 | flat_output = [array_ops.transpose(output, [1, 0, 2]) 466 | for output in flat_output] 467 | outputs = nest.pack_sequence_as(structure=outputs, flat_sequence=flat_output) 468 | 469 | return (outputs, final_state) 470 | 471 | -------------------------------------------------------------------------------- /src/my_rnn.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiguowang/BiMPM/33cc8fe5d450f432a6843bc05cad29c6ce9f5714/src/my_rnn.pyc -------------------------------------------------------------------------------- /src/namespace_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | class Bunch(object): 4 | def __init__(self, adict): 5 | self.__dict__.update(adict) 6 | 7 | def save_namespace(FLAGS, out_path): 8 | FLAGS_dict = vars(FLAGS) 9 | with open(out_path, 'w') as fp: 10 | #json.dump(FLAGS_dict, fp) 11 | json.dump(FLAGS_dict, fp, indent=4, sort_keys=True) 12 | 13 | def load_namespace(in_path): 14 | with open(in_path, 'r') as fp: 15 | FLAGS_dict = json.load(fp) 16 | return Bunch(FLAGS_dict) -------------------------------------------------------------------------------- /src/namespace_utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiguowang/BiMPM/33cc8fe5d450f432a6843bc05cad29c6ce9f5714/src/namespace_utils.pyc -------------------------------------------------------------------------------- /src/vocab_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import print_function 3 | import numpy as np 4 | import re 5 | 6 | # import math 7 | class Vocab(object): 8 | def __init__(self, vec_path=None, dim=100, fileformat='bin',voc=None, word2id=None, word_vecs=None, unk_mapping_path=None): 9 | self.unk_label = '' 10 | self.stoplist = None 11 | if fileformat == 'bin': 12 | self.fromBinary(vec_path,voc=voc) 13 | elif fileformat == 'txt': 14 | self.fromText(vec_path,voc=voc) 15 | elif fileformat == 'txt2': 16 | self.fromText_format2(vec_path,voc=voc,pre_word_vecs=word_vecs) 17 | elif fileformat == 'txt3': 18 | self.fromText_format3(vec_path,voc=voc) 19 | elif fileformat == 'map': 20 | self.fromMap(word2id, word_vecs, word_dim=dim) 21 | else: # build a vocabulary with a word set 22 | self.fromVocabualry(voc, dim=dim) 23 | 24 | self.__unk_mapping = None 25 | if unk_mapping_path is not None: 26 | self.__unk_mapping = {} 27 | in_file = open(unk_mapping_path, 'rt') 28 | for line in in_file: 29 | items = re.split('\t', line) 30 | self.__unk_mapping[items[0]] = items[1] 31 | in_file.close() 32 | 33 | 34 | def fromVocabualry(self, voc, dim=100): 35 | # load freq table and build index for each word 36 | self.word2id = {} 37 | self.id2word = {} 38 | 39 | self.vocab_size = len(voc) 40 | self.word_dim = dim 41 | for word in voc: 42 | cur_index = len(self.word2id) 43 | self.word2id[word] = cur_index 44 | self.id2word[cur_index] = word 45 | 46 | # self.word_vecs = np.zeros((self.vocab_size+1, self.word_dim), dtype=np.float32) # the last dimension is all zero 47 | shape = (self.vocab_size+1, self.word_dim) 48 | scale = 0.05 49 | self.word_vecs = np.array(np.random.uniform(low=-scale, high=scale, size=shape), dtype=np.float32) 50 | # self.word_vecs = None 51 | 52 | def fromMap(self, word2id, word_vecs, word_dim=100): 53 | self.word2id = word2id 54 | self.id2word = dict(zip(word2id.values(),word2id.keys())) 55 | 56 | self.vocab_size = len(word2id) 57 | self.word_dim = word_dim 58 | self.word_vecs = word_vecs 59 | 60 | 61 | 62 | def fromText(self, vec_path,voc=None): 63 | # load freq table and build index for each word 64 | self.word2id = {} 65 | self.id2word = {} 66 | 67 | vec_file = open(vec_path, 'rt') 68 | header = vec_file.readline() 69 | self.vocab_size, self.word_dim = map(int, header.split()) 70 | word_vecs = {} 71 | for line in vec_file: 72 | line = line.decode('utf-8').strip() 73 | parts = line.split(' ') 74 | word = parts[0] 75 | if (voc is not None) and (word not in voc): continue 76 | vector = np.array(parts[1:], dtype='float32') 77 | cur_index = len(self.word2id) 78 | self.word2id[word] = cur_index 79 | self.id2word[cur_index] = word 80 | word_vecs[cur_index] = vector 81 | vec_file.close() 82 | 83 | self.vocab_size = len(self.word2id) 84 | self.word_vecs = np.zeros((self.vocab_size+1, self.word_dim), dtype=np.float32) # the last dimension is all zero 85 | for cur_index in xrange(self.vocab_size): 86 | self.word_vecs[cur_index] = word_vecs[cur_index] 87 | 88 | 89 | def fromText_format2(self, vec_path,voc=None,pre_word_vecs=None): 90 | # load freq table and build index for each word 91 | self.word2id = {} 92 | self.id2word = {} 93 | 94 | vec_file = open(vec_path, 'rt') 95 | word_vecs = {} 96 | for line in vec_file: 97 | line = line.decode('utf-8').strip() 98 | parts = line.split('\t') 99 | cur_index = int(parts[0]) 100 | word = parts[1] 101 | vector = np.array(map(float,re.split('\\s+', parts[2])), dtype='float32') 102 | self.word2id[word] = cur_index 103 | self.id2word[cur_index] = word 104 | word_vecs[cur_index] = vector 105 | self.word_dim = vector.size 106 | vec_file.close() 107 | 108 | self.vocab_size = len(self.word2id) 109 | 110 | if pre_word_vecs is not None: 111 | self.word_vecs = pre_word_vecs 112 | else: 113 | self.word_vecs = np.zeros((self.vocab_size+1, self.word_dim), dtype=np.float32) # the last dimension is all zero 114 | for cur_index in xrange(self.vocab_size): 115 | self.word_vecs[cur_index] = word_vecs[cur_index] 116 | 117 | 118 | def fromText_format3(self, vec_path,voc=None): 119 | # load freq table and build index for each word 120 | self.word2id = {} 121 | self.id2word = {} 122 | 123 | vec_file = open(vec_path, 'rt') 124 | # header = vec_file.readline() 125 | # self.vocab_size, self.word_dim = map(int, header.split()) 126 | word_vecs = {} 127 | for line in vec_file: 128 | line = line.decode('utf-8') 129 | if line[0] == line[1] == ' ': 130 | word = ' ' 131 | parts = [' '] + line.strip().split(' ') 132 | else: 133 | parts = line.split(' ') 134 | word = parts[0] 135 | self.word_dim = len(parts[1:]) 136 | if (voc is not None) and (word not in voc): continue 137 | vector = np.array(parts[1:], dtype='float32') 138 | cur_index = len(self.word2id) 139 | self.word2id[word] = cur_index 140 | self.id2word[cur_index] = word 141 | word_vecs[cur_index] = vector 142 | vec_file.close() 143 | 144 | self.vocab_size = len(self.word2id) 145 | self.word_vecs = np.zeros((self.vocab_size+1, self.word_dim), dtype=np.float32) # the last dimension is all zero 146 | for cur_index in xrange(self.vocab_size): 147 | self.word_vecs[cur_index] = word_vecs[cur_index] 148 | 149 | 150 | 151 | def fromText_bak(self, vec_path,voc=None): 152 | # load freq table and build index for each word 153 | self.word2id = {} 154 | self.id2word = {} 155 | 156 | vec_file = open(vec_path, 'rt') 157 | header = vec_file.readline() 158 | self.vocab_size, self.word_dim = map(int, header.split()) 159 | self.word_vecs = np.zeros((self.vocab_size+1, self.word_dim), dtype=np.float32) # the last dimension is all zero 160 | for line in vec_file: 161 | line = line.decode('utf-8').strip() 162 | parts = line.split(' ') 163 | word = parts[0] 164 | if (voc is not None) and (word not in voc): continue 165 | vector = np.array(parts[1:], dtype='float32') 166 | cur_index = len(self.word2id) 167 | self.word2id[word] = cur_index 168 | self.id2word[cur_index] = word 169 | self.word_vecs[cur_index] = vector 170 | vec_file.close() 171 | 172 | def fromBinary_with_voc(self, fname, voc, scale=0.05, stop_num=50): 173 | self.stoplist = voc[0:stop_num] 174 | voc = voc[stop_num:] 175 | voc.append(self.unk_label) 176 | self.word2id = {} 177 | self.id2word = {} 178 | for word in voc: 179 | curIndex = len(self.word2id) 180 | self.word2id[word] = curIndex 181 | self.id2word[curIndex] = word 182 | 183 | with open(fname, "rb") as f: 184 | header = f.readline() 185 | cur_vocab_size, self.word_dim = map(int, header.split()) 186 | word_vecs = {} 187 | binary_len = np.dtype('float32').itemsize * self.word_dim 188 | for idx in xrange(cur_vocab_size): 189 | word = [] 190 | while True: 191 | ch = f.read(1) 192 | if ch == ' ': 193 | word = ''.join(word) 194 | break 195 | if ch != '\n': 196 | word.append(ch) 197 | if word in self.word2id.keys(): 198 | curIndex = self.word2id[word] 199 | else: 200 | curIndex = len(self.word2id) 201 | self.word2id[word] = curIndex 202 | self.id2word[curIndex] = word 203 | word_vecs[curIndex] = np.fromstring(f.read(binary_len), dtype='float32') 204 | 205 | self.vocab_size = len(self.word2id) 206 | self.word_vecs = np.random.uniform(low=-scale, high=scale, size=(self.vocab_size+1, self.word_dim)).astype('float32') 207 | self.word_vecs[self.vocab_size] = self.word_vecs[self.vocab_size] * 0.0 208 | for cur_index in word_vecs.keys(): 209 | self.word_vecs[cur_index] = word_vecs[cur_index] 210 | 211 | def fromBinary(self, fname, scale=0.05, voc=None): 212 | self.word2id = {} 213 | self.id2word = {} 214 | self.word2id[self.unk_label] = 0 215 | self.id2word[0] = self.unk_label 216 | # load word vector 217 | with open(fname, "rb") as f: 218 | header = f.readline() 219 | self.vocab_size, self.word_dim = map(int, header.split()) 220 | word_vecs = {} 221 | binary_len = np.dtype('float32').itemsize * self.word_dim 222 | for idx in xrange(self.vocab_size): 223 | word = [] 224 | while True: 225 | ch = f.read(1) 226 | if ch == ' ': 227 | word = ''.join(word) 228 | break 229 | if ch != '\n': 230 | word.append(ch) 231 | if word == '': continue 232 | curIndex = len(self.word2id) 233 | self.word2id[word] = curIndex 234 | self.id2word[curIndex] = word 235 | word_vecs[curIndex] = np.fromstring(f.read(binary_len), dtype='float32') 236 | 237 | # add unkwords 238 | if voc is not None: 239 | for word in voc: 240 | if word == '': continue 241 | if self.word2id.has_key(word): continue 242 | curIndex = len(self.word2id) 243 | self.word2id[word] = curIndex 244 | self.id2word[curIndex] = word 245 | word_vecs[curIndex] = np.random.uniform(low=-scale, high=scale, size=(self.word_dim,)).astype('float32') 246 | 247 | self.vocab_size = len(self.word2id) 248 | self.word_vecs = np.zeros((self.vocab_size+1, self.word_dim), dtype=np.float32) # the last dimension is all zero 249 | for cur_index in xrange(self.vocab_size): 250 | if cur_index ==0 : continue 251 | self.word_vecs[cur_index] = word_vecs[cur_index] 252 | self.word_vecs[0] = np.random.uniform(low=-scale, high=scale, size=(self.word_dim,)).astype('float32') 253 | 254 | def setWordvec(self,word_vecs): 255 | self.word_vecs = word_vecs 256 | 257 | def hasWord(self, word): 258 | return self.word2id.has_key(word) 259 | 260 | def size(self): 261 | return len(self.word2id) 262 | 263 | def getIndex(self, word): 264 | if self.stoplist is not None: 265 | if word in self.stoplist: 266 | return None 267 | if(self.word2id.has_key(word)): 268 | return self.word2id.get(word) 269 | else: 270 | return self.vocab_size 271 | 272 | def getWord(self, idx): 273 | return self.id2word.get(idx) 274 | 275 | def getVector(self, word): 276 | if(self.word2id.has_key(word)): 277 | idx = self.word2id.get(word) 278 | return self.word_vecs[idx] 279 | return None 280 | 281 | def to_index_sequence(self, sentence): 282 | # sentence = sentence.strip().lower() 283 | sentence = sentence.strip() 284 | seq = [] 285 | for word in re.split('\\s+', sentence): 286 | idx = self.getIndex(word) 287 | if idx == None and self.__unk_mapping is not None and self.__unk_mapping.has_key(word): 288 | simWord = self.__unk_mapping[word] 289 | idx = self.getIndex(simWord) 290 | if idx == None: idx = self.vocab_size 291 | seq.append(idx) 292 | return seq 293 | 294 | def to_index_sequence_for_list(self, words): 295 | seq = [] 296 | for word in words: 297 | idx = self.getIndex(word) 298 | if idx == None and self.__unk_mapping is not None and self.__unk_mapping.has_key(word): 299 | simWord = self.__unk_mapping[word] 300 | idx = self.getIndex(simWord) 301 | if idx == None: idx = self.vocab_size 302 | seq.append(idx) 303 | return seq 304 | 305 | def to_character_matrix(self, sentence, max_char_per_word=-1): 306 | sentence = sentence.strip() 307 | seq = [] 308 | for word in re.split('\\s+', sentence): 309 | cur_seq = [] 310 | for i in xrange(len(word)): 311 | cur_char = word[i] 312 | idx = self.getIndex(cur_char) 313 | if idx == None and self.__unk_mapping is not None and self.__unk_mapping.has_key(cur_char): 314 | simWord = self.__unk_mapping[cur_char] 315 | idx = self.getIndex(simWord) 316 | if idx == None: idx = self.vocab_size 317 | cur_seq.append(idx) 318 | if max_char_per_word != -1 and len(cur_seq) > max_char_per_word: 319 | cur_seq = cur_seq[:max_char_per_word] 320 | seq.append(cur_seq) 321 | return seq 322 | 323 | def to_index_sequence4binary_features(self, sentence): 324 | sentence = sentence.strip().lower() 325 | seq = [] 326 | for word in re.split(' ', sentence): 327 | idx = self.getIndex(word) 328 | if idx == None: continue 329 | seq.append(idx) 330 | return seq 331 | 332 | def to_char_ngram_index_sequence(self, sentence): 333 | sentence = sentence.strip().lower() 334 | seq = [] 335 | words = re.split(' ', sentence) 336 | for word in words: 337 | sub_words = collect_char_ngram(word) 338 | for sub_word in sub_words: 339 | idx = self.getIndex(sub_word) 340 | if idx == None: continue 341 | seq.append(idx) 342 | return seq 343 | 344 | def to_sparse_feature_sequence(self, sentence1, sentence2): 345 | words1 = set(re.split(' ', sentence1.strip().lower())) 346 | words2 = set(re.split(' ', sentence2.strip().lower())) 347 | intersection_words = words1.intersection(words2) 348 | seq = [] 349 | for word in intersection_words: 350 | idx = self.getIndex(word) 351 | if idx == None: continue 352 | seq.append(idx) 353 | return seq 354 | 355 | def get_sentence_vector(self, sentence): 356 | sent_vec = np.zeros((self.word_dim,), dtype='float32') 357 | sentence = sentence.strip().lower() 358 | total = 0.0 359 | for word in re.split(' ', sentence): 360 | cur_vec = self.getVector(word) 361 | if cur_vec is None: continue 362 | sent_vec += cur_vec 363 | total += 1.0 364 | if total != 0.0: sent_vec /= total 365 | return sent_vec 366 | 367 | def dump_to_txt2(self, outpath): 368 | outfile = open(outpath, 'wt') 369 | for word in self.word2id.keys(): 370 | cur_id = self.word2id[word] 371 | cur_vector = self.getVector(word) 372 | # print(word) 373 | word= word.encode('utf-8') 374 | outline = "{}\t{}\t{}".format(cur_id, word, vec2string(cur_vector)) 375 | outfile.write(outline + "\n") 376 | outfile.close() 377 | 378 | def dump_to_txt3(self, outpath): 379 | outfile = open(outpath, 'wt') 380 | for word in self.word2id.keys(): 381 | cur_vector = self.getVector(word) 382 | word= word.encode('utf-8') 383 | outline = word + " {}".format(vec2string(cur_vector)) 384 | outfile.write(outline + "\n") 385 | outfile.close() 386 | 387 | def vec2string(val): 388 | result = "" 389 | for v in val: 390 | result += " {}".format(v) 391 | return result.strip() 392 | 393 | 394 | def collect_all_ngram(words, n=2): 395 | all_ngrams = set() 396 | for i in xrange(len(words)-n): 397 | cur_ngram = words[i:i+n] 398 | all_ngrams.add(' '.join(cur_ngram)) 399 | return all_ngrams 400 | 401 | def collect_char_ngram(word, n=3): 402 | all_words = [] 403 | if len(word)<=n: all_words.append(word) 404 | else: 405 | for i in xrange(len(word)-n+1): 406 | cur_word = word[i:i+3] 407 | all_words.append(cur_word) 408 | return all_words 409 | 410 | def to_char_ngram_sequence(sentence, n=3): 411 | seq = [] 412 | words = re.split(' ', sentence) 413 | for word in words: 414 | sub_words = collect_char_ngram(word) 415 | seq.extend(sub_words) 416 | return ' '.join(seq) 417 | 418 | def collectVoc(trainpath): 419 | vocab = set() 420 | inputFile = file(trainpath, 'rt') 421 | for line in inputFile: 422 | line = line.strip() 423 | label, sentence = re.split('\t', line) 424 | sentence = sentence.lower() 425 | for word in re.split(' ', sentence): 426 | vocab.add(word) 427 | inputFile.close() 428 | return vocab 429 | 430 | def collect_word_count(sentences, unk_num=1): 431 | word_count_map = {} 432 | for sentence in sentences: 433 | sentence = sentence.strip().lower() 434 | for word in re.split(' ', sentence): 435 | cur_count = 0 436 | if word_count_map.has_key(word): 437 | cur_count = word_count_map.get(word) 438 | word_count_map[word] = cur_count + 1 439 | word_count_list = [] 440 | for word in word_count_map.keys(): 441 | count = word_count_map.get(word) 442 | word_count_list.append((count, word)) 443 | 444 | word_count_list = sorted(word_count_list,key=(lambda a:a[0]), reverse=True) 445 | # for i in xrange(50): 446 | # word, count = word_count_list[i] 447 | # print('{}\t{}'.format(word, count)) 448 | # return word_count_list 449 | return [word for count, word in word_count_list if count>unk_num ] 450 | 451 | def collect_word_count_with_max_vocab(sentences, max_vocab=600000): 452 | word_count_map = {} 453 | for sentence in sentences: 454 | sentence = sentence.strip().lower() 455 | for word in re.split(' ', sentence): 456 | cur_count = 0 457 | if word_count_map.has_key(word): 458 | cur_count = word_count_map.get(word) 459 | word_count_map[word] = cur_count + 1 460 | word_count_list = [] 461 | for word in word_count_map.keys(): 462 | count = word_count_map.get(word) 463 | word_count_list.append((count, word)) 464 | 465 | word_count_list = sorted(word_count_list,key=(lambda a:a[0]), reverse=True) 466 | # for i in xrange(50): 467 | # word, count = word_count_list[i] 468 | # print('{}\t{}'.format(word, count)) 469 | # return word_count_list 470 | # return [word for count, word in word_count_list if count>unk_num ] 471 | if len(word_count_list)