├── .project
├── .pydevproject
├── .settings
└── org.eclipse.core.resources.prefs
├── LICENSE.txt
├── README.md
├── configs
├── quora.sample.config
└── snli.sample.config
└── src
├── SentenceMatchDataStream.py
├── SentenceMatchDataStream.pyc
├── SentenceMatchDecoder.py
├── SentenceMatchModelGraph.py
├── SentenceMatchModelGraph.pyc
├── SentenceMatchTrainer.py
├── __init__.py
├── layer_utils.py
├── match_utils.py
├── match_utils.pyc
├── my_rnn.py
├── my_rnn.pyc
├── namespace_utils.py
├── namespace_utils.pyc
├── vocab_utils.py
└── vocab_utils.pyc
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | BiMPM
4 |
5 |
6 |
7 |
8 |
9 | org.python.pydev.PyDevBuilder
10 |
11 |
12 |
13 |
14 |
15 | org.python.pydev.pythonNature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.pydevproject:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | /${PROJECT_DIR_NAME}
5 | /${PROJECT_DIR_NAME}/src
6 |
7 | python 2.7
8 | Default
9 |
10 | /u/zhigwang/.local/lib/python2.7/site-packages
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//src/SentenceMatchDecoder.py=utf-8
3 | encoding//src/SentenceMatchTrainer.py=utf-8
4 | encoding//src/vocab_utils.py=utf-8
5 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # BiMPM: Bilateral Multi-Perspective Matching for Natural Language Sentences
2 |
3 | ## Updates (Jan 28, 2018)
4 | * This repository has been updated to tensorflow 1.5
5 | * The training process speeds up 15+ times without lossing the accuracy.
6 | * All codes have been re-constructed for better readability and adaptability.
7 |
8 | ## Description
9 | This repository includes the source code for natural language sentence matching.
10 | Basically, the program takes two sentences as input, and predict a label for the two input sentences.
11 | You can use this program to deal with tasks like [paraphrase identification](https://aclweb.org/aclwiki/index.php?title=Paraphrase_Identification_%28State_of_the_art%29), [natural language inference](http://nlp.stanford.edu/projects/snli/), [duplicate questions identification](https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs) et al. More details about the underneath model can be found in our [paper](https://arxiv.org/pdf/1702.03814.pdf) published in IJCAI 2017. Please cite our paper when you use this program! :heart_eyes:
12 |
13 | ## Requirements
14 | * python 2.7
15 | * tensorflow 1.5
16 |
17 | ## Data format
18 | Both the train and test sets require a tab-separated format.
19 | Each line in the train (or test) file corresponds to an instance, and it should be arranged as
20 | > label sentence#1 sentence#2 instanceID
21 |
22 | For more details about the data format, you can download the [SNLI](https://drive.google.com/file/d/1CxjKsaM6YgZPRKmJhNn7WcIC3gISehcS/view?usp=sharing) and the [Quora Question Pair](https://drive.google.com/file/d/0B0PlTAo--BnaQWlsZl9FZ3l1c28/view?usp=sharing) datasets used in our [paper](https://arxiv.org/pdf/1702.03814.pdf).
23 |
24 |
25 | ## Training
26 | You can find the training script at BiMPM/src/SentenceMatchTrainer.py
27 |
28 | First, edit the configuration file at ${workspace}/BiMPM/configs/snli.sample.config (or ${workspace}/BiMPM/configs/quora.sample.config ).
29 | You need to change the "train\_path", "dev\_path", "word\_vec\_path", "model\_dir", "suffix" to your own setting.
30 |
31 | Second, launch job using the following command line
32 | > python ${workspace}/BiMPM/SentenceMatchTrainer.py --config\_path ${workspace}/BiMPM/configs/snli.sample.config
33 |
34 |
35 | ## Testing
36 | You can find the testing script at BiMPM/src/SentenceMatchDecoder.py
37 | > python ${workspace}/BiMPM/src/SentenceMatchDecoder.py --in\_path ${your\_path\_to}/dev.tsv --word\_vec\_path ${your\_path\_to}/wordvec.txt --out\_path ${your\_path\_to}/result.json --model\_prefix ${model\_dir}/SentenceMatch.${suffix}
38 |
39 | Where "model\_dir" and "suffix" are the variables set in your configuration file.
40 |
41 | The output file is a json file with the follwing format.
42 |
43 | ```javascript
44 | {
45 | {
46 | "ID": "instanceID",
47 | "truth": label,
48 | "sent1": sentence1,
49 | "sent2": sentence2,
50 | "prediction": prediciton,
51 | "probs": probs_for_all_possible_labels
52 | },
53 | {
54 | "ID": "instanceID",
55 | "truth": label,
56 | "sent1": sentence1,
57 | "sent2": sentence2,
58 | "prediction": prediciton,
59 | "probs": probs_for_all_possible_labels
60 | }
61 | }
62 | ```
63 |
64 |
65 | ## Reporting issues
66 | Please let [me](https://zhiguowang.github.io/) know, if you encounter any problems.
67 |
--------------------------------------------------------------------------------
/configs/quora.sample.config:
--------------------------------------------------------------------------------
1 | {
2 | "train_path": "/u/zhigwang/zhigwang1/sentence_match/quora/data/train.tsv",
3 | "dev_path": "/u/zhigwang/zhigwang1/sentence_match/quora/data/dev.tsv",
4 | "word_vec_path": "/u/zhigwang/zhigwang1/sentence_match/quora/wordvec.txt",
5 | "model_dir": "/u/zhigwang/zhigwang1/sentence_match/quora/logs",
6 | "suffix": "quora",
7 | "fix_word_vec": true,
8 | "isLower": true,
9 | "max_sent_length": 50,
10 | "max_char_per_word": 10,
11 |
12 | "with_char": true,
13 | "char_emb_dim": 20,
14 | "char_lstm_dim": 40,
15 |
16 |
17 | "batch_size": 60,
18 | "max_epochs": 20,
19 | "dropout_rate": 0.1,
20 | "learning_rate": 0.0005,
21 | "optimize_type": "adam",
22 | "lambda_l2": 0.0,
23 | "grad_clipper": 10.0,
24 |
25 | "context_layer_num": 1,
26 | "context_lstm_dim": 100,
27 | "aggregation_layer_num": 1,
28 | "aggregation_lstm_dim": 100,
29 |
30 | "with_full_match": true,
31 | "with_maxpool_match": false,
32 | "with_max_attentive_match": false,
33 | "with_attentive_match": true,
34 |
35 | "with_cosine": true,
36 | "with_mp_cosine": true,
37 | "cosine_MP_dim": 5,
38 |
39 | "att_dim": 50,
40 | "att_type": "symmetric",
41 |
42 | "highway_layer_num": 1,
43 | "with_highway": true,
44 | "with_match_highway": true,
45 | "with_aggregation_highway": true,
46 |
47 | "use_cudnn": true,
48 |
49 | "with_moving_average": false
50 | }
51 |
--------------------------------------------------------------------------------
/configs/snli.sample.config:
--------------------------------------------------------------------------------
1 | {
2 | "train_path": "/u/zhigwang/zhigwang1/sentence_match/snli/train.tsv",
3 | "dev_path": "/u/zhigwang/zhigwang1/sentence_match/snli/dev.tsv",
4 | "word_vec_path": "/u/zhigwang/zhigwang1/sentence_match/snli/wordvec.txt",
5 | "model_dir": "/u/zhigwang/zhigwang1/sentence_match/snli/logs",
6 | "suffix": "quora",
7 | "fix_word_vec": true,
8 | "isLower": true,
9 | "max_sent_length": 100,
10 | "max_char_per_word": 10,
11 |
12 | "with_char": true,
13 | "char_emb_dim": 20,
14 | "char_lstm_dim": 40,
15 |
16 | "batch_size": 100,
17 | "max_epochs": 10,
18 | "dropout_rate": 0.2,
19 | "learning_rate": 0.001,
20 | "optimize_type": "adam",
21 | "lambda_l2": 0.0,
22 | "grad_clipper": 10.0,
23 |
24 | "context_layer_num": 1,
25 | "context_lstm_dim": 100,
26 | "aggregation_layer_num": 1,
27 | "aggregation_lstm_dim": 100,
28 |
29 | "with_full_match": true,
30 | "with_maxpool_match": false,
31 | "with_max_attentive_match": false,
32 | "with_attentive_match": true,
33 |
34 | "with_cosine": true,
35 | "with_mp_cosine": true,
36 | "cosine_MP_dim": 5,
37 |
38 | "att_dim": 50,
39 | "att_type": "symmetric",
40 |
41 | "highway_layer_num": 1,
42 | "with_highway": true,
43 | "with_match_highway": true,
44 | "with_aggregation_highway": true,
45 |
46 | "use_cudnn": true,
47 |
48 | "with_moving_average": false
49 | }
50 |
--------------------------------------------------------------------------------
/src/SentenceMatchDataStream.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import re
3 |
4 | def make_batches(size, batch_size):
5 | nb_batch = int(np.ceil(size/float(batch_size)))
6 | return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)] # zgwang: starting point of each batch
7 |
8 | def pad_2d_vals(in_vals, dim1_size, dim2_size, dtype=np.int32):
9 | out_val = np.zeros((dim1_size, dim2_size), dtype=dtype)
10 | if dim1_size > len(in_vals): dim1_size = len(in_vals)
11 | for i in xrange(dim1_size):
12 | cur_in_vals = in_vals[i]
13 | cur_dim2_size = dim2_size
14 | if cur_dim2_size > len(cur_in_vals): cur_dim2_size = len(cur_in_vals)
15 | out_val[i,:cur_dim2_size] = cur_in_vals[:cur_dim2_size]
16 | return out_val
17 |
18 | def pad_3d_vals(in_vals, dim1_size, dim2_size, dim3_size, dtype=np.int32):
19 | out_val = np.zeros((dim1_size, dim2_size, dim3_size), dtype=dtype)
20 | if dim1_size > len(in_vals): dim1_size = len(in_vals)
21 | for i in xrange(dim1_size):
22 | in_vals_i = in_vals[i]
23 | cur_dim2_size = dim2_size
24 | if cur_dim2_size > len(in_vals_i): cur_dim2_size = len(in_vals_i)
25 | for j in xrange(cur_dim2_size):
26 | in_vals_ij = in_vals_i[j]
27 | cur_dim3_size = dim3_size
28 | if cur_dim3_size > len(in_vals_ij): cur_dim3_size = len(in_vals_ij)
29 | out_val[i, j, :cur_dim3_size] = in_vals_ij[:cur_dim3_size]
30 | return out_val
31 |
32 |
33 | def read_all_instances(inpath, word_vocab=None, label_vocab=None, char_vocab=None, max_sent_length=100,
34 | max_char_per_word=10, isLower=True):
35 | instances = []
36 | infile = open(inpath, 'rt')
37 | idx = -1
38 | for line in infile:
39 | idx += 1
40 | line = line.decode('utf-8').strip()
41 | if line.startswith('-'): continue
42 | items = re.split("\t", line)
43 | label = items[0]
44 | sentence1 = items[1].strip()
45 | sentence2 = items[2].strip()
46 | cur_ID = "{}".format(idx)
47 | if len(items)>=4: cur_ID = items[3]
48 | if isLower:
49 | sentence1 = sentence1.lower()
50 | sentence2 = sentence2.lower()
51 | if label_vocab is not None:
52 | label_id = label_vocab.getIndex(label)
53 | if label_id >= label_vocab.vocab_size: label_id = 0
54 | else:
55 | label_id = int(label)
56 | word_idx_1 = word_vocab.to_index_sequence(sentence1)
57 | word_idx_2 = word_vocab.to_index_sequence(sentence2)
58 | if char_vocab is not None:
59 | char_matrix_idx_1 = char_vocab.to_character_matrix(sentence1, max_char_per_word=max_char_per_word)
60 | char_matrix_idx_2 = char_vocab.to_character_matrix(sentence2, max_char_per_word=max_char_per_word)
61 | else:
62 | char_matrix_idx_1 = None
63 | char_matrix_idx_2 = None
64 | if len(word_idx_1) > max_sent_length:
65 | word_idx_1 = word_idx_1[:max_sent_length]
66 | if char_vocab is not None: char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length]
67 | if len(word_idx_2) > max_sent_length:
68 | word_idx_2 = word_idx_2[:max_sent_length]
69 | if char_vocab is not None: char_matrix_idx_2 = char_matrix_idx_2[:max_sent_length]
70 | instances.append((label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2, cur_ID))
71 | infile.close()
72 | return instances
73 |
74 | class SentenceMatchDataStream(object):
75 | def __init__(self, inpath, word_vocab=None, char_vocab=None, label_vocab=None,
76 | isShuffle=False, isLoop=False, isSort=True, options=None):
77 | instances = read_all_instances(inpath, word_vocab=word_vocab, label_vocab=label_vocab,
78 | char_vocab=char_vocab, max_sent_length=options.max_sent_length, max_char_per_word=options.max_char_per_word,
79 | isLower=options.isLower)
80 |
81 | # sort instances based on sentence length
82 | if isSort: instances = sorted(instances, key=lambda instance: (len(instance[4]), len(instance[5]))) # sort instances based on length
83 | self.num_instances = len(instances)
84 |
85 | # distribute into different buckets
86 | batch_spans = make_batches(self.num_instances, options.batch_size)
87 | self.batches = []
88 | for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
89 | cur_instances = []
90 | for i in xrange(batch_start, batch_end):
91 | cur_instances.append(instances[i])
92 | cur_batch = InstanceBatch(cur_instances, with_char=options.with_char)
93 | self.batches.append(cur_batch)
94 |
95 | instances = None
96 | self.num_batch = len(self.batches)
97 | self.index_array = np.arange(self.num_batch)
98 | self.isShuffle = isShuffle
99 | if self.isShuffle: np.random.shuffle(self.index_array)
100 | self.isLoop = isLoop
101 | self.cur_pointer = 0
102 |
103 | def nextBatch(self):
104 | if self.cur_pointer>=self.num_batch:
105 | if not self.isLoop: return None
106 | self.cur_pointer = 0
107 | if self.isShuffle: np.random.shuffle(self.index_array)
108 | # print('{} '.format(self.index_array[self.cur_pointer]))
109 | cur_batch = self.batches[self.index_array[self.cur_pointer]]
110 | self.cur_pointer += 1
111 | return cur_batch
112 |
113 | def shuffle(self):
114 | if self.isShuffle: np.random.shuffle(self.index_array)
115 |
116 | def reset(self):
117 | self.cur_pointer = 0
118 |
119 | def get_num_batch(self):
120 | return self.num_batch
121 |
122 | def get_num_instance(self):
123 | return self.num_instances
124 |
125 | def get_batch(self, i):
126 | if i >= self.num_batch: return None
127 | return self.batches[self.index_array[i]]
128 |
129 |
130 | class InstanceBatch(object):
131 | def __init__(self, instances, with_char=False):
132 | self.instances = instances
133 | self.batch_size = len(instances)
134 | self.question_len = 0
135 | self.passage_len = 0
136 |
137 | self.question_lengths = [] # tf.placeholder(tf.int32, [None])
138 | self.in_question_words = [] # tf.placeholder(tf.int32, [None, None]) # [batch_size, question_len]
139 | self.passage_lengths = [] # tf.placeholder(tf.int32, [None])
140 | self.in_passage_words = [] # tf.placeholder(tf.int32, [None, None]) # [batch_size, passage_len]
141 | self.label_truth = [] # [batch_size]
142 |
143 | if with_char:
144 | self.in_question_chars = [] # tf.placeholder(tf.int32, [None, None, None]) # [batch_size, question_len, q_char_len]
145 | self.question_char_lengths = [] # tf.placeholder(tf.int32, [None, None]) # [batch_size, question_len]
146 | self.in_passage_chars = [] # tf.placeholder(tf.int32, [None, None, None]) # [batch_size, passage_len, p_char_len]
147 | self.passage_char_lengths = [] # tf.placeholder(tf.int32, [None, None]) # [batch_size, passage_len]
148 |
149 | for (label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2, cur_ID) in instances:
150 | cur_question_length = len(word_idx_1)
151 | cur_passage_length = len(word_idx_2)
152 | if self.question_len < cur_question_length: self.question_len = cur_question_length
153 | if self.passage_len < cur_passage_length: self.passage_len = cur_passage_length
154 | self.question_lengths.append(cur_question_length)
155 | self.in_question_words.append(word_idx_1)
156 | self.passage_lengths.append(cur_passage_length)
157 | self.in_passage_words.append(word_idx_2)
158 | self.label_truth.append(label_id)
159 | if with_char:
160 | self.in_question_chars.append(char_matrix_idx_1)
161 | self.in_passage_chars.append(char_matrix_idx_2)
162 | self.question_char_lengths.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_1])
163 | self.passage_char_lengths.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_2])
164 |
165 | # padding all value into np arrays
166 | self.question_lengths = np.array(self.question_lengths, dtype=np.int32)
167 | self.in_question_words = pad_2d_vals(self.in_question_words, self.batch_size, self.question_len, dtype=np.int32)
168 | self.passage_lengths = np.array(self.passage_lengths, dtype=np.int32)
169 | self.in_passage_words = pad_2d_vals(self.in_passage_words, self.batch_size, self.passage_len, dtype=np.int32)
170 | self.label_truth = np.array(self.label_truth, dtype=np.int32)
171 | if with_char:
172 | max_char_length1 = np.max([np.max(aa) for aa in self.question_char_lengths])
173 | self.in_question_chars = pad_3d_vals(self.in_question_chars, self.batch_size, self.question_len,
174 | max_char_length1, dtype=np.int32)
175 | max_char_length2 = np.max([np.max(aa) for aa in self.passage_char_lengths])
176 | self.in_passage_chars = pad_3d_vals(self.in_passage_chars, self.batch_size, self.passage_len,
177 | max_char_length2, dtype=np.int32)
178 |
179 | self.question_char_lengths = pad_2d_vals(self.question_char_lengths, self.batch_size, self.question_len)
180 | self.passage_char_lengths = pad_2d_vals(self.passage_char_lengths, self.batch_size, self.passage_len)
181 |
--------------------------------------------------------------------------------
/src/SentenceMatchDataStream.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhiguowang/BiMPM/33cc8fe5d450f432a6843bc05cad29c6ce9f5714/src/SentenceMatchDataStream.pyc
--------------------------------------------------------------------------------
/src/SentenceMatchDecoder.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import print_function
3 | import argparse
4 | import sys
5 | from vocab_utils import Vocab
6 | import namespace_utils
7 |
8 | import tensorflow as tf
9 | import SentenceMatchTrainer
10 | from SentenceMatchModelGraph import SentenceMatchModelGraph
11 | from SentenceMatchDataStream import SentenceMatchDataStream
12 |
13 |
14 | if __name__ == '__main__':
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('--model_prefix', type=str, required=True, help='Prefix to the models.')
17 | parser.add_argument('--in_path', type=str, required=True, help='the path to the test file.')
18 | parser.add_argument('--out_path', type=str, required=True, help='The path to the output file.')
19 | parser.add_argument('--word_vec_path', type=str, help='word embedding file for the input file.')
20 |
21 | args, unparsed = parser.parse_known_args()
22 |
23 | # load the configuration file
24 | print('Loading configurations.')
25 | options = namespace_utils.load_namespace(args.model_prefix + ".config.json")
26 |
27 | if args.word_vec_path is None: args.word_vec_path = options.word_vec_path
28 |
29 |
30 | # load vocabs
31 | print('Loading vocabs.')
32 | word_vocab = Vocab(args.word_vec_path, fileformat='txt3')
33 | label_vocab = Vocab(args.model_prefix + ".label_vocab", fileformat='txt2')
34 | print('word_vocab: {}'.format(word_vocab.word_vecs.shape))
35 | print('label_vocab: {}'.format(label_vocab.word_vecs.shape))
36 | num_classes = label_vocab.size()
37 |
38 | if options.with_char:
39 | char_vocab = Vocab(args.model_prefix + ".char_vocab", fileformat='txt2')
40 | print('char_vocab: {}'.format(char_vocab.word_vecs.shape))
41 |
42 | print('Build SentenceMatchDataStream ... ')
43 | testDataStream = SentenceMatchDataStream(args.in_path, word_vocab=word_vocab, char_vocab=char_vocab,
44 | label_vocab=label_vocab,
45 | isShuffle=False, isLoop=True, isSort=True, options=options)
46 | print('Number of instances in devDataStream: {}'.format(testDataStream.get_num_instance()))
47 | print('Number of batches in devDataStream: {}'.format(testDataStream.get_num_batch()))
48 | sys.stdout.flush()
49 |
50 | best_path = args.model_prefix + ".best.model"
51 | init_scale = 0.01
52 | with tf.Graph().as_default():
53 | initializer = tf.random_uniform_initializer(-init_scale, init_scale)
54 | global_step = tf.train.get_or_create_global_step()
55 | with tf.variable_scope("Model", reuse=False, initializer=initializer):
56 | valid_graph = SentenceMatchModelGraph(num_classes, word_vocab=word_vocab, char_vocab=char_vocab,
57 | is_training=False, options=options)
58 |
59 | initializer = tf.global_variables_initializer()
60 | vars_ = {}
61 | for var in tf.global_variables():
62 | if "word_embedding" in var.name: continue
63 | if not var.name.startswith("Model"): continue
64 | vars_[var.name.split(":")[0]] = var
65 | saver = tf.train.Saver(vars_)
66 |
67 | sess = tf.Session()
68 | sess.run(initializer)
69 | print("Restoring model from " + best_path)
70 | saver.restore(sess, best_path)
71 | print("DONE!")
72 | acc = SentenceMatchTrainer.evaluation(sess, valid_graph, testDataStream, outpath=args.out_path,
73 | label_vocab=label_vocab)
74 | print("Accuracy for test set is %.2f" % acc)
75 |
76 |
77 |
--------------------------------------------------------------------------------
/src/SentenceMatchModelGraph.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import layer_utils
3 | import match_utils
4 |
5 |
6 | class SentenceMatchModelGraph(object):
7 | def __init__(self, num_classes, word_vocab=None, char_vocab=None, is_training=True, options=None, global_step=None):
8 | self.options = options
9 | self.create_placeholders()
10 | self.create_model_graph(num_classes, word_vocab, char_vocab, is_training, global_step=global_step)
11 |
12 | def create_placeholders(self):
13 | self.question_lengths = tf.placeholder(tf.int32, [None])
14 | self.passage_lengths = tf.placeholder(tf.int32, [None])
15 | self.truth = tf.placeholder(tf.int32, [None]) # [batch_size]
16 | self.in_question_words = tf.placeholder(tf.int32, [None, None]) # [batch_size, question_len]
17 | self.in_passage_words = tf.placeholder(tf.int32, [None, None]) # [batch_size, passage_len]
18 |
19 | if self.options.with_char:
20 | self.question_char_lengths = tf.placeholder(tf.int32, [None,None]) # [batch_size, question_len]
21 | self.passage_char_lengths = tf.placeholder(tf.int32, [None,None]) # [batch_size, passage_len]
22 | self.in_question_chars = tf.placeholder(tf.int32, [None, None, None]) # [batch_size, question_len, q_char_len]
23 | self.in_passage_chars = tf.placeholder(tf.int32, [None, None, None]) # [batch_size, passage_len, p_char_len]
24 |
25 | def create_feed_dict(self, cur_batch, is_training=False):
26 | feed_dict = {
27 | self.question_lengths: cur_batch.question_lengths,
28 | self.passage_lengths: cur_batch.passage_lengths,
29 | self.in_question_words: cur_batch.in_question_words,
30 | self.in_passage_words: cur_batch.in_passage_words,
31 | self.truth : cur_batch.label_truth,
32 | }
33 |
34 | if self.options.with_char:
35 | feed_dict[self.question_char_lengths] = cur_batch.question_char_lengths
36 | feed_dict[self.passage_char_lengths] = cur_batch.passage_char_lengths
37 | feed_dict[self.in_question_chars] = cur_batch.in_question_chars
38 | feed_dict[self.in_passage_chars] = cur_batch.in_passage_chars
39 |
40 | return feed_dict
41 |
42 |
43 | def create_model_graph(self, num_classes, word_vocab=None, char_vocab=None, is_training=True, global_step=None):
44 | options = self.options
45 | # ======word representation layer======
46 | in_question_repres = []
47 | in_passage_repres = []
48 | input_dim = 0
49 | if word_vocab is not None:
50 | word_vec_trainable = True
51 | cur_device = '/gpu:0'
52 | if options.fix_word_vec:
53 | word_vec_trainable = False
54 | cur_device = '/cpu:0'
55 | with tf.device(cur_device):
56 | self.word_embedding = tf.get_variable("word_embedding", trainable=word_vec_trainable,
57 | initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32)
58 |
59 | in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim]
60 | in_passage_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim]
61 | in_question_repres.append(in_question_word_repres)
62 | in_passage_repres.append(in_passage_word_repres)
63 |
64 | input_shape = tf.shape(self.in_question_words)
65 | batch_size = input_shape[0]
66 | question_len = input_shape[1]
67 | input_shape = tf.shape(self.in_passage_words)
68 | passage_len = input_shape[1]
69 | input_dim += word_vocab.word_dim
70 |
71 | if options.with_char and char_vocab is not None:
72 | input_shape = tf.shape(self.in_question_chars)
73 | batch_size = input_shape[0]
74 | question_len = input_shape[1]
75 | q_char_len = input_shape[2]
76 | input_shape = tf.shape(self.in_passage_chars)
77 | passage_len = input_shape[1]
78 | p_char_len = input_shape[2]
79 | char_dim = char_vocab.word_dim
80 | self.char_embedding = tf.get_variable("char_embedding", initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32)
81 |
82 | in_question_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_question_chars) # [batch_size, question_len, q_char_len, char_dim]
83 | in_question_char_repres = tf.reshape(in_question_char_repres, shape=[-1, q_char_len, char_dim])
84 | question_char_lengths = tf.reshape(self.question_char_lengths, [-1])
85 | quesiton_char_mask = tf.sequence_mask(question_char_lengths, q_char_len, dtype=tf.float32) # [batch_size*question_len, q_char_len]
86 | in_question_char_repres = tf.multiply(in_question_char_repres, tf.expand_dims(quesiton_char_mask, axis=-1))
87 |
88 |
89 | in_passage_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_passage_chars) # [batch_size, passage_len, p_char_len, char_dim]
90 | in_passage_char_repres = tf.reshape(in_passage_char_repres, shape=[-1, p_char_len, char_dim])
91 | passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1])
92 | passage_char_mask = tf.sequence_mask(passage_char_lengths, p_char_len, dtype=tf.float32) # [batch_size*passage_len, p_char_len]
93 | in_passage_char_repres = tf.multiply(in_passage_char_repres, tf.expand_dims(passage_char_mask, axis=-1))
94 |
95 | (question_char_outputs_fw, question_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_question_char_repres, options.char_lstm_dim,
96 | input_lengths=question_char_lengths,scope_name="char_lstm", reuse=False,
97 | is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn)
98 | question_char_outputs_fw = layer_utils.collect_final_step_of_lstm(question_char_outputs_fw, question_char_lengths - 1)
99 | question_char_outputs_bw = question_char_outputs_bw[:, 0, :]
100 | question_char_outputs = tf.concat(axis=1, values=[question_char_outputs_fw, question_char_outputs_bw])
101 | question_char_outputs = tf.reshape(question_char_outputs, [batch_size, question_len, 2*options.char_lstm_dim])
102 |
103 | (passage_char_outputs_fw, passage_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_passage_char_repres, options.char_lstm_dim,
104 | input_lengths=passage_char_lengths, scope_name="char_lstm", reuse=True,
105 | is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn)
106 | passage_char_outputs_fw = layer_utils.collect_final_step_of_lstm(passage_char_outputs_fw, passage_char_lengths - 1)
107 | passage_char_outputs_bw = passage_char_outputs_bw[:, 0, :]
108 | passage_char_outputs = tf.concat(axis=1, values=[passage_char_outputs_fw, passage_char_outputs_bw])
109 | passage_char_outputs = tf.reshape(passage_char_outputs, [batch_size, passage_len, 2*options.char_lstm_dim])
110 |
111 | in_question_repres.append(question_char_outputs)
112 | in_passage_repres.append(passage_char_outputs)
113 |
114 | input_dim += 2*options.char_lstm_dim
115 |
116 | in_question_repres = tf.concat(axis=2, values=in_question_repres) # [batch_size, question_len, dim]
117 | in_passage_repres = tf.concat(axis=2, values=in_passage_repres) # [batch_size, passage_len, dim]
118 |
119 | if is_training:
120 | in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate))
121 | in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate))
122 |
123 | mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len]
124 | question_mask = tf.sequence_mask(self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len]
125 |
126 | # ======Highway layer======
127 | if options.with_highway:
128 | with tf.variable_scope("input_highway"):
129 | in_question_repres = match_utils.multi_highway_layer(in_question_repres, input_dim, options.highway_layer_num)
130 | tf.get_variable_scope().reuse_variables()
131 | in_passage_repres = match_utils.multi_highway_layer(in_passage_repres, input_dim, options.highway_layer_num)
132 |
133 | # in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1))
134 | # in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(mask, axis=-1))
135 |
136 | # ========Bilateral Matching=====
137 | (match_representation, match_dim) = match_utils.bilateral_match_func(in_question_repres, in_passage_repres,
138 | self.question_lengths, self.passage_lengths, question_mask, mask, input_dim, is_training, options=options)
139 |
140 | #========Prediction Layer=========
141 | # match_dim = 4 * self.options.aggregation_lstm_dim
142 | w_0 = tf.get_variable("w_0", [match_dim, match_dim/2], dtype=tf.float32)
143 | b_0 = tf.get_variable("b_0", [match_dim/2], dtype=tf.float32)
144 | w_1 = tf.get_variable("w_1", [match_dim/2, num_classes],dtype=tf.float32)
145 | b_1 = tf.get_variable("b_1", [num_classes],dtype=tf.float32)
146 |
147 | # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate))
148 | logits = tf.matmul(match_representation, w_0) + b_0
149 | logits = tf.tanh(logits)
150 | if is_training: logits = tf.nn.dropout(logits, (1 - options.dropout_rate))
151 | logits = tf.matmul(logits, w_1) + b_1
152 |
153 | self.prob = tf.nn.softmax(logits)
154 |
155 | gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32)
156 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix))
157 |
158 | correct = tf.nn.in_top_k(logits, self.truth, 1)
159 | self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))
160 | self.predictions = tf.argmax(self.prob, 1)
161 |
162 | if not is_training: return
163 |
164 | tvars = tf.trainable_variables()
165 | if self.options.lambda_l2>0.0:
166 | l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
167 | self.loss = self.loss + self.options.lambda_l2 * l2_loss
168 |
169 | if self.options.optimize_type == 'adadelta':
170 | optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.options.learning_rate)
171 | elif self.options.optimize_type == 'adam':
172 | optimizer = tf.train.AdamOptimizer(learning_rate=self.options.learning_rate)
173 |
174 | grads = layer_utils.compute_gradients(self.loss, tvars)
175 | grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper)
176 | self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step)
177 | # self.train_op = optimizer.apply_gradients(zip(grads, tvars))
178 |
179 | if self.options.with_moving_average:
180 | # Track the moving averages of all trainable variables.
181 | MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average.
182 | variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
183 | variables_averages_op = variable_averages.apply(tf.trainable_variables())
184 | train_ops = [self.train_op, variables_averages_op]
185 | self.train_op = tf.group(*train_ops)
186 |
187 |
--------------------------------------------------------------------------------
/src/SentenceMatchModelGraph.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhiguowang/BiMPM/33cc8fe5d450f432a6843bc05cad29c6ce9f5714/src/SentenceMatchModelGraph.pyc
--------------------------------------------------------------------------------
/src/SentenceMatchTrainer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import print_function
3 | import argparse
4 | import os
5 | import sys
6 | import time
7 | import re
8 | import tensorflow as tf
9 | import json
10 |
11 | from vocab_utils import Vocab
12 | from SentenceMatchDataStream import SentenceMatchDataStream
13 | from SentenceMatchModelGraph import SentenceMatchModelGraph
14 | import namespace_utils
15 |
16 | def collect_vocabs(train_path, with_POS=False, with_NER=False):
17 | all_labels = set()
18 | all_words = set()
19 | all_POSs = None
20 | all_NERs = None
21 | if with_POS: all_POSs = set()
22 | if with_NER: all_NERs = set()
23 | infile = open(train_path, 'rt')
24 | for line in infile:
25 | line = line.decode('utf-8').strip()
26 | if line.startswith('-'): continue
27 | items = re.split("\t", line)
28 | label = items[0]
29 | sentence1 = re.split("\\s+",items[1].lower())
30 | sentence2 = re.split("\\s+",items[2].lower())
31 | all_labels.add(label)
32 | all_words.update(sentence1)
33 | all_words.update(sentence2)
34 | if with_POS:
35 | all_POSs.update(re.split("\\s+",items[3]))
36 | all_POSs.update(re.split("\\s+",items[4]))
37 | if with_NER:
38 | all_NERs.update(re.split("\\s+",items[5]))
39 | all_NERs.update(re.split("\\s+",items[6]))
40 | infile.close()
41 |
42 | all_chars = set()
43 | for word in all_words:
44 | for char in word:
45 | all_chars.add(char)
46 | return (all_words, all_chars, all_labels, all_POSs, all_NERs)
47 |
48 | def output_probs(probs, label_vocab):
49 | out_string = ""
50 | for i in xrange(probs.size):
51 | out_string += " {}:{}".format(label_vocab.getWord(i), probs[i])
52 | return out_string.strip()
53 |
54 | def evaluation(sess, valid_graph, devDataStream, outpath=None, label_vocab=None):
55 | if outpath is not None:
56 | result_json = {}
57 | total = 0
58 | correct = 0
59 | for batch_index in xrange(devDataStream.get_num_batch()): # for each batch
60 | cur_batch = devDataStream.get_batch(batch_index)
61 | total += cur_batch.batch_size
62 | feed_dict = valid_graph.create_feed_dict(cur_batch, is_training=True)
63 | [cur_correct, probs, predictions] = sess.run([valid_graph.eval_correct, valid_graph.prob, valid_graph.predictions], feed_dict=feed_dict)
64 | correct += cur_correct
65 | if outpath is not None:
66 | for i in xrange(cur_batch.batch_size):
67 | (label, sentence1, sentence2, _, _, _, _, _, cur_ID) = cur_batch.instances[i]
68 | result_json[cur_ID] = {
69 | "ID": cur_ID,
70 | "truth": label,
71 | "sent1": sentence1,
72 | "sent2": sentence2,
73 | "prediction": label_vocab.getWord(predictions[i]),
74 | "probs": output_probs(probs[i], label_vocab),
75 | }
76 | accuracy = correct / float(total) * 100
77 | if outpath is not None:
78 | with open(outpath, 'w') as outfile:
79 | json.dump(result_json, outfile)
80 | return accuracy
81 |
82 | def train(sess, saver, train_graph, valid_graph, trainDataStream, devDataStream, options, best_path):
83 | best_accuracy = -1
84 | for epoch in range(options.max_epochs):
85 | print('Train in epoch %d' % epoch)
86 | # training
87 | trainDataStream.shuffle()
88 | num_batch = trainDataStream.get_num_batch()
89 | start_time = time.time()
90 | total_loss = 0
91 | for batch_index in xrange(num_batch): # for each batch
92 | cur_batch = trainDataStream.get_batch(batch_index)
93 | feed_dict = train_graph.create_feed_dict(cur_batch, is_training=True)
94 | _, loss_value = sess.run([train_graph.train_op, train_graph.loss], feed_dict=feed_dict)
95 | total_loss += loss_value
96 | if batch_index % 100 == 0:
97 | print('{} '.format(batch_index), end="")
98 | sys.stdout.flush()
99 |
100 | print()
101 | duration = time.time() - start_time
102 | print('Epoch %d: loss = %.4f (%.3f sec)' % (epoch, total_loss / num_batch, duration))
103 | # evaluation
104 | start_time = time.time()
105 | acc = evaluation(sess, valid_graph, devDataStream)
106 | duration = time.time() - start_time
107 | print("Accuracy: %.2f" % acc)
108 | print('Evaluation time: %.3f sec' % (duration))
109 | if acc>= best_accuracy:
110 | best_accuracy = acc
111 | saver.save(sess, best_path)
112 |
113 |
114 | def main(FLAGS):
115 | train_path = FLAGS.train_path
116 | dev_path = FLAGS.dev_path
117 | word_vec_path = FLAGS.word_vec_path
118 | log_dir = FLAGS.model_dir
119 | if not os.path.exists(log_dir):
120 | os.makedirs(log_dir)
121 |
122 | path_prefix = log_dir + "/SentenceMatch.{}".format(FLAGS.suffix)
123 |
124 | namespace_utils.save_namespace(FLAGS, path_prefix + ".config.json")
125 |
126 | # build vocabs
127 | word_vocab = Vocab(word_vec_path, fileformat='txt3')
128 |
129 | best_path = path_prefix + '.best.model'
130 | char_path = path_prefix + ".char_vocab"
131 | label_path = path_prefix + ".label_vocab"
132 | has_pre_trained_model = False
133 | char_vocab = None
134 | if os.path.exists(best_path + ".index"):
135 | has_pre_trained_model = True
136 | print('Loading vocabs from a pre-trained model ...')
137 | label_vocab = Vocab(label_path, fileformat='txt2')
138 | if FLAGS.with_char: char_vocab = Vocab(char_path, fileformat='txt2')
139 | else:
140 | print('Collecting words, chars and labels ...')
141 | (all_words, all_chars, all_labels, all_POSs, all_NERs) = collect_vocabs(train_path)
142 | print('Number of words: {}'.format(len(all_words)))
143 | label_vocab = Vocab(fileformat='voc', voc=all_labels,dim=2)
144 | label_vocab.dump_to_txt2(label_path)
145 |
146 | if FLAGS.with_char:
147 | print('Number of chars: {}'.format(len(all_chars)))
148 | char_vocab = Vocab(fileformat='voc', voc=all_chars,dim=FLAGS.char_emb_dim)
149 | char_vocab.dump_to_txt2(char_path)
150 |
151 | print('word_vocab shape is {}'.format(word_vocab.word_vecs.shape))
152 | num_classes = label_vocab.size()
153 | print("Number of labels: {}".format(num_classes))
154 | sys.stdout.flush()
155 |
156 | print('Build SentenceMatchDataStream ... ')
157 | trainDataStream = SentenceMatchDataStream(train_path, word_vocab=word_vocab, char_vocab=char_vocab, label_vocab=label_vocab,
158 | isShuffle=True, isLoop=True, isSort=True, options=FLAGS)
159 | print('Number of instances in trainDataStream: {}'.format(trainDataStream.get_num_instance()))
160 | print('Number of batches in trainDataStream: {}'.format(trainDataStream.get_num_batch()))
161 | sys.stdout.flush()
162 |
163 | devDataStream = SentenceMatchDataStream(dev_path, word_vocab=word_vocab, char_vocab=char_vocab, label_vocab=label_vocab,
164 | isShuffle=False, isLoop=True, isSort=True, options=FLAGS)
165 | print('Number of instances in devDataStream: {}'.format(devDataStream.get_num_instance()))
166 | print('Number of batches in devDataStream: {}'.format(devDataStream.get_num_batch()))
167 | sys.stdout.flush()
168 |
169 | init_scale = 0.01
170 | with tf.Graph().as_default():
171 | initializer = tf.random_uniform_initializer(-init_scale, init_scale)
172 | global_step = tf.train.get_or_create_global_step()
173 | with tf.variable_scope("Model", reuse=None, initializer=initializer):
174 | train_graph = SentenceMatchModelGraph(num_classes, word_vocab=word_vocab, char_vocab=char_vocab,
175 | is_training=True, options=FLAGS, global_step=global_step)
176 |
177 | with tf.variable_scope("Model", reuse=True, initializer=initializer):
178 | valid_graph = SentenceMatchModelGraph(num_classes, word_vocab=word_vocab, char_vocab=char_vocab,
179 | is_training=False, options=FLAGS)
180 |
181 |
182 | initializer = tf.global_variables_initializer()
183 | vars_ = {}
184 | for var in tf.global_variables():
185 | if "word_embedding" in var.name: continue
186 | # if not var.name.startswith("Model"): continue
187 | vars_[var.name.split(":")[0]] = var
188 | saver = tf.train.Saver(vars_)
189 |
190 | sess = tf.Session()
191 | sess.run(initializer)
192 | if has_pre_trained_model:
193 | print("Restoring model from " + best_path)
194 | saver.restore(sess, best_path)
195 | print("DONE!")
196 |
197 | # training
198 | train(sess, saver, train_graph, valid_graph, trainDataStream, devDataStream, FLAGS, best_path)
199 |
200 | def enrich_options(options):
201 | if not options.__dict__.has_key("in_format"):
202 | options.__dict__["in_format"] = 'tsv'
203 |
204 | return options
205 |
206 | if __name__ == '__main__':
207 | parser = argparse.ArgumentParser()
208 | parser.add_argument('--train_path', type=str, help='Path to the train set.')
209 | parser.add_argument('--dev_path', type=str, help='Path to the dev set.')
210 | parser.add_argument('--test_path', type=str, help='Path to the test set.')
211 | parser.add_argument('--word_vec_path', type=str, help='Path the to pre-trained word vector model.')
212 | parser.add_argument('--model_dir', type=str, help='Directory to save model files.')
213 | parser.add_argument('--batch_size', type=int, default=60, help='Number of instances in each batch.')
214 | parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning rate.')
215 | parser.add_argument('--lambda_l2', type=float, default=0.0, help='The coefficient of L2 regularizer.')
216 | parser.add_argument('--dropout_rate', type=float, default=0.1, help='Dropout ratio.')
217 | parser.add_argument('--max_epochs', type=int, default=10, help='Maximum epochs for training.')
218 | parser.add_argument('--optimize_type', type=str, default='adam', help='Optimizer type.')
219 | parser.add_argument('--char_emb_dim', type=int, default=20, help='Number of dimension for character embeddings.')
220 | parser.add_argument('--char_lstm_dim', type=int, default=100, help='Number of dimension for character-composed embeddings.')
221 | parser.add_argument('--context_lstm_dim', type=int, default=100, help='Number of dimension for context representation layer.')
222 | parser.add_argument('--aggregation_lstm_dim', type=int, default=100, help='Number of dimension for aggregation layer.')
223 | parser.add_argument('--max_char_per_word', type=int, default=10, help='Maximum number of characters for each word.')
224 | parser.add_argument('--max_sent_length', type=int, default=100, help='Maximum number of words within each sentence.')
225 | parser.add_argument('--aggregation_layer_num', type=int, default=1, help='Number of LSTM layers for aggregation layer.')
226 | parser.add_argument('--context_layer_num', type=int, default=1, help='Number of LSTM layers for context representation layer.')
227 | parser.add_argument('--highway_layer_num', type=int, default=1, help='Number of highway layers.')
228 | parser.add_argument('--suffix', type=str, default='normal', help='Suffix of the model name.')
229 | parser.add_argument('--fix_word_vec', default=False, help='Fix pre-trained word embeddings during training.', action='store_true')
230 | parser.add_argument('--with_highway', default=False, help='Utilize highway layers.', action='store_true')
231 | parser.add_argument('--with_match_highway', default=False, help='Utilize highway layers for matching layer.', action='store_true')
232 | parser.add_argument('--with_aggregation_highway', default=False, help='Utilize highway layers for aggregation layer.', action='store_true')
233 | parser.add_argument('--with_full_match', default=False, help='With full matching.', action='store_true')
234 | parser.add_argument('--with_maxpool_match', default=False, help='With maxpooling matching', action='store_true')
235 | parser.add_argument('--with_attentive_match', default=False, help='With attentive matching', action='store_true')
236 | parser.add_argument('--with_max_attentive_match', default=False, help='With max attentive matching.', action='store_true')
237 | parser.add_argument('--with_char', default=False, help='With character-composed embeddings.', action='store_true')
238 |
239 | parser.add_argument('--config_path', type=str, help='Configuration file.')
240 |
241 | # print("CUDA_VISIBLE_DEVICES " + os.environ['CUDA_VISIBLE_DEVICES'])
242 | args, unparsed = parser.parse_known_args()
243 | if args.config_path is not None:
244 | print('Loading the configuration from ' + args.config_path)
245 | FLAGS = namespace_utils.load_namespace(args.config_path)
246 | else:
247 | FLAGS = args
248 | sys.stdout.flush()
249 |
250 | # enrich arguments to backwards compatibility
251 | FLAGS = enrich_options(FLAGS)
252 |
253 | main(FLAGS)
254 |
255 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhiguowang/BiMPM/33cc8fe5d450f432a6843bc05cad29c6ce9f5714/src/__init__.py
--------------------------------------------------------------------------------
/src/layer_utils.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow.python.ops import nn_ops
3 |
4 | def my_lstm_layer(input_reps, lstm_dim, input_lengths=None, scope_name=None, reuse=False, is_training=True,
5 | dropout_rate=0.2, use_cudnn=True):
6 | '''
7 | :param inputs: [batch_size, seq_len, feature_dim]
8 | :param lstm_dim:
9 | :param scope_name:
10 | :param reuse:
11 | :param is_training:
12 | :param dropout_rate:
13 | :return:
14 | '''
15 | input_reps = dropout_layer(input_reps, dropout_rate, is_training=is_training)
16 | with tf.variable_scope(scope_name, reuse=reuse):
17 | if use_cudnn:
18 | inputs = tf.transpose(input_reps, [1, 0, 2])
19 | lstm = tf.contrib.cudnn_rnn.CudnnLSTM(1, lstm_dim, direction="bidirectional",
20 | name="{}_cudnn_bi_lstm".format(scope_name), dropout=dropout_rate if is_training else 0)
21 | outputs, _ = lstm(inputs)
22 | outputs = tf.transpose(outputs, [1, 0, 2])
23 | f_rep = outputs[:, :, 0:lstm_dim]
24 | b_rep = outputs[:, :, lstm_dim:2*lstm_dim]
25 | else:
26 | context_lstm_cell_fw = tf.nn.rnn_cell.BasicLSTMCell(lstm_dim)
27 | context_lstm_cell_bw = tf.nn.rnn_cell.BasicLSTMCell(lstm_dim)
28 | if is_training:
29 | context_lstm_cell_fw = tf.nn.rnn_cell.DropoutWrapper(context_lstm_cell_fw, output_keep_prob=(1 - dropout_rate))
30 | context_lstm_cell_bw = tf.nn.rnn_cell.DropoutWrapper(context_lstm_cell_bw, output_keep_prob=(1 - dropout_rate))
31 | context_lstm_cell_fw = tf.nn.rnn_cell.MultiRNNCell([context_lstm_cell_fw])
32 | context_lstm_cell_bw = tf.nn.rnn_cell.MultiRNNCell([context_lstm_cell_bw])
33 |
34 | (f_rep, b_rep), _ = tf.nn.bidirectional_dynamic_rnn(
35 | context_lstm_cell_fw, context_lstm_cell_bw, input_reps, dtype=tf.float32,
36 | sequence_length=input_lengths) # [batch_size, question_len, context_lstm_dim]
37 | outputs = tf.concat(axis=2, values=[f_rep, b_rep])
38 | return (f_rep,b_rep, outputs)
39 |
40 | def dropout_layer(input_reps, dropout_rate, is_training=True):
41 | if is_training:
42 | output_repr = tf.nn.dropout(input_reps, (1 - dropout_rate))
43 | else:
44 | output_repr = input_reps
45 | return output_repr
46 |
47 | def cosine_distance(y1,y2, cosine_norm=True, eps=1e-6):
48 | # cosine_norm = True
49 | # y1 [....,a, 1, d]
50 | # y2 [....,1, b, d]
51 | cosine_numerator = tf.reduce_sum(tf.multiply(y1, y2), axis=-1)
52 | if not cosine_norm:
53 | return tf.tanh(cosine_numerator)
54 | y1_norm = tf.sqrt(tf.maximum(tf.reduce_sum(tf.square(y1), axis=-1), eps))
55 | y2_norm = tf.sqrt(tf.maximum(tf.reduce_sum(tf.square(y2), axis=-1), eps))
56 | return cosine_numerator / y1_norm / y2_norm
57 |
58 | def euclidean_distance(y1, y2, eps=1e-6):
59 | distance = tf.sqrt(tf.maximum(tf.reduce_sum(tf.square(y1 - y2), axis=-1), eps))
60 | return distance
61 |
62 | def cross_entropy(logits, truth, mask=None):
63 | # logits: [batch_size, passage_len]
64 | # truth: [batch_size, passage_len]
65 | # mask: [batch_size, passage_len]
66 | if mask is not None: logits = tf.multiply(logits, mask)
67 | xdev = tf.subtract(logits, tf.expand_dims(tf.reduce_max(logits, 1), -1))
68 | log_predictions = tf.subtract(xdev, tf.expand_dims(tf.log(tf.reduce_sum(tf.exp(xdev),-1)),-1))
69 | result = tf.multiply(truth, log_predictions) # [batch_size, passage_len]
70 | if mask is not None: result = tf.multiply(result, mask) # [batch_size, passage_len]
71 | return tf.multiply(-1.0,tf.reduce_sum(result, -1)) # [batch_size]
72 |
73 | def projection_layer(in_val, input_size, output_size, activation_func=tf.tanh, scope=None):
74 | # in_val: [batch_size, passage_len, dim]
75 | input_shape = tf.shape(in_val)
76 | batch_size = input_shape[0]
77 | passage_len = input_shape[1]
78 | # feat_dim = input_shape[2]
79 | in_val = tf.reshape(in_val, [batch_size * passage_len, input_size])
80 | with tf.variable_scope(scope or "projection_layer"):
81 | full_w = tf.get_variable("full_w", [input_size, output_size], dtype=tf.float32)
82 | full_b = tf.get_variable("full_b", [output_size], dtype=tf.float32)
83 | outputs = activation_func(tf.nn.xw_plus_b(in_val, full_w, full_b))
84 | outputs = tf.reshape(outputs, [batch_size, passage_len, output_size])
85 | return outputs # [batch_size, passage_len, output_size]
86 |
87 | def highway_layer(in_val, output_size, activation_func=tf.tanh, scope=None):
88 | # in_val: [batch_size, passage_len, dim]
89 | input_shape = tf.shape(in_val)
90 | batch_size = input_shape[0]
91 | passage_len = input_shape[1]
92 | # feat_dim = input_shape[2]
93 | in_val = tf.reshape(in_val, [batch_size * passage_len, output_size])
94 | with tf.variable_scope(scope or "highway_layer"):
95 | highway_w = tf.get_variable("highway_w", [output_size, output_size], dtype=tf.float32)
96 | highway_b = tf.get_variable("highway_b", [output_size], dtype=tf.float32)
97 | full_w = tf.get_variable("full_w", [output_size, output_size], dtype=tf.float32)
98 | full_b = tf.get_variable("full_b", [output_size], dtype=tf.float32)
99 | trans = activation_func(tf.nn.xw_plus_b(in_val, full_w, full_b))
100 | gate = tf.nn.sigmoid(tf.nn.xw_plus_b(in_val, highway_w, highway_b))
101 | outputs = tf.add(tf.multiply(trans, gate), tf.multiply(in_val, tf.subtract(1.0, gate)), "y")
102 | outputs = tf.reshape(outputs, [batch_size, passage_len, output_size])
103 | return outputs
104 |
105 | def multi_highway_layer(in_val, output_size, num_layers, activation_func=tf.tanh, scope_name=None, reuse=False):
106 | with tf.variable_scope(scope_name, reuse=reuse):
107 | for i in xrange(num_layers):
108 | cur_scope_name = scope_name + "-{}".format(i)
109 | in_val = highway_layer(in_val, output_size,activation_func=activation_func, scope=cur_scope_name)
110 | return in_val
111 |
112 | def collect_representation(representation, positions):
113 | # representation: [batch_size, node_num, feature_dim]
114 | # positions: [batch_size, neigh_num]
115 | return collect_probs(representation, positions)
116 |
117 | def collect_final_step_of_lstm(lstm_representation, lengths):
118 | # lstm_representation: [batch_size, passsage_length, dim]
119 | # lengths: [batch_size]
120 | lengths = tf.maximum(lengths, tf.zeros_like(lengths, dtype=tf.int32))
121 |
122 | batch_size = tf.shape(lengths)[0]
123 | batch_nums = tf.range(0, limit=batch_size) # shape (batch_size)
124 | indices = tf.stack((batch_nums, lengths), axis=1) # shape (batch_size, 2)
125 | result = tf.gather_nd(lstm_representation, indices, name='last-forwar-lstm')
126 | return result # [batch_size, dim]
127 |
128 | def collect_probs(probs, positions):
129 | # probs [batch_size, chunks_size]
130 | # positions [batch_size, pair_size]
131 | batch_size = tf.shape(probs)[0]
132 | pair_size = tf.shape(positions)[1]
133 | batch_nums = tf.range(0, limit=batch_size) # shape (batch_size)
134 | batch_nums = tf.reshape(batch_nums, shape=[-1, 1]) # [batch_size, 1]
135 | batch_nums = tf.tile(batch_nums, multiples=[1, pair_size]) # [batch_size, pair_size]
136 |
137 | indices = tf.stack((batch_nums, positions), axis=2) # shape (batch_size, pair_size, 2)
138 | pair_probs = tf.gather_nd(probs, indices)
139 | # pair_probs = tf.reshape(pair_probs, shape=[batch_size, pair_size])
140 | return pair_probs
141 |
142 |
143 | def calcuate_attention(in_value_1, in_value_2, feature_dim1, feature_dim2, scope_name='att',
144 | att_type='symmetric', att_dim=20, remove_diagnoal=False, mask1=None, mask2=None, is_training=False, dropout_rate=0.2):
145 | input_shape = tf.shape(in_value_1)
146 | batch_size = input_shape[0]
147 | len_1 = input_shape[1]
148 | len_2 = tf.shape(in_value_2)[1]
149 |
150 | in_value_1 = dropout_layer(in_value_1, dropout_rate, is_training=is_training)
151 | in_value_2 = dropout_layer(in_value_2, dropout_rate, is_training=is_training)
152 | with tf.variable_scope(scope_name):
153 | # calculate attention ==> a: [batch_size, len_1, len_2]
154 | atten_w1 = tf.get_variable("atten_w1", [feature_dim1, att_dim], dtype=tf.float32)
155 | if feature_dim1 == feature_dim2: atten_w2 = atten_w1
156 | else: atten_w2 = tf.get_variable("atten_w2", [feature_dim2, att_dim], dtype=tf.float32)
157 | atten_value_1 = tf.matmul(tf.reshape(in_value_1, [batch_size * len_1, feature_dim1]), atten_w1) # [batch_size*len_1, feature_dim]
158 | atten_value_1 = tf.reshape(atten_value_1, [batch_size, len_1, att_dim])
159 | atten_value_2 = tf.matmul(tf.reshape(in_value_2, [batch_size * len_2, feature_dim2]), atten_w2) # [batch_size*len_2, feature_dim]
160 | atten_value_2 = tf.reshape(atten_value_2, [batch_size, len_2, att_dim])
161 |
162 |
163 | if att_type == 'additive':
164 | atten_b = tf.get_variable("atten_b", [att_dim], dtype=tf.float32)
165 | atten_v = tf.get_variable("atten_v", [1, att_dim], dtype=tf.float32)
166 | atten_value_1 = tf.expand_dims(atten_value_1, axis=2, name="atten_value_1") # [batch_size, len_1, 'x', feature_dim]
167 | atten_value_2 = tf.expand_dims(atten_value_2, axis=1, name="atten_value_2") # [batch_size, 'x', len_2, feature_dim]
168 | atten_value = atten_value_1 + atten_value_2 # + tf.expand_dims(tf.expand_dims(tf.expand_dims(atten_b, axis=0), axis=0), axis=0)
169 | atten_value = nn_ops.bias_add(atten_value, atten_b)
170 | atten_value = tf.tanh(atten_value) # [batch_size, len_1, len_2, feature_dim]
171 | atten_value = tf.reshape(atten_value, [-1, att_dim]) * atten_v # tf.expand_dims(atten_v, axis=0) # [batch_size*len_1*len_2, feature_dim]
172 | atten_value = tf.reduce_sum(atten_value, axis=-1)
173 | atten_value = tf.reshape(atten_value, [batch_size, len_1, len_2])
174 | else:
175 | atten_value_1 = tf.tanh(atten_value_1)
176 | # atten_value_1 = tf.nn.relu(atten_value_1)
177 | atten_value_2 = tf.tanh(atten_value_2)
178 | # atten_value_2 = tf.nn.relu(atten_value_2)
179 | diagnoal_params = tf.get_variable("diagnoal_params", [1, 1, att_dim], dtype=tf.float32)
180 | atten_value_1 = atten_value_1 * diagnoal_params
181 | atten_value = tf.matmul(atten_value_1, atten_value_2, transpose_b=True) # [batch_size, len_1, len_2]
182 |
183 | # normalize
184 | if remove_diagnoal:
185 | diagnoal = tf.ones([len_1], tf.float32) # [len1]
186 | diagnoal = 1.0 - tf.diag(diagnoal) # [len1, len1]
187 | diagnoal = tf.expand_dims(diagnoal, axis=0) # ['x', len1, len1]
188 | atten_value = atten_value * diagnoal
189 | if mask1 is not None: atten_value = tf.multiply(atten_value, tf.expand_dims(mask1, axis=-1))
190 | if mask2 is not None: atten_value = tf.multiply(atten_value, tf.expand_dims(mask2, axis=1))
191 | atten_value = tf.nn.softmax(atten_value, name='atten_value') # [batch_size, len_1, len_2]
192 | if remove_diagnoal: atten_value = atten_value * diagnoal
193 | if mask1 is not None: atten_value = tf.multiply(atten_value, tf.expand_dims(mask1, axis=-1))
194 | if mask2 is not None: atten_value = tf.multiply(atten_value, tf.expand_dims(mask2, axis=1))
195 |
196 | return atten_value
197 |
198 | def weighted_sum(atten_scores, in_values):
199 | '''
200 |
201 | :param atten_scores: # [batch_size, len1, len2]
202 | :param in_values: [batch_size, len2, dim]
203 | :return:
204 | '''
205 | return tf.matmul(atten_scores, in_values)
206 |
207 | def cal_relevancy_matrix(in_question_repres, in_passage_repres):
208 | in_question_repres_tmp = tf.expand_dims(in_question_repres, 1) # [batch_size, 1, question_len, dim]
209 | in_passage_repres_tmp = tf.expand_dims(in_passage_repres, 2) # [batch_size, passage_len, 1, dim]
210 | relevancy_matrix = cosine_distance(in_question_repres_tmp,in_passage_repres_tmp) # [batch_size, passage_len, question_len]
211 | return relevancy_matrix
212 |
213 | def mask_relevancy_matrix(relevancy_matrix, question_mask, passage_mask):
214 | # relevancy_matrix: [batch_size, passage_len, question_len]
215 | # question_mask: [batch_size, question_len]
216 | # passage_mask: [batch_size, passsage_len]
217 | if question_mask is not None:
218 | relevancy_matrix = tf.multiply(relevancy_matrix, tf.expand_dims(question_mask, 1))
219 | relevancy_matrix = tf.multiply(relevancy_matrix, tf.expand_dims(passage_mask, 2))
220 | return relevancy_matrix
221 |
222 | def compute_gradients(tensor, var_list):
223 | grads = tf.gradients(tensor, var_list)
224 | return [grad if grad is not None else tf.zeros_like(var) for var, grad in zip(var_list, grads)]
225 |
--------------------------------------------------------------------------------
/src/match_utils.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import layer_utils
3 |
4 | eps = 1e-6
5 | def cosine_distance(y1,y2):
6 | # y1 [....,a, 1, d]
7 | # y2 [....,1, b, d]
8 | cosine_numerator = tf.reduce_sum(tf.multiply(y1, y2), axis=-1)
9 | y1_norm = tf.sqrt(tf.maximum(tf.reduce_sum(tf.square(y1), axis=-1), eps))
10 | y2_norm = tf.sqrt(tf.maximum(tf.reduce_sum(tf.square(y2), axis=-1), eps))
11 | return cosine_numerator / y1_norm / y2_norm
12 |
13 | def cal_relevancy_matrix(in_question_repres, in_passage_repres):
14 | in_question_repres_tmp = tf.expand_dims(in_question_repres, 1) # [batch_size, 1, question_len, dim]
15 | in_passage_repres_tmp = tf.expand_dims(in_passage_repres, 2) # [batch_size, passage_len, 1, dim]
16 | relevancy_matrix = cosine_distance(in_question_repres_tmp,in_passage_repres_tmp) # [batch_size, passage_len, question_len]
17 | return relevancy_matrix
18 |
19 | def mask_relevancy_matrix(relevancy_matrix, question_mask, passage_mask):
20 | # relevancy_matrix: [batch_size, passage_len, question_len]
21 | # question_mask: [batch_size, question_len]
22 | # passage_mask: [batch_size, passsage_len]
23 | relevancy_matrix = tf.multiply(relevancy_matrix, tf.expand_dims(question_mask, 1))
24 | relevancy_matrix = tf.multiply(relevancy_matrix, tf.expand_dims(passage_mask, 2))
25 | return relevancy_matrix
26 |
27 | def multi_perspective_expand_for_3D(in_tensor, decompose_params):
28 | in_tensor = tf.expand_dims(in_tensor, axis=2) #[batch_size, passage_len, 'x', dim]
29 | decompose_params = tf.expand_dims(tf.expand_dims(decompose_params, axis=0), axis=0) # [1, 1, decompse_dim, dim]
30 | return tf.multiply(in_tensor, decompose_params)#[batch_size, passage_len, decompse_dim, dim]
31 |
32 | def multi_perspective_expand_for_2D(in_tensor, decompose_params):
33 | in_tensor = tf.expand_dims(in_tensor, axis=1) #[batch_size, 'x', dim]
34 | decompose_params = tf.expand_dims(decompose_params, axis=0) # [1, decompse_dim, dim]
35 | return tf.multiply(in_tensor, decompose_params) # [batch_size, decompse_dim, dim]
36 |
37 |
38 | def cal_maxpooling_matching(passage_rep, question_rep, decompose_params):
39 | # passage_representation: [batch_size, passage_len, dim]
40 | # qusetion_representation: [batch_size, question_len, dim]
41 | # decompose_params: [decompose_dim, dim]
42 |
43 | def singel_instance(x):
44 | p = x[0]
45 | q = x[1]
46 | # p: [pasasge_len, dim], q: [question_len, dim]
47 | p = multi_perspective_expand_for_2D(p, decompose_params) # [pasasge_len, decompose_dim, dim]
48 | q = multi_perspective_expand_for_2D(q, decompose_params) # [question_len, decompose_dim, dim]
49 | p = tf.expand_dims(p, 1) # [pasasge_len, 1, decompose_dim, dim]
50 | q = tf.expand_dims(q, 0) # [1, question_len, decompose_dim, dim]
51 | return cosine_distance(p, q) # [passage_len, question_len, decompose]
52 | elems = (passage_rep, question_rep)
53 | matching_matrix = tf.map_fn(singel_instance, elems, dtype=tf.float32) # [batch_size, passage_len, question_len, decompse_dim]
54 | return tf.concat(axis=2, values=[tf.reduce_max(matching_matrix, axis=2), tf.reduce_mean(matching_matrix, axis=2)])# [batch_size, passage_len, 2*decompse_dim]
55 |
56 | def cross_entropy(logits, truth, mask):
57 | # logits: [batch_size, passage_len]
58 | # truth: [batch_size, passage_len]
59 | # mask: [batch_size, passage_len]
60 |
61 | # xdev = x - x.max()
62 | # return xdev - T.log(T.sum(T.exp(xdev)))
63 | logits = tf.multiply(logits, mask)
64 | xdev = tf.sub(logits, tf.expand_dims(tf.reduce_max(logits, 1), -1))
65 | log_predictions = tf.sub(xdev, tf.expand_dims(tf.log(tf.reduce_sum(tf.exp(xdev),-1)),-1))
66 | # return -T.sum(targets * log_predictions)
67 | result = tf.multiply(tf.multiply(truth, log_predictions), mask) # [batch_size, passage_len]
68 | return tf.multiply(-1.0,tf.reduce_sum(result, -1)) # [batch_size]
69 |
70 | def highway_layer(in_val, output_size, scope=None):
71 | # in_val: [batch_size, passage_len, dim]
72 | input_shape = tf.shape(in_val)
73 | batch_size = input_shape[0]
74 | passage_len = input_shape[1]
75 | # feat_dim = input_shape[2]
76 | in_val = tf.reshape(in_val, [batch_size * passage_len, output_size])
77 | with tf.variable_scope(scope or "highway_layer"):
78 | highway_w = tf.get_variable("highway_w", [output_size, output_size], dtype=tf.float32)
79 | highway_b = tf.get_variable("highway_b", [output_size], dtype=tf.float32)
80 | full_w = tf.get_variable("full_w", [output_size, output_size], dtype=tf.float32)
81 | full_b = tf.get_variable("full_b", [output_size], dtype=tf.float32)
82 | trans = tf.nn.tanh(tf.nn.xw_plus_b(in_val, full_w, full_b))
83 | gate = tf.nn.sigmoid(tf.nn.xw_plus_b(in_val, highway_w, highway_b))
84 | outputs = trans*gate + in_val* (1.0- gate)
85 | outputs = tf.reshape(outputs, [batch_size, passage_len, output_size])
86 | return outputs
87 |
88 | def multi_highway_layer(in_val, output_size, num_layers, scope=None):
89 | scope_name = 'highway_layer'
90 | if scope is not None: scope_name = scope
91 | for i in xrange(num_layers):
92 | cur_scope_name = scope_name + "-{}".format(i)
93 | in_val = highway_layer(in_val, output_size, scope=cur_scope_name)
94 | return in_val
95 |
96 | def cal_max_question_representation(question_representation, atten_scores):
97 | atten_positions = tf.argmax(atten_scores, axis=2, output_type=tf.int32) # [batch_size, passage_len]
98 | max_question_reps = layer_utils.collect_representation(question_representation, atten_positions)
99 | return max_question_reps
100 |
101 | def multi_perspective_match(feature_dim, repres1, repres2, is_training=True, dropout_rate=0.2,
102 | options=None, scope_name='mp-match', reuse=False):
103 | '''
104 | :param repres1: [batch_size, len, feature_dim]
105 | :param repres2: [batch_size, len, feature_dim]
106 | :return:
107 | '''
108 | input_shape = tf.shape(repres1)
109 | batch_size = input_shape[0]
110 | seq_length = input_shape[1]
111 | matching_result = []
112 | with tf.variable_scope(scope_name, reuse=reuse):
113 | match_dim = 0
114 | if options.with_cosine:
115 | cosine_value = layer_utils.cosine_distance(repres1, repres2, cosine_norm=False)
116 | cosine_value = tf.reshape(cosine_value, [batch_size, seq_length, 1])
117 | matching_result.append(cosine_value)
118 | match_dim += 1
119 |
120 | if options.with_mp_cosine:
121 | mp_cosine_params = tf.get_variable("mp_cosine", shape=[options.cosine_MP_dim, feature_dim], dtype=tf.float32)
122 | mp_cosine_params = tf.expand_dims(mp_cosine_params, axis=0)
123 | mp_cosine_params = tf.expand_dims(mp_cosine_params, axis=0)
124 | repres1_flat = tf.expand_dims(repres1, axis=2)
125 | repres2_flat = tf.expand_dims(repres2, axis=2)
126 | mp_cosine_matching = layer_utils.cosine_distance(tf.multiply(repres1_flat, mp_cosine_params),
127 | repres2_flat,cosine_norm=False)
128 | matching_result.append(mp_cosine_matching)
129 | match_dim += options.cosine_MP_dim
130 |
131 | matching_result = tf.concat(axis=2, values=matching_result)
132 | return (matching_result, match_dim)
133 |
134 |
135 | def match_passage_with_question(passage_reps, question_reps, passage_mask, question_mask, passage_lengths, question_lengths,
136 | context_lstm_dim, scope=None,
137 | with_full_match=True, with_maxpool_match=True, with_attentive_match=True, with_max_attentive_match=True,
138 | is_training=True, options=None, dropout_rate=0, forward=True):
139 | passage_reps = tf.multiply(passage_reps, tf.expand_dims(passage_mask,-1))
140 | question_reps = tf.multiply(question_reps, tf.expand_dims(question_mask,-1))
141 | all_question_aware_representatins = []
142 | dim = 0
143 | with tf.variable_scope(scope or "match_passage_with_question"):
144 | relevancy_matrix = cal_relevancy_matrix(question_reps, passage_reps)
145 | relevancy_matrix = mask_relevancy_matrix(relevancy_matrix, question_mask, passage_mask)
146 | # relevancy_matrix = layer_utils.calcuate_attention(passage_reps, question_reps, context_lstm_dim, context_lstm_dim,
147 | # scope_name="fw_attention", att_type=options.att_type, att_dim=options.att_dim,
148 | # remove_diagnoal=False, mask1=passage_mask, mask2=question_mask, is_training=is_training, dropout_rate=dropout_rate)
149 |
150 | all_question_aware_representatins.append(tf.reduce_max(relevancy_matrix, axis=2,keep_dims=True))
151 | all_question_aware_representatins.append(tf.reduce_mean(relevancy_matrix, axis=2,keep_dims=True))
152 | dim += 2
153 | if with_full_match:
154 | if forward:
155 | question_full_rep = layer_utils.collect_final_step_of_lstm(question_reps, question_lengths - 1)
156 | else:
157 | question_full_rep = question_reps[:,0,:]
158 |
159 | passage_len = tf.shape(passage_reps)[1]
160 | question_full_rep = tf.expand_dims(question_full_rep, axis=1)
161 | question_full_rep = tf.tile(question_full_rep, [1, passage_len, 1]) # [batch_size, pasasge_len, feature_dim]
162 |
163 | (attentive_rep, match_dim) = multi_perspective_match(context_lstm_dim,
164 | passage_reps, question_full_rep, is_training=is_training, dropout_rate=options.dropout_rate,
165 | options=options, scope_name='mp-match-full-match')
166 | all_question_aware_representatins.append(attentive_rep)
167 | dim += match_dim
168 |
169 | if with_maxpool_match:
170 | maxpooling_decomp_params = tf.get_variable("maxpooling_matching_decomp",
171 | shape=[options.cosine_MP_dim, context_lstm_dim], dtype=tf.float32)
172 | maxpooling_rep = cal_maxpooling_matching(passage_reps, question_reps, maxpooling_decomp_params)
173 | all_question_aware_representatins.append(maxpooling_rep)
174 | dim += 2*options.cosine_MP_dim
175 |
176 | if with_attentive_match:
177 | atten_scores = layer_utils.calcuate_attention(passage_reps, question_reps, context_lstm_dim, context_lstm_dim,
178 | scope_name="attention", att_type=options.att_type, att_dim=options.att_dim,
179 | remove_diagnoal=False, mask1=passage_mask, mask2=question_mask, is_training=is_training, dropout_rate=dropout_rate)
180 | att_question_contexts = tf.matmul(atten_scores, question_reps)
181 | (attentive_rep, match_dim) = multi_perspective_match(context_lstm_dim,
182 | passage_reps, att_question_contexts, is_training=is_training, dropout_rate=options.dropout_rate,
183 | options=options, scope_name='mp-match-att_question')
184 | all_question_aware_representatins.append(attentive_rep)
185 | dim += match_dim
186 |
187 | if with_max_attentive_match:
188 | max_att = cal_max_question_representation(question_reps, relevancy_matrix)
189 | (max_attentive_rep, match_dim) = multi_perspective_match(context_lstm_dim,
190 | passage_reps, max_att, is_training=is_training, dropout_rate=options.dropout_rate,
191 | options=options, scope_name='mp-match-max-att')
192 | all_question_aware_representatins.append(max_attentive_rep)
193 | dim += match_dim
194 |
195 | all_question_aware_representatins = tf.concat(axis=2, values=all_question_aware_representatins)
196 | return (all_question_aware_representatins, dim)
197 |
198 | def bilateral_match_func(in_question_repres, in_passage_repres,
199 | question_lengths, passage_lengths, question_mask, passage_mask, input_dim, is_training, options=None):
200 |
201 | question_aware_representatins = []
202 | question_aware_dim = 0
203 | passage_aware_representatins = []
204 | passage_aware_dim = 0
205 |
206 | # ====word level matching======
207 | (match_reps, match_dim) = match_passage_with_question(in_passage_repres, in_question_repres, passage_mask, question_mask, passage_lengths,
208 | question_lengths, input_dim, scope="word_match_forward",
209 | with_full_match=False, with_maxpool_match=options.with_maxpool_match,
210 | with_attentive_match=options.with_attentive_match,
211 | with_max_attentive_match=options.with_max_attentive_match,
212 | is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True)
213 | question_aware_representatins.append(match_reps)
214 | question_aware_dim += match_dim
215 |
216 | (match_reps, match_dim) = match_passage_with_question(in_question_repres, in_passage_repres, question_mask, passage_mask, question_lengths,
217 | passage_lengths, input_dim, scope="word_match_backward",
218 | with_full_match=False, with_maxpool_match=options.with_maxpool_match,
219 | with_attentive_match=options.with_attentive_match,
220 | with_max_attentive_match=options.with_max_attentive_match,
221 | is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False)
222 | passage_aware_representatins.append(match_reps)
223 | passage_aware_dim += match_dim
224 |
225 | with tf.variable_scope('context_MP_matching'):
226 | for i in xrange(options.context_layer_num): # support multiple context layer
227 | with tf.variable_scope('layer-{}'.format(i)):
228 | # contextual lstm for both passage and question
229 | in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1))
230 | in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(passage_mask, axis=-1))
231 | (question_context_representation_fw, question_context_representation_bw,
232 | in_question_repres) = layer_utils.my_lstm_layer(
233 | in_question_repres, options.context_lstm_dim, input_lengths= question_lengths,scope_name="context_represent",
234 | reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn)
235 | (passage_context_representation_fw, passage_context_representation_bw,
236 | in_passage_repres) = layer_utils.my_lstm_layer(
237 | in_passage_repres, options.context_lstm_dim, input_lengths=passage_lengths, scope_name="context_represent",
238 | reuse=True, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn)
239 |
240 | # Multi-perspective matching
241 | with tf.variable_scope('left_MP_matching'):
242 | (match_reps, match_dim) = match_passage_with_question(passage_context_representation_fw,
243 | question_context_representation_fw, passage_mask, question_mask, passage_lengths,
244 | question_lengths, options.context_lstm_dim, scope="forward_match",
245 | with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match,
246 | with_attentive_match=options.with_attentive_match,
247 | with_max_attentive_match=options.with_max_attentive_match,
248 | is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True)
249 | question_aware_representatins.append(match_reps)
250 | question_aware_dim += match_dim
251 | (match_reps, match_dim) = match_passage_with_question(passage_context_representation_bw,
252 | question_context_representation_bw, passage_mask, question_mask, passage_lengths,
253 | question_lengths, options.context_lstm_dim, scope="backward_match",
254 | with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match,
255 | with_attentive_match=options.with_attentive_match,
256 | with_max_attentive_match=options.with_max_attentive_match,
257 | is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False)
258 | question_aware_representatins.append(match_reps)
259 | question_aware_dim += match_dim
260 |
261 | with tf.variable_scope('right_MP_matching'):
262 | (match_reps, match_dim) = match_passage_with_question(question_context_representation_fw,
263 | passage_context_representation_fw, question_mask, passage_mask, question_lengths,
264 | passage_lengths, options.context_lstm_dim, scope="forward_match",
265 | with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match,
266 | with_attentive_match=options.with_attentive_match,
267 | with_max_attentive_match=options.with_max_attentive_match,
268 | is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True)
269 | passage_aware_representatins.append(match_reps)
270 | passage_aware_dim += match_dim
271 | (match_reps, match_dim) = match_passage_with_question(question_context_representation_bw,
272 | passage_context_representation_bw, question_mask, passage_mask, question_lengths,
273 | passage_lengths, options.context_lstm_dim, scope="backward_match",
274 | with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match,
275 | with_attentive_match=options.with_attentive_match,
276 | with_max_attentive_match=options.with_max_attentive_match,
277 | is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False)
278 | passage_aware_representatins.append(match_reps)
279 | passage_aware_dim += match_dim
280 |
281 | question_aware_representatins = tf.concat(axis=2, values=question_aware_representatins) # [batch_size, passage_len, question_aware_dim]
282 | passage_aware_representatins = tf.concat(axis=2, values=passage_aware_representatins) # [batch_size, question_len, question_aware_dim]
283 |
284 | if is_training:
285 | question_aware_representatins = tf.nn.dropout(question_aware_representatins, (1 - options.dropout_rate))
286 | passage_aware_representatins = tf.nn.dropout(passage_aware_representatins, (1 - options.dropout_rate))
287 |
288 | # ======Highway layer======
289 | if options.with_match_highway:
290 | with tf.variable_scope("left_matching_highway"):
291 | question_aware_representatins = multi_highway_layer(question_aware_representatins, question_aware_dim,
292 | options.highway_layer_num)
293 | with tf.variable_scope("right_matching_highway"):
294 | passage_aware_representatins = multi_highway_layer(passage_aware_representatins, passage_aware_dim,
295 | options.highway_layer_num)
296 |
297 | #========Aggregation Layer======
298 | aggregation_representation = []
299 | aggregation_dim = 0
300 |
301 | qa_aggregation_input = question_aware_representatins
302 | pa_aggregation_input = passage_aware_representatins
303 | with tf.variable_scope('aggregation_layer'):
304 | for i in xrange(options.aggregation_layer_num): # support multiple aggregation layer
305 | qa_aggregation_input = tf.multiply(qa_aggregation_input, tf.expand_dims(passage_mask, axis=-1))
306 | (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer(
307 | qa_aggregation_input, options.aggregation_lstm_dim, input_lengths=passage_lengths, scope_name='left_layer-{}'.format(i),
308 | reuse=False, is_training=is_training, dropout_rate=options.dropout_rate,use_cudnn=options.use_cudnn)
309 | fw_rep = layer_utils.collect_final_step_of_lstm(fw_rep, passage_lengths - 1)
310 | bw_rep = bw_rep[:, 0, :]
311 | aggregation_representation.append(fw_rep)
312 | aggregation_representation.append(bw_rep)
313 | aggregation_dim += 2* options.aggregation_lstm_dim
314 | qa_aggregation_input = cur_aggregation_representation# [batch_size, passage_len, 2*aggregation_lstm_dim]
315 |
316 | pa_aggregation_input = tf.multiply(pa_aggregation_input, tf.expand_dims(question_mask, axis=-1))
317 | (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer(
318 | pa_aggregation_input, options.aggregation_lstm_dim,
319 | input_lengths=question_lengths, scope_name='right_layer-{}'.format(i),
320 | reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn)
321 | fw_rep = layer_utils.collect_final_step_of_lstm(fw_rep, question_lengths - 1)
322 | bw_rep = bw_rep[:, 0, :]
323 | aggregation_representation.append(fw_rep)
324 | aggregation_representation.append(bw_rep)
325 | aggregation_dim += 2* options.aggregation_lstm_dim
326 | pa_aggregation_input = cur_aggregation_representation# [batch_size, passage_len, 2*aggregation_lstm_dim]
327 |
328 | aggregation_representation = tf.concat(axis=1, values=aggregation_representation) # [batch_size, aggregation_dim]
329 |
330 | # ======Highway layer======
331 | if options.with_aggregation_highway:
332 | with tf.variable_scope("aggregation_highway"):
333 | agg_shape = tf.shape(aggregation_representation)
334 | batch_size = agg_shape[0]
335 | aggregation_representation = tf.reshape(aggregation_representation, [1, batch_size, aggregation_dim])
336 | aggregation_representation = multi_highway_layer(aggregation_representation, aggregation_dim, options.highway_layer_num)
337 | aggregation_representation = tf.reshape(aggregation_representation, [batch_size, aggregation_dim])
338 |
339 | return (aggregation_representation, aggregation_dim)
340 |
341 |
--------------------------------------------------------------------------------
/src/match_utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhiguowang/BiMPM/33cc8fe5d450f432a6843bc05cad29c6ce9f5714/src/match_utils.pyc
--------------------------------------------------------------------------------
/src/my_rnn.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | from tensorflow.python.framework import constant_op
6 | from tensorflow.python.framework import dtypes
7 | from tensorflow.python.framework import ops
8 | from tensorflow.python.framework import tensor_shape
9 | from tensorflow.python.ops import array_ops
10 | from tensorflow.python.ops import control_flow_ops
11 | from tensorflow.python.ops import math_ops
12 | from tensorflow.python.ops import rnn_cell
13 | from tensorflow.python.ops import tensor_array_ops
14 | from tensorflow.python.ops import variable_scope as vs
15 | from tensorflow.python.util import nest
16 | from tensorflow.python.ops import rnn
17 | import tensorflow as tf
18 |
19 | _state_size_with_prefix = rnn_cell._state_size_with_prefix
20 |
21 | def _dynamic_rnn_loop(cell, inputs, initial_state, parallel_iterations, swap_memory, sequence_length=None, dtype=None):
22 | """Internal implementation of Dynamic RNN.
23 |
24 | Args:
25 | cell: An instance of RNNCell.
26 | inputs: A `Tensor` of shape [time, batch_size, input_size], or a nested
27 | tuple of such elements.
28 | initial_state: A `Tensor` of shape `[batch_size, state_size]`, or if
29 | `cell.state_size` is a tuple, then this should be a tuple of
30 | tensors having shapes `[batch_size, s] for s in cell.state_size`.
31 | parallel_iterations: Positive Python int.
32 | swap_memory: A Python boolean
33 | sequence_length: (optional) An `int32` `Tensor` of shape [batch_size].
34 | dtype: (optional) Expected dtype of output. If not specified, inferred from
35 | initial_state.
36 |
37 | Returns:
38 | Tuple `(final_outputs, final_state)`.
39 | final_outputs:
40 | A `Tensor` of shape `[time, batch_size, cell.output_size]`. If
41 | `cell.output_size` is a (possibly nested) tuple of ints or `TensorShape`
42 | objects, then this returns a (possibly nsted) tuple of Tensors matching
43 | the corresponding shapes.
44 | final_state:
45 | A `Tensor`, or possibly nested tuple of Tensors, matching in length
46 | and shapes to `initial_state`.
47 |
48 | Raises:
49 | ValueError: If the input depth cannot be inferred via shape inference
50 | from the inputs.
51 | """
52 | state = initial_state
53 | assert isinstance(parallel_iterations, int), "parallel_iterations must be int"
54 |
55 | state_size = cell.state_size
56 |
57 | flat_input = nest.flatten(inputs)
58 | flat_output_size = nest.flatten(cell.output_size)
59 |
60 | # Construct an initial output
61 | input_shape = array_ops.shape(flat_input[0])
62 | time_steps = input_shape[0]
63 | batch_size = input_shape[1]
64 |
65 | inputs_got_shape = tuple(input_.get_shape().with_rank_at_least(3) for input_ in flat_input)
66 |
67 | const_time_steps, const_batch_size = inputs_got_shape[0].as_list()[:2]
68 |
69 | for shape in inputs_got_shape:
70 | if not shape[2:].is_fully_defined():
71 | raise ValueError(
72 | "Input size (depth of inputs) must be accessible via shape inference,"
73 | " but saw value None.")
74 | got_time_steps = shape[0].value
75 | got_batch_size = shape[1].value
76 | if const_time_steps != got_time_steps:
77 | raise ValueError(
78 | "Time steps is not the same for all the elements in the input in a "
79 | "batch.")
80 | if const_batch_size != got_batch_size:
81 | raise ValueError(
82 | "Batch_size is not the same for all the elements in the input.")
83 |
84 | # Prepare dynamic conditional copying of state & output
85 | def _create_zero_arrays(size):
86 | size = _state_size_with_prefix(size, prefix=[batch_size])
87 | return array_ops.zeros(array_ops.pack(size), rnn._infer_state_dtype(dtype, state))
88 |
89 | flat_zero_output = tuple(_create_zero_arrays(output) for output in flat_output_size)
90 | zero_output = nest.pack_sequence_as(structure=cell.output_size, flat_sequence=flat_zero_output)
91 |
92 | if sequence_length is not None:
93 | min_sequence_length = math_ops.reduce_min(sequence_length)
94 | max_sequence_length = math_ops.reduce_max(sequence_length)
95 |
96 | time = array_ops.constant(0, dtype=dtypes.int32, name="time")
97 |
98 | with ops.name_scope("dynamic_rnn") as scope:
99 | base_name = scope
100 |
101 | def _create_ta(name, dtype):
102 | return tensor_array_ops.TensorArray(dtype=dtype, size=time_steps, tensor_array_name=base_name + name,clear_after_read=False)
103 |
104 | output_ta = tuple(_create_ta("output_%d" % i, rnn._infer_state_dtype(dtype, state)) for i in range(len(flat_output_size)))
105 | input_ta = tuple(_create_ta("input_%d" % i, flat_input[0].dtype) for i in range(len(flat_input)))
106 |
107 | input_ta = tuple(ta.unpack(input_) for ta, input_ in zip(input_ta, flat_input))
108 |
109 | def _time_step(time, output_ta_t, state):
110 | """Take a time step of the dynamic RNN.
111 |
112 | Args:
113 | time: int32 scalar Tensor.
114 | output_ta_t: List of `TensorArray`s that represent the output.
115 | state: nested tuple of vector tensors that represent the state.
116 |
117 | Returns:
118 | The tuple (time + 1, output_ta_t with updated flow, new_state).
119 | """
120 |
121 | input_t = tuple(ta.read(time) for ta in input_ta)
122 | # Restore some shape information
123 | for input_, shape in zip(input_t, inputs_got_shape):
124 | input_.set_shape(shape[1:])
125 |
126 | input_t = nest.pack_sequence_as(structure=inputs, flat_sequence=input_t)
127 | call_cell = lambda: cell(input_t, state)
128 |
129 | def f1(): return zero_output
130 | def f2(): return tuple(ta.read(tf.subtract(time, 1)) for ta in output_ta_t)#output_ta_t.read(tf.subtract(time, 1))
131 | cur_zero_output = tf.cond(tf.less(time, 1), f1, f2)
132 |
133 | if sequence_length is not None:
134 | (output, new_state) = rnn._rnn_step(
135 | time=time,
136 | sequence_length=sequence_length,
137 | min_sequence_length=min_sequence_length,
138 | max_sequence_length=max_sequence_length,
139 | zero_output=cur_zero_output, # TODO
140 | state=state,
141 | call_cell=call_cell,
142 | state_size=state_size,
143 | skip_conditionals=True)
144 | else:
145 | (output, new_state) = call_cell()
146 |
147 | # Pack state if using state tuples
148 | output = nest.flatten(output)
149 |
150 | output_ta_t = tuple(ta.write(time, out) for ta, out in zip(output_ta_t, output))
151 |
152 | return (time + 1, output_ta_t, new_state)
153 |
154 | _, output_final_ta, final_state = control_flow_ops.while_loop(
155 | cond=lambda time, *_: time < time_steps,
156 | body=_time_step,
157 | loop_vars=(time, output_ta, state),
158 | parallel_iterations=parallel_iterations,
159 | swap_memory=swap_memory)
160 |
161 | # Unpack final output if not using output tuples.
162 | final_outputs = tuple(ta.pack() for ta in output_final_ta)
163 |
164 | # Restore some shape information
165 | for output, output_size in zip(final_outputs, flat_output_size):
166 | shape = _state_size_with_prefix(
167 | output_size, prefix=[const_time_steps, const_batch_size])
168 | output.set_shape(shape)
169 |
170 | final_outputs = nest.pack_sequence_as(
171 | structure=cell.output_size, flat_sequence=final_outputs)
172 |
173 | return (final_outputs, final_state)
174 |
175 |
176 | def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
177 | initial_state_fw=None, initial_state_bw=None,
178 | dtype=None, parallel_iterations=None,
179 | swap_memory=False, time_major=False, scope=None):
180 | """Creates a dynamic version of bidirectional recurrent neural network.
181 |
182 | Similar to the unidirectional case above (rnn) but takes input and builds
183 | independent forward and backward RNNs. The input_size of forward and
184 | backward cell must match. The initial state for both directions is zero by
185 | default (but can be set optionally) and no intermediate states are ever
186 | returned -- the network is fully unrolled for the given (passed in)
187 | length(s) of the sequence(s) or completely unrolled if length(s) is not
188 | given.
189 |
190 | Args:
191 | cell_fw: An instance of RNNCell, to be used for forward direction.
192 | cell_bw: An instance of RNNCell, to be used for backward direction.
193 | inputs: The RNN inputs.
194 | If time_major == False (default), this must be a tensor of shape:
195 | `[batch_size, max_time, input_size]`.
196 | If time_major == True, this must be a tensor of shape:
197 | `[max_time, batch_size, input_size]`.
198 | [batch_size, input_size].
199 | sequence_length: An int32/int64 vector, size `[batch_size]`,
200 | containing the actual lengths for each of the sequences.
201 | initial_state_fw: (optional) An initial state for the forward RNN.
202 | This must be a tensor of appropriate type and shape
203 | `[batch_size, cell_fw.state_size]`.
204 | If `cell_fw.state_size` is a tuple, this should be a tuple of
205 | tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
206 | initial_state_bw: (optional) Same as for `initial_state_fw`, but using
207 | the corresponding properties of `cell_bw`.
208 | dtype: (optional) The data type for the initial states and expected output.
209 | Required if initial_states are not provided or RNN states have a
210 | heterogeneous dtype.
211 | parallel_iterations: (Default: 32). The number of iterations to run in
212 | parallel. Those operations which do not have any temporal dependency
213 | and can be run in parallel, will be. This parameter trades off
214 | time for space. Values >> 1 use more memory but take less time,
215 | while smaller values use less memory but computations take longer.
216 | swap_memory: Transparently swap the tensors produced in forward inference
217 | but needed for back prop from GPU to CPU. This allows training RNNs
218 | which would typically not fit on a single GPU, with very minimal (or no)
219 | performance penalty.
220 | time_major: The shape format of the `inputs` and `outputs` Tensors.
221 | If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
222 | If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
223 | Using `time_major = True` is a bit more efficient because it avoids
224 | transposes at the beginning and end of the RNN calculation. However,
225 | most TensorFlow data is batch-major, so by default this function
226 | accepts input and emits output in batch-major form.
227 | dtype: (optional) The data type for the initial state. Required if
228 | either of the initial states are not provided.
229 | scope: VariableScope for the created subgraph; defaults to "BiRNN"
230 |
231 | Returns:
232 | A tuple (outputs, output_states) where:
233 | outputs: A tuple (output_fw, output_bw) containing the forward and
234 | the backward rnn output `Tensor`.
235 | If time_major == False (default),
236 | output_fw will be a `Tensor` shaped:
237 | `[batch_size, max_time, cell_fw.output_size]`
238 | and output_bw will be a `Tensor` shaped:
239 | `[batch_size, max_time, cell_bw.output_size]`.
240 | If time_major == True,
241 | output_fw will be a `Tensor` shaped:
242 | `[max_time, batch_size, cell_fw.output_size]`
243 | and output_bw will be a `Tensor` shaped:
244 | `[max_time, batch_size, cell_bw.output_size]`.
245 | It returns a tuple instead of a single concatenated `Tensor`, unlike
246 | in the `bidirectional_rnn`. If the concatenated one is preferred,
247 | the forward and backward outputs can be concatenated as
248 | `tf.concat(2, outputs)`.
249 | output_states: A tuple (output_state_fw, output_state_bw) containing
250 | the forward and the backward final states of bidirectional rnn.
251 |
252 | Raises:
253 | TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
254 | """
255 |
256 | if not isinstance(cell_fw, rnn_cell.RNNCell):
257 | raise TypeError("cell_fw must be an instance of RNNCell")
258 | if not isinstance(cell_bw, rnn_cell.RNNCell):
259 | raise TypeError("cell_bw must be an instance of RNNCell")
260 |
261 | with vs.variable_scope(scope or "BiRNN"):
262 | # Forward direction
263 | with vs.variable_scope("FW") as fw_scope:
264 | output_fw, output_state_fw = dynamic_rnn(
265 | cell=cell_fw, inputs=inputs, sequence_length=sequence_length,
266 | initial_state=initial_state_fw, dtype=dtype,
267 | parallel_iterations=parallel_iterations, swap_memory=swap_memory,
268 | time_major=time_major, scope=fw_scope)
269 |
270 | # Backward direction
271 | if not time_major:
272 | time_dim = 1
273 | batch_dim = 0
274 | else:
275 | time_dim = 0
276 | batch_dim = 1
277 |
278 | with vs.variable_scope("BW") as bw_scope:
279 | inputs_reverse = array_ops.reverse_sequence(
280 | input=inputs, seq_lengths=sequence_length,
281 | seq_dim=time_dim, batch_dim=batch_dim)
282 | tmp, output_state_bw = dynamic_rnn(
283 | cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length,
284 | initial_state=initial_state_bw, dtype=dtype,
285 | parallel_iterations=parallel_iterations, swap_memory=swap_memory,
286 | time_major=time_major, scope=bw_scope)
287 |
288 | output_bw = array_ops.reverse_sequence(
289 | input=tmp, seq_lengths=sequence_length,
290 | seq_dim=time_dim, batch_dim=batch_dim)
291 |
292 | outputs = (output_fw, output_bw)
293 | output_states = (output_state_fw, output_state_bw)
294 |
295 | return (outputs, output_states)
296 |
297 | def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
298 | dtype=None, parallel_iterations=None, swap_memory=False,
299 | time_major=False, scope=None):
300 | """Creates a recurrent neural network specified by RNNCell `cell`.
301 |
302 | This function is functionally identical to the function `rnn` above, but
303 | performs fully dynamic unrolling of `inputs`.
304 |
305 | Unlike `rnn`, the input `inputs` is not a Python list of `Tensors`, one for
306 | each frame. Instead, `inputs` may be a single `Tensor` where
307 | the maximum time is either the first or second dimension (see the parameter
308 | `time_major`). Alternatively, it may be a (possibly nested) tuple of
309 | Tensors, each of them having matching batch and time dimensions.
310 | The corresponding output is either a single `Tensor` having the same number
311 | of time steps and batch size, or a (possibly nested) tuple of such tensors,
312 | matching the nested structure of `cell.output_size`.
313 |
314 | The parameter `sequence_length` is optional and is used to copy-through state
315 | and zero-out outputs when past a batch element's sequence length. So it's more
316 | for correctness than performance, unlike in rnn().
317 |
318 | Args:
319 | cell: An instance of RNNCell.
320 | inputs: The RNN inputs.
321 |
322 | If `time_major == False` (default), this must be a `Tensor` of shape:
323 | `[batch_size, max_time, ...]`, or a nested tuple of such
324 | elements.
325 |
326 | If `time_major == True`, this must be a `Tensor` of shape:
327 | `[max_time, batch_size, ...]`, or a nested tuple of such
328 | elements.
329 |
330 | This may also be a (possibly nested) tuple of Tensors satisfying
331 | this property. The first two dimensions must match across all the inputs,
332 | but otherwise the ranks and other shape components may differ.
333 | In this case, input to `cell` at each time-step will replicate the
334 | structure of these tuples, except for the time dimension (from which the
335 | time is taken).
336 |
337 | The input to `cell` at each time step will be a `Tensor` or (possibly
338 | nested) tuple of Tensors each with dimensions `[batch_size, ...]`.
339 | sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
340 | initial_state: (optional) An initial state for the RNN.
341 | If `cell.state_size` is an integer, this must be
342 | a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
343 | If `cell.state_size` is a tuple, this should be a tuple of
344 | tensors having shapes `[batch_size, s] for s in cell.state_size`.
345 | dtype: (optional) The data type for the initial state and expected output.
346 | Required if initial_state is not provided or RNN state has a heterogeneous
347 | dtype.
348 | parallel_iterations: (Default: 32). The number of iterations to run in
349 | parallel. Those operations which do not have any temporal dependency
350 | and can be run in parallel, will be. This parameter trades off
351 | time for space. Values >> 1 use more memory but take less time,
352 | while smaller values use less memory but computations take longer.
353 | swap_memory: Transparently swap the tensors produced in forward inference
354 | but needed for back prop from GPU to CPU. This allows training RNNs
355 | which would typically not fit on a single GPU, with very minimal (or no)
356 | performance penalty.
357 | time_major: The shape format of the `inputs` and `outputs` Tensors.
358 | If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
359 | If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
360 | Using `time_major = True` is a bit more efficient because it avoids
361 | transposes at the beginning and end of the RNN calculation. However,
362 | most TensorFlow data is batch-major, so by default this function
363 | accepts input and emits output in batch-major form.
364 | scope: VariableScope for the created subgraph; defaults to "RNN".
365 |
366 | Returns:
367 | A pair (outputs, state) where:
368 |
369 | outputs: The RNN output `Tensor`.
370 |
371 | If time_major == False (default), this will be a `Tensor` shaped:
372 | `[batch_size, max_time, cell.output_size]`.
373 |
374 | If time_major == True, this will be a `Tensor` shaped:
375 | `[max_time, batch_size, cell.output_size]`.
376 |
377 | Note, if `cell.output_size` is a (possibly nested) tuple of integers
378 | or `TensorShape` objects, then `outputs` will be a tuple having the
379 | same structure as `cell.output_size`, containing Tensors having shapes
380 | corresponding to the shape data in `cell.output_size`.
381 |
382 | state: The final state. If `cell.state_size` is an int, this
383 | will be shaped `[batch_size, cell.state_size]`. If it is a
384 | `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
385 | If it is a (possibly nested) tuple of ints or `TensorShape`, this will
386 | be a tuple having the corresponding shapes.
387 |
388 | Raises:
389 | TypeError: If `cell` is not an instance of RNNCell.
390 | ValueError: If inputs is None or an empty list.
391 | """
392 |
393 | if not isinstance(cell, rnn_cell.RNNCell):
394 | raise TypeError("cell must be an instance of RNNCell")
395 |
396 | # By default, time_major==False and inputs are batch-major: shaped
397 | # [batch, time, depth]
398 | # For internal calculations, we transpose to [time, batch, depth]
399 | flat_input = nest.flatten(inputs)
400 |
401 | if not time_major:
402 | # (B,T,D) => (T,B,D)
403 | flat_input = tuple(array_ops.transpose(input_, [1, 0, 2]) for input_ in flat_input)
404 |
405 | parallel_iterations = parallel_iterations or 32
406 | if sequence_length is not None:
407 | sequence_length = math_ops.to_int32(sequence_length)
408 | if sequence_length.get_shape().ndims not in (None, 1):
409 | raise ValueError(
410 | "sequence_length must be a vector of length batch_size, "
411 | "but saw shape: %s" % sequence_length.get_shape())
412 | sequence_length = array_ops.identity( # Just to find it in the graph.
413 | sequence_length, name="sequence_length")
414 |
415 | # Create a new scope in which the caching device is either
416 | # determined by the parent scope, or is set to place the cached
417 | # Variable using the same placement as for the rest of the RNN.
418 | with vs.variable_scope(scope or "RNN") as varscope:
419 | if varscope.caching_device is None:
420 | varscope.set_caching_device(lambda op: op.device)
421 | input_shape = tuple(array_ops.shape(input_) for input_ in flat_input)
422 | batch_size = input_shape[0][1]
423 |
424 | for input_ in input_shape:
425 | if input_[1].get_shape() != batch_size.get_shape():
426 | raise ValueError("All inputs should have the same batch size")
427 |
428 | if initial_state is not None:
429 | state = initial_state
430 | else:
431 | if not dtype:
432 | raise ValueError("If no initial_state is provided, dtype must be.")
433 | state = cell.zero_state(batch_size, dtype)
434 |
435 | def _assert_has_shape(x, shape):
436 | x_shape = array_ops.shape(x)
437 | packed_shape = array_ops.pack(shape)
438 | return control_flow_ops.Assert(
439 | math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)),
440 | ["Expected shape for Tensor %s is " % x.name,
441 | packed_shape, " but saw shape: ", x_shape])
442 |
443 | if sequence_length is not None:
444 | # Perform some shape validation
445 | with ops.control_dependencies([_assert_has_shape(sequence_length, [batch_size])]):
446 | sequence_length = array_ops.identity(sequence_length, name="CheckSeqLen")
447 |
448 | inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input)
449 |
450 | (outputs, final_state) = _dynamic_rnn_loop(
451 | cell,
452 | inputs,
453 | state,
454 | parallel_iterations=parallel_iterations,
455 | swap_memory=swap_memory,
456 | sequence_length=sequence_length,
457 | dtype=dtype)
458 |
459 | # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth].
460 | # If we are performing batch-major calculations, transpose output back
461 | # to shape [batch, time, depth]
462 | if not time_major:
463 | # (T,B,D) => (B,T,D)
464 | flat_output = nest.flatten(outputs)
465 | flat_output = [array_ops.transpose(output, [1, 0, 2])
466 | for output in flat_output]
467 | outputs = nest.pack_sequence_as(structure=outputs, flat_sequence=flat_output)
468 |
469 | return (outputs, final_state)
470 |
471 |
--------------------------------------------------------------------------------
/src/my_rnn.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhiguowang/BiMPM/33cc8fe5d450f432a6843bc05cad29c6ce9f5714/src/my_rnn.pyc
--------------------------------------------------------------------------------
/src/namespace_utils.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | class Bunch(object):
4 | def __init__(self, adict):
5 | self.__dict__.update(adict)
6 |
7 | def save_namespace(FLAGS, out_path):
8 | FLAGS_dict = vars(FLAGS)
9 | with open(out_path, 'w') as fp:
10 | #json.dump(FLAGS_dict, fp)
11 | json.dump(FLAGS_dict, fp, indent=4, sort_keys=True)
12 |
13 | def load_namespace(in_path):
14 | with open(in_path, 'r') as fp:
15 | FLAGS_dict = json.load(fp)
16 | return Bunch(FLAGS_dict)
--------------------------------------------------------------------------------
/src/namespace_utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhiguowang/BiMPM/33cc8fe5d450f432a6843bc05cad29c6ce9f5714/src/namespace_utils.pyc
--------------------------------------------------------------------------------
/src/vocab_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import print_function
3 | import numpy as np
4 | import re
5 |
6 | # import math
7 | class Vocab(object):
8 | def __init__(self, vec_path=None, dim=100, fileformat='bin',voc=None, word2id=None, word_vecs=None, unk_mapping_path=None):
9 | self.unk_label = ''
10 | self.stoplist = None
11 | if fileformat == 'bin':
12 | self.fromBinary(vec_path,voc=voc)
13 | elif fileformat == 'txt':
14 | self.fromText(vec_path,voc=voc)
15 | elif fileformat == 'txt2':
16 | self.fromText_format2(vec_path,voc=voc,pre_word_vecs=word_vecs)
17 | elif fileformat == 'txt3':
18 | self.fromText_format3(vec_path,voc=voc)
19 | elif fileformat == 'map':
20 | self.fromMap(word2id, word_vecs, word_dim=dim)
21 | else: # build a vocabulary with a word set
22 | self.fromVocabualry(voc, dim=dim)
23 |
24 | self.__unk_mapping = None
25 | if unk_mapping_path is not None:
26 | self.__unk_mapping = {}
27 | in_file = open(unk_mapping_path, 'rt')
28 | for line in in_file:
29 | items = re.split('\t', line)
30 | self.__unk_mapping[items[0]] = items[1]
31 | in_file.close()
32 |
33 |
34 | def fromVocabualry(self, voc, dim=100):
35 | # load freq table and build index for each word
36 | self.word2id = {}
37 | self.id2word = {}
38 |
39 | self.vocab_size = len(voc)
40 | self.word_dim = dim
41 | for word in voc:
42 | cur_index = len(self.word2id)
43 | self.word2id[word] = cur_index
44 | self.id2word[cur_index] = word
45 |
46 | # self.word_vecs = np.zeros((self.vocab_size+1, self.word_dim), dtype=np.float32) # the last dimension is all zero
47 | shape = (self.vocab_size+1, self.word_dim)
48 | scale = 0.05
49 | self.word_vecs = np.array(np.random.uniform(low=-scale, high=scale, size=shape), dtype=np.float32)
50 | # self.word_vecs = None
51 |
52 | def fromMap(self, word2id, word_vecs, word_dim=100):
53 | self.word2id = word2id
54 | self.id2word = dict(zip(word2id.values(),word2id.keys()))
55 |
56 | self.vocab_size = len(word2id)
57 | self.word_dim = word_dim
58 | self.word_vecs = word_vecs
59 |
60 |
61 |
62 | def fromText(self, vec_path,voc=None):
63 | # load freq table and build index for each word
64 | self.word2id = {}
65 | self.id2word = {}
66 |
67 | vec_file = open(vec_path, 'rt')
68 | header = vec_file.readline()
69 | self.vocab_size, self.word_dim = map(int, header.split())
70 | word_vecs = {}
71 | for line in vec_file:
72 | line = line.decode('utf-8').strip()
73 | parts = line.split(' ')
74 | word = parts[0]
75 | if (voc is not None) and (word not in voc): continue
76 | vector = np.array(parts[1:], dtype='float32')
77 | cur_index = len(self.word2id)
78 | self.word2id[word] = cur_index
79 | self.id2word[cur_index] = word
80 | word_vecs[cur_index] = vector
81 | vec_file.close()
82 |
83 | self.vocab_size = len(self.word2id)
84 | self.word_vecs = np.zeros((self.vocab_size+1, self.word_dim), dtype=np.float32) # the last dimension is all zero
85 | for cur_index in xrange(self.vocab_size):
86 | self.word_vecs[cur_index] = word_vecs[cur_index]
87 |
88 |
89 | def fromText_format2(self, vec_path,voc=None,pre_word_vecs=None):
90 | # load freq table and build index for each word
91 | self.word2id = {}
92 | self.id2word = {}
93 |
94 | vec_file = open(vec_path, 'rt')
95 | word_vecs = {}
96 | for line in vec_file:
97 | line = line.decode('utf-8').strip()
98 | parts = line.split('\t')
99 | cur_index = int(parts[0])
100 | word = parts[1]
101 | vector = np.array(map(float,re.split('\\s+', parts[2])), dtype='float32')
102 | self.word2id[word] = cur_index
103 | self.id2word[cur_index] = word
104 | word_vecs[cur_index] = vector
105 | self.word_dim = vector.size
106 | vec_file.close()
107 |
108 | self.vocab_size = len(self.word2id)
109 |
110 | if pre_word_vecs is not None:
111 | self.word_vecs = pre_word_vecs
112 | else:
113 | self.word_vecs = np.zeros((self.vocab_size+1, self.word_dim), dtype=np.float32) # the last dimension is all zero
114 | for cur_index in xrange(self.vocab_size):
115 | self.word_vecs[cur_index] = word_vecs[cur_index]
116 |
117 |
118 | def fromText_format3(self, vec_path,voc=None):
119 | # load freq table and build index for each word
120 | self.word2id = {}
121 | self.id2word = {}
122 |
123 | vec_file = open(vec_path, 'rt')
124 | # header = vec_file.readline()
125 | # self.vocab_size, self.word_dim = map(int, header.split())
126 | word_vecs = {}
127 | for line in vec_file:
128 | line = line.decode('utf-8')
129 | if line[0] == line[1] == ' ':
130 | word = ' '
131 | parts = [' '] + line.strip().split(' ')
132 | else:
133 | parts = line.split(' ')
134 | word = parts[0]
135 | self.word_dim = len(parts[1:])
136 | if (voc is not None) and (word not in voc): continue
137 | vector = np.array(parts[1:], dtype='float32')
138 | cur_index = len(self.word2id)
139 | self.word2id[word] = cur_index
140 | self.id2word[cur_index] = word
141 | word_vecs[cur_index] = vector
142 | vec_file.close()
143 |
144 | self.vocab_size = len(self.word2id)
145 | self.word_vecs = np.zeros((self.vocab_size+1, self.word_dim), dtype=np.float32) # the last dimension is all zero
146 | for cur_index in xrange(self.vocab_size):
147 | self.word_vecs[cur_index] = word_vecs[cur_index]
148 |
149 |
150 |
151 | def fromText_bak(self, vec_path,voc=None):
152 | # load freq table and build index for each word
153 | self.word2id = {}
154 | self.id2word = {}
155 |
156 | vec_file = open(vec_path, 'rt')
157 | header = vec_file.readline()
158 | self.vocab_size, self.word_dim = map(int, header.split())
159 | self.word_vecs = np.zeros((self.vocab_size+1, self.word_dim), dtype=np.float32) # the last dimension is all zero
160 | for line in vec_file:
161 | line = line.decode('utf-8').strip()
162 | parts = line.split(' ')
163 | word = parts[0]
164 | if (voc is not None) and (word not in voc): continue
165 | vector = np.array(parts[1:], dtype='float32')
166 | cur_index = len(self.word2id)
167 | self.word2id[word] = cur_index
168 | self.id2word[cur_index] = word
169 | self.word_vecs[cur_index] = vector
170 | vec_file.close()
171 |
172 | def fromBinary_with_voc(self, fname, voc, scale=0.05, stop_num=50):
173 | self.stoplist = voc[0:stop_num]
174 | voc = voc[stop_num:]
175 | voc.append(self.unk_label)
176 | self.word2id = {}
177 | self.id2word = {}
178 | for word in voc:
179 | curIndex = len(self.word2id)
180 | self.word2id[word] = curIndex
181 | self.id2word[curIndex] = word
182 |
183 | with open(fname, "rb") as f:
184 | header = f.readline()
185 | cur_vocab_size, self.word_dim = map(int, header.split())
186 | word_vecs = {}
187 | binary_len = np.dtype('float32').itemsize * self.word_dim
188 | for idx in xrange(cur_vocab_size):
189 | word = []
190 | while True:
191 | ch = f.read(1)
192 | if ch == ' ':
193 | word = ''.join(word)
194 | break
195 | if ch != '\n':
196 | word.append(ch)
197 | if word in self.word2id.keys():
198 | curIndex = self.word2id[word]
199 | else:
200 | curIndex = len(self.word2id)
201 | self.word2id[word] = curIndex
202 | self.id2word[curIndex] = word
203 | word_vecs[curIndex] = np.fromstring(f.read(binary_len), dtype='float32')
204 |
205 | self.vocab_size = len(self.word2id)
206 | self.word_vecs = np.random.uniform(low=-scale, high=scale, size=(self.vocab_size+1, self.word_dim)).astype('float32')
207 | self.word_vecs[self.vocab_size] = self.word_vecs[self.vocab_size] * 0.0
208 | for cur_index in word_vecs.keys():
209 | self.word_vecs[cur_index] = word_vecs[cur_index]
210 |
211 | def fromBinary(self, fname, scale=0.05, voc=None):
212 | self.word2id = {}
213 | self.id2word = {}
214 | self.word2id[self.unk_label] = 0
215 | self.id2word[0] = self.unk_label
216 | # load word vector
217 | with open(fname, "rb") as f:
218 | header = f.readline()
219 | self.vocab_size, self.word_dim = map(int, header.split())
220 | word_vecs = {}
221 | binary_len = np.dtype('float32').itemsize * self.word_dim
222 | for idx in xrange(self.vocab_size):
223 | word = []
224 | while True:
225 | ch = f.read(1)
226 | if ch == ' ':
227 | word = ''.join(word)
228 | break
229 | if ch != '\n':
230 | word.append(ch)
231 | if word == '': continue
232 | curIndex = len(self.word2id)
233 | self.word2id[word] = curIndex
234 | self.id2word[curIndex] = word
235 | word_vecs[curIndex] = np.fromstring(f.read(binary_len), dtype='float32')
236 |
237 | # add unkwords
238 | if voc is not None:
239 | for word in voc:
240 | if word == '': continue
241 | if self.word2id.has_key(word): continue
242 | curIndex = len(self.word2id)
243 | self.word2id[word] = curIndex
244 | self.id2word[curIndex] = word
245 | word_vecs[curIndex] = np.random.uniform(low=-scale, high=scale, size=(self.word_dim,)).astype('float32')
246 |
247 | self.vocab_size = len(self.word2id)
248 | self.word_vecs = np.zeros((self.vocab_size+1, self.word_dim), dtype=np.float32) # the last dimension is all zero
249 | for cur_index in xrange(self.vocab_size):
250 | if cur_index ==0 : continue
251 | self.word_vecs[cur_index] = word_vecs[cur_index]
252 | self.word_vecs[0] = np.random.uniform(low=-scale, high=scale, size=(self.word_dim,)).astype('float32')
253 |
254 | def setWordvec(self,word_vecs):
255 | self.word_vecs = word_vecs
256 |
257 | def hasWord(self, word):
258 | return self.word2id.has_key(word)
259 |
260 | def size(self):
261 | return len(self.word2id)
262 |
263 | def getIndex(self, word):
264 | if self.stoplist is not None:
265 | if word in self.stoplist:
266 | return None
267 | if(self.word2id.has_key(word)):
268 | return self.word2id.get(word)
269 | else:
270 | return self.vocab_size
271 |
272 | def getWord(self, idx):
273 | return self.id2word.get(idx)
274 |
275 | def getVector(self, word):
276 | if(self.word2id.has_key(word)):
277 | idx = self.word2id.get(word)
278 | return self.word_vecs[idx]
279 | return None
280 |
281 | def to_index_sequence(self, sentence):
282 | # sentence = sentence.strip().lower()
283 | sentence = sentence.strip()
284 | seq = []
285 | for word in re.split('\\s+', sentence):
286 | idx = self.getIndex(word)
287 | if idx == None and self.__unk_mapping is not None and self.__unk_mapping.has_key(word):
288 | simWord = self.__unk_mapping[word]
289 | idx = self.getIndex(simWord)
290 | if idx == None: idx = self.vocab_size
291 | seq.append(idx)
292 | return seq
293 |
294 | def to_index_sequence_for_list(self, words):
295 | seq = []
296 | for word in words:
297 | idx = self.getIndex(word)
298 | if idx == None and self.__unk_mapping is not None and self.__unk_mapping.has_key(word):
299 | simWord = self.__unk_mapping[word]
300 | idx = self.getIndex(simWord)
301 | if idx == None: idx = self.vocab_size
302 | seq.append(idx)
303 | return seq
304 |
305 | def to_character_matrix(self, sentence, max_char_per_word=-1):
306 | sentence = sentence.strip()
307 | seq = []
308 | for word in re.split('\\s+', sentence):
309 | cur_seq = []
310 | for i in xrange(len(word)):
311 | cur_char = word[i]
312 | idx = self.getIndex(cur_char)
313 | if idx == None and self.__unk_mapping is not None and self.__unk_mapping.has_key(cur_char):
314 | simWord = self.__unk_mapping[cur_char]
315 | idx = self.getIndex(simWord)
316 | if idx == None: idx = self.vocab_size
317 | cur_seq.append(idx)
318 | if max_char_per_word != -1 and len(cur_seq) > max_char_per_word:
319 | cur_seq = cur_seq[:max_char_per_word]
320 | seq.append(cur_seq)
321 | return seq
322 |
323 | def to_index_sequence4binary_features(self, sentence):
324 | sentence = sentence.strip().lower()
325 | seq = []
326 | for word in re.split(' ', sentence):
327 | idx = self.getIndex(word)
328 | if idx == None: continue
329 | seq.append(idx)
330 | return seq
331 |
332 | def to_char_ngram_index_sequence(self, sentence):
333 | sentence = sentence.strip().lower()
334 | seq = []
335 | words = re.split(' ', sentence)
336 | for word in words:
337 | sub_words = collect_char_ngram(word)
338 | for sub_word in sub_words:
339 | idx = self.getIndex(sub_word)
340 | if idx == None: continue
341 | seq.append(idx)
342 | return seq
343 |
344 | def to_sparse_feature_sequence(self, sentence1, sentence2):
345 | words1 = set(re.split(' ', sentence1.strip().lower()))
346 | words2 = set(re.split(' ', sentence2.strip().lower()))
347 | intersection_words = words1.intersection(words2)
348 | seq = []
349 | for word in intersection_words:
350 | idx = self.getIndex(word)
351 | if idx == None: continue
352 | seq.append(idx)
353 | return seq
354 |
355 | def get_sentence_vector(self, sentence):
356 | sent_vec = np.zeros((self.word_dim,), dtype='float32')
357 | sentence = sentence.strip().lower()
358 | total = 0.0
359 | for word in re.split(' ', sentence):
360 | cur_vec = self.getVector(word)
361 | if cur_vec is None: continue
362 | sent_vec += cur_vec
363 | total += 1.0
364 | if total != 0.0: sent_vec /= total
365 | return sent_vec
366 |
367 | def dump_to_txt2(self, outpath):
368 | outfile = open(outpath, 'wt')
369 | for word in self.word2id.keys():
370 | cur_id = self.word2id[word]
371 | cur_vector = self.getVector(word)
372 | # print(word)
373 | word= word.encode('utf-8')
374 | outline = "{}\t{}\t{}".format(cur_id, word, vec2string(cur_vector))
375 | outfile.write(outline + "\n")
376 | outfile.close()
377 |
378 | def dump_to_txt3(self, outpath):
379 | outfile = open(outpath, 'wt')
380 | for word in self.word2id.keys():
381 | cur_vector = self.getVector(word)
382 | word= word.encode('utf-8')
383 | outline = word + " {}".format(vec2string(cur_vector))
384 | outfile.write(outline + "\n")
385 | outfile.close()
386 |
387 | def vec2string(val):
388 | result = ""
389 | for v in val:
390 | result += " {}".format(v)
391 | return result.strip()
392 |
393 |
394 | def collect_all_ngram(words, n=2):
395 | all_ngrams = set()
396 | for i in xrange(len(words)-n):
397 | cur_ngram = words[i:i+n]
398 | all_ngrams.add(' '.join(cur_ngram))
399 | return all_ngrams
400 |
401 | def collect_char_ngram(word, n=3):
402 | all_words = []
403 | if len(word)<=n: all_words.append(word)
404 | else:
405 | for i in xrange(len(word)-n+1):
406 | cur_word = word[i:i+3]
407 | all_words.append(cur_word)
408 | return all_words
409 |
410 | def to_char_ngram_sequence(sentence, n=3):
411 | seq = []
412 | words = re.split(' ', sentence)
413 | for word in words:
414 | sub_words = collect_char_ngram(word)
415 | seq.extend(sub_words)
416 | return ' '.join(seq)
417 |
418 | def collectVoc(trainpath):
419 | vocab = set()
420 | inputFile = file(trainpath, 'rt')
421 | for line in inputFile:
422 | line = line.strip()
423 | label, sentence = re.split('\t', line)
424 | sentence = sentence.lower()
425 | for word in re.split(' ', sentence):
426 | vocab.add(word)
427 | inputFile.close()
428 | return vocab
429 |
430 | def collect_word_count(sentences, unk_num=1):
431 | word_count_map = {}
432 | for sentence in sentences:
433 | sentence = sentence.strip().lower()
434 | for word in re.split(' ', sentence):
435 | cur_count = 0
436 | if word_count_map.has_key(word):
437 | cur_count = word_count_map.get(word)
438 | word_count_map[word] = cur_count + 1
439 | word_count_list = []
440 | for word in word_count_map.keys():
441 | count = word_count_map.get(word)
442 | word_count_list.append((count, word))
443 |
444 | word_count_list = sorted(word_count_list,key=(lambda a:a[0]), reverse=True)
445 | # for i in xrange(50):
446 | # word, count = word_count_list[i]
447 | # print('{}\t{}'.format(word, count))
448 | # return word_count_list
449 | return [word for count, word in word_count_list if count>unk_num ]
450 |
451 | def collect_word_count_with_max_vocab(sentences, max_vocab=600000):
452 | word_count_map = {}
453 | for sentence in sentences:
454 | sentence = sentence.strip().lower()
455 | for word in re.split(' ', sentence):
456 | cur_count = 0
457 | if word_count_map.has_key(word):
458 | cur_count = word_count_map.get(word)
459 | word_count_map[word] = cur_count + 1
460 | word_count_list = []
461 | for word in word_count_map.keys():
462 | count = word_count_map.get(word)
463 | word_count_list.append((count, word))
464 |
465 | word_count_list = sorted(word_count_list,key=(lambda a:a[0]), reverse=True)
466 | # for i in xrange(50):
467 | # word, count = word_count_list[i]
468 | # print('{}\t{}'.format(word, count))
469 | # return word_count_list
470 | # return [word for count, word in word_count_list if count>unk_num ]
471 | if len(word_count_list)