├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── __init__.py ├── create_pretraining_data.py ├── data └── lm │ ├── poetry.tsv │ ├── poetry2.tsv │ ├── result.zh.json │ ├── test.en.tsv │ └── test.zh.tsv ├── extract_features.py ├── modeling.py ├── modeling_test.py ├── multilingual.md ├── optimization.py ├── optimization_test.py ├── requirements.txt ├── run_classifier.py ├── run_lm_predict.py ├── run_pretraining.py ├── run_squad.py ├── sample_text.txt ├── tmp ├── lm_output │ └── test_results.json └── utils.py ├── tokenization.py └── tokenization_test.py /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | BERT needs to maintain permanent compatibility with the pre-trained model files, 4 | so we do not plan to make any major changes to this library (other than what was 5 | promised in the README). However, we can accept small patches related to 6 | re-factoring and documentation. To submit contributes, there are just a few 7 | small guidelines you need to follow. 8 | 9 | ## Contributor License Agreement 10 | 11 | Contributions to this project must be accompanied by a Contributor License 12 | Agreement. You (or your employer) retain the copyright to your contribution; 13 | this simply gives us permission to use and redistribute your contributions as 14 | part of the project. Head over to to see 15 | your current agreements on file or to sign a new one. 16 | 17 | You generally only need to submit a CLA once, so if you've already submitted one 18 | (even if it was for a different project), you probably don't need to do it 19 | again. 20 | 21 | ## Code reviews 22 | 23 | All submissions, including submissions by project members, require review. We 24 | use GitHub pull requests for this purpose. Consult 25 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 26 | information on using pull requests. 27 | 28 | ## Community Guidelines 29 | 30 | This project follows 31 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## BERT as Language Model 2 | 3 | For a sentence S = w_1, w_2,..., w_k , we have 4 | 5 | p(S) = \prod_{i=1}^{k} p(w_i | context) 6 | 7 | 8 | In traditional language model, such as RNN, context = w_1, ..., w_{i-1} , 9 | 10 | p(S) = \prod_{i=1}^{k} p(w_i | w_1, ..., w_{i-1}) 11 | 12 | 13 | In bidirectional language model, it has larger context, context = w_1, ..., w_{i-1},w_{i+1},...,w_k. 14 | 15 | In this implementation, we simply adopt the following approximation, 16 | 17 | p(S) \approx \prod_{i=1}^{k} p(w_i | w_1, ..., w_{i-1},w_{i+1}, ...,w_k). 18 | 19 | 20 | 24 | 25 | 30 | 31 | 32 | ### test-case 垂直领域:诗词 33 | 34 | 35 | 36 | ```bash 37 | export BERT_BASE_DIR=model/chinese_L-12_H-768_A-12 38 | export INPUT_FILE=data/lm/poetry2.tsv 39 | python run_lm_predict.py \ 40 | --input_file=$INPUT_FILE \ 41 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 42 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 43 | --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ 44 | --max_seq_length=128 \ 45 | --output_dir=./tmp/lm_output/ 46 | ``` 47 | 48 | 49 | $ cat /tmp/lm/output/test_result.json 50 | 51 | 52 | 53 | ### 上述代码是bert整个代码框架比较“大块”,当然有超级简单的bert调用方法: 54 | 55 | ```bash 56 | '''BERT用作语言模型,计算句子分数,检验句子的合理性与否, 57 | 其实类似于基于bert-mlm的中文纠错,每个字符作为mask计算一个loss''' 58 | from torch.multiprocessing import TimeoutError, Pool, set_start_method, Queue 59 | import torch.multiprocessing as mp 60 | import torch 61 | import numpy as np 62 | # from transformers import DistilBertTokenizer,DistilBertForMaskedLM 63 | from pytorch_pretrained_bert import BertTokenizer, BertForMaskedLM 64 | import json, math 65 | 66 | try: 67 | set_start_method('spawn') 68 | except RuntimeError: 69 | pass 70 | 71 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 72 | 73 | 74 | def load_model(): 75 | ## 加载bert模型,这个路径文件夹下有bert_config.json配置文件和model.bin模型权重文件 76 | # bert-base-uncased是英文的 77 | model = BertForMaskedLM.from_pretrained('bert-base-chinese').to(device) 78 | model.eval() 79 | ## 加载bert的分词器 80 | tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') 81 | return tokenizer, model 82 | 83 | 84 | tokenizer, model = load_model() 85 | 86 | ''' 87 | 将loss作为句子困惑度ppl的分数: 88 | 不足: 89 | 1. 给每个word打分,都要跑一遍inference,计算量较大,且冗余。有优化的空间 90 | 2.该实现中采用的句子概率是近似概率,不够严谨 91 | ''' 92 | def get_score(sentence): 93 | tokenize_input = tokenizer.tokenize(sentence) 94 | tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)]) 95 | # Predict all tokens 96 | predictions = model(tensor_input) # model(masked_ids) 97 | #nn.CrossEntropyLoss(size_average=False) 98 | # 根据pytorch的官方文档,size_average默认情况下是True,对每个小批次的损失取平均值。 但是,如果字段size_average设置为False,则每个小批次的损失将被相加。如果参数reduce = False,则忽略 99 | loss_fct = torch.nn.CrossEntropyLoss() 100 | loss = loss_fct(predictions.squeeze(), tensor_input.squeeze()).data#已经取平均值后的loss,作为句子的ppl分数返回 101 | return math.exp(loss) 102 | 103 | 104 | print(get_score("杜甫是什么的诗词是有哪些")) 105 | print(get_score("杜甫的诗词有哪些")) 106 | ``` 107 | 108 | 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /create_pretraining_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Create masked LM/next sentence masked_lm TF examples for BERT.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import random 23 | 24 | import tokenization 25 | import tensorflow as tf 26 | 27 | flags = tf.flags 28 | 29 | FLAGS = flags.FLAGS 30 | 31 | flags.DEFINE_string("input_file", None, 32 | "Input raw text file (or comma-separated list of files).") 33 | 34 | flags.DEFINE_string( 35 | "output_file", None, 36 | "Output TF example file (or comma-separated list of files).") 37 | 38 | flags.DEFINE_string("vocab_file", None, 39 | "The vocabulary file that the BERT model was trained on.") 40 | 41 | flags.DEFINE_bool( 42 | "do_lower_case", True, 43 | "Whether to lower case the input text. Should be True for uncased " 44 | "models and False for cased models.") 45 | 46 | flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.") 47 | 48 | flags.DEFINE_integer("max_predictions_per_seq", 20, 49 | "Maximum number of masked LM predictions per sequence.") 50 | 51 | flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") 52 | 53 | flags.DEFINE_integer( 54 | "dupe_factor", 10, 55 | "Number of times to duplicate the input data (with different masks).") 56 | 57 | flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") 58 | 59 | flags.DEFINE_float( 60 | "short_seq_prob", 0.1, 61 | "Probability of creating sequences which are shorter than the " 62 | "maximum length.") 63 | 64 | 65 | class TrainingInstance(object): 66 | """A single training instance (sentence pair).""" 67 | 68 | def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, 69 | is_random_next): 70 | self.tokens = tokens 71 | self.segment_ids = segment_ids 72 | self.is_random_next = is_random_next 73 | self.masked_lm_positions = masked_lm_positions 74 | self.masked_lm_labels = masked_lm_labels 75 | 76 | def __str__(self): 77 | s = "" 78 | s += "tokens: %s\n" % (" ".join( 79 | [tokenization.printable_text(x) for x in self.tokens])) 80 | s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) 81 | s += "is_random_next: %s\n" % self.is_random_next 82 | s += "masked_lm_positions: %s\n" % (" ".join( 83 | [str(x) for x in self.masked_lm_positions])) 84 | s += "masked_lm_labels: %s\n" % (" ".join( 85 | [tokenization.printable_text(x) for x in self.masked_lm_labels])) 86 | s += "\n" 87 | return s 88 | 89 | def __repr__(self): 90 | return self.__str__() 91 | 92 | 93 | def write_instance_to_example_files(instances, tokenizer, max_seq_length, 94 | max_predictions_per_seq, output_files): 95 | """Create TF example files from `TrainingInstance`s.""" 96 | writers = [] 97 | for output_file in output_files: 98 | writers.append(tf.python_io.TFRecordWriter(output_file)) 99 | 100 | writer_index = 0 101 | 102 | total_written = 0 103 | for (inst_index, instance) in enumerate(instances): 104 | input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) 105 | input_mask = [1] * len(input_ids) 106 | segment_ids = list(instance.segment_ids) 107 | assert len(input_ids) <= max_seq_length 108 | 109 | while len(input_ids) < max_seq_length: 110 | input_ids.append(0) 111 | input_mask.append(0) 112 | segment_ids.append(0) 113 | 114 | assert len(input_ids) == max_seq_length 115 | assert len(input_mask) == max_seq_length 116 | assert len(segment_ids) == max_seq_length 117 | 118 | masked_lm_positions = list(instance.masked_lm_positions) 119 | masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) 120 | masked_lm_weights = [1.0] * len(masked_lm_ids) 121 | 122 | while len(masked_lm_positions) < max_predictions_per_seq: 123 | masked_lm_positions.append(0) 124 | masked_lm_ids.append(0) 125 | masked_lm_weights.append(0.0) 126 | 127 | next_sentence_label = 1 if instance.is_random_next else 0 128 | 129 | features = collections.OrderedDict() 130 | features["input_ids"] = create_int_feature(input_ids) 131 | features["input_mask"] = create_int_feature(input_mask) 132 | features["segment_ids"] = create_int_feature(segment_ids) 133 | features["masked_lm_positions"] = create_int_feature(masked_lm_positions) 134 | features["masked_lm_ids"] = create_int_feature(masked_lm_ids) 135 | features["masked_lm_weights"] = create_float_feature(masked_lm_weights) 136 | features["next_sentence_labels"] = create_int_feature([next_sentence_label]) 137 | 138 | tf_example = tf.train.Example(features=tf.train.Features(feature=features)) 139 | 140 | writers[writer_index].write(tf_example.SerializeToString()) 141 | writer_index = (writer_index + 1) % len(writers) 142 | 143 | total_written += 1 144 | 145 | if inst_index < 20: 146 | tf.logging.info("*** Example ***") 147 | tf.logging.info("tokens: %s" % " ".join( 148 | [tokenization.printable_text(x) for x in instance.tokens])) 149 | 150 | for feature_name in features.keys(): 151 | feature = features[feature_name] 152 | values = [] 153 | if feature.int64_list.value: 154 | values = feature.int64_list.value 155 | elif feature.float_list.value: 156 | values = feature.float_list.value 157 | tf.logging.info( 158 | "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) 159 | 160 | for writer in writers: 161 | writer.close() 162 | 163 | tf.logging.info("Wrote %d total instances", total_written) 164 | 165 | 166 | def create_int_feature(values): 167 | feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) 168 | return feature 169 | 170 | 171 | def create_float_feature(values): 172 | feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) 173 | return feature 174 | 175 | 176 | def create_training_instances(input_files, tokenizer, max_seq_length, 177 | dupe_factor, short_seq_prob, masked_lm_prob, 178 | max_predictions_per_seq, rng): 179 | """Create `TrainingInstance`s from raw text.""" 180 | all_documents = [[]] 181 | 182 | # Input file format: 183 | # (1) One sentence per line. These should ideally be actual sentences, not 184 | # entire paragraphs or arbitrary spans of text. (Because we use the 185 | # sentence boundaries for the "next sentence prediction" task). 186 | # (2) Blank lines between documents. Document boundaries are needed so 187 | # that the "next sentence prediction" task doesn't span between documents. 188 | for input_file in input_files: 189 | with tf.gfile.GFile(input_file, "r") as reader: 190 | while True: 191 | line = tokenization.convert_to_unicode(reader.readline()) 192 | if not line: 193 | break 194 | line = line.strip() 195 | 196 | # Empty lines are used as document delimiters 197 | if not line: 198 | all_documents.append([]) 199 | tokens = tokenizer.tokenize(line) 200 | if tokens: 201 | all_documents[-1].append(tokens) 202 | 203 | # Remove empty documents 204 | all_documents = [x for x in all_documents if x] 205 | rng.shuffle(all_documents) 206 | 207 | vocab_words = list(tokenizer.vocab.keys()) 208 | instances = [] 209 | for _ in range(dupe_factor): 210 | for document_index in range(len(all_documents)): 211 | instances.extend( 212 | create_instances_from_document( 213 | all_documents, document_index, max_seq_length, short_seq_prob, 214 | masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) 215 | 216 | rng.shuffle(instances) 217 | return instances 218 | 219 | 220 | def create_instances_from_document( 221 | all_documents, document_index, max_seq_length, short_seq_prob, 222 | masked_lm_prob, max_predictions_per_seq, vocab_words, rng): 223 | """Creates `TrainingInstance`s for a single document.""" 224 | document = all_documents[document_index] 225 | 226 | # Account for [CLS], [SEP], [SEP] 227 | max_num_tokens = max_seq_length - 3 228 | 229 | # We *usually* want to fill up the entire sequence since we are padding 230 | # to `max_seq_length` anyways, so short sequences are generally wasted 231 | # computation. However, we *sometimes* 232 | # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter 233 | # sequences to minimize the mismatch between pre-training and fine-tuning. 234 | # The `target_seq_length` is just a rough target however, whereas 235 | # `max_seq_length` is a hard limit. 236 | target_seq_length = max_num_tokens 237 | if rng.random() < short_seq_prob: 238 | target_seq_length = rng.randint(2, max_num_tokens) 239 | 240 | # We DON'T just concatenate all of the tokens from a document into a long 241 | # sequence and choose an arbitrary split point because this would make the 242 | # next sentence prediction task too easy. Instead, we split the input into 243 | # segments "A" and "B" based on the actual "sentences" provided by the user 244 | # input. 245 | instances = [] 246 | current_chunk = [] 247 | current_length = 0 248 | i = 0 249 | while i < len(document): 250 | segment = document[i] 251 | current_chunk.append(segment) 252 | current_length += len(segment) 253 | if i == len(document) - 1 or current_length >= target_seq_length: 254 | if current_chunk: 255 | # `a_end` is how many segments from `current_chunk` go into the `A` 256 | # (first) sentence. 257 | a_end = 1 258 | if len(current_chunk) >= 2: 259 | a_end = rng.randint(1, len(current_chunk) - 1) 260 | 261 | tokens_a = [] 262 | for j in range(a_end): 263 | tokens_a.extend(current_chunk[j]) 264 | 265 | tokens_b = [] 266 | # Random next 267 | is_random_next = False 268 | if len(current_chunk) == 1 or rng.random() < 0.5: 269 | is_random_next = True 270 | target_b_length = target_seq_length - len(tokens_a) 271 | 272 | # This should rarely go for more than one iteration for large 273 | # corpora. However, just to be careful, we try to make sure that 274 | # the random document is not the same as the document 275 | # we're processing. 276 | for _ in range(10): 277 | random_document_index = rng.randint(0, len(all_documents) - 1) 278 | if random_document_index != document_index: 279 | break 280 | 281 | random_document = all_documents[random_document_index] 282 | random_start = rng.randint(0, len(random_document) - 1) 283 | for j in range(random_start, len(random_document)): 284 | tokens_b.extend(random_document[j]) 285 | if len(tokens_b) >= target_b_length: 286 | break 287 | # We didn't actually use these segments so we "put them back" so 288 | # they don't go to waste. 289 | num_unused_segments = len(current_chunk) - a_end 290 | i -= num_unused_segments 291 | # Actual next 292 | else: 293 | is_random_next = False 294 | for j in range(a_end, len(current_chunk)): 295 | tokens_b.extend(current_chunk[j]) 296 | truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) 297 | 298 | assert len(tokens_a) >= 1 299 | assert len(tokens_b) >= 1 300 | 301 | tokens = [] 302 | segment_ids = [] 303 | tokens.append("[CLS]") 304 | segment_ids.append(0) 305 | for token in tokens_a: 306 | tokens.append(token) 307 | segment_ids.append(0) 308 | 309 | tokens.append("[SEP]") 310 | segment_ids.append(0) 311 | 312 | for token in tokens_b: 313 | tokens.append(token) 314 | segment_ids.append(1) 315 | tokens.append("[SEP]") 316 | segment_ids.append(1) 317 | 318 | (tokens, masked_lm_positions, 319 | masked_lm_labels) = create_masked_lm_predictions( 320 | tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) 321 | instance = TrainingInstance( 322 | tokens=tokens, 323 | segment_ids=segment_ids, 324 | is_random_next=is_random_next, 325 | masked_lm_positions=masked_lm_positions, 326 | masked_lm_labels=masked_lm_labels) 327 | instances.append(instance) 328 | current_chunk = [] 329 | current_length = 0 330 | i += 1 331 | 332 | return instances 333 | 334 | 335 | def create_masked_lm_predictions(tokens, masked_lm_prob, 336 | max_predictions_per_seq, vocab_words, rng): 337 | """Creates the predictions for the masked LM objective.""" 338 | 339 | cand_indexes = [] 340 | for (i, token) in enumerate(tokens): 341 | if token == "[CLS]" or token == "[SEP]": 342 | continue 343 | cand_indexes.append(i) 344 | 345 | rng.shuffle(cand_indexes) 346 | 347 | output_tokens = list(tokens) 348 | 349 | masked_lm = collections.namedtuple("masked_lm", ["index", "label"]) # pylint: disable=invalid-name 350 | 351 | num_to_predict = min(max_predictions_per_seq, 352 | max(1, int(round(len(tokens) * masked_lm_prob)))) 353 | 354 | masked_lms = [] 355 | covered_indexes = set() 356 | for index in cand_indexes: 357 | if len(masked_lms) >= num_to_predict: 358 | break 359 | if index in covered_indexes: 360 | continue 361 | covered_indexes.add(index) 362 | 363 | masked_token = None 364 | # 80% of the time, replace with [MASK] 365 | if rng.random() < 0.8: 366 | masked_token = "[MASK]" 367 | else: 368 | # 10% of the time, keep original 369 | if rng.random() < 0.5: 370 | masked_token = tokens[index] 371 | # 10% of the time, replace with random word 372 | else: 373 | masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] 374 | 375 | output_tokens[index] = masked_token 376 | 377 | masked_lms.append(masked_lm(index=index, label=tokens[index])) 378 | 379 | masked_lms = sorted(masked_lms, key=lambda x: x.index) 380 | 381 | masked_lm_positions = [] 382 | masked_lm_labels = [] 383 | for p in masked_lms: 384 | masked_lm_positions.append(p.index) 385 | masked_lm_labels.append(p.label) 386 | 387 | return (output_tokens, masked_lm_positions, masked_lm_labels) 388 | 389 | 390 | def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): 391 | """Truncates a pair of sequences to a maximum sequence length.""" 392 | while True: 393 | total_length = len(tokens_a) + len(tokens_b) 394 | if total_length <= max_num_tokens: 395 | break 396 | 397 | trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b 398 | assert len(trunc_tokens) >= 1 399 | 400 | # We want to sometimes truncate from the front and sometimes from the 401 | # back to add more randomness and avoid biases. 402 | if rng.random() < 0.5: 403 | del trunc_tokens[0] 404 | else: 405 | trunc_tokens.pop() 406 | 407 | 408 | def main(_): 409 | tf.logging.set_verbosity(tf.logging.INFO) 410 | 411 | tokenizer = tokenization.FullTokenizer( 412 | vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) 413 | 414 | input_files = [] 415 | for input_pattern in FLAGS.input_file.split(","): 416 | input_files.extend(tf.gfile.Glob(input_pattern)) 417 | 418 | tf.logging.info("*** Reading from input files ***") 419 | for input_file in input_files: 420 | tf.logging.info(" %s", input_file) 421 | 422 | rng = random.Random(FLAGS.random_seed) 423 | instances = create_training_instances( 424 | input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, 425 | FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, 426 | rng) 427 | 428 | output_files = FLAGS.output_file.split(",") 429 | tf.logging.info("*** Writing to output files ***") 430 | for output_file in output_files: 431 | tf.logging.info(" %s", output_file) 432 | 433 | write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, 434 | FLAGS.max_predictions_per_seq, output_files) 435 | 436 | 437 | if __name__ == "__main__": 438 | flags.mark_flag_as_required("input_file") 439 | flags.mark_flag_as_required("output_file") 440 | flags.mark_flag_as_required("vocab_file") 441 | tf.app.run() 442 | -------------------------------------------------------------------------------- /data/lm/poetry2.tsv: -------------------------------------------------------------------------------- 1 | 儿童学必备古诗100首 2 | 李白与的诗句内容简介 3 | 走进秋天初中文章初中作文 4 | 关于春晓的有古诗文 5 | 关于春天的现代诗我 6 | 李白写的诗窗前明月光 7 | 写草地那首诗 8 | 关于金秋古诗文 9 | 量春天来了那首诗 10 | 诗经中的春天 11 | 世外桃源陶渊明内容概括 12 | 儿童诗集 13 | 关于李白的词语全集 14 | 描写春天相关古诗文 15 | 月亮圆好害羞的那首诗 16 | 众里寻你千百度下一句 17 | 爱国的名言和诗 18 | 和孩子有关的诗古诗文 19 | 爱国诗篇名人名言 20 | 望庐山瀑的诗意视频 21 | 母亲河唐诗中 22 | 写腊梅的好句和诗歌有 23 | 李白全集赏析 24 | 范仲淹词的词语鉴赏 25 | 绝句那首诗 26 | 多学习古诗文 27 | 李白有现代诗歌 28 | 采菊花东篱的下一句歇后语 29 | 描写月下或的月亮那首诗 30 | 李白有关的诗歌写黄河的那首诗 31 | 形容儿童小学古诗大全 32 | 冬天景色中华诗词 33 | 忽如一夜春风来是里面那首诗 34 | 桃花源到陶渊明简要 35 | 关于对现实无奈的那首诗 36 | 书关于陆游 37 | 中华诗词池上唐白居易是 38 | 秋天到了现代诗 39 | 秋日的作文初中初中作文 40 | 男女说话方式爱情句子 41 | 专业诗朗诵关于望庐山瀑布 42 | 有关月的古诗文 43 | 有关黄河的现代诗 44 | 静夜思古诗文 45 | 李白有的诗词全集内容简介 46 | 秋天树作文初中初中作文 47 | 李白诗集词 48 | 冮南古诗文 49 | 关于杜甫的长诗里有哪些 50 | 王维的板桥小离别文言文字词注释 51 | 古典诗词与古诗大全王伟 52 | 写季节的现代诗 53 | 关春诗经 54 | 王维怎么怎么那首诗 55 | 李白写诗的诗词黄河的那首诗 56 | 贾岛有什么出名那首诗 57 | 春夏诗作300首 58 | 李白写诗望炉山瀑布 59 | 描写李白的作者 60 | 含有写江南的那首诗 61 | 用一首关于黄河 62 | 李白诗集 63 | 把酒月李白 64 | 思念的诗词鉴赏 65 | 写明月求圆周长诗歌散文 66 | 李白写伤感句子 67 | 王维是的板桥小别原文及注释 68 | 杜甫的长诗哪些有名 69 | 望卢山瀑布朗诵背景音乐 70 | 辛弃疾诗豪放派与婉约派 71 | 将进酒赏析翻译诗歌鉴赏原文及 72 | 青年人励志古诗词 73 | 恋爱诗经 74 | 伤感离别诗词歌赋 75 | 关于辛弃疾牵挂词 76 | 全唐诗泊秦淮中 77 | 写花的现代诗 78 | 江雪这首诗的赏析 79 | 关于描写久别重逢的古诗 80 | 儿童学古诗忘洞庭 81 | 辛弃疾有哪些关于思念词 82 | 采菊花东篱的后面一句 83 | 寻找秋天的初中文章作文 84 | 李白古诗古诗送别40首诗歌鉴赏 85 | 李白有什么的诗窗前明月光 86 | 关于辛弃疾豪放派 87 | 李白有句子赏析 88 | 陋室铭刘禹锡诗歌鉴赏 89 | 李白做山 90 | 李白关于朗诵 91 | 春天形容诗歌鉴赏 92 | 望月怀远古诗题目诗歌鉴赏 93 | 情感与在爱的中华诗词 94 | 关于爱情写景抒情散文 95 | 苏轼写流放说写的现代诗 96 | 侠客行中现代诗 97 | 中国常用诗经 98 | 苏东坡流放关于写的诗词歌赋 99 | 李白有诵读 100 | 赞美母亲河古诗句 101 | 描写学院关于秋天的古诗文 102 | 诗朗诵李白诗的诗歌 103 | 马致远古诗文经典 104 | 宋词那首诗古诗有 105 | 中秋节有关现代诗名句 106 | 王之焕词句 107 | 停车做爱枫林晚的后一句是什么 108 | 忽如一夜东风过去是这首诗 109 | 王国维现代诗 110 | 桃花缘陶渊明文章简介 111 | 思乡诗现代诗 112 | 范仲淹是什么的词原文及翻译 113 | 含有江南的现代诗 114 | 把酒话月有关李白 115 | 那首诗夜雨寄北中 116 | 学习语文的诗歌鉴赏 117 | 李白诗词诗歌鉴赏 118 | 苏轼什么关中秋节 119 | 静夜思古诗皇家唱诗班 120 | 王国维谈现代诗 121 | 幼儿写现代诗 122 | 有关黄河的那首诗 123 | 儿童天地诗歌集唐诗宋词儿童 124 | 李白写的句子全集 125 | 秋天到了古诗文 126 | 苏轼中秋 127 | 唐有关春天的李白古诗 128 | 李商隐诗集有哪里 129 | 李白有有名人名言 130 | 杜牧登高诗诗歌鉴赏 131 | 曹操文学的创作 132 | 是水的现代诗李白和 133 | 爱情抒情文 134 | 和孩子相关小学古诗大全 135 | 五言绝句是爱情 136 | 打起来黄莺儿的前一句是什么 137 | 古诗大全古诗文大春晓古诗 138 | 与梅花相关的全唐诗 139 | 前出赛杜甫诗意 140 | 把酒话月李白有什么 141 | 世外桃源陶渊明写具体内容 142 | 唐朝文人送孟浩然的句子 143 | 梅花代表这首诗 144 | 爱国主义情怀的古诗 145 | 李白静夜思唐代诗人李白 146 | 诗歌大全那首诗大关于春晓 147 | 和苏轼有关的诗集内容简介 148 | 唐诗草诗歌鉴赏 149 | 李白写的诗句内容简介 150 | 唐代文人大全李白有 151 | 关于读书的现代诗背古诗 152 | 白居易有哪些短内容简介 153 | 李白最有哪些过有哪些词 154 | 夏沫诗歌赏析 155 | 有关苏轼古诗文经典 156 | 含有梅花糕的俗语 157 | 李白诗集中学 158 | 三字字那首诗3 159 | 屈原写的一首词 160 | 赞美春天的这首诗 161 | 朝辞白帝彩云的后一句 162 | 李白诗的诗词全集内容简介 163 | 经典儿童小学古诗大全 164 | 夜来风雨声这句话这首诗写给谁 165 | 那首诗韩愈写的两句古诗 166 | 杜甫有哪些有名的俗语 167 | 来几首关于爱情得诗 168 | 辛弃疾有哪些寂寞的词 169 | 游子吟这首诗唐诗宋词 170 | 诗歌表达关于爱情的 171 | 王维写的板桥小别文言文及注释 172 | 王昌龄写词有哪些 173 | 写给李白的文章 174 | 辛弃疾带思念的词 175 | 登鹳鹊楼-王之涣 176 | 秦时明月汉时相关的下一句是什么 177 | 读过书现代诗 178 | 诗朗诵稿琵琶行是什么 179 | 李白古诗送别诗及40首诗歌鉴赏 180 | 李白作诗 181 | 灯鹳雀楼朗诵技巧视频下载 182 | 忆江南这首诗朗诵 183 | 描写男子用功学习的句子 184 | 青少年爱国那首诗 185 | 春天的写诗有什么 186 | 王维写的作有什么 187 | 李白写诗忘庐山瀑布 188 | 关于春晓的里面古诗文 189 | 诗朗诵稿李白作诗的李白将进酒 190 | 辛弃疾有哪些著名词 191 | 有爱情的那首诗现代诗人 192 | 古诗文白日依山尽 193 | 关于读书古典诗词 194 | 鲁迅写过笔下的句子 195 | 诗人李白山随平野 196 | 忆少年诗词鉴赏 197 | 春晓这首诗翻译和 198 | 有关于李白的作者 199 | 王维写的文章用是什么 200 | 和李白有关的作者 201 | 我的秋天作文初中初中作文 202 | 关于爱情抒情散文精选 203 | 辛弃疾豪放派和婉约派 204 | 忆少年现代诗 205 | 有关李白的词全集 206 | 飞流是什么直下三千丈是谁的诗 207 | 给宝宝诗朗诵的唐诗宋词 208 | 关于苏轼中秋 209 | 写江南带拼音那首诗 210 | 李白古诗的诗句全集内容简介 211 | 李白有著名现代诗 212 | 白帝城里李白二 213 | 关于读书的现代诗有古诗 214 | 爱国情怀的句子 215 | 儿童学背古文词 216 | 形容青少年勤奋读书的句子 217 | 黄河有关那首诗 218 | 范仲淹是名句 219 | 李白词名句 220 | 有关李白的好散文 221 | 苏东坡的词和诗 222 | 屈原给父亲的那首诗 223 | 五言绝句爱情幸福 224 | 给娃娃朗诵视频的古诗文 225 | 关于秋天古诗文 226 | 苏轼有的爱情诗内容简介 227 | 李白有的词全集 228 | 写苏轼有关中秋节 229 | 思乡古典诗词 230 | 描写春天里诗句大全 231 | 王维是的作什么 232 | 专业朗诵望卢山瀑布 233 | 春天的主要人物介绍 234 | 苏东坡写中秋 235 | 王维是你怎样的这首诗 236 | 李白和月有什么古诗文 237 | 静夜思这首诗的有国语版 238 | 折戟水沉沙铁未注销的作者是谁 239 | 王昌龄古文 240 | 过中秋节小诗学生 241 | 五言绝句新爱情的 242 | 有关李白语句赏析 243 | 李白有读书名言 244 | 儿童学农耕那首诗 245 | 苏轼与有关中秋 246 | 描写草地现代诗 247 | 与李白相关的作 248 | 与秋天的初中文章初中作文 249 | 有古诗秦时明月汉时时光 250 | 徒带相思的现代诗 251 | 唐代诗人李白独坐敬亭山表达 252 | 与梅花有关的诗词古诗文 253 | 贺知章诗那首诗 254 | 赞美爱情誓言古诗和 255 | 唐诗宋词三百首杜甫写 256 | 李白藏头诗用环境描写 257 | 当代大学生爱国的古典诗词中 258 | 望卢山瀑布古诗文朗诵 259 | 学习快诗歌鉴赏 260 | 王维古诗的诗歌诗歌鉴赏 261 | 儿童天地诗歌集配图 262 | 杜甫最好有的句子 263 | 曾刘景文宋苏轼写 264 | 关于梅花的全唐诗 265 | 辛弃疾有哪些婉约派与豪放派 266 | 赞美是爱情的古诗文 267 | 诗朗诵稿李白有的诗词 268 | 歌颂中秋节形容词大全 269 | 王维写的诗词诗歌鉴赏 270 | 李白写诗朗诵稿 271 | 诗词加诗歌鉴赏 272 | 范仲淹是什么的词语原文及翻译 273 | 青春的现代诗有什么 274 | 岳阳楼区记朗诵稿 275 | 关于爱情方面著名有古诗 276 | 李白有朗诵 277 | 王维是怎样地那首诗 278 | 冬季景色中华诗词 279 | 春天写山水诗 280 | 净化水的现代诗有关李白 281 | 美丽秋天诗歌散文现代诗 282 | 李白有的文章 283 | 带有九月份的爱恋那首诗 284 | 儿童学农耕古诗文 285 | 李白有那些过有哪些词 286 | 李白写诗的诗句内容简介 287 | 关于爱情深情用古诗词 288 | 描写春天的句子 289 | 秋高气爽初中记叙文作文 290 | 辛弃疾关于想念词 291 | 关于李白全唐诗 292 | 辛弃疾词作 293 | 王国维论那首诗 294 | 朱熹的那首诗 295 | 梅相关都现代诗 296 | 秋光现代诗优美 297 | 忆江南白居易那首诗 298 | 有关春天的诗文 299 | 诗人李白诗朗诵稿 300 | 李白有的诗歌内容简介 301 | 静夜思这首诗的中文网 302 | 诗朗诵散文 303 | 飞流是直下三千尺是谁的诗 304 | 这首诗关于韩愈的二句话 305 | 李清照夏天的诗歌鉴赏 306 | 母亲河爱情诗 307 | 李白和名言及 308 | 母亲河古诗名句 309 | 诗朗诵稿李白古诗的诗词 310 | 用春的诗词歌赋 311 | 描写腊梅花糕的句子 312 | 来首甜蜜爱情诗 313 | 关于李白名人名言带 314 | 关于描写春天的唯美句子 315 | 描写李白有哪些过有哪些词 316 | 桃花园记陶渊明写 317 | 关于辛弃疾眷恋词 318 | 有爱情的古诗词短现代诗 319 | 杜甫的长诗那些 320 | 明月光及时有古诗词 321 | 五言绝句恋情 322 | 给baby诗朗诵的唐诗中 323 | 爱情抒情作文 324 | 有关于中秋山水诗 325 | 全唐诗送孟浩然 326 | 关于辛弃疾思恋的词 327 | 读书节那首诗 328 | 李白诗作环境描写加 329 | 苏轼和的饮湖上初晴雨 330 | 那首诗登高 331 | 与李白有关的诗词内容简介 332 | 望卢山瀑布朗诵版 333 | 走进秋天的小学古诗大全20首 334 | 静夜思这首诗全文阅读 335 | 相思诗重阳古诗文 336 | 李白写诗望庐山瀑 337 | 李白的散文有 338 | 苏轼写的饮湖上初晴雨 339 | 王昌龄写诗词歌赋 340 | 思乡诗词歌赋 341 | 关于爱情深情诗词有 342 | 春晓的全部那首诗 343 | 中学生爱国诗歌鉴赏 344 | 学习中的那首诗 345 | 那首诗韩愈简介的两句古诗 346 | 江南那首诗 347 | 那首诗柳中元 348 | 宋苏轼最曾刘景文 349 | 春晓古诗注释和翻译及诗歌鉴赏 350 | 关于李白名人 351 | 孩子学习好中华诗词 352 | 王维和怎样地那首诗 353 | 关于李白的小散文 354 | 忘庐山瀑布朗诵视频 355 | 小孩和大人的俗语那首诗 356 | 描梅花糕的诗 357 | 幼儿园古诗文背诵词 358 | 秋天秋天的作文初中初中作文 359 | 王维是你要怎样的这首诗 360 | 王维你要怎样的这首诗 361 | 赞美青春现代诗有 362 | 草中华诗词 363 | 停车坐爰枫林晚的前一句是什么 364 | 全唐诗出塞世界背景 365 | 诗人王维古典诗词 366 | 描写少年时期用功的句子 367 | 王维经典名句 368 | 五言绝句将爱情的 369 | 李白写名言警句和 370 | 有关李白的独坐敬亭 371 | 唐代大诗人李白独坐敬亭山 372 | 徒泪相思的现代诗 373 | 杜甫写的长诗分哪些 374 | 马致远经典词语 375 | 望卢山瀑布朗诵稿 376 | 描写老师的致伟大的现代诗 377 | 古诗文朗诵内容 378 | 明月光及时带有古诗 379 | 少年古典诗词 380 | 苏轼写给中秋节 381 | 古诗文全文阅读 382 | 情和像爱的古典诗词 383 | 朝发白帝李白有 384 | 幼儿诗歌全唐诗儿童学 385 | 王维写有什么有名的是哪里一首词 386 | 王昌龄写形容词 387 | 母亲河有关古代诗歌 388 | 陶渊明写桃花源到记五柳先生转 389 | 爱国主义精神的诗句 390 | 有关李白关于离别 391 | 关系与有关月的古诗文 392 | 忆少年古典诗词 393 | 秋天丰收的初中文章初中作文 394 | 行路难其一中诗歌鉴赏 395 | 写相思得到最好诗词歌赋 396 | 李白创作的诗歌内容简介 397 | 关于杜甫的长诗哪几个 398 | 李白关于山随平野 399 | 王维的板桥小别全文注释 400 | 当代爱国的古代长诗 401 | 江南全文翻译 402 | 王维和是怎样的那首诗 403 | 专业诗朗诵稿关于望庐山瀑布 404 | 那首诗泊秦淮 405 | 我和春天的唯美句子 406 | 明月光及时古诗中有 407 | 秦时明月汉时相关的前一句是什么 408 | 曹操文学的代表作有 409 | 有关春天的儿童诗 410 | 形容少年用功的句子 411 | 关于月亮害羞的那首诗 412 | 骆滨女王的古诗文 413 | 游子吟这首诗的是什么意思 414 | 李白诗的诗窗前明月光 415 | 王维的句子 416 | 王维写词语中的禅境 417 | 赞美梅花糕的诗句 418 | 白居易有哪些名句 419 | 江南这句话有 420 | 飞流吃直下三千英尺有谁的诗 421 | 李白关于词朗诵 422 | 读现代诗的句子 423 | 关于春天的那首诗 424 | 清明祭祀杜牧 425 | 白居易有哪些诗集 426 | 李白写离别词 427 | 关于爱祖国那首诗 428 | 思乡用诗作 429 | 杜牧诗登高诗歌鉴赏 430 | 苏轼写明月几时 431 | 李白和的诗歌内容简介 432 | 背诵全唐诗 433 | 苏东坡关中秋节 434 | 范仲淹的词语 435 | 下雨想一个人现代诗 436 | 春天里的古诗大全 437 | 古诗大全那首诗大春晓 438 | 苏轼咏有关中秋 439 | 这首诗韩愈的二句话 440 | 唐诗秦时明月汉时时光 441 | 关于苏轼眉州古诗文 442 | 赞美求爱情那首诗 443 | 关于母亲河的古诗有哪些 444 | 五言绝句关于爱情之 445 | 清明这首诗注音版 446 | 月亮圆很害羞的那首诗 447 | 有关春天古诗题目 448 | 关于的儿童诗集 449 | 古诗文思念的 450 | 给宝贝诗朗诵的古诗文 451 | 关于思乡的现代诗古诗词 452 | 关于读书节那首诗 453 | 中秋节有关诗歌鉴赏诗词名句 454 | 王之焕佳句 455 | 李贺最著名的四句话 456 | 王维要怎样的这首诗 457 | 有关苏轼关于中秋节 458 | 关于苏轼明月几 459 | 杜甫名句鉴赏 460 | 爱情诗散文随笔 461 | 贾岛现在有名那首诗 462 | 形容季节的现代诗我 463 | 有关李白古诗送别40首诗歌鉴赏 464 | 饮湖上初晴雨/苏轼最 465 | 有关春天的诗句 466 | 有关李白励志句子 467 | 离别时伤感中华诗词 468 | 赞扬梅花糕的诗 469 | 苏轼写题西岭壁 470 | 白居易有哪些著名的句子 471 | 思乡诗词鉴赏 472 | 关于辛弃疾的句子作 473 | 朝发白帝彩云和的下一句 474 | 写春天的句子 475 | 儿童学农耕古诗和 476 | 李白有山随平野 477 | 相思这首诗词 478 | 苏轼写作中秋节 479 | 儿童学古诗背诵词 480 | 说爱情的最有名古诗文 481 | 诗朗诵李白有什么的诗句 482 | 和梅花梅有关诗 483 | 水调歌头原文那首诗 484 | 杜牧清明 485 | 秋天捡全唐诗 486 | 叶少翁有哪首古诗文 487 | 李白有什么的诗句全集内容简介 488 | 梅花象征什么这首诗 489 | 关于思乡的经典诗词那首诗 490 | 静夜思这首诗经典诗歌 491 | 这首诗登高诗 492 | 净化水的现代诗李白 493 | 李白有写送别诗40首诗歌鉴赏 494 | 青年人励志古代诗词中 495 | 思念的形容词 496 | 范仲淹词的句子原文及翻译 497 | 辛弃疾有哪些的现代诗 498 | 忘庐山瀑布 499 | 描写初冬用雪的古诗文 500 | 短歌行古典诗词 501 | 忆江南这首诗朗诵稿 502 | 从古典诗词中明知范仲淹什么人 503 | 忽如一夜东风来时是这首诗 504 | 撑起来黄莺儿的前一句是什么 505 | 苏东坡眉州中华诗词 506 | 关于苏轼的句子 507 | 带青春的现代诗人 508 | 王维是怎样地这首诗 509 | 出塞这首诗的意思是什么 510 | 李白写名人 511 | 苏轼写诗歌精选 512 | 描写夏日古诗山行 513 | 关于杜甫有名气的句子 514 | 有关朱熹的用诗作 515 | 古诗词这首诗那首诗 516 | 李白古诗的诗词窗前明月光 517 | 五言绝句城市爱情 518 | 有关爱情散文随笔 519 | 秋后作文初中作文 520 | 王维诗这首诗画家 521 | 唐李白敬亭山 522 | 辛弃疾有哪些我思念的词 523 | 鹳雀楼朗诵比赛视频下载 524 | 有秋天的作文初中初中作文 525 | 陶渊明写惜时 526 | 打得起黄莺儿的后面一句是什么 527 | 儿童天地诗歌集有古诗给宝宝 528 | 诗人李白著名诗句 529 | 忘庐山瀑布根据课文内容 530 | 贺知章写诗词鉴赏 531 | 李白给的诗写黄河的现代诗 532 | 思乡诗中华诗词 533 | 关于苏轼的句子和诗 534 | 前出赛唐关于杜甫翻译和 535 | 杜甫写的长诗哪有 536 | 关于李白的词全集 537 | 李白赞美母亲河古诗和 538 | 秋季古诗文 539 | 关于李白警句 540 | 李白赠汪伦这首诗书怎么写 541 | 春天可以的句子 542 | 关于李白语句赏析 543 | 全唐诗李白 544 | 清明这首诗的含义 545 | 诗朗诵稿李白诗的诗词 546 | 描写春景诗歌鉴赏 547 | 读书节古典诗词 548 | 李白和精典用诗作 549 | 爱国主义与的古诗 550 | 关于描写的儿童诗集 551 | 深秋的那首诗深秋的诗词 552 | 唐代人送孟浩然 553 | 母亲河有关古代诗词 554 | 这首诗韩愈写的二句话 555 | 今年秋天的作文初中初中作文 556 | 这首诗韩愈写的两首古诗 557 | 中秋节的小诗集小学生 558 | 五言绝句写爱情的 559 | 有关读书中华诗词 560 | 游子吟这首诗的寓意是什么 561 | 表现诗人关于爱情的 562 | 雪的诗经 563 | 写黄鹤楼古诗文鉴赏 564 | 写儿童诗全唐诗给宝宝 565 | 与秋有关古典诗词 566 | 诗朗诵稿有关李白的诗句 567 | 停车坐爱木风林晚的后面一句是什么 568 | 李白诗作 569 | 唐代诗人李白独坐敬亭山写作 570 | 北宋苏轼最的曾刘景文 571 | 形容初春现代诗古代诗歌 572 | 忽如一夜春风来时是那首诗 573 | 李白写过哪些过有那些词 574 | 古诗词这首诗山水诗 575 | 找秋天诗歌鉴赏优美 576 | 白居易有哪些名人名言 577 | 诗朗诵写岳阳楼记 578 | 古诗大全古诗文大关于春晓 579 | 黄河全唐诗 580 | 写月亮电影月光的那首诗 581 | 秋天捡现代诗 582 | 赞美母亲河背古诗 583 | 五言绝句关于爱情是什么 584 | 友人关于归来的那首诗 585 | 李清照夏日绝句古诗文 586 | 关于青春的现代诗有 587 | 李白创作年代 588 | 朝发白帝城李白和二 589 | 李白写有哪些过有哪些词 590 | 望卢山瀑布的诗意视频下载 591 | 李白描写的诗词内容简介 592 | 爱国情感的句子 593 | 描写春天的的句子 594 | 望庐山瀑背诵课文 595 | 静夜思这首诗的用中文 596 | 水调歌头原文古诗文苏轼 597 | 黄河母亲现代诗 598 | 李白有什么的词语全集 599 | 春天的朗诵稿 600 | 三字开头古诗文3 601 | 古典诗词词典 602 | 杜甫有那些著名的句子 603 | 朝发白帝城李白有 604 | 介绍中秋这首诗 605 | 诗朗诵李白诗作的将进酒朗诵 606 | 母亲河的名诗句 607 | 岳阳楼区记范仲淹是什么 608 | 把酒月李白有什么 609 | 关于儿童节的句子有古诗 610 | 杜甫较出名的俗语 611 | 李清照夏日绝句那首诗 612 | 飞流是直下三千丈你谁的诗 613 | 江南诗词鉴赏 614 | 爱国主义情怀的诗 615 | 少年夫妻关于爱情得关于古诗 616 | 有九月份的爱情观诗词句 617 | 杜牧山行著名现代诗作 618 | 明月光及时古诗有 619 | 表示对人怀念的现代诗 620 | 韩愈写美句 621 | 停车坐爰枫林晚的前面一句 622 | 王维诗的诗句诗歌鉴赏 623 | 形容对现实无奈的那首诗 624 | 李白有什么的敬亭山 625 | 范仲淹是什么名句 626 | 李白和的敬亭山 627 | 李白将进酒中华诗词吗 628 | 当代爱国的好句那首诗 629 | 形容李白的创作 630 | 王昌龄名句有哪些 631 | 望庐山瀑文本内容 632 | 秋季初中文章作文 633 | 无边萧萧落木下 634 | 送汪伦关于李白 635 | 幼儿园古诗古诗文给宝宝 636 | 李白有词朗诵 637 | 李白最哪些过有哪些词 638 | 王维诗古典诗词 639 | 形容学院关于秋天那首诗 640 | 秋天有初中记叙文作文 641 | 辛弃疾思恋词 642 | 王昌龄古词 643 | 高中生必背古诗文词书 644 | 季节有关的写现代诗 645 | 诗朗诵琵琶行中 646 | 青少年爱国这首诗 647 | 学生爱国山水诗 648 | 关于爱情的深情好诗句 649 | 关于读书的诗歌有那首诗 650 | 来几首甜蜜爱情诗 651 | 有关李白的作 652 | 有关春天的古代诗句 653 | 贺知章那首诗 654 | 李白和名言名 655 | 杜甫登高诗歌鉴赏 656 | 王维和该如何这首诗 657 | 来首关于爱情诗 658 | 读书节小学古诗大全 659 | 有关于爱情的诗歌散文现代诗诗 660 | 白居易那首诗 661 | 诗朗诵稿李白诗文的李白将进酒 662 | 口水的现代诗有关李白 663 | 关于屈原现代诗 664 | 那首诗韩愈简介的两句话 665 | 唐有关春天的古诗大全 666 | 描述青少年勤奋读书的句子 667 | 介绍梅花糕的诗词 668 | 游子吟这首诗的含义 669 | 课外古诗文 670 | 关于写梅花的俗语古诗文 671 | 关于爱情抒情类散文 672 | 春天的译文 673 | 爱国诗词鉴赏 674 | 苏轼有关关于中秋节 675 | 暮江吟古诗文 676 | 宋词精选那首诗诗歌散文 677 | 介绍梅花糕的句子 678 | 有关爱情深情诗词歌赋 679 | 中学生爱国山水诗 680 | 写黄鹤楼诗歌鉴赏中 681 | 王昌龄写古诗文 682 | 曾刘景文宋苏轼 683 | 江雪这首诗的诗歌赏析 684 | 辛弃疾有哪些豪放派和婉约派 685 | 诗人王维的著名现代诗 686 | 李白诗集侠行 687 | 古人爱国情诗 688 | 关于辛弃疾思念想念词 689 | 范仲淹是什么名言名句 690 | 王维写的板桥小离别正文和注释 691 | 杜甫诗作作品是全集 692 | 李白写月有什么古诗文 693 | 有梅花的诗词古诗文 694 | 关于爱情的深情古诗文 695 | 关于爱情的深情诗词关于 696 | 杜甫的长诗具有哪些 697 | 王维这首诗画家 698 | 儿童学农耕唐诗宋词 699 | 梅花有关于红梅诗 700 | 李白有的律诗有哪几 701 | 李白月的古诗文 702 | 关于描写秋天的小学古诗大全二十首 703 | 王维和你要怎样的那首诗 704 | 儿童学农耕有古诗 705 | 爱国主义情怀的诗词 706 | 有关爱情抒情文章 707 | 儿童学农耕诗句有 708 | 白居易有哪些短作者简介 709 | 带江南中华诗词 710 | 苏轼写关于中秋 711 | 王维该如何这首诗 712 | 全唐诗小池杨万里 713 | 杜甫的长诗在哪里 714 | 苏轼写过的饮湖上初晴雨 715 | 李清照诗夏日 716 | 苏轼咏关中秋 717 | 朝发白帝彩云和的前面一句是什么 718 | 和秋天有关诗歌散文诗词名句 719 | 韩愈的描写春田诗 720 | 学习学诗歌鉴赏 721 | 描写中秋那首诗 722 | 诗朗诵稿李白诗歌中的将进酒 723 | 关于春天里的小学古诗大全300首 724 | 形容青少年读书用功的句子 725 | 李白的的一首词 726 | 李白诗及诗歌鉴赏 727 | 关中秋这首诗名句 728 | 古诗文经典诵读内容 729 | 苏轼为官中秋 730 | 李白有什么经典名句 731 | 关于儿童节的俗语古诗句 732 | 古诗词这首诗诗句有 733 | 夏日绝句那首诗 734 | 望炉山瀑布根据课文 735 | 李白写朗诵稿 736 | 王国维这首诗 737 | 唐春天写古诗文 738 | 古诗文全文 739 | 辛弃疾有哪些山水诗那首诗 740 | 李白写的律诗上有哪些 741 | 儿童诗歌唐诗儿童学 742 | 孟浩然山水田园诗 743 | 过中秋节诗小小学生 744 | 望庐山瀑布诗歌鉴赏 745 | 李白有什么的霸气形容词大全 746 | 江南水现代诗 747 | 牵挂诗词歌赋 748 | 李白写过有那些过哪些词 749 | 和秋天有关抒情散文现代诗 750 | 关于思乡的律诗翻译和 751 | 关于月亮的那首诗 752 | 描述春山水诗 753 | 赞美梅花的诗那首诗 754 | 词语大会评关于苏轼 755 | 儿童现代诗新唐诗儿童诗 756 | 范仲淹是什么的句子鉴赏 757 | 古诗文朗诵 758 | 古诗文春天的 759 | 关于母亲河的诗歌有 760 | 李白有英语名人名言 761 | 清明杜牧注音版 762 | 古代文人孟浩然 763 | 描写月亮的古诗文简单 764 | 有关母亲河唐诗宋词 765 | 关于形容久别重逢的古诗 766 | 李白x的文章 767 | 儿童学农耕古诗词 768 | 唐诗宋词这首诗爱情诗 769 | 爱国主义诗歌著名诗句 770 | 苏轼什么的饮湖上初晴雨 771 | 梅花相关的全唐诗 772 | 关于读书的那首诗古诗和 773 | 入门古诗文 774 | 全唐诗收入出塞王昌龄诗原诗 775 | 这首诗韩愈简介的两句诗 776 | 童诗唐诗关于儿童 777 | 记承天寺游记原文及翻译 778 | 春晓的中那首诗 779 | 李白有古诗文 780 | 出塞古诗文 781 | 小学生古诗词望洞庭这首诗 782 | 朝辞白帝彩云和的下一句是什么 783 | 宝宝可以朗诵视频的古诗文 784 | 李白写的律诗哪里有 785 | 望明月前面一句是什么 786 | 赞美女教师那首诗 787 | 忽如一夜春风过去是这首诗 788 | 草现代诗 789 | 飞流是什么直下三千尺你谁的诗 790 | 停车坐爰枫林晚的下一句是什么 791 | 用古诗表达关于爱情的 792 | 离离草原上有关草的全文阅读 793 | 关于春晓我的全部古诗文 794 | 这首诗中带有春字的古诗 795 | 关于写秋天的现代诗 796 | 关于春唯美句子 797 | 形容词大全苏轼最 798 | 有关秋诗经 799 | 有关爱情的那首诗现在诗 800 | 陶渊明诗歌诗歌鉴赏 801 | 朗诵词我们祖国真大 802 | 郑汪伦那首诗简写 803 | 忽如一夜东风过去是那首诗 804 | 中秋给水调歌头那首诗 805 | 唐朝人孟浩然的句子 806 | 形容季节的现代诗有什么 807 | 母亲河有关李白古诗 808 | 王维是现代诗 809 | 关于李白现代诗歌 810 | 李白有经典诗集 811 | 杜甫写的诗句全集诗词鉴赏 812 | 诗朗诵稿李白诗歌的李白将进酒 813 | 描述季节的现代散文 814 | 古诗文将进酒朗诵 815 | 带梅花梅有关诗 816 | 爱祖国名句和诗 817 | 古诗文浪淘沙 818 | 五言绝句关于爱情得 819 | 关于李白经典词句 820 | 桃花源和陶渊明相关内容 821 | 诗朗诵稿有关李白的诗 822 | 关于写日月的古诗文 823 | 关于李白的散文朗诵 824 | 忽如一夜春风来时是这首诗 825 | 忽如一夜春风过去是那首诗 826 | 关于孩童小学古诗大全 827 | 全唐诗庐山瀑布 828 | 写黄鹤楼诗词鉴赏 829 | 写苏轼关于中秋节 830 | 离别伤感诗词鉴赏 831 | 杜甫写的长诗哪几个 832 | 学龄前儿童学习诗词歌赋 833 | 望炉山瀑布有关李白朗诵稿 834 | 江南古诗文全拼音 835 | 静夜有关李白 836 | 李白诗集朗诵 837 | 赐予我们背悯农 838 | 描写麦苗诗句那首诗 839 | 有关李白关于送别40首诗歌鉴赏 840 | 侠客行那首诗 841 | 离别伤感优美词句 842 | 黄河相关现代诗 843 | 关于母亲河的古诗有 844 | 诗朗诵稿抒情散文 845 | 诗朗诵李白古诗的诗句 846 | 全唐诗山中送别 847 | 李白写诗的诗歌内容简介 848 | 写孩子的句子 849 | 现代诗带赏析 850 | 那首诗三字经全文朗读视频 851 | 春天来了的句子 852 | 写春天现代诗歌大全 853 | 王维写山水田园诗 854 | 李白写过母亲河古代诗词 855 | 关于小儿小学古诗大全 856 | 李白赞美的诗内容简介 857 | 关于秋天初中文章初中作文 858 | 苏东坡写关于中秋 859 | 王维古诗现代诗 860 | 摘抄爱国主义情怀经典诗词10句 861 | 酒问月关于李白 862 | 写关于春天的那首诗 863 | 描写春光的句子 864 | 思念的形容词大全 865 | 秋天写抒情散文那首诗 866 | 有关李白关于黄河的那首诗 867 | 关于母亲河的诗句 868 | 这首诗韩愈简介的两句古诗 869 | 古诗词那首诗诗歌 870 | 望明月前一句 871 | 唐代诗人李白敬亭山 872 | 关于春季古诗文 873 | 草白居易爱情 874 | 儿童学背诵古文词 875 | 小学古诗大全李白诗 876 | 李白送别诗带40首诗歌鉴赏 877 | 中中国的古典诗词 878 | 关于爱情的古诗句现代词 879 | 赠王伦这首诗通过怎么讲 880 | 春天来了阅读的句子 881 | 全唐诗爱 882 | 王昌龄形容词大全 883 | 离离草原上的草的全文 884 | 给我我念静夜思这首诗 885 | 停车做爱枫林晚的一句什么 886 | 李贺诗歌诗歌鉴赏 887 | 唐诗朗诵表演忆江南 888 | 杜甫写的长诗具有哪些 889 | 春晓这首诗意思 890 | 关于相思最好拿古诗文 891 | 陶渊明桃花源和记五柳先生转 892 | 古诗文词典 893 | 爱国爱现代诗 894 | 儿童学背古诗有什么词 895 | 关于爱情的诗歌有诗文 896 | 诗人王维的诗诗歌鉴赏 897 | 读书诗词歌赋 898 | 李清照有夏日赏析 899 | 古诗文杨万里小池 900 | 江南拼音用诗作 901 | 王维那首诗画家 902 | 望月怀远这首诗赏析 903 | 全唐诗夜泊秦淮 904 | 关于杜甫的诗诗歌鉴赏 905 | 梅花这首诗 906 | 母亲河的诗歌有 907 | 出塞这首诗的寓意 908 | 写给苏轼的饮湖上初晴雨 909 | 白居易是经典现代诗作 910 | 春晓这首诗全文翻译 911 | 岳阳记范仲淹词 912 | 有关春天的诗 913 | 古诗文泊秦淮中 914 | 白居易是四句教 915 | 爱情的名诗句关于现代诗 916 | 望月怀远诗题诗歌鉴赏 917 | 李白关于坏境描写 918 | 李白有的小散文 919 | 停车坐爱木风林晚的前一句是什么 920 | 描写清明杜牧 921 | 早发白帝李白二 922 | 少儿背古诗文词 923 | 爱情的爱情诗现代诗歌 924 | 关于思乡的诗词句那首诗 925 | 歌颂中秋中华诗词 926 | 李白朗诵 927 | 歌颂梅花糕的古诗 928 | 李峤有写的风这首诗 929 | 春天里的古代诗词中 930 | 中秋节写这首诗 931 | 清明这首诗的寓意是什么 932 | 写给李白有哪些过有那些词 933 | 关于辛弃疾思念你的词 934 | 有明月苏轼 935 | 有关朱熹的中华诗词 936 | 杜甫作诗作品全集 937 | 跟季节有关的现代诗歌及 938 | 停车坐爰枫林晚的一句什么 939 | 诗人王维那首诗 940 | 与黄河有关诗歌集 941 | 杜甫写比较著名的句子 942 | 范仲淹词名言名句 943 | 月亮有古诗文 944 | 有关李白的诗窗前明月光 945 | 唐诗中秦时明月韩时光 946 | 梅花象征这首诗 947 | 陶渊明诗集 948 | 春天来了小学古诗大全300首 949 | 李白和诗作 950 | 杜甫的长诗有哪些关于 951 | 给我的快乐朗诵下一步曹植的七步 952 | 王国维说这首诗 953 | 诗朗诵琵琶行里 954 | 古诗文经典诵读内容简介 955 | 关于春天的现代诗 956 | 表现春天的古句 957 | 母亲河新唐诗 958 | 写校园秋天古诗文 959 | 有古诗秦时明月汉时光 960 | 飞流是直下三千英尺有谁的诗 961 | 量关于春天那首诗 962 | 飞流直下三千英尺是谁的诗 963 | 李白写有那些过哪些词 964 | 关于苏轼的词语 965 | 唐代诗人李白独坐敬亭山中 966 | 悯农古诗两首 967 | 李白和的著名散文 968 | 有关花那首诗 969 | 杜甫写诗代表作全集 970 | 描写秋天景色散文诗那首诗 971 | 有关于爱情的古诗名句短诗 972 | 写给李白有那些过哪些词 973 | 辛弃疾词赏析 974 | 五言绝句我爱情的 975 | 爱国爱诗词鉴赏 976 | 爱国词名言名句 977 | 母亲河的诗句中 978 | 诵诗祖国真大 979 | 母亲河有关古诗带 980 | 母亲河诗歌有 981 | 爱国人物那首诗 982 | 屈原写给父亲的那首诗 983 | 诗朗诵稿写岳阳楼记 984 | 侠客行谁那首诗 985 | 陶渊明写名言名句 986 | 白居易是什么最有名的句子 987 | 送汪伦那首诗怎么读 988 | 诗朗诵岳阳楼记 989 | 打得起黄莺儿的前一句 990 | 春晓这首诗译文 991 | 打起来黄莺儿的上一句是什么 992 | 关于母亲河的古诗词 993 | 停车坐爰枫林晚的后面一句是什么 994 | 亲情和爱的歌古诗文 995 | 中秋节有关诗歌赏析名句 996 | 望庐山瀑朗诵会 997 | 描写春天来临山水诗 998 | 纳兰性德词 999 | 五言绝句爱情誓言 1000 | 和梅花有关都那首诗 1001 | 关于母亲河的爱情诗 1002 | 离别的中华诗词 1003 | 李白有什么格言大全 1004 | 母亲河现代诗 1005 | 现代人爱国的句子 1006 | 李商隐诗集哪些有 1007 | 关于李白有那些过有哪些词 1008 | 高中古诗文鉴赏 1009 | 王昌龄词句哪些 1010 | 李白写诗有哪些过哪些词 1011 | 李白望庐山瀑 1012 | 关于杜甫的长诗有哪些 1013 | 老师写诗 1014 | 辛弃疾词 1015 | 母亲河古诗和 1016 | 有关于爱情的诗句现代诗与 1017 | 母亲河的诗词句 1018 | 少儿背古文词 1019 | 李白古诗的诗句窗前明月光 1020 | 有关秋天的全唐诗 1021 | 有关于爱情的诗歌有现在诗 1022 | 李白作诗环境描写和 1023 | 爱情借景抒情散文 1024 | 老师那首诗 1025 | 朝发白帝彩云间的后一句是什么 1026 | 关于母亲河那首诗 1027 | 思念的重阳诗词歌赋 1028 | 春晓这首诗朗诵版 1029 | 夏天那首诗 1030 | 李白有关母亲河经典诗句 1031 | 关于母亲河的诗歌散文 1032 | 多读书中华诗词 1033 | 用春的写诗有什么 1034 | 飞流吃直下三千英尺是谁的诗 1035 | 王维现代诗 1036 | 赞颂春天的古代情诗 1037 | 宋词这首诗诗歌鉴赏 1038 | 江南原文及 1039 | 关于辛弃疾经典词语 1040 | 有关李白离别时 1041 | 叶绍翁写关于春天里的诗 1042 | 李白关于的诗句内容简介 1043 | 给女宝宝诗朗诵的古诗和 1044 | 李白有什么的独坐敬亭 1045 | 李白和佳句赏析 1046 | 古诗文春天 1047 | 形容梅花糕的诗句 1048 | 草白居易名句 1049 | 关于春天里的诗词大全300首 1050 | 给孩子诗朗诵的唐诗 1051 | 李白有佳句赏析 1052 | 是春天的小学古诗大全300首 1053 | 诗朗诵稿琵琶行原文 1054 | 带给我的朗诵稿下一下曹植的七步 1055 | 王维的板桥小离别古文注释 1056 | 诗朗诵李白诗的诗句 1057 | 鲁迅诗集 1058 | 李白藏头诗朗诵 1059 | 母亲河古诗有 1060 | 短歌行背古诗文 1061 | 表示对人思念的现代诗 1062 | 关于读书的名诗句古诗文 1063 | 写出春天这首诗 1064 | 苏轼最的著名现代诗10首 1065 | 叶绍翁春天里诗 1066 | 描写春天景色现代诗歌大全 1067 | 静夜诗皇家教堂唱诗班 1068 | 王昌龄写用诗作 1069 | 王之涣词句 1070 | 望庐山瀑的诗意视频下载 1071 | 我古诗文 1072 | 李白写赏析句子 1073 | 关于写春天诗经 1074 | 全唐诗春晓古诗 1075 | 静夜诗唐代诗人李白 1076 | 飞流打直下三千尺有谁的诗 1077 | 多阅读诗词歌赋 1078 | 忘庐山瀑布朗诵作品 1079 | 用月亮有关圆的现代诗 1080 | 夸赞教师那首诗 1081 | 李白关于黄河上游那首诗 1082 | 李白有什么精典诗词歌赋 1083 | 写月的现代诗 1084 | 关于月亮的古诗文 1085 | 关中秋诗歌鉴赏 1086 | 李白诗的诗词窗前明月光 1087 | 牵挂古诗文 1088 | 李白诗的诗词内容简介 1089 | 李白的独坐敬亭 1090 | 李白山 1091 | 飞流打直下三千英尺有谁的诗 1092 | 儿童学古诗望洞庭是什么 1093 | 全唐诗李白诗 1094 | 那首诗长相思 1095 | 过中秋节古诗小小学生 1096 | 赞美春天诗歌鉴赏 1097 | 古诗文山中送别 1098 | 游子吟这首诗关于阅读 1099 | 冬天有关古诗文 1100 | 江南原文及翻译 1101 | 王维律诗中的有禅境 1102 | 王昌龄形容词 1103 | 李白说有哪些过哪些词 1104 | 秋天的树叶初中文章初中作文 1105 | 和季节有关的写现代诗 1106 | 母亲河古诗里 1107 | 王昌龄写句子有哪些 1108 | 关于李白的律诗有哪几 1109 | 江南朗诵稿 1110 | 季节类的短现代诗 1111 | 春天的古词 1112 | 关于爱人著名新唐诗 1113 | 李白藏头诗朗诵稿 1114 | 有关黄河那首诗 1115 | 形容春天来临诗歌鉴赏 1116 | 江南水那首诗 1117 | 水调歌头原文诗词歌赋关于苏轼 1118 | 五言绝句关于爱情誓言 1119 | 苏轼写明月几 1120 | 王维和你怎样的那首诗 1121 | 爱情的经典诗词现代诗写 1122 | 李白有语句赏析 1123 | 忘庐山瀑布文本内容 1124 | 中小学古诗比赛课 1125 | 李白相关的文章 1126 | 王维是怎样的这首诗 1127 | 诗朗诵稿李白写诗的将进酒朗诵 1128 | 有关李白送别诗歌40首诗歌鉴赏 1129 | 桃花源里陶渊明作者简 1130 | 有关李白最著名 1131 | 有关青春的现代诗歌精选 1132 | 好文采现代诗 1133 | 杜甫写的诗词全集中华诗词 1134 | 望庐山瀑语文内容 1135 | 赞美梅花的古诗文 1136 | 描写月下或月亮几古诗文 1137 | 游子吟这首诗诗歌散文 1138 | 古诗文出塞 1139 | 情和最爱的现代诗 1140 | 桃花源人陶渊明主要内容及 1141 | 小孩和大人的诗古诗文 1142 | 关于杜甫的长诗有那些 1143 | 母亲河的古诗有 1144 | 爱情诗散文集 1145 | 古典诗词苏东坡 1146 | 给春天的现代诗 1147 | 关于爱情得最著名背古诗 1148 | 关于描写久别重逢是什么的诗句 1149 | 白居易是有哪些知名的俗语 1150 | 早发白帝关于李白二 1151 | 李白有什么人生名言 1152 | 现代诗加诗歌赏析 1153 | 李白有朝发白帝城 1154 | 飞流吃直下三千尺是谁的诗 1155 | 代表离开时经典伤感的那首诗 1156 | 古诗春晓注释和翻译及诗歌鉴赏 1157 | 王维的板桥小别全文加注释 1158 | 撑起来黄莺儿的后面一句 1159 | 关于青春现代诗歌精选 1160 | 李白的抒情散文精选 1161 | 像风的唯美句子 1162 | 形容词大全关于苏轼 1163 | 有关描写跟月亮的古代名句 1164 | 王昌龄写古文 1165 | 白居易写忆江南那首诗 1166 | 青少年爱国现代诗 1167 | 明月光及时古诗词有 1168 | 古诗中表达关于爱情 1169 | 有关李白的律诗哪些有 1170 | 王国维说那首诗 1171 | 王国维诗歌鉴赏 1172 | 与黄河有关诗歌有 1173 | 李白有的诗词黄河的现代诗 1174 | 文笔很好现代诗 1175 | 李白诗写送别诗40首诗歌鉴赏 1176 | 关于宝宝诗作 1177 | 关于辛弃疾山水那首诗 1178 | 诗朗诵琵琶行及 1179 | 朝辞白帝彩云的前面一句是什么 1180 | 辛弃疾有哪些牵挂词 1181 | 关于母亲河的那首诗 1182 | 有关李白诗作 1183 | 草白居易那首诗 1184 | 李白描写哪些过有那些词 1185 | 李白有八律古诗文 1186 | 王维写经典语句 1187 | 带江南这首诗中 1188 | 诗歌鉴赏将进酒 1189 | 王维知名的句子 1190 | 李白写诗名句 1191 | 李清照有夏天的 1192 | 那首诗韩愈的两首古诗 1193 | 李白有什么有关于月的那首诗 1194 | 古诗文朗诵表演江南 1195 | 江南这首诗通过 1196 | 李白写的词语全集 1197 | 描写夏日古诗词和 1198 | 关于青春的现代诗我 1199 | 苏轼为官关中秋节 1200 | 写苏东坡关中秋节 1201 | 思念的重阳古典诗词 1202 | 写月亮古诗文简单 1203 | 中秋节这首诗 1204 | 青春的现代诗与 1205 | 望月怀远题目什么诗歌鉴赏 1206 | 赞美关于爱情古诗和 1207 | 李贺有什么著名的四句 1208 | 秦时明月汉时相关的下一句 1209 | 诗人李白诗朗诵 1210 | 杜甫写的长诗上有哪些 1211 | 那首诗王维诗 1212 | 草白居易什么最出名的俗语 1213 | 像秋天的初中记叙文作文 1214 | 爱国精神的句子 1215 | 李白望庐山瀑布诗歌鉴赏 1216 | 爱情抒情类散文 1217 | 鲁迅写笔下的俗语 1218 | 送汪伦这首诗中为什么写 1219 | 水调歌头古典诗词苏东坡 1220 | 有关苏轼关中秋 1221 | 李白写的独坐静亭山 1222 | 古诗文泊秦淮原文及 1223 | 赞美梅花糕的俗语 1224 | 杜牧诗词著名现代诗作 1225 | 苏轼写给关于中秋 1226 | 赞美甜蜜爱情古诗 1227 | 李清照词夏日 1228 | 古代诗词那首诗诗句有 1229 | 李商隐诗集目前有哪些 1230 | 李白写歌德名言 1231 | 王维写的作者用什么 1232 | 幼儿园诗歌古诗有儿童学 1233 | 夏末的那首诗 1234 | 王之涣诗经名句 1235 | 形容春天的那首诗 1236 | 关于月亮月光曲现代诗 1237 | 李白写哪些过有哪些词 1238 | 诗朗诵李白古诗的将进酒朗诵 1239 | 关于李白有名人名言 1240 | 新唐诗秦时明月汉时时光 1241 | 苏轼最眉州诗词鉴赏 1242 | 关于辛弃疾想念的词 1243 | 思乡的现代诗古诗词 1244 | 秋日的初中记叙文初中作文 1245 | 静夜诗的那首诗 1246 | 李白经典语句 1247 | 折戟沉沙铁未注销的作者谁 1248 | 写话秋天小学古诗大全20首 1249 | 王昌龄诗经 1250 | 李白赠汪伦这首诗中怎么说 1251 | 王之焕诗经名句 1252 | 古女子感怀那首诗 1253 | 静夜思中华诗词 1254 | 朝发白帝彩云的下一句 1255 | 有关花的诗句那首诗 1256 | 古诗文望月怀远 1257 | 李白诗的诗内容简介 1258 | 有春的写诗有什么 1259 | 母亲河那首诗 1260 | 秦时明月汉关的后面一句是什么 1261 | 与少年有关用功的句子 1262 | 记承天寺游记全文阅读 1263 | 关于苏轼的词语和诗 1264 | 五言绝句爱恋 1265 | 写教师的句子 1266 | 忆江南这首诗中 1267 | 描写春天的现代诗 1268 | 曹操文学的作者 1269 | 李白写诗望卢山瀑布 1270 | 专业诗朗诵稿望庐山瀑布 1271 | 明月光及时有诗词 1272 | 小学古诗大全春不觉晓 1273 | 王维是有名的是在哪里一首词 1274 | 白居易现代诗歌作 1275 | 郑汪伦这首诗写上 1276 | 罗隐现代诗 1277 | 颂中秋诗词歌赋 1278 | 诗朗诵稿琵琶行是 1279 | 描写春天古文 1280 | 有关李白句子分析 1281 | 关于思乡的现代诗李白古诗 1282 | 男女说话情诗 1283 | 关于家国的句子古诗文 1284 | 青春的现代诗词 1285 | 李白与哪些过有哪些词 1286 | 王昌龄出塞诗歌鉴赏 1287 | 有关于夏天的关于现代诗 1288 | 母亲河唐诗宋词 1289 | 朝辞白帝彩云的上一句是什么 1290 | 关于杜甫的长诗有有哪些 1291 | 母亲河古诗词 1292 | 秋天到作文初中作文 1293 | 范仲淹词名句 1294 | 中秋之夜水调歌头古诗文 1295 | 有关梅花糕的句子 1296 | 关于爱情深情有古诗词 1297 | 母亲河诗句有 1298 | 诗朗诵稿儿诗的李白将进酒 1299 | 关于爱情的抒情性散文 1300 | 王维写的板桥小别翻译和注释 1301 | 李白有的律诗有什么 1302 | 赞美爱的那首诗 1303 | 介绍黄河那首诗 1304 | 中秋夜水调歌头古诗大全 1305 | 有关于中秋这首诗 1306 | 韩愈名句 1307 | 诗朗诵琵琶行是什么 1308 | 李白的小散文 1309 | 关于春晓所有的那首诗 1310 | 李白写古郎月行 1311 | 与中秋有关诗歌集 1312 | 春晓这首诗视频下载 1313 | 青春的诗篇 1314 | 关于黄河颂古代诗词 1315 | 中秋夜水调歌头原文古诗大全 1316 | 苏轼最眉州现代诗 1317 | 陶渊明桃花源是记五柳先生转 1318 | 月亮有关诗歌大全 1319 | 关秋天的初中记叙文作文 1320 | 歌颂梅花都那首诗 1321 | 苏轼有关于中秋 1322 | 李白古诗的诗歌窗前明月光 1323 | 我爱梅花的全唐诗 1324 | 望炉山瀑布关于李白朗诵稿 1325 | 中国诗歌大全及解释 1326 | 宋词精选那首诗山水诗 1327 | 朝辞白帝彩云和的前一句是什么 1328 | 李白和格言大全 1329 | 白居易是那首诗 1330 | 将进酒古诗文原文及 1331 | 有爱情的名诗句现代诗歌 1332 | 诗朗诵李白作诗的将进酒 1333 | 描写春山水诗 1334 | 苏轼写的句子和诗 1335 | 李清照词作 1336 | 李白有关有哪些过有那些词 1337 | 秋天的作文初中初中作文 1338 | 王之涣出塞秦时明月汉时 1339 | 苏轼最流放关于写的形容词 1340 | 相思诗重阳那首诗 1341 | 关于苏轼的饮湖上初晴雨 1342 | 带草现代诗 1343 | 王之焕名言名句 1344 | 读书节诗作 1345 | 有关于爱情的古代诗词写诗 1346 | 采菊花东篱的一句 1347 | 韩愈经典词句 1348 | 形容孩子的句子 1349 | 带青春的现代散文 1350 | 忘庐山瀑布朗诵稿 1351 | 唐代女子感怀那首诗 1352 | 白居易是什么最有名的俗语 1353 | 苏轼那首诗 1354 | 李白写有那些过有哪些词 1355 | 唐诗宋词那首诗那首诗 1356 | 宋词这首诗山水诗 1357 | 王昌龄写形容词大全 1358 | 那首诗关于韩愈的二句话 1359 | 李白有环境描写 1360 | 给我们朗诵一下子曹植的七步 1361 | 秦时明月汉关的下一句是什么 1362 | 李白有什么的句子全集 1363 | 关于离别时感伤诗句有 1364 | 贺知章过哪些古诗文 1365 | 夏沫这首诗 1366 | 忘庐山瀑布有关李白朗诵 1367 | 李白最山 1368 | 关于思乡诗词鉴赏 1369 | 全唐诗王维送别 1370 | 诗句表达关于爱情的 1371 | 李白什么哪些过哪些词 1372 | 春晓这首诗原文及翻译 1373 | 王维你怎样的这首诗 1374 | 辛弃疾诗著名现代诗 1375 | 有关于爱情的诗句有新诗 1376 | 牵挂那首诗 1377 | 朱熹的诗词鉴赏 1378 | 杜甫写的诗全集古典诗词 1379 | 李白古诗八律那首诗 1380 | 梅花象征那首诗 1381 | 陶渊明写题诗 1382 | 带江南注音版古诗文 1383 | 秋天秋天的古词 1384 | 关于辛弃疾精选 1385 | 古人写给泪相思的现代诗 1386 | 春古诗文 1387 | 李白赞美山 1388 | 诗仙李白环境描述 1389 | 少年用夫妻爱情誓言诗词句 1390 | 发给我背悯农 1391 | 竹里馆古诗文 1392 | 夏末的现代诗 1393 | 幼儿古诗全唐诗关于儿童 1394 | 水调歌头中华诗词苏东坡 1395 | 夏日绝句李清照诗歌鉴赏 1396 | 专业朗诵稿望庐山瀑 1397 | 静夜思这首诗好诗词 1398 | 关于李白名句 1399 | 飞流打直下三千尺谁是谁的诗 1400 | 关于秋天思乡的古诗文 1401 | 李白有名人名言 1402 | 中华古现代诗 1403 | 关于李白的美文 1404 | 王维写的题诗 1405 | 打起来黄莺儿的下一句是什么 1406 | 诗经中春天的 1407 | 李白静夜思那首诗 1408 | 关于李白送别词40首诗歌鉴赏 1409 | -------------------------------------------------------------------------------- /data/lm/result.zh.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "tokens": [ 4 | { 5 | "token": "2016", 6 | "prob": 0.10201895982027054 7 | }, 8 | { 9 | "token": "全", 10 | "prob": 0.8663827180862427 11 | }, 12 | { 13 | "token": "国", 14 | "prob": 0.9864780902862549 15 | }, 16 | { 17 | "token": "高", 18 | "prob": 0.34686800837516785 19 | }, 20 | { 21 | "token": "考", 22 | "prob": 0.9513261914253235 23 | }, 24 | { 25 | "token": "卷", 26 | "prob": 0.002143713878467679 27 | }, 28 | { 29 | "token": "答", 30 | "prob": 0.04595280811190605 31 | }, 32 | { 33 | "token": "题", 34 | "prob": 0.052179522812366486 35 | }, 36 | { 37 | "token": "模", 38 | "prob": 0.9097260236740112 39 | }, 40 | { 41 | "token": "板", 42 | "prob": 0.5003664493560791 43 | } 44 | ], 45 | "ppl": 5.214168292180062 46 | }, 47 | { 48 | "tokens": [ 49 | { 50 | "token": "2016", 51 | "prob": 0.07322586327791214 52 | }, 53 | { 54 | "token": "全", 55 | "prob": 0.4594285190105438 56 | }, 57 | { 58 | "token": "国", 59 | "prob": 0.9693033695220947 60 | }, 61 | { 62 | "token": "大", 63 | "prob": 0.002398423384875059 64 | }, 65 | { 66 | "token": "考", 67 | "prob": 0.35024359822273254 68 | }, 69 | { 70 | "token": "卷", 71 | "prob": 0.005357644520699978 72 | }, 73 | { 74 | "token": "答", 75 | "prob": 0.05084587633609772 76 | }, 77 | { 78 | "token": "题", 79 | "prob": 0.08499259501695633 80 | }, 81 | { 82 | "token": "模", 83 | "prob": 0.8091200590133667 84 | }, 85 | { 86 | "token": "板", 87 | "prob": 0.4416927099227905 88 | } 89 | ], 90 | "ppl": 9.214288062044073 91 | }, 92 | { 93 | "tokens": [ 94 | { 95 | "token": "2016", 96 | "prob": 0.06563900411128998 97 | }, 98 | { 99 | "token": "全", 100 | "prob": 0.4981258511543274 101 | }, 102 | { 103 | "token": "国", 104 | "prob": 0.9088247418403625 105 | }, 106 | { 107 | "token": "低", 108 | "prob": 1.6259804397122934e-05 109 | }, 110 | { 111 | "token": "考", 112 | "prob": 0.4023572504520416 113 | }, 114 | { 115 | "token": "卷", 116 | "prob": 0.014088313095271587 117 | }, 118 | { 119 | "token": "答", 120 | "prob": 0.0639762282371521 121 | }, 122 | { 123 | "token": "题", 124 | "prob": 0.09024914354085922 125 | }, 126 | { 127 | "token": "模", 128 | "prob": 0.8052080273628235 129 | }, 130 | { 131 | "token": "板", 132 | "prob": 0.4206086993217468 133 | } 134 | ], 135 | "ppl": 13.400421357093588 136 | }, 137 | { 138 | "tokens": [ 139 | { 140 | "token": "床", 141 | "prob": 0.06141529977321625 142 | }, 143 | { 144 | "token": "前", 145 | "prob": 0.6264788508415222 146 | }, 147 | { 148 | "token": "明", 149 | "prob": 0.19100141525268555 150 | }, 151 | { 152 | "token": "月", 153 | "prob": 0.8327909111976624 154 | }, 155 | { 156 | "token": "光", 157 | "prob": 0.0027572319377213717 158 | }, 159 | { 160 | "token": ",", 161 | "prob": 0.9875602126121521 162 | }, 163 | { 164 | "token": "疑", 165 | "prob": 0.26570719480514526 166 | }, 167 | { 168 | "token": "是", 169 | "prob": 0.20073072612285614 170 | }, 171 | { 172 | "token": "地", 173 | "prob": 0.0013851922703906894 174 | }, 175 | { 176 | "token": "上", 177 | "prob": 0.08247583359479904 178 | }, 179 | { 180 | "token": "霜", 181 | "prob": 1.1996443838313553e-09 182 | } 183 | ], 184 | "ppl": 52.41817724710573 185 | }, 186 | { 187 | "tokens": [ 188 | { 189 | "token": "床", 190 | "prob": 0.007852567359805107 191 | }, 192 | { 193 | "token": "前", 194 | "prob": 0.023406021296977997 195 | }, 196 | { 197 | "token": "星", 198 | "prob": 0.0016379707958549261 199 | }, 200 | { 201 | "token": "星", 202 | "prob": 0.08062248677015305 203 | }, 204 | { 205 | "token": "光", 206 | "prob": 0.003109232522547245 207 | }, 208 | { 209 | "token": ",", 210 | "prob": 0.7677633762359619 211 | }, 212 | { 213 | "token": "疑", 214 | "prob": 0.02753453515470028 215 | }, 216 | { 217 | "token": "是", 218 | "prob": 0.18585893511772156 219 | }, 220 | { 221 | "token": "地", 222 | "prob": 0.006168654654175043 223 | }, 224 | { 225 | "token": "上", 226 | "prob": 0.09490019828081131 227 | }, 228 | { 229 | "token": "霜", 230 | "prob": 5.2765480873517845e-09 231 | } 232 | ], 233 | "ppl": 153.20970629080068 234 | }, 235 | { 236 | "tokens": [ 237 | { 238 | "token": "床", 239 | "prob": 0.05734571814537048 240 | }, 241 | { 242 | "token": "前", 243 | "prob": 0.4436746835708618 244 | }, 245 | { 246 | "token": "白", 247 | "prob": 0.0020346769597381353 248 | }, 249 | { 250 | "token": "月", 251 | "prob": 0.23601755499839783 252 | }, 253 | { 254 | "token": "光", 255 | "prob": 0.05537288263440132 256 | }, 257 | { 258 | "token": ",", 259 | "prob": 0.9799799919128418 260 | }, 261 | { 262 | "token": "疑", 263 | "prob": 0.22007052600383759 264 | }, 265 | { 266 | "token": "是", 267 | "prob": 0.21303308010101318 268 | }, 269 | { 270 | "token": "地", 271 | "prob": 0.0022917157039046288 272 | }, 273 | { 274 | "token": "上", 275 | "prob": 0.07564827054738998 276 | }, 277 | { 278 | "token": "霜", 279 | "prob": 6.6843566237650975e-09 280 | } 281 | ], 282 | "ppl": 58.55804131999474 283 | }, 284 | { 285 | "tokens": [ 286 | { 287 | "token": "落", 288 | "prob": 0.14436914026737213 289 | }, 290 | { 291 | "token": "霞", 292 | "prob": 0.4585109353065491 293 | }, 294 | { 295 | "token": "与", 296 | "prob": 0.8038789629936218 297 | }, 298 | { 299 | "token": "孤", 300 | "prob": 0.9961937665939331 301 | }, 302 | { 303 | "token": "鹜", 304 | "prob": 0.7430610060691833 305 | }, 306 | { 307 | "token": "齐", 308 | "prob": 0.5758452415466309 309 | }, 310 | { 311 | "token": "飞", 312 | "prob": 0.8611886501312256 313 | }, 314 | { 315 | "token": ",", 316 | "prob": 0.8926917314529419 317 | }, 318 | { 319 | "token": "秋", 320 | "prob": 0.26929906010627747 321 | }, 322 | { 323 | "token": "水", 324 | "prob": 0.29808318614959717 325 | }, 326 | { 327 | "token": "共", 328 | "prob": 0.16475750505924225 329 | }, 330 | { 331 | "token": "长", 332 | "prob": 0.04301689937710762 333 | }, 334 | { 335 | "token": "天", 336 | "prob": 0.8850820660591125 337 | }, 338 | { 339 | "token": "一", 340 | "prob": 0.675239622592926 341 | }, 342 | { 343 | "token": "色", 344 | "prob": 5.158061867405195e-06 345 | } 346 | ], 347 | "ppl": 5.023060982907458 348 | }, 349 | { 350 | "tokens": [ 351 | { 352 | "token": "落", 353 | "prob": 0.1483132392168045 354 | }, 355 | { 356 | "token": "霞", 357 | "prob": 0.42232587933540344 358 | }, 359 | { 360 | "token": "与", 361 | "prob": 0.8615185022354126 362 | }, 363 | { 364 | "token": "孤", 365 | "prob": 0.9975666999816895 366 | }, 367 | { 368 | "token": "鹜", 369 | "prob": 0.5613960027694702 370 | }, 371 | { 372 | "token": "齐", 373 | "prob": 0.18012434244155884 374 | }, 375 | { 376 | "token": "跑", 377 | "prob": 1.3388593288254924e-05 378 | }, 379 | { 380 | "token": ",", 381 | "prob": 0.8621458411216736 382 | }, 383 | { 384 | "token": "秋", 385 | "prob": 0.24820539355278015 386 | }, 387 | { 388 | "token": "水", 389 | "prob": 0.32748720049858093 390 | }, 391 | { 392 | "token": "共", 393 | "prob": 0.2348739057779312 394 | }, 395 | { 396 | "token": "长", 397 | "prob": 0.040592435747385025 398 | }, 399 | { 400 | "token": "天", 401 | "prob": 0.9231186509132385 402 | }, 403 | { 404 | "token": "一", 405 | "prob": 0.5295999646186829 406 | }, 407 | { 408 | "token": "色", 409 | "prob": 2.7847559067595284e-06 410 | } 411 | ], 412 | "ppl": 11.983086642867598 413 | }, 414 | { 415 | "tokens": [ 416 | { 417 | "token": "落", 418 | "prob": 0.14112578332424164 419 | }, 420 | { 421 | "token": "霞", 422 | "prob": 0.5331157445907593 423 | }, 424 | { 425 | "token": "与", 426 | "prob": 0.41120263934135437 427 | }, 428 | { 429 | "token": "孤", 430 | "prob": 0.9943874478340149 431 | }, 432 | { 433 | "token": "鹜", 434 | "prob": 0.6526917219161987 435 | }, 436 | { 437 | "token": "双", 438 | "prob": 0.03388489782810211 439 | }, 440 | { 441 | "token": "飞", 442 | "prob": 0.4864092767238617 443 | }, 444 | { 445 | "token": ",", 446 | "prob": 0.8635039925575256 447 | }, 448 | { 449 | "token": "秋", 450 | "prob": 0.27505722641944885 451 | }, 452 | { 453 | "token": "水", 454 | "prob": 0.2975866198539734 455 | }, 456 | { 457 | "token": "共", 458 | "prob": 0.1109926775097847 459 | }, 460 | { 461 | "token": "长", 462 | "prob": 0.05403125286102295 463 | }, 464 | { 465 | "token": "天", 466 | "prob": 0.8708866238594055 467 | }, 468 | { 469 | "token": "一", 470 | "prob": 0.6170234680175781 471 | }, 472 | { 473 | "token": "色", 474 | "prob": 1.1969897968810983e-05 475 | } 476 | ], 477 | "ppl": 6.352968507499718 478 | }, 479 | { 480 | "tokens": [ 481 | { 482 | "token": "众", 483 | "prob": 0.9997541308403015 484 | }, 485 | { 486 | "token": "里", 487 | "prob": 0.9997285008430481 488 | }, 489 | { 490 | "token": "寻", 491 | "prob": 0.9988683462142944 492 | }, 493 | { 494 | "token": "他", 495 | "prob": 0.7197673916816711 496 | }, 497 | { 498 | "token": "千", 499 | "prob": 0.9998348951339722 500 | }, 501 | { 502 | "token": "百", 503 | "prob": 0.9883707761764526 504 | }, 505 | { 506 | "token": "度", 507 | "prob": 0.9988011121749878 508 | }, 509 | { 510 | "token": ",", 511 | "prob": 0.9991918206214905 512 | }, 513 | { 514 | "token": "蓦", 515 | "prob": 0.9992905855178833 516 | }, 517 | { 518 | "token": "然", 519 | "prob": 0.999387264251709 520 | }, 521 | { 522 | "token": "回", 523 | "prob": 0.9999867677688599 524 | }, 525 | { 526 | "token": "首", 527 | "prob": 0.9995602965354919 528 | }, 529 | { 530 | "token": ",", 531 | "prob": 0.9979262351989746 532 | }, 533 | { 534 | "token": "那", 535 | "prob": 0.957136869430542 536 | }, 537 | { 538 | "token": "人", 539 | "prob": 0.9965313076972961 540 | }, 541 | { 542 | "token": "却", 543 | "prob": 0.9842029809951782 544 | }, 545 | { 546 | "token": "在", 547 | "prob": 0.9936916828155518 548 | }, 549 | { 550 | "token": ",", 551 | "prob": 0.9937483072280884 552 | }, 553 | { 554 | "token": "灯", 555 | "prob": 0.9998267292976379 556 | }, 557 | { 558 | "token": "火", 559 | "prob": 0.9999244213104248 560 | }, 561 | { 562 | "token": "阑", 563 | "prob": 0.9999798536300659 564 | }, 565 | { 566 | "token": "珊", 567 | "prob": 0.9999972581863403 568 | }, 569 | { 570 | "token": "处", 571 | "prob": 1.918866399108765e-09 572 | } 573 | ], 574 | "ppl": 2.4378636592211476 575 | }, 576 | { 577 | "tokens": [ 578 | { 579 | "token": "众", 580 | "prob": 0.9995400905609131 581 | }, 582 | { 583 | "token": "里", 584 | "prob": 0.9989792108535767 585 | }, 586 | { 587 | "token": "寻", 588 | "prob": 0.9996756315231323 589 | }, 590 | { 591 | "token": "她", 592 | "prob": 0.26533517241477966 593 | }, 594 | { 595 | "token": "千", 596 | "prob": 0.9998635053634644 597 | }, 598 | { 599 | "token": "百", 600 | "prob": 0.9769723415374756 601 | }, 602 | { 603 | "token": "度", 604 | "prob": 0.997245192527771 605 | }, 606 | { 607 | "token": ",", 608 | "prob": 0.9979918003082275 609 | }, 610 | { 611 | "token": "蓦", 612 | "prob": 0.9991554021835327 613 | }, 614 | { 615 | "token": "然", 616 | "prob": 0.9993096590042114 617 | }, 618 | { 619 | "token": "回", 620 | "prob": 0.9999856948852539 621 | }, 622 | { 623 | "token": "首", 624 | "prob": 0.9995132684707642 625 | }, 626 | { 627 | "token": ",", 628 | "prob": 0.9980834722518921 629 | }, 630 | { 631 | "token": "那", 632 | "prob": 0.969828724861145 633 | }, 634 | { 635 | "token": "人", 636 | "prob": 0.9686450958251953 637 | }, 638 | { 639 | "token": "却", 640 | "prob": 0.9804452061653137 641 | }, 642 | { 643 | "token": "在", 644 | "prob": 0.9953302145004272 645 | }, 646 | { 647 | "token": ",", 648 | "prob": 0.9902727007865906 649 | }, 650 | { 651 | "token": "灯", 652 | "prob": 0.999835729598999 653 | }, 654 | { 655 | "token": "火", 656 | "prob": 0.9999294281005859 657 | }, 658 | { 659 | "token": "阑", 660 | "prob": 0.9999833106994629 661 | }, 662 | { 663 | "token": "珊", 664 | "prob": 0.9999969005584717 665 | }, 666 | { 667 | "token": "处", 668 | "prob": 2.7301996219364355e-09 669 | } 670 | ], 671 | "ppl": 2.511098534866279 672 | }, 673 | { 674 | "tokens": [ 675 | { 676 | "token": "众", 677 | "prob": 0.9973291158676147 678 | }, 679 | { 680 | "token": "里", 681 | "prob": 0.9993185997009277 682 | }, 683 | { 684 | "token": "寻", 685 | "prob": 0.9946662187576294 686 | }, 687 | { 688 | "token": "ta", 689 | "prob": 9.90712862858345e-08 690 | }, 691 | { 692 | "token": "千", 693 | "prob": 0.999840259552002 694 | }, 695 | { 696 | "token": "百", 697 | "prob": 0.9573060870170593 698 | }, 699 | { 700 | "token": "度", 701 | "prob": 0.9984676241874695 702 | }, 703 | { 704 | "token": ",", 705 | "prob": 0.998295247554779 706 | }, 707 | { 708 | "token": "蓦", 709 | "prob": 0.9991011619567871 710 | }, 711 | { 712 | "token": "然", 713 | "prob": 0.9991719722747803 714 | }, 715 | { 716 | "token": "回", 717 | "prob": 0.9999833106994629 718 | }, 719 | { 720 | "token": "首", 721 | "prob": 0.999624490737915 722 | }, 723 | { 724 | "token": ",", 725 | "prob": 0.9986977577209473 726 | }, 727 | { 728 | "token": "那", 729 | "prob": 0.9325712323188782 730 | }, 731 | { 732 | "token": "人", 733 | "prob": 0.9955593347549438 734 | }, 735 | { 736 | "token": "却", 737 | "prob": 0.9584773182868958 738 | }, 739 | { 740 | "token": "在", 741 | "prob": 0.9958037734031677 742 | }, 743 | { 744 | "token": ",", 745 | "prob": 0.9910857081413269 746 | }, 747 | { 748 | "token": "灯", 749 | "prob": 0.9998859167098999 750 | }, 751 | { 752 | "token": "火", 753 | "prob": 0.9999510049819946 754 | }, 755 | { 756 | "token": "阑", 757 | "prob": 0.9999864101409912 758 | }, 759 | { 760 | "token": "珊", 761 | "prob": 0.9999982118606567 762 | }, 763 | { 764 | "token": "处", 765 | "prob": 1.1214358330846608e-09 766 | } 767 | ], 768 | "ppl": 4.980078099549084 769 | } -------------------------------------------------------------------------------- /data/lm/test.en.tsv: -------------------------------------------------------------------------------- 1 | there is a book on the desk 2 | there is a plane on the desk 3 | there is a book in the desk 4 | 5 | -------------------------------------------------------------------------------- /data/lm/test.zh.tsv: -------------------------------------------------------------------------------- 1 | 2016全国高考卷答题模板 2 | 2016全国大考卷答题模板 3 | 2016全国低考卷答题模板 4 | 床前明月光,疑是地上霜 5 | 床前星星光,疑是地上霜 6 | 床前白月光,疑是地上霜 7 | 落霞与孤鹜齐飞,秋水共长天一色 8 | 落霞与孤鹜齐跑,秋水共长天一色 9 | 落霞与孤鹜双飞,秋水共长天一色 10 | 众里寻他千百度,蓦然回首,那人却在,灯火阑珊处 11 | 众里寻她千百度,蓦然回首,那人却在,灯火阑珊处 12 | 众里寻ta千百度,蓦然回首,那人却在,灯火阑珊处 -------------------------------------------------------------------------------- /extract_features.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Extract pre-computed feature vectors from BERT.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import codecs 22 | import collections 23 | import json 24 | import re 25 | 26 | import modeling 27 | import tokenization 28 | import tensorflow as tf 29 | 30 | flags = tf.flags 31 | 32 | FLAGS = flags.FLAGS 33 | 34 | flags.DEFINE_string("input_file", None, "") 35 | 36 | flags.DEFINE_string("output_file", None, "") 37 | 38 | flags.DEFINE_string("layers", "-1,-2,-3,-4", "") 39 | 40 | flags.DEFINE_string( 41 | "bert_config_file", None, 42 | "The config json file corresponding to the pre-trained BERT model. " 43 | "This specifies the model architecture.") 44 | 45 | flags.DEFINE_integer( 46 | "max_seq_length", 128, 47 | "The maximum total input sequence length after WordPiece tokenization. " 48 | "Sequences longer than this will be truncated, and sequences shorter " 49 | "than this will be padded.") 50 | 51 | flags.DEFINE_string( 52 | "init_checkpoint", None, 53 | "Initial checkpoint (usually from a pre-trained BERT model).") 54 | 55 | flags.DEFINE_string("vocab_file", None, 56 | "The vocabulary file that the BERT model was trained on.") 57 | 58 | flags.DEFINE_bool( 59 | "do_lower_case", True, 60 | "Whether to lower case the input text. Should be True for uncased " 61 | "models and False for cased models.") 62 | 63 | flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.") 64 | 65 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") 66 | 67 | flags.DEFINE_string("master", None, 68 | "If using a TPU, the address of the master.") 69 | 70 | flags.DEFINE_integer( 71 | "num_tpu_cores", 8, 72 | "Only used if `use_tpu` is True. Total number of TPU cores to use.") 73 | 74 | flags.DEFINE_bool( 75 | "use_one_hot_embeddings", False, 76 | "If True, tf.one_hot will be used for embedding lookups, otherwise " 77 | "tf.nn.embedding_lookup will be used. On TPUs, this should be True " 78 | "since it is much faster.") 79 | 80 | 81 | class InputExample(object): 82 | 83 | def __init__(self, unique_id, text_a, text_b): 84 | self.unique_id = unique_id 85 | self.text_a = text_a 86 | self.text_b = text_b 87 | 88 | 89 | class InputFeatures(object): 90 | """A single set of features of data.""" 91 | 92 | def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): 93 | self.unique_id = unique_id 94 | self.tokens = tokens 95 | self.input_ids = input_ids 96 | self.input_mask = input_mask 97 | self.input_type_ids = input_type_ids 98 | 99 | 100 | def input_fn_builder(features, seq_length): 101 | """Creates an `input_fn` closure to be passed to TPUEstimator.""" 102 | 103 | all_unique_ids = [] 104 | all_input_ids = [] 105 | all_input_mask = [] 106 | all_input_type_ids = [] 107 | 108 | for feature in features: 109 | all_unique_ids.append(feature.unique_id) 110 | all_input_ids.append(feature.input_ids) 111 | all_input_mask.append(feature.input_mask) 112 | all_input_type_ids.append(feature.input_type_ids) 113 | 114 | def input_fn(params): 115 | """The actual input function.""" 116 | batch_size = params["batch_size"] 117 | 118 | num_examples = len(features) 119 | 120 | # This is for demo purposes and does NOT scale to large data sets. We do 121 | # not use Dataset.from_generator() because that uses tf.py_func which is 122 | # not TPU compatible. The right way to load data is with TFRecordReader. 123 | d = tf.data.Dataset.from_tensor_slices({ 124 | "unique_ids": 125 | tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32), 126 | "input_ids": 127 | tf.constant( 128 | all_input_ids, shape=[num_examples, seq_length], 129 | dtype=tf.int32), 130 | "input_mask": 131 | tf.constant( 132 | all_input_mask, 133 | shape=[num_examples, seq_length], 134 | dtype=tf.int32), 135 | "input_type_ids": 136 | tf.constant( 137 | all_input_type_ids, 138 | shape=[num_examples, seq_length], 139 | dtype=tf.int32), 140 | }) 141 | 142 | d = d.batch(batch_size=batch_size, drop_remainder=False) 143 | return d 144 | 145 | return input_fn 146 | 147 | 148 | def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu, 149 | use_one_hot_embeddings): 150 | """Returns `model_fn` closure for TPUEstimator.""" 151 | 152 | def model_fn(features, labels, mode, params): # pylint: disable=unused-argument 153 | """The `model_fn` for TPUEstimator.""" 154 | 155 | unique_ids = features["unique_ids"] 156 | input_ids = features["input_ids"] 157 | input_mask = features["input_mask"] 158 | input_type_ids = features["input_type_ids"] 159 | 160 | model = modeling.BertModel( 161 | config=bert_config, 162 | is_training=False, 163 | input_ids=input_ids, 164 | input_mask=input_mask, 165 | token_type_ids=input_type_ids, 166 | use_one_hot_embeddings=use_one_hot_embeddings) 167 | 168 | if mode != tf.estimator.ModeKeys.PREDICT: 169 | raise ValueError("Only PREDICT modes are supported: %s" % (mode)) 170 | 171 | tvars = tf.trainable_variables() 172 | scaffold_fn = None 173 | (assignment_map, 174 | initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( 175 | tvars, init_checkpoint) 176 | if use_tpu: 177 | 178 | def tpu_scaffold(): 179 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 180 | return tf.train.Scaffold() 181 | 182 | scaffold_fn = tpu_scaffold 183 | else: 184 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 185 | 186 | tf.logging.info("**** Trainable Variables ****") 187 | for var in tvars: 188 | init_string = "" 189 | if var.name in initialized_variable_names: 190 | init_string = ", *INIT_FROM_CKPT*" 191 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, 192 | init_string) 193 | 194 | all_layers = model.get_all_encoder_layers() 195 | 196 | predictions = { 197 | "unique_id": unique_ids, 198 | } 199 | 200 | for (i, layer_index) in enumerate(layer_indexes): 201 | predictions["layer_output_%d" % i] = all_layers[layer_index] 202 | 203 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 204 | mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) 205 | return output_spec 206 | 207 | return model_fn 208 | 209 | 210 | def convert_examples_to_features(examples, seq_length, tokenizer): 211 | """Loads a data file into a list of `InputBatch`s.""" 212 | 213 | features = [] 214 | for (ex_index, example) in enumerate(examples): 215 | tokens_a = tokenizer.tokenize(example.text_a) 216 | 217 | tokens_b = None 218 | if example.text_b: 219 | tokens_b = tokenizer.tokenize(example.text_b) 220 | 221 | if tokens_b: 222 | # Modifies `tokens_a` and `tokens_b` in place so that the total 223 | # length is less than the specified length. 224 | # Account for [CLS], [SEP], [SEP] with "- 3" 225 | _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) 226 | else: 227 | # Account for [CLS] and [SEP] with "- 2" 228 | if len(tokens_a) > seq_length - 2: 229 | tokens_a = tokens_a[0:(seq_length - 2)] 230 | 231 | # The convention in BERT is: 232 | # (a) For sequence pairs: 233 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 234 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 235 | # (b) For single sequences: 236 | # tokens: [CLS] the dog is hairy . [SEP] 237 | # type_ids: 0 0 0 0 0 0 0 238 | # 239 | # Where "type_ids" are used to indicate whether this is the first 240 | # sequence or the second sequence. The embedding vectors for `type=0` and 241 | # `type=1` were learned during pre-training and are added to the wordpiece 242 | # embedding vector (and position vector). This is not *strictly* necessary 243 | # since the [SEP] token unambiguously separates the sequences, but it makes 244 | # it easier for the model to learn the concept of sequences. 245 | # 246 | # For classification tasks, the first vector (corresponding to [CLS]) is 247 | # used as as the "sentence vector". Note that this only makes sense because 248 | # the entire model is fine-tuned. 249 | tokens = [] 250 | input_type_ids = [] 251 | tokens.append("[CLS]") 252 | input_type_ids.append(0) 253 | for token in tokens_a: 254 | tokens.append(token) 255 | input_type_ids.append(0) 256 | tokens.append("[SEP]") 257 | input_type_ids.append(0) 258 | 259 | if tokens_b: 260 | for token in tokens_b: 261 | tokens.append(token) 262 | input_type_ids.append(1) 263 | tokens.append("[SEP]") 264 | input_type_ids.append(1) 265 | 266 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 267 | 268 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 269 | # tokens are attended to. 270 | input_mask = [1] * len(input_ids) 271 | 272 | # Zero-pad up to the sequence length. 273 | while len(input_ids) < seq_length: 274 | input_ids.append(0) 275 | input_mask.append(0) 276 | input_type_ids.append(0) 277 | 278 | assert len(input_ids) == seq_length 279 | assert len(input_mask) == seq_length 280 | assert len(input_type_ids) == seq_length 281 | 282 | if ex_index < 5: 283 | tf.logging.info("*** Example ***") 284 | tf.logging.info("unique_id: %s" % (example.unique_id)) 285 | tf.logging.info("tokens: %s" % " ".join( 286 | [tokenization.printable_text(x) for x in tokens])) 287 | tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 288 | tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 289 | tf.logging.info( 290 | "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) 291 | 292 | features.append( 293 | InputFeatures( 294 | unique_id=example.unique_id, 295 | tokens=tokens, 296 | input_ids=input_ids, 297 | input_mask=input_mask, 298 | input_type_ids=input_type_ids)) 299 | return features 300 | 301 | 302 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 303 | """Truncates a sequence pair in place to the maximum length.""" 304 | 305 | # This is a simple heuristic which will always truncate the longer sequence 306 | # one token at a time. This makes more sense than truncating an equal percent 307 | # of tokens from each, since if one sequence is very short then each token 308 | # that's truncated likely contains more information than a longer sequence. 309 | while True: 310 | total_length = len(tokens_a) + len(tokens_b) 311 | if total_length <= max_length: 312 | break 313 | if len(tokens_a) > len(tokens_b): 314 | tokens_a.pop() 315 | else: 316 | tokens_b.pop() 317 | 318 | 319 | def read_examples(input_file): 320 | """Read a list of `InputExample`s from an input file.""" 321 | examples = [] 322 | unique_id = 0 323 | with tf.gfile.GFile(input_file, "r") as reader: 324 | while True: 325 | line = tokenization.convert_to_unicode(reader.readline()) 326 | if not line: 327 | break 328 | line = line.strip() 329 | text_a = None 330 | text_b = None 331 | m = re.match(r"^(.*) \|\|\| (.*)$", line) 332 | if m is None: 333 | text_a = line 334 | else: 335 | text_a = m.group(1) 336 | text_b = m.group(2) 337 | examples.append( 338 | InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) 339 | unique_id += 1 340 | return examples 341 | 342 | 343 | def main(_): 344 | tf.logging.set_verbosity(tf.logging.INFO) 345 | 346 | layer_indexes = [int(x) for x in FLAGS.layers.split(",")] 347 | 348 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) 349 | 350 | tokenizer = tokenization.FullTokenizer( 351 | vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) 352 | 353 | is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 354 | run_config = tf.contrib.tpu.RunConfig( 355 | master=FLAGS.master, 356 | tpu_config=tf.contrib.tpu.TPUConfig( 357 | num_shards=FLAGS.num_tpu_cores, 358 | per_host_input_for_training=is_per_host)) 359 | 360 | examples = read_examples(FLAGS.input_file) 361 | 362 | features = convert_examples_to_features( 363 | examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) 364 | 365 | unique_id_to_feature = {} 366 | for feature in features: 367 | unique_id_to_feature[feature.unique_id] = feature 368 | 369 | model_fn = model_fn_builder( 370 | bert_config=bert_config, 371 | init_checkpoint=FLAGS.init_checkpoint, 372 | layer_indexes=layer_indexes, 373 | use_tpu=FLAGS.use_tpu, 374 | use_one_hot_embeddings=FLAGS.use_one_hot_embeddings) 375 | 376 | # If TPU is not available, this will fall back to normal Estimator on CPU 377 | # or GPU. 378 | estimator = tf.contrib.tpu.TPUEstimator( 379 | use_tpu=FLAGS.use_tpu, 380 | model_fn=model_fn, 381 | config=run_config, 382 | predict_batch_size=FLAGS.batch_size) 383 | 384 | input_fn = input_fn_builder( 385 | features=features, seq_length=FLAGS.max_seq_length) 386 | 387 | with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file, 388 | "w")) as writer: 389 | for result in estimator.predict(input_fn, yield_single_examples=True): 390 | unique_id = int(result["unique_id"]) 391 | feature = unique_id_to_feature[unique_id] 392 | output_json = collections.OrderedDict() 393 | output_json["linex_index"] = unique_id 394 | all_features = [] 395 | for (i, token) in enumerate(feature.tokens): 396 | all_layers = [] 397 | for (j, layer_index) in enumerate(layer_indexes): 398 | layer_output = result["layer_output_%d" % j] 399 | layers = collections.OrderedDict() 400 | layers["index"] = layer_index 401 | layers["values"] = [ 402 | round(float(x), 6) for x in layer_output[i:(i + 1)].flat 403 | ] 404 | all_layers.append(layers) 405 | features = collections.OrderedDict() 406 | features["token"] = token 407 | features["layers"] = all_layers 408 | all_features.append(features) 409 | output_json["features"] = all_features 410 | writer.write(json.dumps(output_json) + "\n") 411 | 412 | 413 | if __name__ == "__main__": 414 | flags.mark_flag_as_required("input_file") 415 | flags.mark_flag_as_required("vocab_file") 416 | flags.mark_flag_as_required("bert_config_file") 417 | flags.mark_flag_as_required("init_checkpoint") 418 | flags.mark_flag_as_required("output_file") 419 | tf.app.run() 420 | -------------------------------------------------------------------------------- /modeling_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import collections 20 | import json 21 | import random 22 | import re 23 | 24 | import modeling 25 | import six 26 | import tensorflow as tf 27 | 28 | 29 | class BertModelTest(tf.test.TestCase): 30 | 31 | class BertModelTester(object): 32 | 33 | def __init__(self, 34 | parent, 35 | batch_size=13, 36 | seq_length=7, 37 | is_training=True, 38 | use_input_mask=True, 39 | use_token_type_ids=True, 40 | vocab_size=99, 41 | hidden_size=32, 42 | num_hidden_layers=5, 43 | num_attention_heads=4, 44 | intermediate_size=37, 45 | hidden_act="gelu", 46 | hidden_dropout_prob=0.1, 47 | attention_probs_dropout_prob=0.1, 48 | max_position_embeddings=512, 49 | type_vocab_size=16, 50 | initializer_range=0.02, 51 | scope=None): 52 | self.parent = parent 53 | self.batch_size = batch_size 54 | self.seq_length = seq_length 55 | self.is_training = is_training 56 | self.use_input_mask = use_input_mask 57 | self.use_token_type_ids = use_token_type_ids 58 | self.vocab_size = vocab_size 59 | self.hidden_size = hidden_size 60 | self.num_hidden_layers = num_hidden_layers 61 | self.num_attention_heads = num_attention_heads 62 | self.intermediate_size = intermediate_size 63 | self.hidden_act = hidden_act 64 | self.hidden_dropout_prob = hidden_dropout_prob 65 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 66 | self.max_position_embeddings = max_position_embeddings 67 | self.type_vocab_size = type_vocab_size 68 | self.initializer_range = initializer_range 69 | self.scope = scope 70 | 71 | def create_model(self): 72 | input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], 73 | self.vocab_size) 74 | 75 | input_mask = None 76 | if self.use_input_mask: 77 | input_mask = BertModelTest.ids_tensor( 78 | [self.batch_size, self.seq_length], vocab_size=2) 79 | 80 | token_type_ids = None 81 | if self.use_token_type_ids: 82 | token_type_ids = BertModelTest.ids_tensor( 83 | [self.batch_size, self.seq_length], self.type_vocab_size) 84 | 85 | config = modeling.BertConfig( 86 | vocab_size=self.vocab_size, 87 | hidden_size=self.hidden_size, 88 | num_hidden_layers=self.num_hidden_layers, 89 | num_attention_heads=self.num_attention_heads, 90 | intermediate_size=self.intermediate_size, 91 | hidden_act=self.hidden_act, 92 | hidden_dropout_prob=self.hidden_dropout_prob, 93 | attention_probs_dropout_prob=self.attention_probs_dropout_prob, 94 | max_position_embeddings=self.max_position_embeddings, 95 | type_vocab_size=self.type_vocab_size, 96 | initializer_range=self.initializer_range) 97 | 98 | model = modeling.BertModel( 99 | config=config, 100 | is_training=self.is_training, 101 | input_ids=input_ids, 102 | input_mask=input_mask, 103 | token_type_ids=token_type_ids, 104 | scope=self.scope) 105 | 106 | outputs = { 107 | "embedding_output": model.get_embedding_output(), 108 | "sequence_output": model.get_sequence_output(), 109 | "pooled_output": model.get_pooled_output(), 110 | "all_encoder_layers": model.get_all_encoder_layers(), 111 | } 112 | return outputs 113 | 114 | def check_output(self, result): 115 | self.parent.assertAllEqual( 116 | result["embedding_output"].shape, 117 | [self.batch_size, self.seq_length, self.hidden_size]) 118 | 119 | self.parent.assertAllEqual( 120 | result["sequence_output"].shape, 121 | [self.batch_size, self.seq_length, self.hidden_size]) 122 | 123 | self.parent.assertAllEqual(result["pooled_output"].shape, 124 | [self.batch_size, self.hidden_size]) 125 | 126 | def test_default(self): 127 | self.run_tester(BertModelTest.BertModelTester(self)) 128 | 129 | def test_config_to_json_string(self): 130 | config = modeling.BertConfig(vocab_size=99, hidden_size=37) 131 | obj = json.loads(config.to_json_string()) 132 | self.assertEqual(obj["vocab_size"], 99) 133 | self.assertEqual(obj["hidden_size"], 37) 134 | 135 | def run_tester(self, tester): 136 | with self.test_session() as sess: 137 | ops = tester.create_model() 138 | init_op = tf.group(tf.global_variables_initializer(), 139 | tf.local_variables_initializer()) 140 | sess.run(init_op) 141 | output_result = sess.run(ops) 142 | tester.check_output(output_result) 143 | 144 | self.assert_all_tensors_reachable(sess, [init_op, ops]) 145 | 146 | @classmethod 147 | def ids_tensor(cls, shape, vocab_size, rng=None, name=None): 148 | """Creates a random int32 tensor of the shape within the vocab size.""" 149 | if rng is None: 150 | rng = random.Random() 151 | 152 | total_dims = 1 153 | for dim in shape: 154 | total_dims *= dim 155 | 156 | values = [] 157 | for _ in range(total_dims): 158 | values.append(rng.randint(0, vocab_size - 1)) 159 | 160 | return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name) 161 | 162 | def assert_all_tensors_reachable(self, sess, outputs): 163 | """Checks that all the tensors in the graph are reachable from outputs.""" 164 | graph = sess.graph 165 | 166 | ignore_strings = [ 167 | "^.*/assert_less_equal/.*$", 168 | "^.*/dilation_rate$", 169 | "^.*/Tensordot/concat$", 170 | "^.*/Tensordot/concat/axis$", 171 | "^testing/.*$", 172 | ] 173 | 174 | ignore_regexes = [re.compile(x) for x in ignore_strings] 175 | 176 | unreachable = self.get_unreachable_ops(graph, outputs) 177 | filtered_unreachable = [] 178 | for x in unreachable: 179 | do_ignore = False 180 | for r in ignore_regexes: 181 | m = r.match(x.name) 182 | if m is not None: 183 | do_ignore = True 184 | if do_ignore: 185 | continue 186 | filtered_unreachable.append(x) 187 | unreachable = filtered_unreachable 188 | 189 | self.assertEqual( 190 | len(unreachable), 0, "The following ops are unreachable: %s" % 191 | (" ".join([x.name for x in unreachable]))) 192 | 193 | @classmethod 194 | def get_unreachable_ops(cls, graph, outputs): 195 | """Finds all of the tensors in graph that are unreachable from outputs.""" 196 | outputs = cls.flatten_recursive(outputs) 197 | output_to_op = collections.defaultdict(list) 198 | op_to_all = collections.defaultdict(list) 199 | assign_out_to_in = collections.defaultdict(list) 200 | 201 | for op in graph.get_operations(): 202 | for x in op.inputs: 203 | op_to_all[op.name].append(x.name) 204 | for y in op.outputs: 205 | output_to_op[y.name].append(op.name) 206 | op_to_all[op.name].append(y.name) 207 | if str(op.type) == "Assign": 208 | for y in op.outputs: 209 | for x in op.inputs: 210 | assign_out_to_in[y.name].append(x.name) 211 | 212 | assign_groups = collections.defaultdict(list) 213 | for out_name in assign_out_to_in.keys(): 214 | name_group = assign_out_to_in[out_name] 215 | for n1 in name_group: 216 | assign_groups[n1].append(out_name) 217 | for n2 in name_group: 218 | if n1 != n2: 219 | assign_groups[n1].append(n2) 220 | 221 | seen_tensors = {} 222 | stack = [x.name for x in outputs] 223 | while stack: 224 | name = stack.pop() 225 | if name in seen_tensors: 226 | continue 227 | seen_tensors[name] = True 228 | 229 | if name in output_to_op: 230 | for op_name in output_to_op[name]: 231 | if op_name in op_to_all: 232 | for input_name in op_to_all[op_name]: 233 | if input_name not in stack: 234 | stack.append(input_name) 235 | 236 | expanded_names = [] 237 | if name in assign_groups: 238 | for assign_name in assign_groups[name]: 239 | expanded_names.append(assign_name) 240 | 241 | for expanded_name in expanded_names: 242 | if expanded_name not in stack: 243 | stack.append(expanded_name) 244 | 245 | unreachable_ops = [] 246 | for op in graph.get_operations(): 247 | is_unreachable = False 248 | all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs] 249 | for name in all_names: 250 | if name not in seen_tensors: 251 | is_unreachable = True 252 | if is_unreachable: 253 | unreachable_ops.append(op) 254 | return unreachable_ops 255 | 256 | @classmethod 257 | def flatten_recursive(cls, item): 258 | """Flattens (potentially nested) a tuple/dictionary/list to a list.""" 259 | output = [] 260 | if isinstance(item, list): 261 | output.extend(item) 262 | elif isinstance(item, tuple): 263 | output.extend(list(item)) 264 | elif isinstance(item, dict): 265 | for (_, v) in six.iteritems(item): 266 | output.append(v) 267 | else: 268 | return [item] 269 | 270 | flat_output = [] 271 | for x in output: 272 | flat_output.extend(cls.flatten_recursive(x)) 273 | return flat_output 274 | 275 | 276 | if __name__ == "__main__": 277 | tf.test.main() 278 | -------------------------------------------------------------------------------- /multilingual.md: -------------------------------------------------------------------------------- 1 | ## Models 2 | 3 | There are two multilingual models currently available. We do not plan to release 4 | more single-language models, but we may release `BERT-Large` versions of these 5 | two in the future: 6 | 7 | * **[`BERT-Base, Multilingual Cased (New, recommended)`](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip)**: 8 | 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters 9 | * **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)**: 10 | 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters 11 | * **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**: 12 | Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M 13 | parameters 14 | 15 | **The `Multilingual Cased (New)` model also fixes normalization issues in many 16 | languages, so it is recommended in languages with non-Latin alphabets (and is 17 | often better for most languages with Latin alphabets). When using this model, 18 | make sure to pass `--do_lower_case=false` to `run_pretraining.py` and other 19 | scripts.** 20 | 21 | See the [list of languages](#list-of-languages) that the Multilingual model 22 | supports. The Multilingual model does include Chinese (and English), but if your 23 | fine-tuning data is Chinese-only, then the Chinese model will likely produce 24 | better results. 25 | 26 | ## Results 27 | 28 | To evaluate these systems, we use the 29 | [XNLI dataset](https://github.com/facebookresearch/XNLI) dataset, which is a 30 | version of [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) where the 31 | dev and test sets have been translated (by humans) into 15 languages. Note that 32 | the training set was *machine* translated (we used the translations provided by 33 | XNLI, not Google NMT). For clarity, we only report on 6 languages below: 34 | 35 | 36 | 37 | | System | English | Chinese | Spanish | German | Arabic | Urdu | 38 | | --------------------------------- | -------- | -------- | -------- | -------- | -------- | -------- | 39 | | XNLI Baseline - Translate Train | 73.7 | 67.0 | 68.8 | 66.5 | 65.8 | 56.6 | 40 | | XNLI Baseline - Translate Test | 73.7 | 68.3 | 70.7 | 68.7 | 66.8 | 59.3 | 41 | | BERT - Translate Train Cased | **81.9** | **76.6** | **77.8** | **75.9** | **70.7** | 61.6 | 42 | | BERT - Translate Train Uncased | 81.4 | 74.2 | 77.3 | 75.2 | 70.5 | 61.7 | 43 | | BERT - Translate Test Uncased | 81.4 | 70.1 | 74.9 | 74.4 | 70.4 | **62.1** | 44 | | BERT - Zero Shot Uncased | 81.4 | 63.8 | 74.3 | 70.5 | 62.1 | 58.3 | 45 | 46 | 47 | 48 | The first two rows are baselines from the XNLI paper and the last three rows are 49 | our results with BERT. 50 | 51 | **Translate Train** means that the MultiNLI training set was machine translated 52 | from English into the foreign language. So training and evaluation were both 53 | done in the foreign language. Unfortunately, training was done on 54 | machine-translated data, so it is impossible to quantify how much of the lower 55 | accuracy (compared to English) is due to the quality of the machine translation 56 | vs. the quality of the pre-trained model. 57 | 58 | **Translate Test** means that the XNLI test set was machine translated from the 59 | foreign language into English. So training and evaluation were both done on 60 | English. However, test evaluation was done on machine-translated English, so the 61 | accuracy depends on the quality of the machine translation system. 62 | 63 | **Zero Shot** means that the Multilingual BERT system was fine-tuned on English 64 | MultiNLI, and then evaluated on the foreign language XNLI test. In this case, 65 | machine translation was not involved at all in either the pre-training or 66 | fine-tuning. 67 | 68 | Note that the English result is worse than the 84.2 MultiNLI baseline because 69 | this training used Multilingual BERT rather than English-only BERT. This implies 70 | that for high-resource languages, the Multilingual model is somewhat worse than 71 | a single-language model. However, it is not feasible for us to train and 72 | maintain dozens of single-language model. Therefore, if your goal is to maximize 73 | performance with a language other than English or Chinese, you might find it 74 | beneficial to run pre-training for additional steps starting from our 75 | Multilingual model on data from your language of interest. 76 | 77 | Here is a comparison of training Chinese models with the Multilingual 78 | `BERT-Base` and Chinese-only `BERT-Base`: 79 | 80 | System | Chinese 81 | ----------------------- | ------- 82 | XNLI Baseline | 67.0 83 | BERT Multilingual Model | 74.2 84 | BERT Chinese-only Model | 77.2 85 | 86 | Similar to English, the single-language model does 3% better than the 87 | Multilingual model. 88 | 89 | ## Fine-tuning Example 90 | 91 | The multilingual model does **not** require any special consideration or API 92 | changes. We did update the implementation of `BasicTokenizer` in 93 | `tokenization.py` to support Chinese character tokenization, so please update if 94 | you forked it. However, we did not change the tokenization API. 95 | 96 | To test the new models, we did modify `run_classifier.py` to add support for the 97 | [XNLI dataset](https://github.com/facebookresearch/XNLI). This is a 15-language 98 | version of MultiNLI where the dev/test sets have been human-translated, and the 99 | training set has been machine-translated. 100 | 101 | To run the fine-tuning code, please download the 102 | [XNLI dev/test set](https://s3.amazonaws.com/xnli/XNLI-1.0.zip) and the 103 | [XNLI machine-translated training set](https://s3.amazonaws.com/xnli/XNLI-MT-1.0.zip) 104 | and then unpack both .zip files into some directory `$XNLI_DIR`. 105 | 106 | To run fine-tuning on XNLI. The language is hard-coded into `run_classifier.py` 107 | (Chinese by default), so please modify `XnliProcessor` if you want to run on 108 | another language. 109 | 110 | This is a large dataset, so this will training will take a few hours on a GPU 111 | (or about 30 minutes on a Cloud TPU). To run an experiment quickly for 112 | debugging, just set `num_train_epochs` to a small value like `0.1`. 113 | 114 | ```shell 115 | export BERT_BASE_DIR=/path/to/bert/chinese_L-12_H-768_A-12 # or multilingual_L-12_H-768_A-12 116 | export XNLI_DIR=/path/to/xnli 117 | 118 | python run_classifier.py \ 119 | --task_name=XNLI \ 120 | --do_train=true \ 121 | --do_eval=true \ 122 | --data_dir=$XNLI_DIR \ 123 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 124 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 125 | --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ 126 | --max_seq_length=128 \ 127 | --train_batch_size=32 \ 128 | --learning_rate=5e-5 \ 129 | --num_train_epochs=2.0 \ 130 | --output_dir=/tmp/xnli_output/ 131 | ``` 132 | 133 | With the Chinese-only model, the results should look something like this: 134 | 135 | ``` 136 | ***** Eval results ***** 137 | eval_accuracy = 0.774116 138 | eval_loss = 0.83554 139 | global_step = 24543 140 | loss = 0.74603 141 | ``` 142 | 143 | ## Details 144 | 145 | ### Data Source and Sampling 146 | 147 | The languages chosen were the 148 | [top 100 languages with the largest Wikipedias](https://meta.wikimedia.org/wiki/List_of_Wikipedias). 149 | The entire Wikipedia dump for each language (excluding user and talk pages) was 150 | taken as the training data for each language 151 | 152 | However, the size of the Wikipedia for a given language varies greatly, and 153 | therefore low-resource languages may be "under-represented" in terms of the 154 | neural network model (under the assumption that languages are "competing" for 155 | limited model capacity to some extent). 156 | 157 | However, the size of a Wikipedia also correlates with the number of speakers of 158 | a language, and we also don't want to overfit the model by performing thousands 159 | of epochs over a tiny Wikipedia for a particular language. 160 | 161 | To balance these two factors, we performed exponentially smoothed weighting of 162 | the data during pre-training data creation (and WordPiece vocab creation). In 163 | other words, let's say that the probability of a language is *P(L)*, e.g., 164 | *P(English) = 0.21* means that after concatenating all of the Wikipedias 165 | together, 21% of our data is English. We exponentiate each probability by some 166 | factor *S* and then re-normalize, and sample from that distribution. In our case 167 | we use *S=0.7*. So, high-resource languages like English will be under-sampled, 168 | and low-resource languages like Icelandic will be over-sampled. E.g., in the 169 | original distribution English would be sampled 1000x more than Icelandic, but 170 | after smoothing it's only sampled 100x more. 171 | 172 | ### Tokenization 173 | 174 | For tokenization, we use a 110k shared WordPiece vocabulary. The word counts are 175 | weighted the same way as the data, so low-resource languages are upweighted by 176 | some factor. We intentionally do *not* use any marker to denote the input 177 | language (so that zero-shot training can work). 178 | 179 | Because Chinese (and Japanese Kanji and Korean Hanja) does not have whitespace 180 | characters, we add spaces around every character in the 181 | [CJK Unicode range](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_\(Unicode_block\)) 182 | before applying WordPiece. This means that Chinese is effectively 183 | character-tokenized. Note that the CJK Unicode block only includes 184 | Chinese-origin characters and does *not* include Hangul Korean or 185 | Katakana/Hiragana Japanese, which are tokenized with whitespace+WordPiece like 186 | all other languages. 187 | 188 | For all other languages, we apply the 189 | [same recipe as English](https://github.com/google-research/bert#tokenization): 190 | (a) lower casing+accent removal, (b) punctuation splitting, (c) whitespace 191 | tokenization. We understand that accent markers have substantial meaning in some 192 | languages, but felt that the benefits of reducing the effective vocabulary make 193 | up for this. Generally the strong contextual models of BERT should make up for 194 | any ambiguity introduced by stripping accent markers. 195 | 196 | ### List of Languages 197 | 198 | The multilingual model supports the following languages. These languages were 199 | chosen because they are the top 100 languages with the largest Wikipedias: 200 | 201 | * Afrikaans 202 | * Albanian 203 | * Arabic 204 | * Aragonese 205 | * Armenian 206 | * Asturian 207 | * Azerbaijani 208 | * Bashkir 209 | * Basque 210 | * Bavarian 211 | * Belarusian 212 | * Bengali 213 | * Bishnupriya Manipuri 214 | * Bosnian 215 | * Breton 216 | * Bulgarian 217 | * Burmese 218 | * Catalan 219 | * Cebuano 220 | * Chechen 221 | * Chinese (Simplified) 222 | * Chinese (Traditional) 223 | * Chuvash 224 | * Croatian 225 | * Czech 226 | * Danish 227 | * Dutch 228 | * English 229 | * Estonian 230 | * Finnish 231 | * French 232 | * Galician 233 | * Georgian 234 | * German 235 | * Greek 236 | * Gujarati 237 | * Haitian 238 | * Hebrew 239 | * Hindi 240 | * Hungarian 241 | * Icelandic 242 | * Ido 243 | * Indonesian 244 | * Irish 245 | * Italian 246 | * Japanese 247 | * Javanese 248 | * Kannada 249 | * Kazakh 250 | * Kirghiz 251 | * Korean 252 | * Latin 253 | * Latvian 254 | * Lithuanian 255 | * Lombard 256 | * Low Saxon 257 | * Luxembourgish 258 | * Macedonian 259 | * Malagasy 260 | * Malay 261 | * Malayalam 262 | * Marathi 263 | * Minangkabau 264 | * Nepali 265 | * Newar 266 | * Norwegian (Bokmal) 267 | * Norwegian (Nynorsk) 268 | * Occitan 269 | * Persian (Farsi) 270 | * Piedmontese 271 | * Polish 272 | * Portuguese 273 | * Punjabi 274 | * Romanian 275 | * Russian 276 | * Scots 277 | * Serbian 278 | * Serbo-Croatian 279 | * Sicilian 280 | * Slovak 281 | * Slovenian 282 | * South Azerbaijani 283 | * Spanish 284 | * Sundanese 285 | * Swahili 286 | * Swedish 287 | * Tagalog 288 | * Tajik 289 | * Tamil 290 | * Tatar 291 | * Telugu 292 | * Turkish 293 | * Ukrainian 294 | * Urdu 295 | * Uzbek 296 | * Vietnamese 297 | * Volapük 298 | * Waray-Waray 299 | * Welsh 300 | * West 301 | * Western Punjabi 302 | * Yoruba 303 | 304 | The **Multilingual Cased (New)** release contains additionally **Thai** and 305 | **Mongolian**, which were not included in the original release. 306 | -------------------------------------------------------------------------------- /optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Functions and classes related to optimization (weight updates).""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import re 22 | import tensorflow as tf 23 | 24 | 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): 26 | """Creates an optimizer training op.""" 27 | global_step = tf.train.get_or_create_global_step() 28 | 29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 30 | 31 | # Implements linear decay of the learning rate. 32 | learning_rate = tf.train.polynomial_decay( 33 | learning_rate, 34 | global_step, 35 | num_train_steps, 36 | end_learning_rate=0.0, 37 | power=1.0, 38 | cycle=False) 39 | 40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the 41 | # learning rate will be `global_step/num_warmup_steps * init_lr`. 42 | if num_warmup_steps: 43 | global_steps_int = tf.cast(global_step, tf.int32) 44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) 45 | 46 | global_steps_float = tf.cast(global_steps_int, tf.float32) 47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) 48 | 49 | warmup_percent_done = global_steps_float / warmup_steps_float 50 | warmup_learning_rate = init_lr * warmup_percent_done 51 | 52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) 53 | learning_rate = ( 54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) 55 | 56 | # It is recommended that you use this optimizer for fine tuning, since this 57 | # is how the model was trained (note that the Adam m/v variables are NOT 58 | # loaded from init_checkpoint.) 59 | optimizer = AdamWeightDecayOptimizer( 60 | learning_rate=learning_rate, 61 | weight_decay_rate=0.01, 62 | beta_1=0.9, 63 | beta_2=0.999, 64 | epsilon=1e-6, 65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) 66 | 67 | if use_tpu: 68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) 69 | 70 | tvars = tf.trainable_variables() 71 | grads = tf.gradients(loss, tvars) 72 | 73 | # This is how the model was pre-trained. 74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) 75 | 76 | train_op = optimizer.apply_gradients( 77 | zip(grads, tvars), global_step=global_step) 78 | 79 | new_global_step = global_step + 1 80 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 81 | return train_op 82 | 83 | 84 | class AdamWeightDecayOptimizer(tf.train.Optimizer): 85 | """A basic Adam optimizer that includes "correct" L2 weight decay.""" 86 | 87 | def __init__(self, 88 | learning_rate, 89 | weight_decay_rate=0.0, 90 | beta_1=0.9, 91 | beta_2=0.999, 92 | epsilon=1e-6, 93 | exclude_from_weight_decay=None, 94 | name="AdamWeightDecayOptimizer"): 95 | """Constructs a AdamWeightDecayOptimizer.""" 96 | super(AdamWeightDecayOptimizer, self).__init__(False, name) 97 | 98 | self.learning_rate = learning_rate 99 | self.weight_decay_rate = weight_decay_rate 100 | self.beta_1 = beta_1 101 | self.beta_2 = beta_2 102 | self.epsilon = epsilon 103 | self.exclude_from_weight_decay = exclude_from_weight_decay 104 | 105 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 106 | """See base class.""" 107 | assignments = [] 108 | for (grad, param) in grads_and_vars: 109 | if grad is None or param is None: 110 | continue 111 | 112 | param_name = self._get_variable_name(param.name) 113 | 114 | m = tf.get_variable( 115 | name=param_name + "/adam_m", 116 | shape=param.shape.as_list(), 117 | dtype=tf.float32, 118 | trainable=False, 119 | initializer=tf.zeros_initializer()) 120 | v = tf.get_variable( 121 | name=param_name + "/adam_v", 122 | shape=param.shape.as_list(), 123 | dtype=tf.float32, 124 | trainable=False, 125 | initializer=tf.zeros_initializer()) 126 | 127 | # Standard Adam update. 128 | next_m = ( 129 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) 130 | next_v = ( 131 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, 132 | tf.square(grad))) 133 | 134 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 135 | 136 | # Just adding the square of the weights to the loss function is *not* 137 | # the correct way of using L2 regularization/weight decay with Adam, 138 | # since that will interact with the m and v parameters in strange ways. 139 | # 140 | # Instead we want ot decay the weights in a manner that doesn't interact 141 | # with the m/v parameters. This is equivalent to adding the square 142 | # of the weights to the loss with plain (non-momentum) SGD. 143 | if self._do_use_weight_decay(param_name): 144 | update += self.weight_decay_rate * param 145 | 146 | update_with_lr = self.learning_rate * update 147 | 148 | next_param = param - update_with_lr 149 | 150 | assignments.extend( 151 | [param.assign(next_param), 152 | m.assign(next_m), 153 | v.assign(next_v)]) 154 | return tf.group(*assignments, name=name) 155 | 156 | def _do_use_weight_decay(self, param_name): 157 | """Whether to use L2 weight decay for `param_name`.""" 158 | if not self.weight_decay_rate: 159 | return False 160 | if self.exclude_from_weight_decay: 161 | for r in self.exclude_from_weight_decay: 162 | if re.search(r, param_name) is not None: 163 | return False 164 | return True 165 | 166 | def _get_variable_name(self, param_name): 167 | """Get the variable name from the tensor name.""" 168 | m = re.match("^(.*):\\d+$", param_name) 169 | if m is not None: 170 | param_name = m.group(1) 171 | return param_name 172 | -------------------------------------------------------------------------------- /optimization_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import optimization 20 | import tensorflow as tf 21 | 22 | 23 | class OptimizationTest(tf.test.TestCase): 24 | 25 | def test_adam(self): 26 | with self.test_session() as sess: 27 | w = tf.get_variable( 28 | "w", 29 | shape=[3], 30 | initializer=tf.constant_initializer([0.1, -0.2, -0.1])) 31 | x = tf.constant([0.4, 0.2, -0.5]) 32 | loss = tf.reduce_mean(tf.square(x - w)) 33 | tvars = tf.trainable_variables() 34 | grads = tf.gradients(loss, tvars) 35 | global_step = tf.train.get_or_create_global_step() 36 | optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2) 37 | train_op = optimizer.apply_gradients(zip(grads, tvars), global_step) 38 | init_op = tf.group(tf.global_variables_initializer(), 39 | tf.local_variables_initializer()) 40 | sess.run(init_op) 41 | for _ in range(100): 42 | sess.run(train_op) 43 | w_np = sess.run(w) 44 | self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2) 45 | 46 | 47 | if __name__ == "__main__": 48 | tf.test.main() 49 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow >= 1.11.0 # CPU Version of TensorFlow. 2 | # tensorflow-gpu >= 1.11.0 # GPU version of TensorFlow. 3 | -------------------------------------------------------------------------------- /run_classifier.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """BERT finetuning runner.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import csv 23 | import os 24 | import modeling 25 | import optimization 26 | import tokenization 27 | import tensorflow as tf 28 | 29 | flags = tf.flags 30 | 31 | FLAGS = flags.FLAGS 32 | 33 | ## Required parameters 34 | flags.DEFINE_string( 35 | "data_dir", None, 36 | "The input data dir. Should contain the .tsv files (or other data files) " 37 | "for the task.") 38 | 39 | flags.DEFINE_string( 40 | "bert_config_file", None, 41 | "The config json file corresponding to the pre-trained BERT model. " 42 | "This specifies the model architecture.") 43 | 44 | flags.DEFINE_string("task_name", None, "The name of the task to train.") 45 | 46 | flags.DEFINE_string("vocab_file", None, 47 | "The vocabulary file that the BERT model was trained on.") 48 | 49 | flags.DEFINE_string( 50 | "output_dir", None, 51 | "The output directory where the model checkpoints will be written.") 52 | 53 | ## Other parameters 54 | 55 | flags.DEFINE_string( 56 | "init_checkpoint", None, 57 | "Initial checkpoint (usually from a pre-trained BERT model).") 58 | 59 | flags.DEFINE_bool( 60 | "do_lower_case", True, 61 | "Whether to lower case the input text. Should be True for uncased " 62 | "models and False for cased models.") 63 | 64 | flags.DEFINE_integer( 65 | "max_seq_length", 128, 66 | "The maximum total input sequence length after WordPiece tokenization. " 67 | "Sequences longer than this will be truncated, and sequences shorter " 68 | "than this will be padded.") 69 | 70 | flags.DEFINE_bool("do_train", False, "Whether to run training.") 71 | 72 | flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") 73 | 74 | flags.DEFINE_bool( 75 | "do_predict", False, 76 | "Whether to run the model in inference mode on the test set.") 77 | 78 | flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") 79 | 80 | flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") 81 | 82 | flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") 83 | 84 | flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") 85 | 86 | flags.DEFINE_float("num_train_epochs", 3.0, 87 | "Total number of training epochs to perform.") 88 | 89 | flags.DEFINE_float( 90 | "warmup_proportion", 0.1, 91 | "Proportion of training to perform linear learning rate warmup for. " 92 | "E.g., 0.1 = 10% of training.") 93 | 94 | flags.DEFINE_integer("save_checkpoints_steps", 1000, 95 | "How often to save the model checkpoint.") 96 | 97 | flags.DEFINE_integer("iterations_per_loop", 1000, 98 | "How many steps to make in each estimator call.") 99 | 100 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") 101 | 102 | tf.flags.DEFINE_string( 103 | "tpu_name", None, 104 | "The Cloud TPU to use for training. This should be either the name " 105 | "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " 106 | "url.") 107 | 108 | tf.flags.DEFINE_string( 109 | "tpu_zone", None, 110 | "[Optional] GCE zone where the Cloud TPU is located in. If not " 111 | "specified, we will attempt to automatically detect the GCE project from " 112 | "metadata.") 113 | 114 | tf.flags.DEFINE_string( 115 | "gcp_project", None, 116 | "[Optional] Project name for the Cloud TPU-enabled project. If not " 117 | "specified, we will attempt to automatically detect the GCE project from " 118 | "metadata.") 119 | 120 | tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") 121 | 122 | flags.DEFINE_integer( 123 | "num_tpu_cores", 8, 124 | "Only used if `use_tpu` is True. Total number of TPU cores to use.") 125 | 126 | 127 | class InputExample(object): 128 | """A single training/test example for simple sequence classification.""" 129 | 130 | def __init__(self, guid, text_a, text_b=None, label=None): 131 | """Constructs a InputExample. 132 | 133 | Args: 134 | guid: Unique id for the example. 135 | text_a: string. The untokenized text of the first sequence. For single 136 | sequence tasks, only this sequence must be specified. 137 | text_b: (Optional) string. The untokenized text of the second sequence. 138 | Only must be specified for sequence pair tasks. 139 | label: (Optional) string. The label of the example. This should be 140 | specified for train and dev examples, but not for test examples. 141 | """ 142 | self.guid = guid 143 | self.text_a = text_a 144 | self.text_b = text_b 145 | self.label = label 146 | 147 | 148 | class InputFeatures(object): 149 | """A single set of features of data.""" 150 | 151 | def __init__(self, input_ids, input_mask, segment_ids, label_id): 152 | self.input_ids = input_ids 153 | self.input_mask = input_mask 154 | self.segment_ids = segment_ids 155 | self.label_id = label_id 156 | 157 | 158 | class DataProcessor(object): 159 | """Base class for data converters for sequence classification data sets.""" 160 | 161 | def get_train_examples(self, data_dir): 162 | """Gets a collection of `InputExample`s for the train set.""" 163 | raise NotImplementedError() 164 | 165 | def get_dev_examples(self, data_dir): 166 | """Gets a collection of `InputExample`s for the dev set.""" 167 | raise NotImplementedError() 168 | 169 | def get_test_examples(self, data_dir): 170 | """Gets a collection of `InputExample`s for prediction.""" 171 | raise NotImplementedError() 172 | 173 | def get_labels(self): 174 | """Gets the list of labels for this data set.""" 175 | raise NotImplementedError() 176 | 177 | @classmethod 178 | def _read_tsv(cls, input_file, quotechar=None): 179 | """Reads a tab separated value file.""" 180 | with tf.gfile.Open(input_file, "r") as f: 181 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar) 182 | lines = [] 183 | for line in reader: 184 | lines.append(line) 185 | return lines 186 | 187 | 188 | class XnliProcessor(DataProcessor): 189 | """Processor for the XNLI data set.""" 190 | 191 | def __init__(self): 192 | self.language = "zh" 193 | 194 | def get_train_examples(self, data_dir): 195 | """See base class.""" 196 | lines = self._read_tsv( 197 | os.path.join(data_dir, "multinli", 198 | "multinli.train.%s.tsv" % self.language)) 199 | examples = [] 200 | for (i, line) in enumerate(lines): 201 | if i == 0: 202 | continue 203 | guid = "train-%d" % (i) 204 | text_a = tokenization.convert_to_unicode(line[0]) 205 | text_b = tokenization.convert_to_unicode(line[1]) 206 | label = tokenization.convert_to_unicode(line[2]) 207 | if label == tokenization.convert_to_unicode("contradictory"): 208 | label = tokenization.convert_to_unicode("contradiction") 209 | examples.append( 210 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 211 | return examples 212 | 213 | def get_dev_examples(self, data_dir): 214 | """See base class.""" 215 | lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) 216 | examples = [] 217 | for (i, line) in enumerate(lines): 218 | if i == 0: 219 | continue 220 | guid = "dev-%d" % (i) 221 | language = tokenization.convert_to_unicode(line[0]) 222 | if language != tokenization.convert_to_unicode(self.language): 223 | continue 224 | text_a = tokenization.convert_to_unicode(line[6]) 225 | text_b = tokenization.convert_to_unicode(line[7]) 226 | label = tokenization.convert_to_unicode(line[1]) 227 | examples.append( 228 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 229 | return examples 230 | 231 | def get_labels(self): 232 | """See base class.""" 233 | return ["contradiction", "entailment", "neutral"] 234 | 235 | 236 | class MnliProcessor(DataProcessor): 237 | """Processor for the MultiNLI data set (GLUE version).""" 238 | 239 | def get_train_examples(self, data_dir): 240 | """See base class.""" 241 | return self._create_examples( 242 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") 243 | 244 | def get_dev_examples(self, data_dir): 245 | """See base class.""" 246 | return self._create_examples( 247 | self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), 248 | "dev_matched") 249 | 250 | def get_test_examples(self, data_dir): 251 | """See base class.""" 252 | return self._create_examples( 253 | self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test") 254 | 255 | def get_labels(self): 256 | """See base class.""" 257 | return ["contradiction", "entailment", "neutral"] 258 | 259 | def _create_examples(self, lines, set_type): 260 | """Creates examples for the training and dev sets.""" 261 | examples = [] 262 | for (i, line) in enumerate(lines): 263 | if i == 0: 264 | continue 265 | guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) 266 | text_a = tokenization.convert_to_unicode(line[8]) 267 | text_b = tokenization.convert_to_unicode(line[9]) 268 | if set_type == "test": 269 | label = "contradiction" 270 | else: 271 | label = tokenization.convert_to_unicode(line[-1]) 272 | examples.append( 273 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 274 | return examples 275 | 276 | 277 | class MrpcProcessor(DataProcessor): 278 | """Processor for the MRPC data set (GLUE version).""" 279 | 280 | def get_train_examples(self, data_dir): 281 | """See base class.""" 282 | return self._create_examples( 283 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") 284 | 285 | def get_dev_examples(self, data_dir): 286 | """See base class.""" 287 | return self._create_examples( 288 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") 289 | 290 | def get_test_examples(self, data_dir): 291 | """See base class.""" 292 | return self._create_examples( 293 | self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") 294 | 295 | def get_labels(self): 296 | """See base class.""" 297 | return ["0", "1"] 298 | 299 | def _create_examples(self, lines, set_type): 300 | """Creates examples for the training and dev sets.""" 301 | examples = [] 302 | for (i, line) in enumerate(lines): 303 | if i == 0: 304 | continue 305 | guid = "%s-%s" % (set_type, i) 306 | text_a = tokenization.convert_to_unicode(line[3]) 307 | text_b = tokenization.convert_to_unicode(line[4]) 308 | if set_type == "test": 309 | label = "0" 310 | else: 311 | label = tokenization.convert_to_unicode(line[0]) 312 | examples.append( 313 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 314 | return examples 315 | 316 | 317 | class ColaProcessor(DataProcessor): 318 | """Processor for the CoLA data set (GLUE version).""" 319 | 320 | def get_train_examples(self, data_dir): 321 | """See base class.""" 322 | return self._create_examples( 323 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") 324 | 325 | def get_dev_examples(self, data_dir): 326 | """See base class.""" 327 | return self._create_examples( 328 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") 329 | 330 | def get_test_examples(self, data_dir): 331 | """See base class.""" 332 | return self._create_examples( 333 | self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") 334 | 335 | def get_labels(self): 336 | """See base class.""" 337 | return ["0", "1"] 338 | 339 | def _create_examples(self, lines, set_type): 340 | """Creates examples for the training and dev sets.""" 341 | examples = [] 342 | for (i, line) in enumerate(lines): 343 | # Only the test set has a header 344 | if set_type == "test" and i == 0: 345 | continue 346 | guid = "%s-%s" % (set_type, i) 347 | if set_type == "test": 348 | text_a = tokenization.convert_to_unicode(line[1]) 349 | label = "0" 350 | else: 351 | text_a = tokenization.convert_to_unicode(line[3]) 352 | label = tokenization.convert_to_unicode(line[1]) 353 | examples.append( 354 | InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) 355 | return examples 356 | 357 | 358 | def convert_single_example(ex_index, example, label_list, max_seq_length, 359 | tokenizer): 360 | """Converts a single `InputExample` into a single `InputFeatures`.""" 361 | label_map = {} 362 | for (i, label) in enumerate(label_list): 363 | label_map[label] = i 364 | 365 | tokens_a = tokenizer.tokenize(example.text_a) 366 | tokens_b = None 367 | if example.text_b: 368 | tokens_b = tokenizer.tokenize(example.text_b) 369 | 370 | if tokens_b: 371 | # Modifies `tokens_a` and `tokens_b` in place so that the total 372 | # length is less than the specified length. 373 | # Account for [CLS], [SEP], [SEP] with "- 3" 374 | _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) 375 | else: 376 | # Account for [CLS] and [SEP] with "- 2" 377 | if len(tokens_a) > max_seq_length - 2: 378 | tokens_a = tokens_a[0:(max_seq_length - 2)] 379 | 380 | # The convention in BERT is: 381 | # (a) For sequence pairs: 382 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 383 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 384 | # (b) For single sequences: 385 | # tokens: [CLS] the dog is hairy . [SEP] 386 | # type_ids: 0 0 0 0 0 0 0 387 | # 388 | # Where "type_ids" are used to indicate whether this is the first 389 | # sequence or the second sequence. The embedding vectors for `type=0` and 390 | # `type=1` were learned during pre-training and are added to the wordpiece 391 | # embedding vector (and position vector). This is not *strictly* necessary 392 | # since the [SEP] token unambiguously separates the sequences, but it makes 393 | # it easier for the model to learn the concept of sequences. 394 | # 395 | # For classification tasks, the first vector (corresponding to [CLS]) is 396 | # used as as the "sentence vector". Note that this only makes sense because 397 | # the entire model is fine-tuned. 398 | tokens = [] 399 | segment_ids = [] 400 | tokens.append("[CLS]") 401 | segment_ids.append(0) 402 | for token in tokens_a: 403 | tokens.append(token) 404 | segment_ids.append(0) 405 | tokens.append("[SEP]") 406 | segment_ids.append(0) 407 | 408 | if tokens_b: 409 | for token in tokens_b: 410 | tokens.append(token) 411 | segment_ids.append(1) 412 | tokens.append("[SEP]") 413 | segment_ids.append(1) 414 | 415 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 416 | 417 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 418 | # tokens are attended to. 419 | input_mask = [1] * len(input_ids) 420 | 421 | # Zero-pad up to the sequence length. 422 | while len(input_ids) < max_seq_length: 423 | input_ids.append(0) 424 | input_mask.append(0) 425 | segment_ids.append(0) 426 | 427 | assert len(input_ids) == max_seq_length 428 | assert len(input_mask) == max_seq_length 429 | assert len(segment_ids) == max_seq_length 430 | 431 | label_id = label_map[example.label] 432 | if ex_index < 5: 433 | tf.logging.info("*** Example ***") 434 | tf.logging.info("guid: %s" % (example.guid)) 435 | tf.logging.info("tokens: %s" % " ".join( 436 | [tokenization.printable_text(x) for x in tokens])) 437 | tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 438 | tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 439 | tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) 440 | tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) 441 | 442 | feature = InputFeatures( 443 | input_ids=input_ids, 444 | input_mask=input_mask, 445 | segment_ids=segment_ids, 446 | label_id=label_id) 447 | return feature 448 | 449 | 450 | def file_based_convert_examples_to_features( 451 | examples, label_list, max_seq_length, tokenizer, output_file): 452 | """Convert a set of `InputExample`s to a TFRecord file.""" 453 | 454 | writer = tf.python_io.TFRecordWriter(output_file) 455 | 456 | for (ex_index, example) in enumerate(examples): 457 | if ex_index % 10000 == 0: 458 | tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) 459 | 460 | feature = convert_single_example(ex_index, example, label_list, 461 | max_seq_length, tokenizer) 462 | 463 | def create_int_feature(values): 464 | f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) 465 | return f 466 | 467 | features = collections.OrderedDict() 468 | features["input_ids"] = create_int_feature(feature.input_ids) 469 | features["input_mask"] = create_int_feature(feature.input_mask) 470 | features["segment_ids"] = create_int_feature(feature.segment_ids) 471 | features["label_ids"] = create_int_feature([feature.label_id]) 472 | 473 | tf_example = tf.train.Example(features=tf.train.Features(feature=features)) 474 | writer.write(tf_example.SerializeToString()) 475 | 476 | 477 | def file_based_input_fn_builder(input_file, seq_length, is_training, 478 | drop_remainder): 479 | """Creates an `input_fn` closure to be passed to TPUEstimator.""" 480 | 481 | name_to_features = { 482 | "input_ids": tf.FixedLenFeature([seq_length], tf.int64), 483 | "input_mask": tf.FixedLenFeature([seq_length], tf.int64), 484 | "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), 485 | "label_ids": tf.FixedLenFeature([], tf.int64), 486 | } 487 | 488 | def _decode_record(record, name_to_features): 489 | """Decodes a record to a TensorFlow example.""" 490 | example = tf.parse_single_example(record, name_to_features) 491 | 492 | # tf.Example only supports tf.int64, but the TPU only supports tf.int32. 493 | # So cast all int64 to int32. 494 | for name in list(example.keys()): 495 | t = example[name] 496 | if t.dtype == tf.int64: 497 | t = tf.to_int32(t) 498 | example[name] = t 499 | 500 | return example 501 | 502 | def input_fn(params): 503 | """The actual input function.""" 504 | batch_size = params["batch_size"] 505 | 506 | # For training, we want a lot of parallel reading and shuffling. 507 | # For eval, we want no shuffling and parallel reading doesn't matter. 508 | d = tf.data.TFRecordDataset(input_file) 509 | if is_training: 510 | d = d.repeat() 511 | d = d.shuffle(buffer_size=100) 512 | 513 | d = d.apply( 514 | tf.contrib.data.map_and_batch( 515 | lambda record: _decode_record(record, name_to_features), 516 | batch_size=batch_size, 517 | drop_remainder=drop_remainder)) 518 | 519 | return d 520 | 521 | return input_fn 522 | 523 | 524 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 525 | """Truncates a sequence pair in place to the maximum length.""" 526 | 527 | # This is a simple heuristic which will always truncate the longer sequence 528 | # one token at a time. This makes more sense than truncating an equal percent 529 | # of tokens from each, since if one sequence is very short then each token 530 | # that's truncated likely contains more information than a longer sequence. 531 | while True: 532 | total_length = len(tokens_a) + len(tokens_b) 533 | if total_length <= max_length: 534 | break 535 | if len(tokens_a) > len(tokens_b): 536 | tokens_a.pop() 537 | else: 538 | tokens_b.pop() 539 | 540 | 541 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, 542 | labels, num_labels, use_one_hot_embeddings): 543 | """Creates a classification model.""" 544 | model = modeling.BertModel( 545 | config=bert_config, 546 | is_training=is_training, 547 | input_ids=input_ids, 548 | input_mask=input_mask, 549 | token_type_ids=segment_ids, 550 | use_one_hot_embeddings=use_one_hot_embeddings) 551 | 552 | # In the demo, we are doing a simple classification task on the entire 553 | # segment. 554 | # 555 | # If you want to use the token-level output, use model.get_sequence_output() 556 | # instead. 557 | output_layer = model.get_pooled_output() 558 | 559 | hidden_size = output_layer.shape[-1].value 560 | 561 | output_weights = tf.get_variable( 562 | "output_weights", [num_labels, hidden_size], 563 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 564 | 565 | output_bias = tf.get_variable( 566 | "output_bias", [num_labels], initializer=tf.zeros_initializer()) 567 | 568 | with tf.variable_scope("loss"): 569 | if is_training: 570 | # I.e., 0.1 dropout 571 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 572 | 573 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 574 | logits = tf.nn.bias_add(logits, output_bias) 575 | probabilities = tf.nn.softmax(logits, axis=-1) 576 | log_probs = tf.nn.log_softmax(logits, axis=-1) 577 | 578 | one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) 579 | 580 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) 581 | loss = tf.reduce_mean(per_example_loss) 582 | 583 | return (loss, per_example_loss, logits, probabilities) 584 | 585 | 586 | def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, 587 | num_train_steps, num_warmup_steps, use_tpu, 588 | use_one_hot_embeddings): 589 | """Returns `model_fn` closure for TPUEstimator.""" 590 | 591 | def model_fn(features, labels, mode, params): # pylint: disable=unused-argument 592 | """The `model_fn` for TPUEstimator.""" 593 | 594 | tf.logging.info("*** Features ***") 595 | for name in sorted(features.keys()): 596 | tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) 597 | 598 | input_ids = features["input_ids"] 599 | input_mask = features["input_mask"] 600 | segment_ids = features["segment_ids"] 601 | label_ids = features["label_ids"] 602 | 603 | is_training = (mode == tf.estimator.ModeKeys.TRAIN) 604 | 605 | (total_loss, per_example_loss, logits, probabilities) = create_model( 606 | bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, 607 | num_labels, use_one_hot_embeddings) 608 | 609 | tvars = tf.trainable_variables() 610 | initialized_variable_names = {} 611 | scaffold_fn = None 612 | if init_checkpoint: 613 | (assignment_map, initialized_variable_names 614 | ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) 615 | if use_tpu: 616 | 617 | def tpu_scaffold(): 618 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 619 | return tf.train.Scaffold() 620 | 621 | scaffold_fn = tpu_scaffold 622 | else: 623 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 624 | 625 | tf.logging.info("**** Trainable Variables ****") 626 | for var in tvars: 627 | init_string = "" 628 | if var.name in initialized_variable_names: 629 | init_string = ", *INIT_FROM_CKPT*" 630 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, 631 | init_string) 632 | 633 | output_spec = None 634 | if mode == tf.estimator.ModeKeys.TRAIN: 635 | 636 | train_op = optimization.create_optimizer( 637 | total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) 638 | 639 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 640 | mode=mode, 641 | loss=total_loss, 642 | train_op=train_op, 643 | scaffold_fn=scaffold_fn) 644 | elif mode == tf.estimator.ModeKeys.EVAL: 645 | 646 | def metric_fn(per_example_loss, label_ids, logits): 647 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) 648 | accuracy = tf.metrics.accuracy(label_ids, predictions) 649 | loss = tf.metrics.mean(per_example_loss) 650 | return { 651 | "eval_accuracy": accuracy, 652 | "eval_loss": loss, 653 | } 654 | 655 | eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) 656 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 657 | mode=mode, 658 | loss=total_loss, 659 | eval_metrics=eval_metrics, 660 | scaffold_fn=scaffold_fn) 661 | else: 662 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 663 | mode=mode, predictions=probabilities, scaffold_fn=scaffold_fn) 664 | return output_spec 665 | 666 | return model_fn 667 | 668 | 669 | # This function is not used by this file but is still used by the Colab and 670 | # people who depend on it. 671 | def input_fn_builder(features, seq_length, is_training, drop_remainder): 672 | """Creates an `input_fn` closure to be passed to TPUEstimator.""" 673 | 674 | all_input_ids = [] 675 | all_input_mask = [] 676 | all_segment_ids = [] 677 | all_label_ids = [] 678 | 679 | for feature in features: 680 | all_input_ids.append(feature.input_ids) 681 | all_input_mask.append(feature.input_mask) 682 | all_segment_ids.append(feature.segment_ids) 683 | all_label_ids.append(feature.label_id) 684 | 685 | def input_fn(params): 686 | """The actual input function.""" 687 | batch_size = params["batch_size"] 688 | 689 | num_examples = len(features) 690 | 691 | # This is for demo purposes and does NOT scale to large data sets. We do 692 | # not use Dataset.from_generator() because that uses tf.py_func which is 693 | # not TPU compatible. The right way to load data is with TFRecordReader. 694 | d = tf.data.Dataset.from_tensor_slices({ 695 | "input_ids": 696 | tf.constant( 697 | all_input_ids, shape=[num_examples, seq_length], 698 | dtype=tf.int32), 699 | "input_mask": 700 | tf.constant( 701 | all_input_mask, 702 | shape=[num_examples, seq_length], 703 | dtype=tf.int32), 704 | "segment_ids": 705 | tf.constant( 706 | all_segment_ids, 707 | shape=[num_examples, seq_length], 708 | dtype=tf.int32), 709 | "label_ids": 710 | tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), 711 | }) 712 | 713 | if is_training: 714 | d = d.repeat() 715 | d = d.shuffle(buffer_size=100) 716 | 717 | d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) 718 | return d 719 | 720 | return input_fn 721 | 722 | 723 | # This function is not used by this file but is still used by the Colab and 724 | # people who depend on it. 725 | def convert_examples_to_features(examples, label_list, max_seq_length, 726 | tokenizer): 727 | """Convert a set of `InputExample`s to a list of `InputFeatures`.""" 728 | 729 | features = [] 730 | for (ex_index, example) in enumerate(examples): 731 | if ex_index % 10000 == 0: 732 | tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) 733 | 734 | feature = convert_single_example(ex_index, example, label_list, 735 | max_seq_length, tokenizer) 736 | 737 | features.append(feature) 738 | return features 739 | 740 | 741 | def main(_): 742 | tf.logging.set_verbosity(tf.logging.INFO) 743 | 744 | processors = { 745 | "cola": ColaProcessor, 746 | "mnli": MnliProcessor, 747 | "mrpc": MrpcProcessor, 748 | "xnli": XnliProcessor, 749 | } 750 | 751 | if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: 752 | raise ValueError( 753 | "At least one of `do_train`, `do_eval` or `do_predict' must be True.") 754 | 755 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) 756 | 757 | if FLAGS.max_seq_length > bert_config.max_position_embeddings: 758 | raise ValueError( 759 | "Cannot use sequence length %d because the BERT model " 760 | "was only trained up to sequence length %d" % 761 | (FLAGS.max_seq_length, bert_config.max_position_embeddings)) 762 | 763 | tf.gfile.MakeDirs(FLAGS.output_dir) 764 | 765 | task_name = FLAGS.task_name.lower() 766 | 767 | if task_name not in processors: 768 | raise ValueError("Task not found: %s" % (task_name)) 769 | 770 | processor = processors[task_name]() 771 | 772 | label_list = processor.get_labels() 773 | 774 | tokenizer = tokenization.FullTokenizer( 775 | vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) 776 | 777 | tpu_cluster_resolver = None 778 | if FLAGS.use_tpu and FLAGS.tpu_name: 779 | tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( 780 | FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) 781 | 782 | is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 783 | run_config = tf.contrib.tpu.RunConfig( 784 | cluster=tpu_cluster_resolver, 785 | master=FLAGS.master, 786 | model_dir=FLAGS.output_dir, 787 | save_checkpoints_steps=FLAGS.save_checkpoints_steps, 788 | tpu_config=tf.contrib.tpu.TPUConfig( 789 | iterations_per_loop=FLAGS.iterations_per_loop, 790 | num_shards=FLAGS.num_tpu_cores, 791 | per_host_input_for_training=is_per_host)) 792 | 793 | train_examples = None 794 | num_train_steps = None 795 | num_warmup_steps = None 796 | if FLAGS.do_train: 797 | train_examples = processor.get_train_examples(FLAGS.data_dir) 798 | num_train_steps = int( 799 | len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) 800 | num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) 801 | 802 | model_fn = model_fn_builder( 803 | bert_config=bert_config, 804 | num_labels=len(label_list), 805 | init_checkpoint=FLAGS.init_checkpoint, 806 | learning_rate=FLAGS.learning_rate, 807 | num_train_steps=num_train_steps, 808 | num_warmup_steps=num_warmup_steps, 809 | use_tpu=FLAGS.use_tpu, 810 | use_one_hot_embeddings=FLAGS.use_tpu) 811 | 812 | # If TPU is not available, this will fall back to normal Estimator on CPU 813 | # or GPU. 814 | estimator = tf.contrib.tpu.TPUEstimator( 815 | use_tpu=FLAGS.use_tpu, 816 | model_fn=model_fn, 817 | config=run_config, 818 | train_batch_size=FLAGS.train_batch_size, 819 | eval_batch_size=FLAGS.eval_batch_size, 820 | predict_batch_size=FLAGS.predict_batch_size) 821 | 822 | if FLAGS.do_train: 823 | train_file = os.path.join(FLAGS.output_dir, "train.tf_record") 824 | file_based_convert_examples_to_features( 825 | train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) 826 | tf.logging.info("***** Running training *****") 827 | tf.logging.info(" Num examples = %d", len(train_examples)) 828 | tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) 829 | tf.logging.info(" Num steps = %d", num_train_steps) 830 | train_input_fn = file_based_input_fn_builder( 831 | input_file=train_file, 832 | seq_length=FLAGS.max_seq_length, 833 | is_training=True, 834 | drop_remainder=True) 835 | estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) 836 | 837 | if FLAGS.do_eval: 838 | eval_examples = processor.get_dev_examples(FLAGS.data_dir) 839 | eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") 840 | file_based_convert_examples_to_features( 841 | eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) 842 | 843 | tf.logging.info("***** Running evaluation *****") 844 | tf.logging.info(" Num examples = %d", len(eval_examples)) 845 | tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) 846 | 847 | # This tells the estimator to run through the entire set. 848 | eval_steps = None 849 | # However, if running eval on the TPU, you will need to specify the 850 | # number of steps. 851 | if FLAGS.use_tpu: 852 | # Eval will be slightly WRONG on the TPU because it will truncate 853 | # the last batch. 854 | eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) 855 | 856 | eval_drop_remainder = True if FLAGS.use_tpu else False 857 | eval_input_fn = file_based_input_fn_builder( 858 | input_file=eval_file, 859 | seq_length=FLAGS.max_seq_length, 860 | is_training=False, 861 | drop_remainder=eval_drop_remainder) 862 | 863 | result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) 864 | 865 | output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") 866 | with tf.gfile.GFile(output_eval_file, "w") as writer: 867 | tf.logging.info("***** Eval results *****") 868 | for key in sorted(result.keys()): 869 | tf.logging.info(" %s = %s", key, str(result[key])) 870 | writer.write("%s = %s\n" % (key, str(result[key]))) 871 | 872 | if FLAGS.do_predict: 873 | predict_examples = processor.get_test_examples(FLAGS.data_dir) 874 | predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") 875 | file_based_convert_examples_to_features(predict_examples, label_list, 876 | FLAGS.max_seq_length, tokenizer, 877 | predict_file) 878 | 879 | tf.logging.info("***** Running prediction*****") 880 | tf.logging.info(" Num examples = %d", len(predict_examples)) 881 | tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) 882 | 883 | if FLAGS.use_tpu: 884 | # Warning: According to tpu_estimator.py Prediction on TPU is an 885 | # experimental feature and hence not supported here 886 | raise ValueError("Prediction in TPU not supported") 887 | 888 | predict_drop_remainder = True if FLAGS.use_tpu else False 889 | predict_input_fn = file_based_input_fn_builder( 890 | input_file=predict_file, 891 | seq_length=FLAGS.max_seq_length, 892 | is_training=False, 893 | drop_remainder=predict_drop_remainder) 894 | 895 | result = estimator.predict(input_fn=predict_input_fn) 896 | 897 | output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") 898 | with tf.gfile.GFile(output_predict_file, "w") as writer: 899 | tf.logging.info("***** Predict results *****") 900 | for prediction in result: 901 | output_line = "\t".join( 902 | str(class_probability) for class_probability in prediction) + "\n" 903 | writer.write(output_line) 904 | 905 | 906 | if __name__ == "__main__": 907 | flags.mark_flag_as_required("data_dir") 908 | flags.mark_flag_as_required("task_name") 909 | flags.mark_flag_as_required("vocab_file") 910 | flags.mark_flag_as_required("bert_config_file") 911 | flags.mark_flag_as_required("output_dir") 912 | tf.app.run() 913 | -------------------------------------------------------------------------------- /run_lm_predict.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """BERT language model predict.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os 22 | import json 23 | import modeling 24 | import tokenization 25 | import numpy as np 26 | import tensorflow as tf 27 | 28 | flags = tf.flags 29 | FLAGS = flags.FLAGS 30 | 31 | flags.DEFINE_integer("max_predictions_per_seq", 20, 32 | "In this task, it also refers to maximum number of masked tokens per word.") 33 | 34 | flags.DEFINE_string( 35 | "bert_config_file", None, 36 | "The config json file corresponding to the pre-trained BERT model. " 37 | "This specifies the model architecture.") 38 | 39 | flags.DEFINE_string( 40 | "input_file", None, 41 | "The config json file corresponding to the pre-trained BERT model. " 42 | "This specifies the model architecture.") 43 | 44 | flags.DEFINE_string( 45 | "output_dir", None, 46 | "The output directory where the model checkpoints will be written.") 47 | 48 | flags.DEFINE_string("vocab_file", None, 49 | "The vocabulary file that the BERT model was trained on.") 50 | 51 | ## Other parameters 52 | 53 | flags.DEFINE_string( 54 | "init_checkpoint", None, 55 | "Initial checkpoint (usually from a pre-trained BERT model).") 56 | 57 | flags.DEFINE_bool( 58 | "do_lower_case", True, 59 | "Whether to lower case the input text. Should be True for uncased " 60 | "models and False for cased models.") 61 | 62 | flags.DEFINE_integer( 63 | "max_seq_length", 128, 64 | "The maximum total input sequence length after WordPiece tokenization. " 65 | "Sequences longer than this will be truncated, and sequences shorter " 66 | "than this will be padded.") 67 | 68 | flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") 69 | 70 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") 71 | 72 | tf.flags.DEFINE_string( 73 | "tpu_name", None, 74 | "The Cloud TPU to use for training. This should be either the name " 75 | "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " 76 | "url.") 77 | 78 | tf.flags.DEFINE_string( 79 | "tpu_zone", None, 80 | "[Optional] GCE zone where the Cloud TPU is located in. If not " 81 | "specified, we will attempt to automatically detect the GCE project from " 82 | "metadata.") 83 | 84 | tf.flags.DEFINE_string( 85 | "gcp_project", None, 86 | "[Optional] Project name for the Cloud TPU-enabled project. If not " 87 | "specified, we will attempt to automatically detect the GCE project from " 88 | "metadata.") 89 | 90 | tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") 91 | 92 | flags.DEFINE_integer( 93 | "num_tpu_cores", 8, 94 | "Only used if `use_tpu` is True. Total number of TPU cores to use.") 95 | 96 | 97 | class InputExample(object): 98 | def __init__(self, unique_id, text): 99 | self.unique_id = unique_id 100 | self.text = text 101 | 102 | 103 | def read_examples(input_file): 104 | """Read a list of `InputExample`s from an input file.""" 105 | examples = [] 106 | unique_id = 0 107 | with tf.gfile.GFile(input_file, "r") as reader: 108 | while True: 109 | line = tokenization.convert_to_unicode(reader.readline()) 110 | if not line: 111 | break 112 | line = line.strip() 113 | unique_id += 1 114 | examples.append( 115 | InputExample(unique_id, line)) 116 | unique_id += 1 117 | return examples 118 | 119 | 120 | def model_fn_builder(bert_config, init_checkpoint, use_tpu, 121 | use_one_hot_embeddings): 122 | """Returns `model_fn` closure for TPUEstimator.""" 123 | 124 | def model_fn(features, mode, params): # pylint: disable=unused-argument 125 | """The `model_fn` for TPUEstimator.""" 126 | 127 | tf.logging.info("*** Features ***") 128 | for name in sorted(features.keys()): 129 | tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) 130 | 131 | input_ids = features["input_ids"] 132 | input_mask = features["input_mask"] 133 | segment_ids = features["segment_ids"] 134 | masked_lm_positions = features["masked_lm_positions"] 135 | masked_lm_ids = features["masked_lm_ids"] 136 | 137 | 138 | model = modeling.BertModel( 139 | config=bert_config, 140 | is_training=False, 141 | input_ids=input_ids, 142 | input_mask=input_mask, 143 | token_type_ids=segment_ids, 144 | use_one_hot_embeddings=use_one_hot_embeddings) 145 | 146 | masked_lm_example_loss = get_masked_lm_output( 147 | bert_config, model.get_sequence_output(), model.get_embedding_table(), 148 | masked_lm_positions, masked_lm_ids) 149 | 150 | tvars = tf.trainable_variables() 151 | initialized_variable_names = {} 152 | scaffold_fn = None 153 | if init_checkpoint: 154 | (assignment_map, initialized_variable_names 155 | ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) 156 | if use_tpu: 157 | 158 | def tpu_scaffold(): 159 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 160 | return tf.train.Scaffold() 161 | 162 | scaffold_fn = tpu_scaffold 163 | else: 164 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 165 | 166 | tf.logging.info("**** Trainable Variables ****") 167 | for var in tvars: 168 | init_string = "" 169 | if var.name in initialized_variable_names: 170 | init_string = ", *INIT_FROM_CKPT*" 171 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, 172 | init_string) 173 | 174 | output_spec = None 175 | if mode == tf.estimator.ModeKeys.PREDICT: 176 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 177 | mode=mode, predictions=masked_lm_example_loss, scaffold_fn=scaffold_fn) # 输出mask_word的score 178 | return output_spec 179 | 180 | return model_fn 181 | 182 | 183 | 184 | 185 | def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, 186 | label_ids): 187 | """Get loss and log probs for the masked LM.""" 188 | input_tensor = gather_indexes(input_tensor, positions) 189 | 190 | with tf.variable_scope("cls/predictions"): 191 | # We apply one more non-linear transformation before the output layer. 192 | # This matrix is not used after pre-training. 193 | with tf.variable_scope("transform"): 194 | input_tensor = tf.layers.dense( 195 | input_tensor, 196 | units=bert_config.hidden_size, 197 | activation=modeling.get_activation(bert_config.hidden_act), 198 | kernel_initializer=modeling.create_initializer( 199 | bert_config.initializer_range)) 200 | input_tensor = modeling.layer_norm(input_tensor) 201 | 202 | # The output weights are the same as the input embeddings, but there is 203 | # an output-only bias for each token. 204 | output_bias = tf.get_variable( 205 | "output_bias", 206 | shape=[bert_config.vocab_size], 207 | initializer=tf.zeros_initializer()) 208 | logits = tf.matmul(input_tensor, output_weights, transpose_b=True) 209 | logits = tf.nn.bias_add(logits, output_bias) 210 | log_probs = tf.nn.log_softmax(logits, axis=-1) 211 | 212 | label_ids = tf.reshape(label_ids, [-1]) 213 | 214 | one_hot_labels = tf.one_hot( 215 | label_ids, depth=bert_config.vocab_size, dtype=tf.float32) 216 | per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) 217 | loss = tf.reshape(per_example_loss, [-1, tf.shape(positions)[1]]) 218 | # TODO: dynamic gather from per_example_loss 219 | return loss 220 | 221 | 222 | 223 | def gather_indexes(sequence_tensor, positions): 224 | """Gathers the vectors at the specific positions over a minibatch.""" 225 | sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) 226 | batch_size = sequence_shape[0] 227 | seq_length = sequence_shape[1] 228 | width = sequence_shape[2] 229 | 230 | flat_offsets = tf.reshape( 231 | tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) 232 | flat_positions = tf.reshape(positions + flat_offsets, [-1]) 233 | flat_sequence_tensor = tf.reshape(sequence_tensor, 234 | [batch_size * seq_length, width]) 235 | output_tensor = tf.gather(flat_sequence_tensor, flat_positions) 236 | return output_tensor 237 | 238 | 239 | def input_fn_builder(features, seq_length, max_predictions_per_seq): 240 | """Creates an `input_fn` closure to be passed to TPUEstimator.""" 241 | 242 | all_input_ids = [] 243 | all_input_mask = [] 244 | all_segment_ids = [] 245 | all_masked_lm_positions = [] 246 | all_masked_lm_ids = [] 247 | 248 | for feature in features: 249 | all_input_ids.append(feature.input_ids) 250 | all_input_mask.append(feature.input_mask) 251 | all_segment_ids.append(feature.segment_ids) 252 | all_masked_lm_positions.append(feature.masked_lm_positions) 253 | all_masked_lm_ids.append(feature.masked_lm_ids) 254 | 255 | def input_fn(params): 256 | """The actual input function.""" 257 | batch_size = params["batch_size"] 258 | num_examples = len(features) 259 | 260 | # This is for demo purposes and does NOT scale to large data sets. We do 261 | # not use Dataset.from_generator() because that uses tf.py_func which is 262 | # not TPU compatible. The right way to load data is with TFRecordReader. 263 | d = tf.data.Dataset.from_tensor_slices({ 264 | "input_ids": 265 | tf.constant( 266 | all_input_ids, shape=[num_examples, seq_length], 267 | dtype=tf.int32), 268 | "input_mask": 269 | tf.constant( 270 | all_input_mask, 271 | shape=[num_examples, seq_length], 272 | dtype=tf.int32), 273 | "segment_ids": 274 | tf.constant( 275 | all_segment_ids, 276 | shape=[num_examples, seq_length], 277 | dtype=tf.int32), 278 | "masked_lm_positions": 279 | tf.constant( 280 | all_masked_lm_positions, 281 | shape=[num_examples, max_predictions_per_seq], 282 | dtype=tf.int32), 283 | "masked_lm_ids": 284 | tf.constant( 285 | all_masked_lm_ids, 286 | shape=[num_examples, max_predictions_per_seq], 287 | dtype=tf.int32) 288 | }) 289 | 290 | d = d.batch(batch_size=batch_size, drop_remainder=False) 291 | return d 292 | 293 | return input_fn 294 | 295 | 296 | 297 | # This function is not used by this file but is still used by the Colab and 298 | # people who depend on it. 299 | def convert_examples_to_features(examples, max_seq_length, tokenizer): 300 | """Convert a set of `InputExample`s to a list of `InputFeatures`.""" 301 | 302 | all_features = [] 303 | all_tokens = [] 304 | 305 | for (ex_index, example) in enumerate(examples): 306 | if ex_index % 10000 == 0: 307 | tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) 308 | 309 | features, tokens = convert_single_example(ex_index, example, 310 | max_seq_length, tokenizer) 311 | all_features.extend(features) 312 | all_tokens.extend(tokens) 313 | 314 | return all_features, all_tokens 315 | 316 | tokenizer = tokenization.FullTokenizer( 317 | vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) 318 | MASKED_TOKEN = "[MASK]" 319 | MASKED_ID = tokenizer.convert_tokens_to_ids([MASKED_TOKEN])[0] 320 | 321 | 322 | def create_masked_lm_prediction(input_ids, mask_position, mask_count=1): 323 | new_input_ids = list(input_ids) 324 | masked_lm_labels = [] 325 | masked_lm_positions = list(range(mask_position, mask_position + mask_count)) 326 | for i in masked_lm_positions: 327 | new_input_ids[i] = MASKED_ID 328 | masked_lm_labels.append(input_ids[i]) 329 | return new_input_ids, masked_lm_positions, masked_lm_labels 330 | 331 | 332 | class InputFeatures(object): 333 | """A single set of features of data.""" 334 | 335 | def __init__(self, input_ids, segment_ids, input_mask, masked_lm_positions, 336 | masked_lm_ids): 337 | self.input_ids = input_ids, 338 | self.segment_ids = segment_ids, 339 | self.input_mask = input_mask, 340 | self.masked_lm_positions = masked_lm_positions, 341 | self.masked_lm_ids = masked_lm_ids, 342 | 343 | 344 | def convert_single_example(ex_index, example, max_seq_length, 345 | tokenizer): 346 | """Converts a single `InputExample` into a single `InputFeatures`.""" 347 | tokens = tokenizer.tokenize(example.text) 348 | 349 | # Account for [CLS] and [SEP] with "- 2" 350 | if len(tokens) > max_seq_length - 2: 351 | tokens = tokens[0:(max_seq_length - 2)] 352 | 353 | input_tokens = [] 354 | segment_ids = [] 355 | input_tokens.append("[CLS]") 356 | segment_ids.append(0) 357 | for token in tokens: 358 | input_tokens.append(token) 359 | segment_ids.append(0) 360 | input_tokens.append("[SEP]") 361 | segment_ids.append(0) 362 | 363 | input_ids = tokenizer.convert_tokens_to_ids(input_tokens) 364 | 365 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 366 | # tokens are attended to. 367 | input_mask = [1] * len(input_ids) 368 | 369 | # Zero-pad up to the sequence length. 370 | while len(input_ids) < max_seq_length: 371 | input_ids.append(0) 372 | input_mask.append(0) 373 | segment_ids.append(0) 374 | 375 | assert len(input_ids) == max_seq_length 376 | assert len(input_mask) == max_seq_length 377 | assert len(segment_ids) == max_seq_length 378 | 379 | if ex_index < 5: 380 | tf.logging.info("*** Example ***") 381 | tf.logging.info("id: %s" % (example.unique_id)) 382 | tf.logging.info("tokens: %s" % " ".join( 383 | [tokenization.printable_text(x) for x in input_tokens])) 384 | tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 385 | tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 386 | tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) 387 | 388 | features = create_sequential_mask(input_tokens, input_ids, input_mask, segment_ids, 389 | FLAGS.max_predictions_per_seq) 390 | return features, input_tokens 391 | 392 | 393 | def is_subtoken(x): 394 | return x.startswith("##") 395 | 396 | def create_sequential_mask(input_tokens, input_ids, input_mask, segment_ids, 397 | max_predictions_per_seq): 398 | """Mask each token/word sequentially""" 399 | features = [] 400 | i = 1 401 | while i < len(input_tokens) - 1: 402 | mask_count = 1 403 | while is_subtoken(input_tokens[i+mask_count]): 404 | mask_count += 1 405 | 406 | input_ids_new, masked_lm_positions, masked_lm_labels = create_masked_lm_prediction(input_ids, i, mask_count) 407 | while len(masked_lm_positions) < max_predictions_per_seq: 408 | masked_lm_positions.append(0) 409 | masked_lm_labels.append(0) 410 | 411 | feature = InputFeatures( 412 | input_ids=input_ids_new, 413 | input_mask=input_mask, 414 | segment_ids=segment_ids, 415 | masked_lm_positions=masked_lm_positions, 416 | masked_lm_ids=masked_lm_labels) 417 | features.append(feature) 418 | i += mask_count 419 | return features 420 | 421 | 422 | def parse_result(result, all_tokens, output_file=None): 423 | with tf.gfile.GFile(output_file, "w") as writer: 424 | tf.logging.info("***** Predict results *****") 425 | i = 0 426 | sentences = [] 427 | for word_loss in result: 428 | # start of a sentence 429 | if all_tokens[i] == "[CLS]": 430 | sentence = {} 431 | tokens = [] 432 | sentence_loss = 0.0 433 | word_count_per_sent = 0 434 | i += 1 435 | sen=[] 436 | # add token 437 | tokens.append({"token": tokenization.printable_text(all_tokens[i]), 438 | "prob": float(np.exp(-word_loss[0])) }) 439 | sentence_loss += word_loss[0] 440 | word_count_per_sent += 1 441 | i += 1 442 | 443 | token_count_per_word = 0 444 | while is_subtoken(all_tokens[i]): 445 | token_count_per_word += 1 446 | tokens.append({"token": tokenization.printable_text(all_tokens[i]), 447 | "prob": float(np.exp(-word_loss[token_count_per_word]))}) 448 | sentence_loss += word_loss[token_count_per_word] 449 | i += 1 450 | 451 | # end of a sentence 452 | if all_tokens[i] == "[SEP]": 453 | sentence["tokens"] = tokens#单词的概率值 454 | sentence["ppl"] = float(np.exp(sentence_loss / word_count_per_sent))#句子的困惑度 455 | sentences.append(sentence) 456 | i += 1 457 | 458 | if output_file is not None: 459 | tf.logging.info("Saving results to %s" % output_file) 460 | writer.write(json.dumps(sentences, indent=2, ensure_ascii=False)) 461 | for sen in sentences: 462 | print("".join([sen["tokens"][i]["token"] for i in range(len(sen["tokens"]))])) 463 | print("ppl:",sen["ppl"]) 464 | 465 | def main(_): 466 | tf.logging.set_verbosity(tf.logging.INFO) 467 | 468 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) 469 | 470 | if FLAGS.max_seq_length > bert_config.max_position_embeddings: 471 | raise ValueError( 472 | "Cannot use sequence length %d because the BERT model " 473 | "was only trained up to sequence length %d" % 474 | (FLAGS.max_seq_length, bert_config.max_position_embeddings)) 475 | 476 | tf.gfile.MakeDirs(FLAGS.output_dir) 477 | 478 | tpu_cluster_resolver = None 479 | if FLAGS.use_tpu and FLAGS.tpu_name: 480 | tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( 481 | FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) 482 | 483 | is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 484 | run_config = tf.contrib.tpu.RunConfig( 485 | cluster=tpu_cluster_resolver, 486 | master=FLAGS.master, 487 | model_dir=FLAGS.output_dir, 488 | tpu_config=tf.contrib.tpu.TPUConfig( 489 | num_shards=FLAGS.num_tpu_cores, 490 | per_host_input_for_training=is_per_host)) 491 | 492 | 493 | model_fn = model_fn_builder( 494 | bert_config=bert_config, 495 | init_checkpoint=FLAGS.init_checkpoint, 496 | use_tpu=FLAGS.use_tpu, 497 | use_one_hot_embeddings=FLAGS.use_tpu) 498 | 499 | # If TPU is not available, this will fall back to normal Estimator on CPU 500 | # or GPU. 501 | estimator = tf.contrib.tpu.TPUEstimator( 502 | use_tpu=FLAGS.use_tpu, 503 | model_fn=model_fn, 504 | config=run_config, 505 | predict_batch_size=FLAGS.predict_batch_size) 506 | 507 | 508 | predict_examples = read_examples(FLAGS.input_file) 509 | features, all_tokens = convert_examples_to_features(predict_examples, 510 | FLAGS.max_seq_length, tokenizer) 511 | 512 | tf.logging.info("***** Running prediction*****") 513 | tf.logging.info(" Num examples = %d", len(predict_examples)) 514 | tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) 515 | 516 | if FLAGS.use_tpu: 517 | # Warning: According to tpu_estimator.py Prediction on TPU is an 518 | # experimental feature and hence not supported here 519 | raise ValueError("Prediction in TPU not supported") 520 | 521 | predict_input_fn = input_fn_builder( 522 | features=features, 523 | seq_length=FLAGS.max_seq_length, 524 | max_predictions_per_seq=FLAGS.max_predictions_per_seq) 525 | 526 | result = estimator.predict(input_fn=predict_input_fn) 527 | output_predict_file = os.path.join(FLAGS.output_dir, "test_results.json") 528 | parse_result(result, all_tokens, output_predict_file) 529 | 530 | if __name__ == "__main__": 531 | tf.app.run() 532 | -------------------------------------------------------------------------------- /run_pretraining.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Run masked LM/next sentence masked_lm pre-training for BERT.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os 22 | import modeling 23 | import optimization 24 | import tensorflow as tf 25 | 26 | flags = tf.flags 27 | 28 | FLAGS = flags.FLAGS 29 | 30 | ## Required parameters 31 | flags.DEFINE_string( 32 | "bert_config_file", None, 33 | "The config json file corresponding to the pre-trained BERT model. " 34 | "This specifies the model architecture.") 35 | 36 | flags.DEFINE_string( 37 | "input_file", None, 38 | "Input TF example files (can be a glob or comma separated).") 39 | 40 | flags.DEFINE_string( 41 | "output_dir", None, 42 | "The output directory where the model checkpoints will be written.") 43 | 44 | ## Other parameters 45 | flags.DEFINE_string( 46 | "init_checkpoint", None, 47 | "Initial checkpoint (usually from a pre-trained BERT model).") 48 | 49 | flags.DEFINE_integer( 50 | "max_seq_length", 128, 51 | "The maximum total input sequence length after WordPiece tokenization. " 52 | "Sequences longer than this will be truncated, and sequences shorter " 53 | "than this will be padded. Must match data generation.") 54 | 55 | flags.DEFINE_integer( 56 | "max_predictions_per_seq", 20, 57 | "Maximum number of masked LM predictions per sequence. " 58 | "Must match data generation.") 59 | 60 | flags.DEFINE_bool("do_train", False, "Whether to run training.") 61 | 62 | flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") 63 | 64 | flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") 65 | 66 | flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") 67 | 68 | flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") 69 | 70 | flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.") 71 | 72 | flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.") 73 | 74 | flags.DEFINE_integer("save_checkpoints_steps", 1000, 75 | "How often to save the model checkpoint.") 76 | 77 | flags.DEFINE_integer("iterations_per_loop", 1000, 78 | "How many steps to make in each estimator call.") 79 | 80 | flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.") 81 | 82 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") 83 | 84 | tf.flags.DEFINE_string( 85 | "tpu_name", None, 86 | "The Cloud TPU to use for training. This should be either the name " 87 | "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " 88 | "url.") 89 | 90 | tf.flags.DEFINE_string( 91 | "tpu_zone", None, 92 | "[Optional] GCE zone where the Cloud TPU is located in. If not " 93 | "specified, we will attempt to automatically detect the GCE project from " 94 | "metadata.") 95 | 96 | tf.flags.DEFINE_string( 97 | "gcp_project", None, 98 | "[Optional] Project name for the Cloud TPU-enabled project. If not " 99 | "specified, we will attempt to automatically detect the GCE project from " 100 | "metadata.") 101 | 102 | tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") 103 | 104 | flags.DEFINE_integer( 105 | "num_tpu_cores", 8, 106 | "Only used if `use_tpu` is True. Total number of TPU cores to use.") 107 | 108 | 109 | def model_fn_builder(bert_config, init_checkpoint, learning_rate, 110 | num_train_steps, num_warmup_steps, use_tpu, 111 | use_one_hot_embeddings): 112 | """Returns `model_fn` closure for TPUEstimator.""" 113 | 114 | def model_fn(features, labels, mode, params): # pylint: disable=unused-argument 115 | """The `model_fn` for TPUEstimator.""" 116 | 117 | tf.logging.info("*** Features ***") 118 | for name in sorted(features.keys()): 119 | tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) 120 | 121 | input_ids = features["input_ids"] 122 | input_mask = features["input_mask"] 123 | segment_ids = features["segment_ids"] 124 | masked_lm_positions = features["masked_lm_positions"] 125 | masked_lm_ids = features["masked_lm_ids"] 126 | masked_lm_weights = features["masked_lm_weights"] 127 | next_sentence_labels = features["next_sentence_labels"] 128 | 129 | is_training = (mode == tf.estimator.ModeKeys.TRAIN) 130 | 131 | model = modeling.BertModel( 132 | config=bert_config, 133 | is_training=is_training, 134 | input_ids=input_ids, 135 | input_mask=input_mask, 136 | token_type_ids=segment_ids, 137 | use_one_hot_embeddings=use_one_hot_embeddings) 138 | 139 | (masked_lm_loss, 140 | masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( 141 | bert_config, model.get_sequence_output(), model.get_embedding_table(), 142 | masked_lm_positions, masked_lm_ids, masked_lm_weights) 143 | 144 | (next_sentence_loss, next_sentence_example_loss, 145 | next_sentence_log_probs) = get_next_sentence_output( 146 | bert_config, model.get_pooled_output(), next_sentence_labels) 147 | 148 | total_loss = masked_lm_loss + next_sentence_loss 149 | 150 | tvars = tf.trainable_variables() 151 | 152 | initialized_variable_names = {} 153 | scaffold_fn = None 154 | if init_checkpoint: 155 | (assignment_map, initialized_variable_names 156 | ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) 157 | if use_tpu: 158 | 159 | def tpu_scaffold(): 160 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 161 | return tf.train.Scaffold() 162 | 163 | scaffold_fn = tpu_scaffold 164 | else: 165 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 166 | 167 | tf.logging.info("**** Trainable Variables ****") 168 | for var in tvars: 169 | init_string = "" 170 | if var.name in initialized_variable_names: 171 | init_string = ", *INIT_FROM_CKPT*" 172 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, 173 | init_string) 174 | 175 | output_spec = None 176 | if mode == tf.estimator.ModeKeys.TRAIN: 177 | train_op = optimization.create_optimizer( 178 | total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) 179 | 180 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 181 | mode=mode, 182 | loss=total_loss, 183 | train_op=train_op, 184 | scaffold_fn=scaffold_fn) 185 | elif mode == tf.estimator.ModeKeys.EVAL: 186 | 187 | def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, 188 | masked_lm_weights, next_sentence_example_loss, 189 | next_sentence_log_probs, next_sentence_labels): 190 | """Computes the loss and accuracy of the model.""" 191 | masked_lm_log_probs = tf.reshape(masked_lm_log_probs, 192 | [-1, masked_lm_log_probs.shape[-1]]) 193 | masked_lm_predictions = tf.argmax( 194 | masked_lm_log_probs, axis=-1, output_type=tf.int32) 195 | masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) 196 | masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) 197 | masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) 198 | masked_lm_accuracy = tf.metrics.accuracy( 199 | labels=masked_lm_ids, 200 | predictions=masked_lm_predictions, 201 | weights=masked_lm_weights) 202 | masked_lm_mean_loss = tf.metrics.mean( 203 | values=masked_lm_example_loss, weights=masked_lm_weights) 204 | 205 | next_sentence_log_probs = tf.reshape( 206 | next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) 207 | next_sentence_predictions = tf.argmax( 208 | next_sentence_log_probs, axis=-1, output_type=tf.int32) 209 | next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) 210 | next_sentence_accuracy = tf.metrics.accuracy( 211 | labels=next_sentence_labels, predictions=next_sentence_predictions) 212 | next_sentence_mean_loss = tf.metrics.mean( 213 | values=next_sentence_example_loss) 214 | 215 | return { 216 | "masked_lm_accuracy": masked_lm_accuracy, 217 | "masked_lm_loss": masked_lm_mean_loss, 218 | "next_sentence_accuracy": next_sentence_accuracy, 219 | "next_sentence_loss": next_sentence_mean_loss, 220 | } 221 | 222 | eval_metrics = (metric_fn, [ 223 | masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, 224 | masked_lm_weights, next_sentence_example_loss, 225 | next_sentence_log_probs, next_sentence_labels 226 | ]) 227 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 228 | mode=mode, 229 | loss=total_loss, 230 | eval_metrics=eval_metrics, 231 | scaffold_fn=scaffold_fn) 232 | else: 233 | raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) 234 | 235 | return output_spec 236 | 237 | return model_fn 238 | 239 | 240 | def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, 241 | label_ids, label_weights): 242 | """Get loss and log probs for the masked LM.""" 243 | input_tensor = gather_indexes(input_tensor, positions) 244 | 245 | with tf.variable_scope("cls/predictions"): 246 | # We apply one more non-linear transformation before the output layer. 247 | # This matrix is not used after pre-training. 248 | with tf.variable_scope("transform"): 249 | input_tensor = tf.layers.dense( 250 | input_tensor, 251 | units=bert_config.hidden_size, 252 | activation=modeling.get_activation(bert_config.hidden_act), 253 | kernel_initializer=modeling.create_initializer( 254 | bert_config.initializer_range)) 255 | input_tensor = modeling.layer_norm(input_tensor) 256 | 257 | # The output weights are the same as the input embeddings, but there is 258 | # an output-only bias for each token. 259 | output_bias = tf.get_variable( 260 | "output_bias", 261 | shape=[bert_config.vocab_size], 262 | initializer=tf.zeros_initializer()) 263 | logits = tf.matmul(input_tensor, output_weights, transpose_b=True) 264 | logits = tf.nn.bias_add(logits, output_bias) 265 | log_probs = tf.nn.log_softmax(logits, axis=-1) 266 | 267 | label_ids = tf.reshape(label_ids, [-1]) 268 | label_weights = tf.reshape(label_weights, [-1]) 269 | 270 | one_hot_labels = tf.one_hot( 271 | label_ids, depth=bert_config.vocab_size, dtype=tf.float32) 272 | 273 | # The `positions` tensor might be zero-padded (if the sequence is too 274 | # short to have the maximum number of predictions). The `label_weights` 275 | # tensor has a value of 1.0 for every real prediction and 0.0 for the 276 | # padding predictions. 277 | per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) 278 | numerator = tf.reduce_sum(label_weights * per_example_loss) 279 | denominator = tf.reduce_sum(label_weights) + 1e-5 280 | loss = numerator / denominator 281 | 282 | return (loss, per_example_loss, log_probs) 283 | 284 | 285 | def get_next_sentence_output(bert_config, input_tensor, labels): 286 | """Get loss and log probs for the next sentence prediction.""" 287 | 288 | # Simple binary classification. Note that 0 is "next sentence" and 1 is 289 | # "random sentence". This weight matrix is not used after pre-training. 290 | with tf.variable_scope("cls/seq_relationship"): 291 | output_weights = tf.get_variable( 292 | "output_weights", 293 | shape=[2, bert_config.hidden_size], 294 | initializer=modeling.create_initializer(bert_config.initializer_range)) 295 | output_bias = tf.get_variable( 296 | "output_bias", shape=[2], initializer=tf.zeros_initializer()) 297 | 298 | logits = tf.matmul(input_tensor, output_weights, transpose_b=True) 299 | logits = tf.nn.bias_add(logits, output_bias) 300 | log_probs = tf.nn.log_softmax(logits, axis=-1) 301 | labels = tf.reshape(labels, [-1]) 302 | one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) 303 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) 304 | loss = tf.reduce_mean(per_example_loss) 305 | return (loss, per_example_loss, log_probs) 306 | 307 | 308 | def gather_indexes(sequence_tensor, positions): 309 | """Gathers the vectors at the specific positions over a minibatch.""" 310 | sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) 311 | batch_size = sequence_shape[0] 312 | seq_length = sequence_shape[1] 313 | width = sequence_shape[2] 314 | 315 | flat_offsets = tf.reshape( 316 | tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) 317 | flat_positions = tf.reshape(positions + flat_offsets, [-1]) 318 | flat_sequence_tensor = tf.reshape(sequence_tensor, 319 | [batch_size * seq_length, width]) 320 | output_tensor = tf.gather(flat_sequence_tensor, flat_positions) 321 | return output_tensor 322 | 323 | 324 | def input_fn_builder(input_files, 325 | max_seq_length, 326 | max_predictions_per_seq, 327 | is_training, 328 | num_cpu_threads=4): 329 | """Creates an `input_fn` closure to be passed to TPUEstimator.""" 330 | 331 | def input_fn(params): 332 | """The actual input function.""" 333 | batch_size = params["batch_size"] 334 | 335 | name_to_features = { 336 | "input_ids": 337 | tf.FixedLenFeature([max_seq_length], tf.int64), 338 | "input_mask": 339 | tf.FixedLenFeature([max_seq_length], tf.int64), 340 | "segment_ids": 341 | tf.FixedLenFeature([max_seq_length], tf.int64), 342 | "masked_lm_positions": 343 | tf.FixedLenFeature([max_predictions_per_seq], tf.int64), 344 | "masked_lm_ids": 345 | tf.FixedLenFeature([max_predictions_per_seq], tf.int64), 346 | "masked_lm_weights": 347 | tf.FixedLenFeature([max_predictions_per_seq], tf.float32), 348 | "next_sentence_labels": 349 | tf.FixedLenFeature([1], tf.int64), 350 | } 351 | 352 | # For training, we want a lot of parallel reading and shuffling. 353 | # For eval, we want no shuffling and parallel reading doesn't matter. 354 | if is_training: 355 | d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) 356 | d = d.repeat() 357 | d = d.shuffle(buffer_size=len(input_files)) 358 | 359 | # `cycle_length` is the number of parallel files that get read. 360 | cycle_length = min(num_cpu_threads, len(input_files)) 361 | 362 | # `sloppy` mode means that the interleaving is not exact. This adds 363 | # even more randomness to the training pipeline. 364 | d = d.apply( 365 | tf.contrib.data.parallel_interleave( 366 | tf.data.TFRecordDataset, 367 | sloppy=is_training, 368 | cycle_length=cycle_length)) 369 | d = d.shuffle(buffer_size=100) 370 | else: 371 | d = tf.data.TFRecordDataset(input_files) 372 | # Since we evaluate for a fixed number of steps we don't want to encounter 373 | # out-of-range exceptions. 374 | d = d.repeat() 375 | 376 | # We must `drop_remainder` on training because the TPU requires fixed 377 | # size dimensions. For eval, we assume we are evaluating on the CPU or GPU 378 | # and we *don't* want to drop the remainder, otherwise we wont cover 379 | # every sample. 380 | d = d.apply( 381 | tf.contrib.data.map_and_batch( 382 | lambda record: _decode_record(record, name_to_features), 383 | batch_size=batch_size, 384 | num_parallel_batches=num_cpu_threads, 385 | drop_remainder=True)) 386 | return d 387 | 388 | return input_fn 389 | 390 | 391 | def _decode_record(record, name_to_features): 392 | """Decodes a record to a TensorFlow example.""" 393 | example = tf.parse_single_example(record, name_to_features) 394 | 395 | # tf.Example only supports tf.int64, but the TPU only supports tf.int32. 396 | # So cast all int64 to int32. 397 | for name in list(example.keys()): 398 | t = example[name] 399 | if t.dtype == tf.int64: 400 | t = tf.to_int32(t) 401 | example[name] = t 402 | 403 | return example 404 | 405 | 406 | def main(_): 407 | tf.logging.set_verbosity(tf.logging.INFO) 408 | 409 | if not FLAGS.do_train and not FLAGS.do_eval: 410 | raise ValueError("At least one of `do_train` or `do_eval` must be True.") 411 | 412 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) 413 | 414 | tf.gfile.MakeDirs(FLAGS.output_dir) 415 | 416 | input_files = [] 417 | for input_pattern in FLAGS.input_file.split(","): 418 | input_files.extend(tf.gfile.Glob(input_pattern)) 419 | 420 | tf.logging.info("*** Input Files ***") 421 | for input_file in input_files: 422 | tf.logging.info(" %s" % input_file) 423 | 424 | tpu_cluster_resolver = None 425 | if FLAGS.use_tpu and FLAGS.tpu_name: 426 | tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( 427 | FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) 428 | 429 | is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 430 | run_config = tf.contrib.tpu.RunConfig( 431 | cluster=tpu_cluster_resolver, 432 | master=FLAGS.master, 433 | model_dir=FLAGS.output_dir, 434 | save_checkpoints_steps=FLAGS.save_checkpoints_steps, 435 | tpu_config=tf.contrib.tpu.TPUConfig( 436 | iterations_per_loop=FLAGS.iterations_per_loop, 437 | num_shards=FLAGS.num_tpu_cores, 438 | per_host_input_for_training=is_per_host)) 439 | 440 | model_fn = model_fn_builder( 441 | bert_config=bert_config, 442 | init_checkpoint=FLAGS.init_checkpoint, 443 | learning_rate=FLAGS.learning_rate, 444 | num_train_steps=FLAGS.num_train_steps, 445 | num_warmup_steps=FLAGS.num_warmup_steps, 446 | use_tpu=FLAGS.use_tpu, 447 | use_one_hot_embeddings=FLAGS.use_tpu) 448 | 449 | # If TPU is not available, this will fall back to normal Estimator on CPU 450 | # or GPU. 451 | estimator = tf.contrib.tpu.TPUEstimator( 452 | use_tpu=FLAGS.use_tpu, 453 | model_fn=model_fn, 454 | config=run_config, 455 | train_batch_size=FLAGS.train_batch_size, 456 | eval_batch_size=FLAGS.eval_batch_size) 457 | 458 | if FLAGS.do_train: 459 | tf.logging.info("***** Running training *****") 460 | tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) 461 | train_input_fn = input_fn_builder( 462 | input_files=input_files, 463 | max_seq_length=FLAGS.max_seq_length, 464 | max_predictions_per_seq=FLAGS.max_predictions_per_seq, 465 | is_training=True) 466 | estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) 467 | 468 | if FLAGS.do_eval: 469 | tf.logging.info("***** Running evaluation *****") 470 | tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) 471 | 472 | eval_input_fn = input_fn_builder( 473 | input_files=input_files, 474 | max_seq_length=FLAGS.max_seq_length, 475 | max_predictions_per_seq=FLAGS.max_predictions_per_seq, 476 | is_training=False) 477 | 478 | result = estimator.evaluate( 479 | input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) 480 | 481 | output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") 482 | with tf.gfile.GFile(output_eval_file, "w") as writer: 483 | tf.logging.info("***** Eval results *****") 484 | for key in sorted(result.keys()): 485 | tf.logging.info(" %s = %s", key, str(result[key])) 486 | writer.write("%s = %s\n" % (key, str(result[key]))) 487 | 488 | 489 | if __name__ == "__main__": 490 | flags.mark_flag_as_required("input_file") 491 | flags.mark_flag_as_required("bert_config_file") 492 | flags.mark_flag_as_required("output_dir") 493 | tf.app.run() 494 | -------------------------------------------------------------------------------- /sample_text.txt: -------------------------------------------------------------------------------- 1 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত 2 | Text should be one-sentence-per-line, with empty lines between documents. 3 | This sample text is public domain and was randomly selected from Project Guttenberg. 4 | 5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors. 6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity. 7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them. 8 | "Cass" Beard had risen early that morning, but not with a view to discovery. 9 | A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets. 10 | The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency. 11 | This was nearly opposite. 12 | Mr. Cassius crossed the highway, and stopped suddenly. 13 | Something glittered in the nearest red pool before him. 14 | Gold, surely! 15 | But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring. 16 | Looking at it more attentively, he saw that it bore the inscription, "May to Cass." 17 | Like most of his fellow gold-seekers, Cass was superstitious. 18 | 19 | The fountain of classic wisdom, Hypatia herself. 20 | As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge. 21 | From my youth I felt in me a soul above the matter-entangled herd. 22 | She revealed to me the glorious fact, that I am a spark of Divinity itself. 23 | A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's. 24 | There is a philosophic pleasure in opening one's treasures to the modest young. 25 | Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street. 26 | Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide; 27 | but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind. 28 | Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now. 29 | His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert; 30 | while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts. 31 | At last they reached the quay at the opposite end of the street; 32 | and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers. 33 | He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him. 34 | -------------------------------------------------------------------------------- /tmp/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | f=open("./lm_output/test_results.json") 3 | sentence={} 4 | line=f.read() 5 | text=json.loads(line) 6 | for sen in text: 7 | s="".join([sen["tokens"][i]["token"] for i in range(len(sen["tokens"]))]) 8 | if sen["ppl"]<=10: 9 | sentence[s]=sen["ppl"] 10 | print(len(sentence)) -------------------------------------------------------------------------------- /tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import unicodedata 23 | import six 24 | import tensorflow as tf 25 | 26 | 27 | def convert_to_unicode(text): 28 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 29 | if six.PY3: 30 | if isinstance(text, str): 31 | return text 32 | elif isinstance(text, bytes): 33 | return text.decode("utf-8", "ignore") 34 | else: 35 | raise ValueError("Unsupported string type: %s" % (type(text))) 36 | elif six.PY2: 37 | if isinstance(text, str): 38 | return text.decode("utf-8", "ignore") 39 | elif isinstance(text, unicode): 40 | return text 41 | else: 42 | raise ValueError("Unsupported string type: %s" % (type(text))) 43 | else: 44 | raise ValueError("Not running on Python2 or Python 3?") 45 | 46 | 47 | def printable_text(text): 48 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 49 | 50 | # These functions want `str` for both Python2 and Python3, but in one case 51 | # it's a Unicode string and in the other it's a byte string. 52 | if six.PY3: 53 | if isinstance(text, str): 54 | return text 55 | elif isinstance(text, bytes): 56 | return text.decode("utf-8", "ignore") 57 | else: 58 | raise ValueError("Unsupported string type: %s" % (type(text))) 59 | elif six.PY2: 60 | if isinstance(text, str): 61 | return text 62 | elif isinstance(text, unicode): 63 | return text.encode("utf-8") 64 | else: 65 | raise ValueError("Unsupported string type: %s" % (type(text))) 66 | else: 67 | raise ValueError("Not running on Python2 or Python 3?") 68 | 69 | 70 | def load_vocab(vocab_file): 71 | """Loads a vocabulary file into a dictionary.""" 72 | vocab = collections.OrderedDict() 73 | index = 0 74 | with tf.gfile.GFile(vocab_file, "r") as reader: 75 | while True: 76 | token = convert_to_unicode(reader.readline()) 77 | if not token: 78 | break 79 | token = token.strip() 80 | vocab[token] = index 81 | index += 1 82 | return vocab 83 | 84 | 85 | def convert_by_vocab(vocab, items): 86 | """Converts a sequence of [tokens|ids] using the vocab.""" 87 | output = [] 88 | for item in items: 89 | output.append(vocab[item]) 90 | return output 91 | 92 | 93 | def convert_tokens_to_ids(vocab, tokens): 94 | return convert_by_vocab(vocab, tokens) 95 | 96 | 97 | def convert_ids_to_tokens(inv_vocab, ids): 98 | return convert_by_vocab(inv_vocab, ids) 99 | 100 | 101 | def whitespace_tokenize(text): 102 | """Runs basic whitespace cleaning and splitting on a piece of text.""" 103 | text = text.strip() 104 | if not text: 105 | return [] 106 | tokens = text.split() 107 | return tokens 108 | 109 | 110 | class FullTokenizer(object): 111 | """Runs end-to-end tokenziation.""" 112 | 113 | def __init__(self, vocab_file, do_lower_case=True): 114 | self.vocab = load_vocab(vocab_file) 115 | self.inv_vocab = {v: k for k, v in self.vocab.items()} 116 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 117 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 118 | 119 | def tokenize(self, text): 120 | split_tokens = [] 121 | for token in self.basic_tokenizer.tokenize(text): 122 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 123 | split_tokens.append(sub_token) 124 | 125 | return split_tokens 126 | 127 | def convert_tokens_to_ids(self, tokens): 128 | return convert_by_vocab(self.vocab, tokens) 129 | 130 | def convert_ids_to_tokens(self, ids): 131 | return convert_by_vocab(self.inv_vocab, ids) 132 | 133 | 134 | class BasicTokenizer(object): 135 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 136 | 137 | def __init__(self, do_lower_case=True): 138 | """Constructs a BasicTokenizer. 139 | 140 | Args: 141 | do_lower_case: Whether to lower case the input. 142 | """ 143 | self.do_lower_case = do_lower_case 144 | 145 | def tokenize(self, text): 146 | """Tokenizes a piece of text.""" 147 | text = convert_to_unicode(text) 148 | text = self._clean_text(text) 149 | 150 | # This was added on November 1st, 2018 for the multilingual and Chinese 151 | # models. This is also applied to the English models now, but it doesn't 152 | # matter since the English models were not trained on any Chinese data 153 | # and generally don't have any Chinese data in them (there are Chinese 154 | # characters in the vocabulary because Wikipedia does have some Chinese 155 | # words in the English Wikipedia.). 156 | text = self._tokenize_chinese_chars(text) 157 | 158 | orig_tokens = whitespace_tokenize(text) 159 | split_tokens = [] 160 | for token in orig_tokens: 161 | if self.do_lower_case: 162 | token = token.lower() 163 | token = self._run_strip_accents(token) 164 | split_tokens.extend(self._run_split_on_punc(token)) 165 | 166 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 167 | return output_tokens 168 | 169 | def _run_strip_accents(self, text): 170 | """Strips accents from a piece of text.""" 171 | text = unicodedata.normalize("NFD", text) 172 | output = [] 173 | for char in text: 174 | cat = unicodedata.category(char) 175 | if cat == "Mn": 176 | continue 177 | output.append(char) 178 | return "".join(output) 179 | 180 | def _run_split_on_punc(self, text): 181 | """Splits punctuation on a piece of text.""" 182 | chars = list(text) 183 | i = 0 184 | start_new_word = True 185 | output = [] 186 | while i < len(chars): 187 | char = chars[i] 188 | if _is_punctuation(char): 189 | output.append([char]) 190 | start_new_word = True 191 | else: 192 | if start_new_word: 193 | output.append([]) 194 | start_new_word = False 195 | output[-1].append(char) 196 | i += 1 197 | 198 | return ["".join(x) for x in output] 199 | 200 | def _tokenize_chinese_chars(self, text): 201 | """Adds whitespace around any CJK character.""" 202 | output = [] 203 | for char in text: 204 | cp = ord(char) 205 | if self._is_chinese_char(cp): 206 | output.append(" ") 207 | output.append(char) 208 | output.append(" ") 209 | else: 210 | output.append(char) 211 | return "".join(output) 212 | 213 | def _is_chinese_char(self, cp): 214 | """Checks whether CP is the codepoint of a CJK character.""" 215 | # This defines a "chinese character" as anything in the CJK Unicode block: 216 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 217 | # 218 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 219 | # despite its name. The modern Korean Hangul alphabet is a different block, 220 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 221 | # space-separated words, so they are not treated specially and handled 222 | # like the all of the other languages. 223 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 224 | (cp >= 0x3400 and cp <= 0x4DBF) or # 225 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 226 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 227 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 228 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 229 | (cp >= 0xF900 and cp <= 0xFAFF) or # 230 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 231 | return True 232 | 233 | return False 234 | 235 | def _clean_text(self, text): 236 | """Performs invalid character removal and whitespace cleanup on text.""" 237 | output = [] 238 | for char in text: 239 | cp = ord(char) 240 | if cp == 0 or cp == 0xfffd or _is_control(char): 241 | continue 242 | if _is_whitespace(char): 243 | output.append(" ") 244 | else: 245 | output.append(char) 246 | return "".join(output) 247 | 248 | 249 | class WordpieceTokenizer(object): 250 | """Runs WordPiece tokenziation.""" 251 | 252 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): 253 | self.vocab = vocab 254 | self.unk_token = unk_token 255 | self.max_input_chars_per_word = max_input_chars_per_word 256 | 257 | def tokenize(self, text): 258 | """Tokenizes a piece of text into its word pieces. 259 | 260 | This uses a greedy longest-match-first algorithm to perform tokenization 261 | using the given vocabulary. 262 | 263 | For example: 264 | input = "unaffable" 265 | output = ["un", "##aff", "##able"] 266 | 267 | Args: 268 | text: A single token or whitespace separated tokens. This should have 269 | already been passed through `BasicTokenizer. 270 | 271 | Returns: 272 | A list of wordpiece tokens. 273 | """ 274 | 275 | text = convert_to_unicode(text) 276 | 277 | output_tokens = [] 278 | for token in whitespace_tokenize(text): 279 | chars = list(token) 280 | if len(chars) > self.max_input_chars_per_word: 281 | output_tokens.append(self.unk_token) 282 | continue 283 | 284 | is_bad = False 285 | start = 0 286 | sub_tokens = [] 287 | while start < len(chars): 288 | end = len(chars) 289 | cur_substr = None 290 | while start < end: 291 | substr = "".join(chars[start:end]) 292 | if start > 0: 293 | substr = "##" + substr 294 | if substr in self.vocab: 295 | cur_substr = substr 296 | break 297 | end -= 1 298 | if cur_substr is None: 299 | is_bad = True 300 | break 301 | sub_tokens.append(cur_substr) 302 | start = end 303 | 304 | if is_bad: 305 | output_tokens.append(self.unk_token) 306 | else: 307 | output_tokens.extend(sub_tokens) 308 | return output_tokens 309 | 310 | 311 | def _is_whitespace(char): 312 | """Checks whether `chars` is a whitespace character.""" 313 | # \t, \n, and \r are technically contorl characters but we treat them 314 | # as whitespace since they are generally considered as such. 315 | if char == " " or char == "\t" or char == "\n" or char == "\r": 316 | return True 317 | cat = unicodedata.category(char) 318 | if cat == "Zs": 319 | return True 320 | return False 321 | 322 | 323 | def _is_control(char): 324 | """Checks whether `chars` is a control character.""" 325 | # These are technically control characters but we count them as whitespace 326 | # characters. 327 | if char == "\t" or char == "\n" or char == "\r": 328 | return False 329 | cat = unicodedata.category(char) 330 | if cat.startswith("C"): 331 | return True 332 | return False 333 | 334 | 335 | def _is_punctuation(char): 336 | """Checks whether `chars` is a punctuation character.""" 337 | cp = ord(char) 338 | # We treat all non-letter/number ASCII as punctuation. 339 | # Characters such as "^", "$", and "`" are not in the Unicode 340 | # Punctuation class but we treat them as punctuation anyways, for 341 | # consistency. 342 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 343 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 344 | return True 345 | cat = unicodedata.category(char) 346 | if cat.startswith("P"): 347 | return True 348 | return False 349 | -------------------------------------------------------------------------------- /tokenization_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import os 20 | import tempfile 21 | 22 | import tokenization 23 | import tensorflow as tf 24 | 25 | 26 | class TokenizationTest(tf.test.TestCase): 27 | 28 | def test_full_tokenizer(self): 29 | vocab_tokens = [ 30 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", 31 | "##ing", "," 32 | ] 33 | with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: 34 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) 35 | 36 | vocab_file = vocab_writer.name 37 | 38 | tokenizer = tokenization.FullTokenizer(vocab_file) 39 | os.unlink(vocab_file) 40 | 41 | tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") 42 | self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) 43 | 44 | self.assertAllEqual( 45 | tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) 46 | 47 | def test_chinese(self): 48 | tokenizer = tokenization.BasicTokenizer() 49 | 50 | self.assertAllEqual( 51 | tokenizer.tokenize(u"ah\u535A\u63A8zz"), 52 | [u"ah", u"\u535A", u"\u63A8", u"zz"]) 53 | 54 | def test_basic_tokenizer_lower(self): 55 | tokenizer = tokenization.BasicTokenizer(do_lower_case=True) 56 | 57 | self.assertAllEqual( 58 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), 59 | ["hello", "!", "how", "are", "you", "?"]) 60 | self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) 61 | 62 | def test_basic_tokenizer_no_lower(self): 63 | tokenizer = tokenization.BasicTokenizer(do_lower_case=False) 64 | 65 | self.assertAllEqual( 66 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), 67 | ["HeLLo", "!", "how", "Are", "yoU", "?"]) 68 | 69 | def test_wordpiece_tokenizer(self): 70 | vocab_tokens = [ 71 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", 72 | "##ing" 73 | ] 74 | 75 | vocab = {} 76 | for (i, token) in enumerate(vocab_tokens): 77 | vocab[token] = i 78 | tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) 79 | 80 | self.assertAllEqual(tokenizer.tokenize(""), []) 81 | 82 | self.assertAllEqual( 83 | tokenizer.tokenize("unwanted running"), 84 | ["un", "##want", "##ed", "runn", "##ing"]) 85 | 86 | self.assertAllEqual( 87 | tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) 88 | 89 | def test_convert_tokens_to_ids(self): 90 | vocab_tokens = [ 91 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", 92 | "##ing" 93 | ] 94 | 95 | vocab = {} 96 | for (i, token) in enumerate(vocab_tokens): 97 | vocab[token] = i 98 | 99 | self.assertAllEqual( 100 | tokenization.convert_tokens_to_ids( 101 | vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9]) 102 | 103 | def test_is_whitespace(self): 104 | self.assertTrue(tokenization._is_whitespace(u" ")) 105 | self.assertTrue(tokenization._is_whitespace(u"\t")) 106 | self.assertTrue(tokenization._is_whitespace(u"\r")) 107 | self.assertTrue(tokenization._is_whitespace(u"\n")) 108 | self.assertTrue(tokenization._is_whitespace(u"\u00A0")) 109 | 110 | self.assertFalse(tokenization._is_whitespace(u"A")) 111 | self.assertFalse(tokenization._is_whitespace(u"-")) 112 | 113 | def test_is_control(self): 114 | self.assertTrue(tokenization._is_control(u"\u0005")) 115 | 116 | self.assertFalse(tokenization._is_control(u"A")) 117 | self.assertFalse(tokenization._is_control(u" ")) 118 | self.assertFalse(tokenization._is_control(u"\t")) 119 | self.assertFalse(tokenization._is_control(u"\r")) 120 | 121 | def test_is_punctuation(self): 122 | self.assertTrue(tokenization._is_punctuation(u"-")) 123 | self.assertTrue(tokenization._is_punctuation(u"$")) 124 | self.assertTrue(tokenization._is_punctuation(u"`")) 125 | self.assertTrue(tokenization._is_punctuation(u".")) 126 | 127 | self.assertFalse(tokenization._is_punctuation(u"A")) 128 | self.assertFalse(tokenization._is_punctuation(u" ")) 129 | 130 | 131 | if __name__ == "__main__": 132 | tf.test.main() 133 | --------------------------------------------------------------------------------