├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── __init__.py
├── albert_tsim
    ├── README.md
    ├── __init__.py
    ├── albert_config
    │   ├── albert_config_base.json
    │   ├── albert_config_base_google_fast.json
    │   ├── albert_config_large.json
    │   ├── albert_config_small_google.json
    │   ├── albert_config_tiny.json
    │   ├── albert_config_tiny_google.json
    │   ├── albert_config_tiny_google_fast.json
    │   ├── albert_config_xlarge.json
    │   ├── albert_config_xxlarge.json
    │   ├── bert_config.json
    │   └── vocab.txt
    ├── args.py
    ├── bert_utils.py
    ├── create_pretrain_data.sh
    ├── create_pretraining_data.py
    ├── create_pretraining_data_google.py
    ├── data
    │   └── news_zh_1.txt
    ├── freezeGraph.py
    ├── freeze_graph.py
    ├── lamb_optimizer_google.py
    ├── lcqmc
    │   ├── dev.txt
    │   ├── test.txt
    │   └── train.txt
    ├── modeling.py
    ├── modeling_google.py
    ├── modeling_google_fast.py
    ├── optimization.py
    ├── optimization_finetuning.py
    ├── optimization_google.py
    ├── predict.py
    ├── resources
    │   ├── add_data_removing_dropout.jpg
    │   ├── albert_configuration.jpg
    │   ├── albert_large_zh_parameters.jpg
    │   ├── albert_performance.jpg
    │   ├── albert_tiny_compare_s.jpg
    │   ├── albert_tiny_compare_s_old.jpg
    │   ├── create_pretraining_data_roberta.py
    │   ├── crmc2018_compare_s.jpg
    │   ├── shell_scripts
    │   │   └── create_pretrain_data_batch_webtext.sh
    │   ├── state_of_the_art.jpg
    │   └── xlarge_loss.jpg
    ├── run_classifier.py
    ├── run_classifier_lcqmc.sh
    ├── run_classifier_sp_google.py
    ├── run_pretraining.py
    ├── run_pretraining_google.py
    ├── run_pretraining_google_fast.py
    ├── similarity.py
    ├── similarity_bert.py
    ├── test_changes.py
    ├── tokenization.py
    └── tokenization_google.py
├── bert_related_resources.md
├── bert_tsim
    ├── README.md
    ├── __init__.py
    ├── args.py
    ├── data
    │   ├── dev.csv
    │   └── train.csv
    ├── extract_feature.py
    ├── graph.py
    ├── modeling.py
    ├── optimization.py
    ├── requirements.txt
    ├── similarity.py
    └── tokenization.py
├── create_pretraining_data.py
├── data
    ├── test.csv
    ├── train.csv
    └── val.csv
├── export.py
├── export.sh
├── extract_features.py
├── modeling.py
├── modeling_test.py
├── multilingual.md
├── optimization.py
├── optimization_test.py
├── predict.sh
├── predicting_movie_reviews_with_bert_on_tf_hub.ipynb
├── requirements.txt
├── run_classifier.py
├── run_classifier_pb.py
├── run_classifier_predict_online.py
├── run_classifier_with_tfhub.py
├── run_pretraining.py
├── run_regression.py
├── run_squad.py
├── sample_text.txt
├── tokenization.py
├── tokenization_test.py
└── train.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Initially taken from Github's Python gitignore file
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # IPython
 79 | profile_default/
 80 | ipython_config.py
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # celery beat schedule file
 86 | celerybeat-schedule
 87 | 
 88 | # SageMath parsed files
 89 | *.sage.py
 90 | 
 91 | # Environments
 92 | .env
 93 | .venv
 94 | env/
 95 | venv/
 96 | ENV/
 97 | env.bak/
 98 | venv.bak/
 99 | 
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 | 
104 | # Rope project settings
105 | .ropeproject
106 | 
107 | # mkdocs documentation
108 | /site
109 | 
110 | # mypy
111 | .mypy_cache/
112 | .dmypy.json
113 | dmypy.json
114 | 
115 | # Pyre type checker
116 | .pyre/
117 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | BERT needs to maintain permanent compatibility with the pre-trained model files,
 4 | so we do not plan to make any major changes to this library (other than what was
 5 | promised in the README). However, we can accept small patches related to
 6 | re-factoring and documentation. To submit contributes, there are just a few
 7 | small guidelines you need to follow.
 8 | 
 9 | ## Contributor License Agreement
10 | 
11 | Contributions to this project must be accompanied by a Contributor License
12 | Agreement. You (or your employer) retain the copyright to your contribution;
13 | this simply gives us permission to use and redistribute your contributions as
14 | part of the project. Head over to <https://cla.developers.google.com/> to see
15 | your current agreements on file or to sign a new one.
16 | 
17 | You generally only need to submit a CLA once, so if you've already submitted one
18 | (even if it was for a different project), you probably don't need to do it
19 | again.
20 | 
21 | ## Code reviews
22 | 
23 | All submissions, including submissions by project members, require review. We
24 | use GitHub pull requests for this purpose. Consult
25 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
26 | information on using pull requests.
27 | 
28 | ## Community Guidelines
29 | 
30 | This project follows
31 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/albert_tsim/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sliderSun/bert/c9e16d652f85398fbb6ca6aea72f11f82166c672/albert_tsim/__init__.py


--------------------------------------------------------------------------------
/albert_tsim/albert_config/albert_config_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.0,
 3 |   "directionality": "bidi", 
 4 |   "hidden_act": "gelu", 
 5 |   "hidden_dropout_prob": 0.0,
 6 |   "hidden_size": 768,
 7 |   "embedding_size": 128,
 8 |   "initializer_range": 0.02, 
 9 |   "intermediate_size": 3072 ,
10 |   "max_position_embeddings": 512, 
11 |   "num_attention_heads": 12,
12 |   "num_hidden_layers": 12,
13 | 
14 |   "pooler_fc_size": 768,
15 |   "pooler_num_attention_heads": 12,
16 |   "pooler_num_fc_layers": 3, 
17 |   "pooler_size_per_head": 128, 
18 |   "pooler_type": "first_token_transform", 
19 |   "type_vocab_size": 2, 
20 |   "vocab_size": 21128,
21 |    "ln_type":"postln"
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/albert_tsim/albert_config/albert_config_base_google_fast.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "embedding_size": 128,
 6 |   "hidden_size": 768,
 7 |   "initializer_range": 0.02,
 8 |   "intermediate_size": 3072,
 9 |   "max_position_embeddings": 512,
10 |   "num_attention_heads": 12,
11 |   "num_hidden_layers": 12,
12 |   "num_hidden_groups": 12,
13 |   "net_structure_type": 0,
14 |   "gap_size": 0,
15 |   "num_memory_blocks": 0,
16 |   "inner_group_num": 1,
17 |   "down_scale_factor": 1,
18 |   "type_vocab_size": 2,
19 |   "vocab_size": 21128
20 | }


--------------------------------------------------------------------------------
/albert_tsim/albert_config/albert_config_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.0,
 3 |   "directionality": "bidi", 
 4 |   "hidden_act": "gelu", 
 5 |   "hidden_dropout_prob": 0.0,
 6 |   "hidden_size": 1024,
 7 |   "embedding_size": 128,
 8 |   "initializer_range": 0.02, 
 9 |   "intermediate_size": 4096,
10 |   "max_position_embeddings": 512, 
11 |   "num_attention_heads": 16,
12 |   "num_hidden_layers": 24,
13 | 
14 |   "pooler_fc_size": 768,
15 |   "pooler_num_attention_heads": 12,
16 |   "pooler_num_fc_layers": 3, 
17 |   "pooler_size_per_head": 128, 
18 |   "pooler_type": "first_token_transform", 
19 |   "type_vocab_size": 2, 
20 |   "vocab_size": 21128,
21 |    "ln_type":"postln"
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/albert_tsim/albert_config/albert_config_small_google.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.0,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.0,
 5 |   "embedding_size": 128,
 6 |   "hidden_size": 384,
 7 |   "initializer_range": 0.02,
 8 |   "intermediate_size": 1536,
 9 |   "max_position_embeddings": 512,
10 |   "num_attention_heads": 12,
11 |   "num_hidden_layers": 6,
12 |   "num_hidden_groups": 1,
13 |   "net_structure_type": 0,
14 |   "gap_size": 0,
15 |   "num_memory_blocks": 0,
16 |   "inner_group_num": 1,
17 |   "down_scale_factor": 1,
18 |   "type_vocab_size": 2,
19 |   "vocab_size": 21128
20 | }


--------------------------------------------------------------------------------
/albert_tsim/albert_config/albert_config_tiny.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.0,
 3 |   "directionality": "bidi", 
 4 |   "hidden_act": "gelu", 
 5 |   "hidden_dropout_prob": 0.0,
 6 |   "hidden_size": 312,
 7 |   "embedding_size": 128,
 8 |   "initializer_range": 0.02, 
 9 |   "intermediate_size": 1248 ,
10 |   "max_position_embeddings": 512, 
11 |   "num_attention_heads": 12,
12 |   "num_hidden_layers": 4,
13 | 
14 |   "pooler_fc_size": 768,
15 |   "pooler_num_attention_heads": 12,
16 |   "pooler_num_fc_layers": 3, 
17 |   "pooler_size_per_head": 128, 
18 |   "pooler_type": "first_token_transform", 
19 |   "type_vocab_size": 2, 
20 |   "vocab_size": 21128,
21 |    "ln_type":"postln"
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/albert_tsim/albert_config/albert_config_tiny_google.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.0,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.0,
 5 |   "embedding_size": 128,
 6 |   "hidden_size": 312,
 7 |   "initializer_range": 0.02,
 8 |   "intermediate_size": 1248,
 9 |   "max_position_embeddings": 512,
10 |   "num_attention_heads": 12,
11 |   "num_hidden_layers": 4,
12 |   "num_hidden_groups": 1,
13 |   "net_structure_type": 0,
14 |   "gap_size": 0,
15 |   "num_memory_blocks": 0,
16 |   "inner_group_num": 1,
17 |   "down_scale_factor": 1,
18 |   "type_vocab_size": 2,
19 |   "vocab_size": 21128
20 | }
21 | 


--------------------------------------------------------------------------------
/albert_tsim/albert_config/albert_config_tiny_google_fast.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "embedding_size": 128,
 6 |   "hidden_size": 336,
 7 |   "initializer_range": 0.02,
 8 |   "intermediate_size": 1344,
 9 |   "max_position_embeddings": 512,
10 |   "num_attention_heads": 12,
11 |   "num_hidden_layers": 4,
12 |   "num_hidden_groups": 12,
13 |   "net_structure_type": 0,
14 |   "gap_size": 0,
15 |   "num_memory_blocks": 0,
16 |   "inner_group_num": 1,
17 |   "down_scale_factor": 1,
18 |   "type_vocab_size": 2,
19 |   "vocab_size": 21128
20 | }


--------------------------------------------------------------------------------
/albert_tsim/albert_config/albert_config_xlarge.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.0,
 3 |   "directionality": "bidi", 
 4 |   "hidden_act": "gelu", 
 5 |   "hidden_dropout_prob": 0.0,
 6 |   "hidden_size": 2048,
 7 |   "embedding_size": 128,
 8 |   "initializer_range": 0.02, 
 9 |   "intermediate_size": 8192,
10 |   "max_position_embeddings": 512, 
11 |   "num_attention_heads": 32,
12 |   "num_hidden_layers": 24,
13 | 
14 |   "pooler_fc_size": 1024,
15 |   "pooler_num_attention_heads": 64,
16 |   "pooler_num_fc_layers": 3, 
17 |   "pooler_size_per_head": 128, 
18 |   "pooler_type": "first_token_transform", 
19 |   "type_vocab_size": 2, 
20 |   "vocab_size": 21128,
21 |   "ln_type":"postln"
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/albert_tsim/albert_config/albert_config_xxlarge.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.0,
 3 |   "directionality": "bidi", 
 4 |   "hidden_act": "gelu", 
 5 |   "hidden_dropout_prob": 0.0,
 6 |   "hidden_size": 4096,
 7 |   "embedding_size": 128,
 8 |   "initializer_range": 0.02, 
 9 |   "intermediate_size": 16384,
10 |   "max_position_embeddings": 512, 
11 |   "num_attention_heads": 64,
12 |   "num_hidden_layers": 12,
13 | 
14 |   "pooler_fc_size": 1024,
15 |   "pooler_num_attention_heads": 64,
16 |   "pooler_num_fc_layers": 3, 
17 |   "pooler_size_per_head": 128, 
18 |   "pooler_type": "first_token_transform", 
19 |   "type_vocab_size": 2, 
20 |   "vocab_size": 21128,
21 |    "ln_type":"preln"
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/albert_tsim/albert_config/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.0,
 3 |   "directionality": "bidi", 
 4 |   "hidden_act": "gelu", 
 5 |   "hidden_dropout_prob": 0.0,
 6 |   "hidden_size": 768, 
 7 |   "initializer_range": 0.02, 
 8 |   "intermediate_size": 3072, 
 9 |   "max_position_embeddings": 512, 
10 |   "num_attention_heads": 12, 
11 |   "num_hidden_layers": 12, 
12 |   "pooler_fc_size": 768, 
13 |   "pooler_num_attention_heads": 12, 
14 |   "pooler_num_fc_layers": 3, 
15 |   "pooler_size_per_head": 128, 
16 |   "pooler_type": "first_token_transform", 
17 |   "type_vocab_size": 2, 
18 |   "vocab_size": 21128
19 | }
20 | 


--------------------------------------------------------------------------------
/albert_tsim/args.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tensorflow as tf
 3 | 
 4 | tf.logging.set_verbosity(tf.logging.INFO)
 5 | 
 6 | file_path = os.path.dirname(__file__)
 7 | 
 8 | 
 9 | #模型目录
10 | model_dir = os.path.join(file_path, 'albert_lcqmc_checkpoints/')
11 | 
12 | #config文件
13 | config_name = os.path.join(file_path, 'albert_config/albert_config_tiny.json')
14 | #ckpt文件名称
15 | ckpt_name = os.path.join(model_dir, 'model.ckpt')
16 | #输出文件目录
17 | output_dir = os.path.join(file_path, 'albert_lcqmc_checkpoints/')
18 | #vocab文件目录
19 | vocab_file = os.path.join(file_path, 'albert_config/vocab.txt')
20 | #数据目录
21 | data_dir = os.path.join(file_path, 'data/')
22 | 
23 | num_train_epochs = 10
24 | batch_size = 128
25 | learning_rate = 0.00005
26 | 
27 | # gpu使用率
28 | gpu_memory_fraction = 0.8
29 | 
30 | # 默认取倒数第二层的输出值作为句向量
31 | layer_indexes = [-2]
32 | 
33 | # 序列的最大程度，单文本建议把该值调小
34 | max_seq_len = 128
35 | 
36 | # graph名字
37 | graph_file = os.path.join(file_path, 'albert_lcqmc_checkpoints/graph')


--------------------------------------------------------------------------------
/albert_tsim/bert_utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import collections
  6 | import copy
  7 | import json
  8 | import math
  9 | import re
 10 | import six
 11 | import tensorflow as tf
 12 | 
 13 | def get_shape_list(tensor, expected_rank=None, name=None):
 14 | 	"""Returns a list of the shape of tensor, preferring static dimensions.
 15 | 
 16 | 	Args:
 17 | 		tensor: A tf.Tensor object to find the shape of.
 18 | 		expected_rank: (optional) int. The expected rank of `tensor`. If this is
 19 | 			specified and the `tensor` has a different rank, and exception will be
 20 | 			thrown.
 21 | 		name: Optional name of the tensor for the error message.
 22 | 
 23 | 	Returns:
 24 | 		A list of dimensions of the shape of tensor. All static dimensions will
 25 | 		be returned as python integers, and dynamic dimensions will be returned
 26 | 		as tf.Tensor scalars.
 27 | 	"""
 28 | 	if name is None:
 29 | 		name = tensor.name
 30 | 
 31 | 	if expected_rank is not None:
 32 | 		assert_rank(tensor, expected_rank, name)
 33 | 
 34 | 	shape = tensor.shape.as_list()
 35 | 
 36 | 	non_static_indexes = []
 37 | 	for (index, dim) in enumerate(shape):
 38 | 		if dim is None:
 39 | 			non_static_indexes.append(index)
 40 | 
 41 | 	if not non_static_indexes:
 42 | 		return shape
 43 | 
 44 | 	dyn_shape = tf.shape(tensor)
 45 | 	for index in non_static_indexes:
 46 | 		shape[index] = dyn_shape[index]
 47 | 	return shape
 48 | 
 49 | def reshape_to_matrix(input_tensor):
 50 | 	"""Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
 51 | 	ndims = input_tensor.shape.ndims
 52 | 	if ndims < 2:
 53 | 		raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
 54 | 										 (input_tensor.shape))
 55 | 	if ndims == 2:
 56 | 		return input_tensor
 57 | 
 58 | 	width = input_tensor.shape[-1]
 59 | 	output_tensor = tf.reshape(input_tensor, [-1, width])
 60 | 	return output_tensor
 61 | 
 62 | def reshape_from_matrix(output_tensor, orig_shape_list):
 63 | 	"""Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
 64 | 	if len(orig_shape_list) == 2:
 65 | 		return output_tensor
 66 | 
 67 | 	output_shape = get_shape_list(output_tensor)
 68 | 
 69 | 	orig_dims = orig_shape_list[0:-1]
 70 | 	width = output_shape[-1]
 71 | 
 72 | 	return tf.reshape(output_tensor, orig_dims + [width])
 73 | 
 74 | def assert_rank(tensor, expected_rank, name=None):
 75 | 	"""Raises an exception if the tensor rank is not of the expected rank.
 76 | 
 77 | 	Args:
 78 | 		tensor: A tf.Tensor to check the rank of.
 79 | 		expected_rank: Python integer or list of integers, expected rank.
 80 | 		name: Optional name of the tensor for the error message.
 81 | 
 82 | 	Raises:
 83 | 		ValueError: If the expected shape doesn't match the actual shape.
 84 | 	"""
 85 | 	if name is None:
 86 | 		name = tensor.name
 87 | 
 88 | 	expected_rank_dict = {}
 89 | 	if isinstance(expected_rank, six.integer_types):
 90 | 		expected_rank_dict[expected_rank] = True
 91 | 	else:
 92 | 		for x in expected_rank:
 93 | 			expected_rank_dict[x] = True
 94 | 
 95 | 	actual_rank = tensor.shape.ndims
 96 | 	if actual_rank not in expected_rank_dict:
 97 | 		scope_name = tf.get_variable_scope().name
 98 | 		raise ValueError(
 99 | 				"For the tensor `%s` in scope `%s`, the actual rank "
100 | 				"`%d` (shape = %s) is not equal to the expected rank `%s`" %
101 | 				(name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
102 | 
103 | def gather_indexes(sequence_tensor, positions):
104 | 	"""Gathers the vectors at the specific positions over a minibatch."""
105 | 	sequence_shape = get_shape_list(sequence_tensor, expected_rank=3)
106 | 	batch_size = sequence_shape[0]
107 | 	seq_length = sequence_shape[1]
108 | 	width = sequence_shape[2]
109 | 
110 | 	flat_offsets = tf.reshape(
111 | 			tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
112 | 	flat_positions = tf.reshape(positions + flat_offsets, [-1])
113 | 	flat_sequence_tensor = tf.reshape(sequence_tensor,
114 | 																		[batch_size * seq_length, width])
115 | 	output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
116 | 	return output_tensor
117 | 
118 | # add sequence mask for:
119 | # 1. random shuffle lm modeling---xlnet with random shuffled input
120 | # 2. left2right and right2left language modeling
121 | # 3. conditional generation
122 | def generate_seq2seq_mask(attention_mask, mask_sequence, seq_type, **kargs):
123 | 	if seq_type == 'seq2seq':
124 | 		if mask_sequence is not None:
125 | 			seq_shape = get_shape_list(mask_sequence, expected_rank=2)
126 | 			seq_len = seq_shape[1]
127 | 			ones = tf.ones((1, seq_len, seq_len))
128 | 			a_mask = tf.matrix_band_part(ones, -1, 0)
129 | 			s_ex12 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 2)
130 | 			s_ex13 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 3)
131 | 			a_mask = (1 - s_ex13) * (1 - s_ex12) + s_ex13 * a_mask
132 | 			# generate mask of batch x seq_len x seq_len
133 | 			a_mask = tf.reshape(a_mask, (-1, seq_len, seq_len))
134 | 			out_mask = attention_mask * a_mask
135 | 		else:
136 | 			ones = tf.ones_like(attention_mask[:1])
137 | 			mask = (tf.matrix_band_part(ones, -1, 0))
138 | 			out_mask = attention_mask * mask
139 | 	else:
140 | 		out_mask = attention_mask
141 | 
142 | 	return out_mask
143 | 
144 | 


--------------------------------------------------------------------------------
/albert_tsim/create_pretrain_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | BERT_BASE_DIR=./albert_config
4 | python3 create_pretraining_data.py --do_whole_word_mask=True --input_file=data/news_zh_1.txt \
5 | --output_file=data/tf_news_2016_zh_raw_news2016zh_1.tfrecord --vocab_file=$BERT_BASE_DIR/vocab.txt --do_lower_case=True \
6 | --max_seq_length=512 --max_predictions_per_seq=51 --masked_lm_prob=0.10


--------------------------------------------------------------------------------
/albert_tsim/data/news_zh_1.txt:
--------------------------------------------------------------------------------
  1 | 最后的南京老城该往何处去 城市化时代呼唤文化自觉
  2 | 【概要】80后学者姚远出版《城市的自觉》一书 姚远出版《城市的自觉》 作者简介姚远，政治学博士，1981年出生于南京，1999年从金陵中学毕业后考入北京大学国际关系学院，负笈燕园十二载，获政治学博士学位。
  3 | 现任教于南京大学政府管理学院。
  4 | 在关系古都北京、南京等历史文化名城存废的历史关头，他锲而不舍地为抢救中华文明奔走呐喊。
  5 | 2010年，他被中国文物保护基金会评为“中国文化遗产保护年度十大杰出人物”，当时的获奖评语是：一支?土耳其诗人纳齐姆·希克梅特曾深情地说：“人的一生有两样东西不会忘记，那就是母亲的面孔和城市的面貌。
  6 | ”然而，前不久南京再次发生颜料坊地块市级文保单位两进建筑被毁的事件。
  7 | 故宫博物院院长、原国家文物局局长单霁翔近日在宁直言，南京城南再遭损毁令他心痛。
  8 | 南京老城“路在何方”？
  9 | 2010年被中国文物保护基金会评为“中国文化遗产保护年度十大杰出人物”的80后学者、南京大学姚远老师所著的《城市的自觉》近日正式出版。
 10 | 书中探索古城保护与复兴的建设性路径，值得南京的决策者们在颜料坊事件后再次深思。
 11 | 江南时报记者黄勇疑问:城市化，是否迷失了文化自觉“目睹一座座古建筑的消失，行走在古城的废墟，想到梁思成说过的‘拆掉北京的一座城楼，就像割掉我的一块肉；扒掉北京的一段城墙，就像扒掉我的一层皮’，真是感同身受，我流泪了。
 12 | ”这是姚远最让记者为之动容的一句话，也是《城市的自觉》一书中的“魂”。
 13 | 包括南京在内，中国大多数城市正处于大拆除的时代，成片的历史街区在“旧城改造”的大旗下被不断夷为平地。
 14 | 有专家称，这场“休克疗法式”的“改造”，对中华文脉的影响之深、之巨、之不可逆，堪称中国城市史上“三千年未有之大变局”。
 15 | 《城市的自觉》正是在这种背景下，由北京大学出版社于近日出版的。
 16 | 书中，姚远以情理交融的文字，辅之以背景、南京古城珍贵的最后影像，如实记录了在北京梁思成故居和宣南、东四八条、钟鼓楼等历史街区，南京颜料坊、南捕厅、门东、门西等历史街区的最后时刻，为阻挡推土机而屡败屡战的历程。
 17 | 同时，又理性剖析了与存续城市记忆密切相关的文化自觉、物权保护、民生改善、公众参与等议题，探索古城保护与复兴的建设性路径。
 18 | 为何要保老城？
 19 | 很多人认为陈旧的老街区、老房子应该为摩天大楼让位，造高速路、摩天楼是现代化，“保护老古董”是抱残守缺，姚远却不是这种看法：“一些决策者并不知城市遗产保护恰恰是‘后工业’、‘后现代’的思想，比前者的理念差不多领先了一个世纪。
 20 | ” 在他眼里，南京这座千年古城曾是“活”着的，老城里有最纯正的方言、最鲜活的民俗、最地道的小吃，简直是一座巨大的民俗博物馆。
 21 | “你可以在同老者的交谈中，听到一个个家族或老宅的兴衰故事。
 22 | 这里的城与人，就是一本厚重的大书，它们用最生动的语言向你讲述不一样的‘城南旧事’。
 23 | ”面对许多古城不断遭到大拆大建、拆真建假、拆旧建新的厄运，姚远痛心地说，“我们的城市化，是否迷失了自我认同，是否失去了文化自觉的能力？
 24 | 在城市化的文化自觉重建之前，我们还将继续付出多少代价？
 25 | ”现状:老城南仅剩不到1平方公里南京城曾有十九个别称，如秦淮、白下、建邺、江宁等，建城史更是长达两千五百年。
 26 | 但如今，除去明城墙以及一些重点文物以及七零八落的民国建筑之外，这个城市跟中国其他的城市看上去并无太多区别，鳞次栉比的高楼大厦，车水马龙的宽阔街道，川流不息的红男绿女……持续多年的旧城改造，已经让南京老城日益失去古朴的历史风貌。
 27 | 秦淮河畔的老城南，是南京文化的发源地，是南京的根。
 28 | 在2006年前，尽管南京诸多的“殿、庙、塔、桥”已在兵火和变乱中消失，但秦淮河畔的老城南依然保存了文物丰富、风貌完整的历史街区。
 29 | 然而，2006年，南京风云突起，突击对颜料坊、安品街等历史街区实施“危旧房改造”，拆毁大量文物建筑。
 30 | 2009年又是一轮“危改”，大大的“拆”字，再次涂上了门东、门西、南捕厅等多片老街区。
 31 | 2010年至今，南京先后出台了《南京市历史文化名城保护条例》《南京历史文化名城保护规划》《南京老城南历史城区保护规划与城市设计》，以法规的高度，回应了社会各界的诉求，明确要求对老城的整体保护。
 32 | 姚远和其他学者联名提出的建议，有40处被采纳进了最后的《条例》中。
 33 | 姚远告诉江南时报记者，南京的传统旧城区——老城南仅剩不到1平方公里，尚不及50平方公里老城总面积的2%，整体保护势在必行。
 34 | 但他并不认为整体保护意味着“冻结不动”，而是强调古民居、古街巷和宏伟的古建筑一样重要，它们是古都特有的城市肌理，低矮的民居衬托高大的城阙，形成轮廓丰富的城市格局。
 35 | 如果消灭了它们，名胜古迹就变成无法交融联络的“孤岛”，古都的整体风貌则无从谈起。
 36 | “对于金陵古城濒危的最后这点种子，实行‘整体保护’已经没有任何讨价还价的余地。
 37 | ”《城市的自觉》一书中，姚远的声音振聋发聩。
 38 | 方案:探索保护与整治的最大合力可惜的是，在专家学者与推土机的拉锯战中，前者基本还是处于下风的，即便是中央领导的几次批示，旧城改造的推土机依然我行我素，将一面面古墙碾在轮下。
 39 | 颜料坊、牛市、门东等被“肢解”的老城南片区，如今多已竖起或正在建设房地产开发、商业项目。
 40 | 2002年8月，姚远在南京颜料坊开始了古城保护的第一次拍摄。
 41 | 如今牛市64号-颜料坊49号这座百年清代建筑却再遭破坏。
 42 | 单霁翔近日在南大演讲中也表示，颜料坊再遭损毁令人心痛。
 43 | “我不认同南京老城南成片拆除，搬迁当地住户的改造方式。
 44 | 简单地认为它的居住形式落后了，这种态度是消极的，没有给予作为代表地域特色的传统建筑的居住形式有尊严的呵护。
 45 | ”《城市的自觉》一书中也多次提及南京老城不能“只见物，不见人”。
 46 | 姚远强调，南京历史文化名城的保护，离不开对传统社区的活态保护。
 47 | 老城南有丰富的民俗和古老的街区，是唇齿相依的一个整体。
 48 | 拆去了老宅，迁走了居民，文化自然就成了无源之水、无本之木。
 49 | “国际上的成功经验表明，保护从来不是发展、民生、现代化的反义词。
 50 | ”姚远建议，老城区的整治，可以在政府的指导和协助下，以居民为主体，通过社区互助的“自我修缮”的方式来实施，将“旧城区改建”从拆迁模式下的行政关系转变为修缮模式下的民事关系，最大限度地调动各方面的积极性，形成保护与整治的最大合力。
 51 | 措施:用行动让法律“站起来”经历了两次保卫战，姚远对于文物保护方面的法律条文早已如数家珍。
 52 | 在他看来，“法治”和“参与”这两个关键词尤为重要。
 53 | 姚远认为，政府的很多失误是因为政策制定的封闭性，推土机开到门口时才告知公众。
 54 | 公民参与，就要求行政更加透明、公开。
 55 | “几次保护后制定的政策或者法律法规，也很重要。
 56 | 因为未来只要有人参与去触动，政策或者法律法规就能‘站起来’，变成一套强有力的程序，约束政府行为。
 57 | ”“这些年古城保护的每一点进步，都离不开广泛的公众参与，都凝结着社会各界共同的努力。
 58 | ”姚远认为，在北京、南京等许多古城，一批志愿者、社会人士和民间团体，在古城命运的危急关头，已经显示出日益崛起的公众参与的巨大力量。
 59 | “关键要有人能够站出来。
 60 | 第一个人站出来，就会有第二个人跟上，专家和媒体也会介入，事情就能在公开博弈中得到较为合理的解决。
 61 | 我国目前民间的文保力量正在逐渐成长，公民参与将成为构建良性社会机制的重要力量。
 62 | ”姚远强调。
 63 | 单霁翔对文化遗产保护中的公众参与也做出了高度评价。
 64 | 他在《城市的自觉》的序中写道：“保护文化遗产绝不仅仅是各级政府和文物工作者的专利，只有广大民众真心地、持久地参与文化遗产保护，文化遗产才能得到最可靠的保障。
 65 | 以姚远博士为代表的一批志愿者和社会人士，在我国文化遗产保护事业中已经显示出不可低估、无可替代的力量。
 66 | 
 67 | 不是每一块石头，都能叫珠宝
 68 | 对于很多人来说，矿石是长成这样的石头： 上图：铁矿石 上图：石 上图：煤矿石 上图：锡矿石如你所想象的那样，很多矿石都是又黑又丑，即使在野外遇到，也不会多看一眼的那种石头。
 69 | 当然，也不是所有矿石都这么丑。
 70 | 我们再看看下面这些矿石： 上图：赤铜 上图：钼铅矿 上图：方硼石 上图：自然硫 上图：云母这些矿石，能否让你感慨大自然的造化神奇?小伙伴们可能会想，这些漂亮的矿石，打磨以后就是漂亮的宝石啊，为什么我们不把他们加工成珠宝呢?这个是个好问题。
 71 | 人类自古以来就没有停止过对美好事物的追求，凡漂亮的东西都可能被人们看上，成为制作饰品原料。
 72 | 珠宝就是大自然赐予的美好的东西中的一种。
 73 | 珠宝如果不美就不能成为珠宝，这种美或表现为绚丽的颜色，或表现为透明而洁净。
 74 | 物以稀为贵，鸽血红级别的红宝石、矢车菊蓝级别的蓝宝石，每克拉价值上万美元，而某些颇美丽又可耐久的宝石(如白水晶)，由于产量较多，开采较容易，其价格一直较低。
 75 | so，大家能明白了吧，不是每一块石头都能成为珠宝。
 76 | 如果拥有珠宝，请务必珍惜。
 77 | 目前1000+人已关注加入我们您看此文用· 秒，转发只需1秒呦~
 78 | 
 79 | 北京市黄埔同学会接待“踏寻中山足迹学习之旅”台湾参访团
 80 | 光明网讯（通讯员苏民军记者任生心）日前，由台湾中国统一联盟桃竹分会成员组成的“踏寻中山足迹学习之旅”参访团一行21人来到北京参观访问。
 81 | 在北京市黄埔同学会的精心安排下，在京期间，参访团拜谒了中山先生衣冠冢，参观了卢沟桥、抗战纪念馆、抗战名将纪念馆和宋庆龄故居等；“踏寻中山足迹学习之旅”参访团还将赴南京中山堂等地参访。
 82 | 在抗战纪念馆，参访团成员们认真聆听讲解员的介绍，仔细观看每张图片资料，回顾国共两党团结抗战的往事，缅怀那些为民族独立而壮烈牺牲的英雄。
 83 | 而后，参访团一行来到位于京西香山深处的孙中山先生衣冠冢拜谒，参访团团长李尚贤（台湾中国统一联盟总会第一副主席兼秘书长）发表了简短的感言后，全体成员在孙中山雕像前三鞠躬，向孙中山先生致敬，缅怀孙中山先生以“三民主义”为宗旨的革命的一生。
 84 | 随后，参访团一行又来到2009年建成的北京香麓园抗战名将纪念馆，瞻仰了佟麟阁将军墓，他们还参观了宋庆龄故居。
 85 | 
 86 | 鼎丰(08056.HK)向客户借出5000万人币 月息1.75厘 为期一年
 87 | 鼎丰集团控股(08056.HK)+0.030(+1.345%)公布，同意将一笔5000万元人民币的款项委托予贷款银行，以供转借予客户，贷款期为十二个月，月息1.75厘。
 88 | (报价延迟最少十五分钟。
 89 | 
 90 | 在青岛不买房，居然能拥有这么多东西！
 91 | 这段时间青岛房价扶摇直上闹得人心惶惶这不，青岛房市，又在国庆节火了一把 国庆5天内16城启动楼市限购一时之间楼市风云大转纵观9月份青岛一手房均价怎么也有一万三四了看完十三哥默默地回去工作了 按照一套房子100平米计算购买一套房子大概需要130万在青岛，买一套房子怎么也得需要130万如果这些钱不买房能在全世界各地买什么呢？
 92 | 今天，小编就带大家（bai）感（ri）受（meng）一下在西班牙能买3.4个村庄 一位英国人，名叫尼尔·克里斯蒂，在西班牙农村西北部一个田园地区买下了一处村庄（阿鲁纳达），只花费了4.5万欧元（约合35.6万人民币）。
 93 | 简直便宜到吐血，这点钱要是在青岛的豪宅区，恐怕厕所都买不了。
 94 | 如果选的地方靠近旅游景区，稍微装修一下，变成一个度假村……妥妥的壕啊，画面太美，不敢想象……在爱尔兰差不多能买个小岛 Inishdooney岛，位于北爱尔兰西北部，售价14万英镑（约合139万人民币）。
 95 | 约38万平方米的无人居住地有淡水池塘、天然溶洞和鹅卵石海滩，美翻了有木有！
 96 | 一个小岛的钱，和青岛一个水泥格子的价格差不多。
 97 | 不要拦着最懂妹，我要去爱尔兰做岛主！
 98 | 在巴厘岛能买2座别墅 巴厘岛，蓝天、碧水、白云，美的像梦一样，而你知道吗，这座世界著名旅游岛一个小镇的别墅只要10.7万美元，也就是不到70万人民币，青岛买房那点钱都够买两栋别墅了。
 99 | 在巴厘岛拥有两座别墅是什么概念？
100 | 发完文章小编就去买机票！
101 | 在美国能买1驾小飞机 美国塞斯纳C172R型，最大航程可达1270公里，飞机上具备GPS导航定位系统、自动驾驶、盲降设备等，价格大概在17万美元左右，也就是104万人民币。
102 | 在青岛买房的钱妥妥的够买一架飞机了。
103 | 直接移民去西班牙 一个以阳光和沙滩吸引着无数游客的国家，有着激情的足球和斗牛文化、独特的海鲜美食、发达的时装行业、热情火辣的西班牙女郎...... 直接去西班牙？
104 | 你以为我在搞笑？
105 | 西班牙有个买房移民的政策，在西班牙的指定区域购买当地售价在170万人民币以上的房产就可以办理多次往返签证了，然后你待够10年，就可以入西班牙国籍了。
106 | 买一大堆LV手袋 十三哥相信很多女孩应该都很喜欢LV手袋。
107 | 这款极具魅力的CHAIN LOUISE手袋价格为2.04万人民币。
108 | 随随便便买一堆！
109 | 带着爱人环游世界 微博上那对香港80后小夫妻历时308天花费16万人民币走遍了37国，你们还记得吗？
110 | 按照他们的行程，你几乎就能去环游世界了。
111 | 什么也不用想，痛痛快快环游地球一圈！
112 | 在澳大利亚当农场主 五卧室、三浴室的大房子，还有德尼利昆镇附近一块27英亩的农场。
113 | 只需要美元价格14.4万美元（≈96万人民币），是不是惊呆了！
114 | 哦，对了，澳大利亚还提供住房贷款业务哟！
115 | 十三哥要挣钱去澳大利亚买牧场！
116 | 在莫斯科买下1座别墅 莫斯科市中心双卧室、双浴室的豪华大别墅，你觉得多少钱？
117 | 千万别吃惊，美元价格在15.2万美元左右（≈100.1万人民币）。
118 | 虽然在这个城市生活总会有各种各样的压力我们必须十分努力才能看起来毫不费力但是我们永远保持一颗向上的心不气馁，好好加油！
119 | [海尔地产世纪公馆]新都心2期升级新品9月底推出 海尔地产世纪公馆二期规划8栋高层住宅，预计9月底推出，认筹中，交2.5万享99折优惠，预计均价17000-18000元/平。
120 | 户型面积区间89-162平，主力120-140平品质改善产品。
121 | 125-126平为套三，142-162平为套四。
122 | 海尔地产世纪公馆一户一价，以上价格仅供参考，所有在售户型价格以售楼处公布为准。
123 | 咨询电话：400-099-0099 转 27724[金隅和府]3大商圈环绕地铁房18000元 金隅和府一户一价，以下价格仅供参考，所有在售户型价格以售楼处公布为准。
124 | 金隅和府预计9月20日加推6#楼（24F）楼王，3个单元，1梯2户，户型面积为90平套二，122平、138平套三，团购交1万团购金、10万认筹金可以享受97折优惠，预计均价18000-26000元/平。
125 | 金隅和府位于镇江路12号，近邻山东路、延吉路、东西快速路等三横三纵交通网、未来享地铁M5之便利；CBD商圈、香港路商圈、台东商圈3大商圈环绕，居住生活便利。
126 | 
127 | 直播拐点来临：未来直播APP开发还有哪些趋势？
128 | 趋势一：巨头收割直播价值，依赖巨头扶持的直播平台存活几率更高尽管一线垂直领域已经被巨头的直播平台占领，但创业者依然还有机会。
129 | 未来在泛娱乐社交、游戏、美妆电商等核心领域必然会有几家直播平台具有突出优势，而这些具备突出优势的直播平台很可能会被BAT入股收购或者收编，因此如果能够获得巨头的资本输血与流量扶持，往往存活的几率会更大。
130 | 趋势二：直播平台从争抢网红到争抢明星资源明星+粉丝经济+直播平台，很可能会衍生出新型的整合营销方式。
131 | 即怎样通过可购买价值的内容设定，运营好与粉丝之间的感情沟通，让粉丝群体进行持续性参与并进行情感消费投入，直播平台与明星组合叠加的人气效应与非理性消费的频次也非常契合品牌商的需求。
132 | 因此，直播的未来趋势将从争抢网红资源到争抢明星资源。
133 | 这是直播平台孕育粉丝经济进而带来新型的情感消费与商业模式的要走的一条必要的路径。
134 | 而未来可能会有越来越多的品牌商更愿意尝试这种直播互动带来的品牌曝光机会与商业变现模式。
135 | 趋势三：从泛娱乐明星网红直播转入到二级垂直细分市场的专业直播泛娱乐直播内容属性上由于其单一、无聊的直播内容无法构成平台的核心竞争力，直播平台未来大趋势是从泛娱乐直播转入到内涵直播。
136 | 目前部分视频直播平台已针对财经、育儿、时尚、体育、美食等垂直领域的自频道开放直播权限，内容的差异化与垂直化可以为直播平台带来新的商业模式，平台也可以通过优质的直播内容，产生付费、会员、打赏以及直播购物等盈利模式。
137 | 因为目前缺乏真正有价值的直播，多数直播平台在内容供给侧是存在问题的，网红要提升自身与粉丝之间的黏性，显然需要差异化的内容，而从目前的欧美网红与直播内容的发展规律来看，更健康、更有价值与内涵的直播内容成为未来的发展趋势之一。
138 | 趋势四：网红孵化器批量生产网红 将走向专业化由于在网红包装、传播、变现等方面具备专业的运营能力，网红孵化器未来须具备 “经纪人+代运营+供应链+网红星探”等多重角色，向专业网红群聚捆绑者向提供专业化的服务与垂直领域专家型、特长型、个性型网红培养者与发现者这一定位转型。
139 | 借助在用户洞察、网红运营、电商管理方面的精良团队，需要打通粉丝营销和电商运营，并将网红、粉丝，平台、内容，品牌、供应链，进行有效链接及整合。
140 | 趋势五：C端直播洗牌 B端企业直播崛起带动专业的商务直播需求目前，各种企业的商务发布会、沙龙、座谈、讲座、渠道大会、教育培训等方面直播需求强烈，在企业进行移动视频直播的需求推动下，它们开始寻求低成本、快速的搭建属于自己的高清视频直播平台的模式，而企业搭建视频直播平台需要专业的技术能力的服务商来应对这种需求。
141 | 用户可以通过微信直接观看企业直播参与互动，让直播突破空间场地的限制，某种程度也代表直播产业链的一个接入的发展方向。
142 | 趋势六：解决直播用户体验与新媒体营销，移动直播服务商将迎来新的机会直播行业进入了各行各业均可参与，并将直播作为企业服务工具的直播+时代，而玩转直播+，从技术、营销、服务、内容，进而可以衍生出更多的直播服务盈利。
143 | 而对于解决直播体验背后的移动直播服务商，也将迎来新的机会。
144 | 趋势七：直播或成为企业的标配，可能为企业带来更多转化率当直播火爆起来的时候，人们要关注的不仅仅是行业能火爆多久，它的商业模式是否成熟，在洗牌节点来临与巨头羽翼覆盖下，自身还有没有机会，创业者与企业都应该从中寻找自己的机会与跨界领域的嫁接。
145 | 它不仅仅是内容和流量的变现工具，更应该是一种营销与商业理念的转变。
146 | 不久前，马化腾向青年创业者建议，要关注两个产业跨界的部分，因为将新技术用在两个产业跨界部分往往最有可能诞生创新的机会。
147 | 而企业营销如果能从垂直细分领域的切入并借助直播技术与趋势为已所用，往往也能获得新的机会，尽管任何基于行业趋势的预测都意味着不确定性，但抓住不确定性的机会，才能最终在新一轮风口下，把握企业转型与商业、营销模式创新的机会，迎来属于自己的时代。
148 | 欢迎互联网创业者加入杭州互联网创业QQ群：157936473直接加QQ或pc上点击加群项目开发咨询：0571-28030088
149 | 
150 | 邓伟根北美硅谷行“捎回”一个MBA授课点
151 | 南都讯记者郭伟豪通讯员伍新宇6月7日至16日，佛山市委常委、南海区委书记、佛山高新区党工委书记兼管委会主任邓伟根率领由南海区和佛山高新区相关人员组成的经贸洽谈和友好交流代表团，对新加坡、美国和加拿大进行友好访问。
152 | 由于新加坡裕廊、美国硅谷与有“加拿大高科技之都”美誉的万锦市均以发达的高科技产业著名，皆是所在国的硅谷，邓伟根更称此行为“三谷”之行。
153 | 在新加坡，邓伟根一行与新加坡淡马锡控股公司相关负责人就双方进一步深化合作进行了深入的探讨。
154 | 交流中，新加坡国立大学(N U S)商学院杨贤院长表示有意在南海设立N U S的海外M B A授课点，双方拟于6月下旬就有关意向在南海签订合作协议。
155 | 6月9日，邓伟根一行前往硅谷拜会了硅谷美华科技商会(S V C A C A )和华美半导体协会(C A SPA )。
156 | SV C A C A和CA SPA将通过其广泛的会员和在硅谷等地的影响力，为佛高区、南高区在硅谷进行宣传推介，并积极把有意拓展中国市场的高科技项目推荐到南高区。
157 | 代表团一行还到访了南海区政府与万锦市政府联合举办了“南海区与万锦市经贸交流会”。
158 | 2012年12月，万锦市市长薛家平先生率团访问南海后，万锦市议会正式通过了为当地一道路命名“南海街”的议案，并于2013年9月举行道路命名仪式。
159 | 在本次交流中，邓伟根提议未来也在南海选址命名一条“万锦路”，此举也立即得到薛家平市长的认同。
160 | 对于“三谷”之行，邓伟根表示，南海将利用现有的南海乡亲和关系密切的协会等有利资源，计划在“三谷”建立南海和佛高区的海外联络处，学习和吸收海外高科技之都的先进经验，努力将已定位为“中国制造金谷”的佛高区南海核心园打造成为下一个“硅谷”，并争取早日实现佛高区挺进全国国家高新区20强的目标。
161 | 
162 | 内地高中生将通篇学习《道德经》
163 | 摘要国内第一套自主研发的高中传统文化通识教材预计将于今年9月出版，四册分别为《论语》《孟子》《大学·中庸》和《道德经》。
164 | 2016年高考改革方案中，全国25个省高考要统一命题，并且增加分数后的语文考试，正在研究增加“中华优秀传统文化”之相关内容。
165 | 《道德经》成为高中传统文化教材。
166 | 法制晚报讯(记者 李文姬 )今天上午，记者从“十二五”教育部规划课题《传统文化与中小学生人格培养研究》总课题组了解到，国内第一套自主研发的高中传统文化通识教材预计将于今年9月出版，四册分别为《论语》《孟子》《大学·中庸》和《道德经》。
167 | 至此，课题组已完成了幼儿园、小学、初中、高中各阶段标准化传统文化教材的研发工作，高中国学教材将在各地开展成规模的教材试用工作。
168 | 中国国学文化艺术中心秘书长张健表示，目前各地高考改革的几个信号均指向国学，但考什么、怎么考又是一个难题。
169 | 专家建议，不应以文言文字词解释等传统形式考查，应关注考生如何消化吸收传统文化中的哲学素养和思想韬略。
170 | 教材各年级国学内容全覆盖据 “十二五”教育部规划课题《传统文化与中小学生人格培养研究》总课题组介绍，高中传统文化通识系列教材作为“十一五”、“十二五”两个阶段十年课题研究的重要成果之一，由中国国学文化艺术中心承担资源整合和编著。
171 | 去年，教育部印发了《完善中华优秀传统文化教育指导纲要》，要求在课程建设和课程标准修订中强化中华优秀传统文化内容。
172 | 在中小学德育、语文、历史等课程标准修订中，增加中华优秀传统文化的比重。
173 | 课题组秘书长张健表示，幼儿园、小学、初中、高中各阶段标准化传统文化教材的均已研发完成，明确提出以“青少年完美人格”为传统文化教育目标，教材知识相互关联，自成体系，并通过高中教材实现最终教学评价。
174 | 这是“十一五”“十二五”两个阶段十年课题研究的重要成果之一。
175 | 今年5月份之前，《高等教育传统文化教材》(12册)《全国行政领导干部国学教材》(10册)两套教材也将研发完毕。
176 | 内容高中教材含《论语》《道德经》此次即将出版的高中阶段传统文化通识教材共有4册，供高中一、二年级使用。
177 | 高一学习《论语》《孟子》，高二学习《大学·中庸》和《道德经》。
178 | 其中《道德经》为原文全本讲解，另外三册则是按主题归类讲解。
179 | 如《大学·中庸》一册，分为“慎独”“齐家”“格物致知”“中和”“为政”等章节。
180 | 据课题组专家介绍，这4册书并非孤立的高中教材，而是《中华优秀传统文化教育全国中小学实验教材》的高中部分。
181 | 全套教材包含小学、初中和高中三个阶段，经专家组反复研讨、论证，制定了“儒学养正、兵学相佑、道法自然、文化浸润”的课程结构，各阶段教学内容和深度循序渐进、系统科学。
182 | 事实上，小学高年级段已开始涉及《论语》《孟子》等儒学典籍，但仅以诵读和简单理解为主，到高中阶段，学生可在已有基础上更为深刻地领悟儒道经典的思想内涵，以达到融会贯通的程度。
183 | 此外，每一章节在讲解儒道核心精神的同时，还为学生提供了大量中西文化比较等拓展阅读素材。
184 | 针对公众关注的一个话题，即传统文化有望成为高考的新考点，课题组表示目前在研发高中传统文化教材的同时，就已开展了另一个重点子课题研究，即传统文化教学评价与考试模式研究。
185 | 张健强调高考改革的几个信号均指向国学，例如北京、上海等地公布的高考改革方案中，英语降分后其所降分数分给了语文，而且还更进一步明确指出了就是将分数转移给所增加的“传统文化考试内容”部分。
186 | 又如今年清华北大自主招生均招收国学特长生。
187 | 此外，近期公布的2016年高考改革方案中，全国25个省高考要统一命题，并且增加分数后的语文考试，正在研究增加“中华优秀传统文化”之相关内容。
188 | 张健表示，传统文化成为高考的又一创新考点指日可待，但考什么、怎么考又是一个重大难题。
189 | 由于相关子课题研究还没有结束，课题组非行政机构只承担建议义务。
190 | 张健坦言，能否在高考语文中出现一个新的形式——政论或申论形式的传统文化论述题，这一方向应该是研究和创新的改革方向之一。
191 | 若2016年传统文化进入高考，最大的问题是很多高中生没有接触过传统文化课程，不具备相关知识储备和素养，国学文化是通过长期熏陶和涵养才能显现的，不是靠一朝一夕突击补课就能拥有的。
192 | 
193 | 悬灸技术培训专家教你艾灸降血糖，为爸妈收好了！
194 | 近年来随着我国经济条件的改善和人们生活水平的提高，我国糖尿病的患病率也在逐年上升。
195 | 悬灸技术培训的创始人艾灸专家刘全军先生对糖尿病深有研究，接下来，学一学他是怎么用艾灸降血压的吧！
196 | 中医认为，糖尿病是气血、阴阳失调等多种原因引起的一种慢性疾病。
197 | 虽然分为上消、中消、下消，但是无论何种糖尿病 ，治疗的原则都是荣养阴液，清热润燥。
198 | 艾灸对控制血糖效果不错。
199 | 艾灸功效：调升元阳降血糖艾灸可以修复受损胰岛细胞，激活再生，逐步实现胰岛素的自给自足。
200 | 服药一天比一天少，身体一天比一天好，彻底摆脱终生服药！
201 | 还可以双向调节血糖，使血糖老老实实地锁定在正常的恒定值范围。
202 | 也可以改善组织供氧，对微血管病变导致的视物不清、眼底出血等视网膜病变及早期肾病病变及早期肾病病变有明显治疗与改善作用，改善病人消瘦无力、免疫力低下、低蛋白质血证及伤口不愈等现象。
203 | 艾灸取穴糖尿病艾灸过的穴位有，承浆中脘足三里关元曲骨三阴交、期门太冲下脘天枢气海膈俞膻中、胃俞，这么多穴位可根据患者当时的症状进行选取。
204 | 选取后艾灸，每10天为一个疗程，疗程间休息3-5天后继续第二轮的治疗，三个疗程基本可见到理想疗效。
205 | 这几个穴位都是具有补充人体元阳功能的大穴和调节脏腑功能的腧穴，从根上调节人体的元阳使阴阳达到新的平衡，五脏六腑尤其是肺、脾肾的功能恢复正常，糖尿病自然也就不药而愈了。
206 | 艾灸可以有效控制糖尿病 ，这在很多资料都有报导。
207 | 艾灸使病人的营养能得到有效的吸收和利用，从而提高人体的自身免疫功能和抗病防病能力，防止了系列并发症的发生，真正做到综合治疗，标本兼治。
208 | 艾灸对于常见病是具有广泛的适应性的。
209 | 希望大家把艾灸推广出去，让艾灸这个疗法能够更完善，造福更多的人。
210 | 
211 | 熟食放在垃圾旁无照窝点被取缔
212 | 本报讯（记者李涛）又黑又脏的墙面、随意堆放的加工原料、处处弥漫的刺鼻味道。
213 | 昨天上午，东小口镇政府与城管、食药、公安等部门开展联合执法行动时，依法取缔了一个位于昌平区东小口镇半截塔村的非法熟食加工窝点。
214 | 昨天上午，执法人员对东小口镇半截塔村进行环境整治时，一家挂着“久久鸭”招牌的小店的店主显得有点紧张，还“顺手”把通向后院的门关上了。
215 | 执法人员觉得有些蹊跷，便要求到后院进行检查。
216 | 一进院子，执法人员就发现大量的熟食加工原料被随意摆放在地上，旁边就堆放着垃圾。
217 | 院内煤炉上的一口锅内正煮着的食物，发出刺鼻的味道。
218 | 执法队员介绍，在炉子一旁的笸箩里盛着制作好的熟食制品，但却没有任何遮盖，一阵风起，煤灰混着尘土就落在上面。
219 | 执法队员说：“走进院旁的小屋内，地上和墙上满是油污，脏乎乎的冰柜上堆放着一袋一袋的半成品，一个个用来盛放熟食制品的笸箩摞在生锈的铁架子上。
220 | ”随后，执法人员仔细查找，没有发现任何消毒设施，调查得知从事加工的人员也没有取得加工熟食应需的健康证。
221 | 执法人员随后对店主进行询问，当执法人员要求出示营业执照及卫生许可证时，店主嘟囔了半天才坦白自己不具备任何手续。
222 | 执法人员当即对该非法生产窝点进行了取缔，对现场工作人员进行了宣传与教育，并依法没收了加工工具及食品。


--------------------------------------------------------------------------------
/albert_tsim/freezeGraph.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Time    : 2019/12/12 10:36
  3 | # @Author  : Magic
  4 | # @Email   : hanjunm@haier.com
  5 | 
  6 | '''
  7 | BERT模型文件 ckpt转pb 工具
  8 | '''
  9 | 
 10 | # import contextlib
 11 | import codecs
 12 | import json
 13 | import os
 14 | 
 15 | import modeling
 16 | import tensorflow as tf
 17 | import argparse
 18 | 
 19 | 
 20 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
 21 |                  labels, num_labels, use_one_hot_embeddings):
 22 |     """Creates a classification model."""
 23 |     model = modeling.BertModel(
 24 |         config=bert_config,
 25 |         is_training=is_training,
 26 |         input_ids=input_ids,
 27 |         input_mask=input_mask,
 28 |         token_type_ids=segment_ids,
 29 |         use_one_hot_embeddings=use_one_hot_embeddings)
 30 | 
 31 |     # In the demo, we are doing a simple classification task on the entire
 32 |     # segment.
 33 |     #
 34 |     # If you want to use the token-level output, use model.get_sequence_output()
 35 |     # instead.
 36 |     output_layer = model.get_pooled_output()
 37 | 
 38 |     hidden_size = output_layer.shape[-1].value
 39 | 
 40 |     output_weights = tf.get_variable(
 41 |         "output_weights", [num_labels, hidden_size],
 42 |         initializer=tf.truncated_normal_initializer(stddev=0.02))
 43 | 
 44 |     output_bias = tf.get_variable(
 45 |         "output_bias", [num_labels], initializer=tf.zeros_initializer())
 46 | 
 47 |     with tf.variable_scope("loss"):
 48 |         ln_type = bert_config.ln_type
 49 |         if ln_type == 'preln':  # add by brightmart, 10-06. if it is preln, we need to an additonal layer: layer normalization as suggested in paper "ON LAYER NORMALIZATION IN THE TRANSFORMER ARCHITECTURE"
 50 |             print("ln_type is preln. add LN layer.")
 51 |             output_layer = layer_norm(output_layer)
 52 |         else:
 53 |             print("ln_type is postln or other,do nothing.")
 54 | 
 55 |         if is_training:
 56 |             # I.e., 0.1 dropout
 57 |             output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
 58 | 
 59 |         logits = tf.matmul(output_layer, output_weights, transpose_b=True)
 60 |         logits = tf.nn.bias_add(logits, output_bias)
 61 |         probabilities = tf.nn.softmax(logits, axis=-1)
 62 | 
 63 |     return probabilities
 64 | 
 65 | 
 66 | def layer_norm(input_tensor, name=None):
 67 |     """Run layer normalization on the last dimension of the tensor."""
 68 |     return tf.contrib.layers.layer_norm(
 69 |         inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
 70 | 
 71 | 
 72 | def init_predict_var(path):
 73 |     with open(os.path.join(path, 'label.json'), 'r', encoding='utf-8') as f:
 74 |         label2id = json.load(f)
 75 |         id2label = {value: key for key, value in label2id.items()}
 76 |         num_labels = len(label2id)
 77 |     print('num_labels:%d' % num_labels)
 78 |     return num_labels, label2id, id2label
 79 | 
 80 | 
 81 | def optimize_class_model(args):
 82 |     """
 83 |     加载中文分类模型
 84 |     :param args:
 85 |     :param num_labels:
 86 |     :param logger:
 87 |     :return:
 88 |     """
 89 |     tf.logging.set_verbosity(tf.logging.INFO)
 90 | 
 91 |     try:
 92 |         # 如果PB文件已经存在则，返回PB文件的路径，否则将模型转化为PB文件，并且返回存储PB文件的路径
 93 |         tmp_dir = args.model_dir
 94 | 
 95 |         pb_file = os.path.join(tmp_dir, 'albert.pb')
 96 |         if os.path.exists(pb_file):
 97 |             print('pb_file exits', pb_file)
 98 |             return pb_file
 99 | 
100 |         num_labels, label2id, id2label = init_predict_var(tmp_dir)
101 | 
102 |         graph = tf.Graph()
103 |         with graph.as_default():
104 |             with tf.Session() as sess:
105 |                 input_ids = tf.placeholder(tf.int32, (None, args.max_seq_len), 'input_ids')
106 |                 input_mask = tf.placeholder(tf.int32, (None, args.max_seq_len), 'input_mask')
107 |                 segment_ids = tf.placeholder(tf.int32, (None, args.max_seq_len), 'segment_ids')
108 |                 bert_config = modeling.BertConfig.from_json_file(os.path.join(args.bert_model_dir, 'albert_config_tiny.json'))
109 | 
110 |                 probabilities = create_model(
111 |                     bert_config=bert_config,
112 |                     is_training=False,
113 |                     input_ids=input_ids,
114 |                     input_mask=input_mask,
115 |                     segment_ids=segment_ids,
116 |                     labels=None,
117 |                     num_labels=num_labels,
118 |                     use_one_hot_embeddings=False
119 |                 )
120 | 
121 |                 saver = tf.train.Saver()
122 |                 latest_checkpoint = tf.train.latest_checkpoint(args.model_dir)
123 |                 tf.logging.info('loading... %s ' % latest_checkpoint)
124 |                 saver.restore(sess, latest_checkpoint)
125 |                 tf.logging.info('freeze...')
126 |                 from tensorflow.python.framework import graph_util
127 |                 tmp_g = graph_util.convert_variables_to_constants(sess, graph.as_graph_def(), [probabilities.op.name])
128 |                 tf.logging.info('predict cut finished !!!')
129 | 
130 |         # 存储二进制模型到文件中
131 |         tf.logging.info('write graph to a tmp file: %s' % pb_file)
132 |         with tf.gfile.GFile(pb_file, 'wb') as f:
133 |             f.write(tmp_g.SerializeToString())
134 |         return pb_file
135 |     except Exception as e:
136 |         tf.logging.error('fail to optimize the graph! %s' % e, exc_info=True)
137 | 
138 | 
139 | 
140 | if __name__ == '__main__':
141 |     parser = argparse.ArgumentParser(description='Trans ckpt file to .pb file')
142 | 
143 |     args = parser.parse_args()
144 |     args.bert_model_dir = 'albert_tiny_zh'
145 |     args.model_dir = 'albert_lcqmc_checkpoints_base'
146 |     args.max_seq_len = 32
147 | 
148 |     optimize_class_model(args)


--------------------------------------------------------------------------------
/albert_tsim/freeze_graph.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.python.framework import graph_util
 3 | 
 4 | 
 5 | def freeze_graph(output_graph):
 6 |     '''
 7 |     :param input_checkpoint:
 8 |     :param output_graph: PB模型保存路径
 9 |     :return:
10 |     '''
11 |     # checkpoint = tf.train.get_checkpoint_state(model_folder) #检查目录下ckpt文件状态是否可用
12 |     # input_checkpoint = checkpoint.model_checkpoint_path #得ckpt文件路径
13 |     from tensorflow.python import pywrap_tensorflow
14 | 
15 |     reader = pywrap_tensorflow.NewCheckpointReader("F:\python_work\github\\albert_zh\\albert_lcqmc_checkpoints_base\model.ckpt-33000")
16 |     var_to_shape_map = reader.get_variable_to_shape_map()
17 |     for key in var_to_shape_map:
18 |         print("tensor_name: ", key)
19 |     # # 指定输出的节点名称,该节点名称必须是原模型中存在的节点
20 |     # output_node_names = "accuracy/temp_sim,output/distance"
21 |     # input_checkpoint = "F:\python_work\siamese-lstm-network\deep-siamese-text-similarity\\atec_runs\\1553238291\checkpoints\model-170000.meta"
22 |     # model_path = 'F:\python_work\siamese-lstm-network\deep-siamese-text-similarity\\atec_runs\\1553238291\checkpoints\model-170000' # 数据路径
23 |     #
24 |     # saver = tf.train.import_meta_graph(input_checkpoint, clear_devices=False)
25 |     # graph = tf.get_default_graph()  # 获得默认的图
26 |     # input_graph_def = graph.as_graph_def()  # 返回一个序列化的图代表当前的图
27 |     # with tf.Session() as sess:
28 |     #     saver.restore(sess, model_path)  # 恢复图并得到数据
29 |     #     output_graph_def = graph_util.convert_variables_to_constants(  # 模型持久化，将变量值固定
30 |     #         sess=sess,
31 |     #         input_graph_def=input_graph_def,  # 等于:sess.graph_def
32 |     #         output_node_names=output_node_names.split(","))  # 如果有多个输出节点，以逗号隔开
33 |     #
34 |     #     with tf.gfile.GFile(output_graph, "wb") as f:  # 保存模型
35 |     #         f.write(output_graph_def.SerializeToString())  # 序列化输出
36 |     #     print("%d ops in the final graph." % len(output_graph_def.node))  # 得到当前图有几个操作节点
37 | 
38 | 
39 | freeze_graph("./model.pb")
40 | 


--------------------------------------------------------------------------------
/albert_tsim/lamb_optimizer_google.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # Lint as: python2, python3
 17 | """Functions and classes related to optimization (weight updates)."""
 18 | 
 19 | from __future__ import absolute_import
 20 | from __future__ import division
 21 | from __future__ import print_function
 22 | 
 23 | import re
 24 | import six
 25 | import tensorflow as tf
 26 | 
 27 | # pylint: disable=g-direct-tensorflow-import
 28 | from tensorflow.python.ops import array_ops
 29 | from tensorflow.python.ops import linalg_ops
 30 | from tensorflow.python.ops import math_ops
 31 | # pylint: enable=g-direct-tensorflow-import
 32 | 
 33 | 
 34 | class LAMBOptimizer(tf.train.Optimizer):
 35 |   """LAMB (Layer-wise Adaptive Moments optimizer for Batch training)."""
 36 |   # A new optimizer that includes correct L2 weight decay, adaptive
 37 |   # element-wise updating, and layer-wise justification. The LAMB optimizer
 38 |   # was proposed by Yang You, Jing Li, Jonathan Hseu, Xiaodan Song,
 39 |   # James Demmel, and Cho-Jui Hsieh in a paper titled as Reducing BERT
 40 |   # Pre-Training Time from 3 Days to 76 Minutes (arxiv.org/abs/1904.00962)
 41 | 
 42 |   def __init__(self,
 43 |                learning_rate,
 44 |                weight_decay_rate=0.0,
 45 |                beta_1=0.9,
 46 |                beta_2=0.999,
 47 |                epsilon=1e-6,
 48 |                exclude_from_weight_decay=None,
 49 |                exclude_from_layer_adaptation=None,
 50 |                name="LAMBOptimizer"):
 51 |     """Constructs a LAMBOptimizer."""
 52 |     super(LAMBOptimizer, self).__init__(False, name)
 53 | 
 54 |     self.learning_rate = learning_rate
 55 |     self.weight_decay_rate = weight_decay_rate
 56 |     self.beta_1 = beta_1
 57 |     self.beta_2 = beta_2
 58 |     self.epsilon = epsilon
 59 |     self.exclude_from_weight_decay = exclude_from_weight_decay
 60 |     # exclude_from_layer_adaptation is set to exclude_from_weight_decay if the
 61 |     # arg is None.
 62 |     # TODO(jingli): validate if exclude_from_layer_adaptation is necessary.
 63 |     if exclude_from_layer_adaptation:
 64 |       self.exclude_from_layer_adaptation = exclude_from_layer_adaptation
 65 |     else:
 66 |       self.exclude_from_layer_adaptation = exclude_from_weight_decay
 67 | 
 68 |   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
 69 |     """See base class."""
 70 |     assignments = []
 71 |     for (grad, param) in grads_and_vars:
 72 |       if grad is None or param is None:
 73 |         continue
 74 | 
 75 |       param_name = self._get_variable_name(param.name)
 76 | 
 77 |       m = tf.get_variable(
 78 |           name=six.ensure_str(param_name) + "/adam_m",
 79 |           shape=param.shape.as_list(),
 80 |           dtype=tf.float32,
 81 |           trainable=False,
 82 |           initializer=tf.zeros_initializer())
 83 |       v = tf.get_variable(
 84 |           name=six.ensure_str(param_name) + "/adam_v",
 85 |           shape=param.shape.as_list(),
 86 |           dtype=tf.float32,
 87 |           trainable=False,
 88 |           initializer=tf.zeros_initializer())
 89 | 
 90 |       # Standard Adam update.
 91 |       next_m = (
 92 |           tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
 93 |       next_v = (
 94 |           tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
 95 |                                                     tf.square(grad)))
 96 | 
 97 |       update = next_m / (tf.sqrt(next_v) + self.epsilon)
 98 | 
 99 |       # Just adding the square of the weights to the loss function is *not*
100 |       # the correct way of using L2 regularization/weight decay with Adam,
101 |       # since that will interact with the m and v parameters in strange ways.
102 |       #
103 |       # Instead we want ot decay the weights in a manner that doesn't interact
104 |       # with the m/v parameters. This is equivalent to adding the square
105 |       # of the weights to the loss with plain (non-momentum) SGD.
106 |       if self._do_use_weight_decay(param_name):
107 |         update += self.weight_decay_rate * param
108 | 
109 |       ratio = 1.0
110 |       if self._do_layer_adaptation(param_name):
111 |         w_norm = linalg_ops.norm(param, ord=2)
112 |         g_norm = linalg_ops.norm(update, ord=2)
113 |         ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where(
114 |             math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0)
115 | 
116 |       update_with_lr = ratio * self.learning_rate * update
117 | 
118 |       next_param = param - update_with_lr
119 | 
120 |       assignments.extend(
121 |           [param.assign(next_param),
122 |            m.assign(next_m),
123 |            v.assign(next_v)])
124 |     return tf.group(*assignments, name=name)
125 | 
126 |   def _do_use_weight_decay(self, param_name):
127 |     """Whether to use L2 weight decay for `param_name`."""
128 |     if not self.weight_decay_rate:
129 |       return False
130 |     if self.exclude_from_weight_decay:
131 |       for r in self.exclude_from_weight_decay:
132 |         if re.search(r, param_name) is not None:
133 |           return False
134 |     return True
135 | 
136 |   def _do_layer_adaptation(self, param_name):
137 |     """Whether to do layer-wise learning rate adaptation for `param_name`."""
138 |     if self.exclude_from_layer_adaptation:
139 |       for r in self.exclude_from_layer_adaptation:
140 |         if re.search(r, param_name) is not None:
141 |           return False
142 |     return True
143 | 
144 |   def _get_variable_name(self, param_name):
145 |     """Get the variable name from the tensor name."""
146 |     m = re.match("^(.*):\\d+$", six.ensure_str(param_name))
147 |     if m is not None:
148 |       param_name = m.group(1)
149 |     return param_name
150 | 


--------------------------------------------------------------------------------
/albert_tsim/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Functions and classes related to optimization (weight updates)."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import re
 22 | import tensorflow as tf
 23 | 
 24 | 
 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
 26 |     """Creates an optimizer training op."""
 27 |     global_step = tf.train.get_or_create_global_step()
 28 | 
 29 |     learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
 30 | 
 31 |     # Implements linear decay of the learning rate.
 32 |     learning_rate = tf.train.polynomial_decay(
 33 |         learning_rate,
 34 |         global_step,
 35 |         num_train_steps,
 36 |         end_learning_rate=0.0,
 37 |         power=1.0,
 38 |         cycle=False)
 39 | 
 40 |     # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
 41 |     # learning rate will be `global_step/num_warmup_steps * init_lr`.
 42 |     if num_warmup_steps:
 43 |         global_steps_int = tf.cast(global_step, tf.int32)
 44 |         warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
 45 | 
 46 |         global_steps_float = tf.cast(global_steps_int, tf.float32)
 47 |         warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
 48 | 
 49 |         warmup_percent_done = global_steps_float / warmup_steps_float
 50 |         warmup_learning_rate = init_lr * warmup_percent_done
 51 | 
 52 |         is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
 53 |         learning_rate = (
 54 |                 (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
 55 | 
 56 |     # It is recommended that you use this optimizer for fine tuning, since this
 57 |     # is how the model was trained (note that the Adam m/v variables are NOT
 58 |     # loaded from init_checkpoint.)
 59 |     optimizer = LAMBOptimizer(
 60 |         learning_rate=learning_rate,
 61 |         weight_decay_rate=0.01,
 62 |         beta_1=0.9,
 63 |         beta_2=0.999,
 64 |         epsilon=1e-6,
 65 |         exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 66 | 
 67 |     if use_tpu:
 68 |         optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
 69 | 
 70 |     tvars = tf.trainable_variables()
 71 |     grads = tf.gradients(loss, tvars)
 72 | 
 73 |     # This is how the model was pre-trained.
 74 |     (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
 75 | 
 76 |     train_op = optimizer.apply_gradients(
 77 |         zip(grads, tvars), global_step=global_step)
 78 | 
 79 |     # Normally the global step update is done inside of `apply_gradients`.
 80 |     # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
 81 |     # a different optimizer, you should probably take this line out.
 82 |     new_global_step = global_step + 1
 83 |     train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 84 |     return train_op
 85 | 
 86 | 
 87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
 88 |     """A basic Adam optimizer that includes "correct" L2 weight decay."""
 89 | 
 90 |     def __init__(self,
 91 |                  learning_rate,
 92 |                  weight_decay_rate=0.0,
 93 |                  beta_1=0.9,
 94 |                  beta_2=0.999,
 95 |                  epsilon=1e-6,
 96 |                  exclude_from_weight_decay=None,
 97 |                  name="AdamWeightDecayOptimizer"):
 98 |         """Constructs a AdamWeightDecayOptimizer."""
 99 |         super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 | 
101 |         self.learning_rate = learning_rate
102 |         self.weight_decay_rate = weight_decay_rate
103 |         self.beta_1 = beta_1
104 |         self.beta_2 = beta_2
105 |         self.epsilon = epsilon
106 |         self.exclude_from_weight_decay = exclude_from_weight_decay
107 | 
108 |     def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 |         """See base class."""
110 |         assignments = []
111 |         for (grad, param) in grads_and_vars:
112 |             if grad is None or param is None:
113 |                 continue
114 | 
115 |             param_name = self._get_variable_name(param.name)
116 | 
117 |             m = tf.get_variable(
118 |                 name=param_name + "/adam_m",
119 |                 shape=param.shape.as_list(),
120 |                 dtype=tf.float32,
121 |                 trainable=False,
122 |                 initializer=tf.zeros_initializer())
123 |             v = tf.get_variable(
124 |                 name=param_name + "/adam_v",
125 |                 shape=param.shape.as_list(),
126 |                 dtype=tf.float32,
127 |                 trainable=False,
128 |                 initializer=tf.zeros_initializer())
129 | 
130 |             # Standard Adam update.
131 |             next_m = (
132 |                     tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 |             next_v = (
134 |                     tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 |                                                               tf.square(grad)))
136 | 
137 |             update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 | 
139 |             # Just adding the square of the weights to the loss function is *not*
140 |             # the correct way of using L2 regularization/weight decay with Adam,
141 |             # since that will interact with the m and v parameters in strange ways.
142 |             #
143 |             # Instead we want ot decay the weights in a manner that doesn't interact
144 |             # with the m/v parameters. This is equivalent to adding the square
145 |             # of the weights to the loss with plain (non-momentum) SGD.
146 |             if self._do_use_weight_decay(param_name):
147 |                 update += self.weight_decay_rate * param
148 | 
149 |             update_with_lr = self.learning_rate * update
150 | 
151 |             next_param = param - update_with_lr
152 | 
153 |             assignments.extend(
154 |                 [param.assign(next_param),
155 |                  m.assign(next_m),
156 |                  v.assign(next_v)])
157 |         return tf.group(*assignments, name=name)
158 | 
159 |     def _do_use_weight_decay(self, param_name):
160 |         """Whether to use L2 weight decay for `param_name`."""
161 |         if not self.weight_decay_rate:
162 |             return False
163 |         if self.exclude_from_weight_decay:
164 |             for r in self.exclude_from_weight_decay:
165 |                 if re.search(r, param_name) is not None:
166 |                     return False
167 |         return True
168 | 
169 |     def _get_variable_name(self, param_name):
170 |         """Get the variable name from the tensor name."""
171 |         m = re.match("^(.*):\\d+$", param_name)
172 |         if m is not None:
173 |             param_name = m.group(1)
174 |         return param_name
175 | 
176 | 
177 | #
178 | class LAMBOptimizer(tf.train.Optimizer):
179 |     """
180 |     LAMBOptimizer optimizer.
181 |     https://github.com/ymcui/LAMB_Optimizer_TF
182 |     # IMPORTANT NOTE
183 |     - This is NOT an official implementation.
184 |     - LAMB optimizer is changed from arXiv v1 ~ v3.
185 |     - We implement v3 version (which is the latest version on June, 2019.).
186 |     - Our implementation is based on `AdamWeightDecayOptimizer` in BERT (provided by Google).
187 | 
188 |     # References
189 |     - Large Batch Optimization for Deep Learning: Training BERT in 76 minutes. https://arxiv.org/abs/1904.00962v3
190 |     - BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. https://arxiv.org/abs/1810.04805
191 |     # Parameters
192 |     - There is nothing special, just the same as `AdamWeightDecayOptimizer`.
193 |     """
194 | 
195 |     def __init__(self,
196 |                  learning_rate,
197 |                  weight_decay_rate=0.01,
198 |                  beta_1=0.9,
199 |                  beta_2=0.999,
200 |                  epsilon=1e-6,
201 |                  exclude_from_weight_decay=None,
202 |                  name="LAMBOptimizer"):
203 |         """Constructs a LAMBOptimizer."""
204 |         super(LAMBOptimizer, self).__init__(False, name)
205 | 
206 |         self.learning_rate = learning_rate
207 |         self.weight_decay_rate = weight_decay_rate
208 |         self.beta_1 = beta_1
209 |         self.beta_2 = beta_2
210 |         self.epsilon = epsilon
211 |         self.exclude_from_weight_decay = exclude_from_weight_decay
212 | 
213 |     def apply_gradients(self, grads_and_vars, global_step=None, name=None):
214 |         """See base class."""
215 |         assignments = []
216 |         for (grad, param) in grads_and_vars:
217 |             if grad is None or param is None:
218 |                 continue
219 | 
220 |             param_name = self._get_variable_name(param.name)
221 | 
222 |             m = tf.get_variable(
223 |                 name=param_name + "/lamb_m",
224 |                 shape=param.shape.as_list(),
225 |                 dtype=tf.float32,
226 |                 trainable=False,
227 |                 initializer=tf.zeros_initializer())
228 |             v = tf.get_variable(
229 |                 name=param_name + "/lamb_v",
230 |                 shape=param.shape.as_list(),
231 |                 dtype=tf.float32,
232 |                 trainable=False,
233 |                 initializer=tf.zeros_initializer())
234 | 
235 |             # Standard Adam update.
236 |             next_m = (
237 |                     tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
238 |             next_v = (
239 |                     tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
240 |                                                               tf.square(grad)))
241 | 
242 |             update = next_m / (tf.sqrt(next_v) + self.epsilon)
243 | 
244 |             # Just adding the square of the weights to the loss function is *not*
245 |             # the correct way of using L2 regularization/weight decay with Adam,
246 |             # since that will interact with the m and v parameters in strange ways.
247 |             #
248 |             # Instead we want ot decay the weights in a manner that doesn't interact
249 |             # with the m/v parameters. This is equivalent to adding the square
250 |             # of the weights to the loss with plain (non-momentum) SGD.
251 |             if self._do_use_weight_decay(param_name):
252 |                 update += self.weight_decay_rate * param
253 | 
254 |             ############## BELOW ARE THE SPECIFIC PARTS FOR LAMB ##############
255 | 
256 |             # Note: Here are two choices for scaling function \phi(z)
257 |             # minmax:   \phi(z) = min(max(z, \gamma_l), \gamma_u)
258 |             # identity: \phi(z) = z
259 |             # The authors does not mention what is \gamma_l and \gamma_u
260 |             # UPDATE: after asking authors, they provide me the code below.
261 |             # ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where(
262 |             #      math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0)
263 | 
264 |             r1 = tf.sqrt(tf.reduce_sum(tf.square(param)))
265 |             r2 = tf.sqrt(tf.reduce_sum(tf.square(update)))
266 | 
267 |             r = tf.where(tf.greater(r1, 0.0),
268 |                          tf.where(tf.greater(r2, 0.0),
269 |                                   r1 / r2,
270 |                                   1.0),
271 |                          1.0)
272 | 
273 |             eta = self.learning_rate * r
274 | 
275 |             update_with_lr = eta * update
276 | 
277 |             next_param = param - update_with_lr
278 | 
279 |             assignments.extend(
280 |                 [param.assign(next_param),
281 |                  m.assign(next_m),
282 |                  v.assign(next_v)])
283 |         return tf.group(*assignments, name=name)
284 | 
285 |     def _do_use_weight_decay(self, param_name):
286 |         """Whether to use L2 weight decay for `param_name`."""
287 |         if not self.weight_decay_rate:
288 |             return False
289 |         if self.exclude_from_weight_decay:
290 |             for r in self.exclude_from_weight_decay:
291 |                 if re.search(r, param_name) is not None:
292 |                     return False
293 |         return True
294 | 
295 |     def _get_variable_name(self, param_name):
296 |         """Get the variable name from the tensor name."""
297 |         m = re.match("^(.*):\\d+$", param_name)
298 |         if m is not None:
299 |             param_name = m.group(1)
300 |         return param_name


--------------------------------------------------------------------------------
/albert_tsim/optimization_finetuning.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Functions and classes related to optimization (weight updates)."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import re
 22 | import tensorflow as tf
 23 | 
 24 | 
 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
 26 |   """Creates an optimizer training op."""
 27 |   global_step = tf.train.get_or_create_global_step()
 28 | 
 29 |   learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
 30 | 
 31 |   # Implements linear decay of the learning rate.
 32 |   learning_rate = tf.train.polynomial_decay(
 33 |       learning_rate,
 34 |       global_step,
 35 |       num_train_steps,
 36 |       end_learning_rate=0.0,
 37 |       power=1.0,
 38 |       cycle=False)
 39 | 
 40 |   # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
 41 |   # learning rate will be `global_step/num_warmup_steps * init_lr`.
 42 |   if num_warmup_steps:
 43 |     global_steps_int = tf.cast(global_step, tf.int32)
 44 |     warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
 45 | 
 46 |     global_steps_float = tf.cast(global_steps_int, tf.float32)
 47 |     warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
 48 | 
 49 |     warmup_percent_done = global_steps_float / warmup_steps_float
 50 |     warmup_learning_rate = init_lr * warmup_percent_done
 51 | 
 52 |     is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
 53 |     learning_rate = (
 54 |         (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
 55 | 
 56 |   # It is recommended that you use this optimizer for fine tuning, since this
 57 |   # is how the model was trained (note that the Adam m/v variables are NOT
 58 |   # loaded from init_checkpoint.)
 59 |   optimizer = AdamWeightDecayOptimizer(
 60 |       learning_rate=learning_rate,
 61 |       weight_decay_rate=0.01,
 62 |       beta_1=0.9,
 63 |       beta_2=0.999, # 0.98 ONLY USED FOR PRETRAIN. MUST CHANGE AT FINE-TUNING 0.999,
 64 |       epsilon=1e-6,
 65 |       exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 66 | 
 67 |   if use_tpu:
 68 |     optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
 69 | 
 70 |   tvars = tf.trainable_variables()
 71 |   grads = tf.gradients(loss, tvars)
 72 | 
 73 |   # This is how the model was pre-trained.
 74 |   (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
 75 | 
 76 |   train_op = optimizer.apply_gradients(
 77 |       zip(grads, tvars), global_step=global_step)
 78 | 
 79 |   # Normally the global step update is done inside of `apply_gradients`.
 80 |   # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
 81 |   # a different optimizer, you should probably take this line out.
 82 |   new_global_step = global_step + 1
 83 |   train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 84 |   return train_op
 85 | 
 86 | 
 87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
 88 |   """A basic Adam optimizer that includes "correct" L2 weight decay."""
 89 | 
 90 |   def __init__(self,
 91 |                learning_rate,
 92 |                weight_decay_rate=0.0,
 93 |                beta_1=0.9,
 94 |                beta_2=0.999,
 95 |                epsilon=1e-6,
 96 |                exclude_from_weight_decay=None,
 97 |                name="AdamWeightDecayOptimizer"):
 98 |     """Constructs a AdamWeightDecayOptimizer."""
 99 |     super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 | 
101 |     self.learning_rate = learning_rate
102 |     self.weight_decay_rate = weight_decay_rate
103 |     self.beta_1 = beta_1
104 |     self.beta_2 = beta_2
105 |     self.epsilon = epsilon
106 |     self.exclude_from_weight_decay = exclude_from_weight_decay
107 | 
108 |   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 |     """See base class."""
110 |     assignments = []
111 |     for (grad, param) in grads_and_vars:
112 |       if grad is None or param is None:
113 |         continue
114 | 
115 |       param_name = self._get_variable_name(param.name)
116 | 
117 |       m = tf.get_variable(
118 |           name=param_name + "/adam_m",
119 |           shape=param.shape.as_list(),
120 |           dtype=tf.float32,
121 |           trainable=False,
122 |           initializer=tf.zeros_initializer())
123 |       v = tf.get_variable(
124 |           name=param_name + "/adam_v",
125 |           shape=param.shape.as_list(),
126 |           dtype=tf.float32,
127 |           trainable=False,
128 |           initializer=tf.zeros_initializer())
129 | 
130 |       # Standard Adam update.
131 |       next_m = (
132 |           tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 |       next_v = (
134 |           tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 |                                                     tf.square(grad)))
136 | 
137 |       update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 | 
139 |       # Just adding the square of the weights to the loss function is *not*
140 |       # the correct way of using L2 regularization/weight decay with Adam,
141 |       # since that will interact with the m and v parameters in strange ways.
142 |       #
143 |       # Instead we want ot decay the weights in a manner that doesn't interact
144 |       # with the m/v parameters. This is equivalent to adding the square
145 |       # of the weights to the loss with plain (non-momentum) SGD.
146 |       if self._do_use_weight_decay(param_name):
147 |         update += self.weight_decay_rate * param
148 | 
149 |       update_with_lr = self.learning_rate * update
150 | 
151 |       next_param = param - update_with_lr
152 | 
153 |       assignments.extend(
154 |           [param.assign(next_param),
155 |            m.assign(next_m),
156 |            v.assign(next_v)])
157 |     return tf.group(*assignments, name=name)
158 | 
159 |   def _do_use_weight_decay(self, param_name):
160 |     """Whether to use L2 weight decay for `param_name`."""
161 |     if not self.weight_decay_rate:
162 |       return False
163 |     if self.exclude_from_weight_decay:
164 |       for r in self.exclude_from_weight_decay:
165 |         if re.search(r, param_name) is not None:
166 |           return False
167 |     return True
168 | 
169 |   def _get_variable_name(self, param_name):
170 |     """Get the variable name from the tensor name."""
171 |     m = re.match("^(.*):\\d+$", param_name)
172 |     if m is not None:
173 |       param_name = m.group(1)
174 |     return param_name
175 | 


--------------------------------------------------------------------------------
/albert_tsim/optimization_google.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # Lint as: python2, python3
 17 | """Functions and classes related to optimization (weight updates)."""
 18 | 
 19 | from __future__ import absolute_import
 20 | from __future__ import division
 21 | from __future__ import print_function
 22 | 
 23 | import re
 24 | 
 25 | import six
 26 | from six.moves import zip
 27 | import tensorflow as tf
 28 | 
 29 | import lamb_optimizer_google as lamb_optimizer
 30 | 
 31 | 
 32 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu,
 33 |                      optimizer="adamw", poly_power=1.0, start_warmup_step=0):
 34 |   """Creates an optimizer training op."""
 35 |   global_step = tf.train.get_or_create_global_step()
 36 | 
 37 |   learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
 38 | 
 39 |   # Implements linear decay of the learning rate.
 40 |   learning_rate = tf.train.polynomial_decay(
 41 |       learning_rate,
 42 |       global_step,
 43 |       num_train_steps,
 44 |       end_learning_rate=0.0,
 45 |       power=poly_power,
 46 |       cycle=False)
 47 | 
 48 |   # Implements linear warmup. I.e., if global_step - start_warmup_step <
 49 |   # num_warmup_steps, the learning rate will be
 50 |   # `(global_step - start_warmup_step)/num_warmup_steps * init_lr`.
 51 |   if num_warmup_steps:
 52 |     tf.logging.info("++++++ warmup starts at step " + str(start_warmup_step)
 53 |                     + ", for " + str(num_warmup_steps) + " steps ++++++")
 54 |     global_steps_int = tf.cast(global_step, tf.int32)
 55 |     start_warm_int = tf.constant(start_warmup_step, dtype=tf.int32)
 56 |     global_steps_int = global_steps_int - start_warm_int
 57 |     warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
 58 | 
 59 |     global_steps_float = tf.cast(global_steps_int, tf.float32)
 60 |     warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
 61 | 
 62 |     warmup_percent_done = global_steps_float / warmup_steps_float
 63 |     warmup_learning_rate = init_lr * warmup_percent_done
 64 | 
 65 |     is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
 66 |     learning_rate = (
 67 |         (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
 68 | 
 69 |   # It is OK that you use this optimizer for finetuning, since this
 70 |   # is how the model was trained (note that the Adam m/v variables are NOT
 71 |   # loaded from init_checkpoint.)
 72 |   # It is OK to use AdamW in the finetuning even the model is trained by LAMB.
 73 |   # As report in the Bert pulic github, the learning rate for SQuAD 1.1 finetune
 74 |   # is 3e-5, 4e-5 or 5e-5. For LAMB, the users can use 3e-4, 4e-4,or 5e-4 for a
 75 |   # batch size of 64 in the finetune.
 76 |   if optimizer == "adamw":
 77 |     tf.logging.info("using adamw")
 78 |     optimizer = AdamWeightDecayOptimizer(
 79 |         learning_rate=learning_rate,
 80 |         weight_decay_rate=0.01,
 81 |         beta_1=0.9,
 82 |         beta_2=0.999,
 83 |         epsilon=1e-6,
 84 |         exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 85 |   elif optimizer == "lamb":
 86 |     tf.logging.info("using lamb")
 87 |     optimizer = lamb_optimizer.LAMBOptimizer(
 88 |         learning_rate=learning_rate,
 89 |         weight_decay_rate=0.01,
 90 |         beta_1=0.9,
 91 |         beta_2=0.999,
 92 |         epsilon=1e-6,
 93 |         exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 94 |   else:
 95 |     raise ValueError("Not supported optimizer: ", optimizer)
 96 | 
 97 |   if use_tpu:
 98 |     optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
 99 | 
100 |   tvars = tf.trainable_variables()
101 |   grads = tf.gradients(loss, tvars)
102 | 
103 |   # This is how the model was pre-trained.
104 |   (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
105 | 
106 |   train_op = optimizer.apply_gradients(
107 |       list(zip(grads, tvars)), global_step=global_step)
108 | 
109 |   # Normally the global step update is done inside of `apply_gradients`.
110 |   # However, neither `AdamWeightDecayOptimizer` nor `LAMBOptimizer` do this.
111 |   # But if you use a different optimizer, you should probably take this line
112 |   # out.
113 |   new_global_step = global_step + 1
114 |   train_op = tf.group(train_op, [global_step.assign(new_global_step)])
115 |   return train_op
116 | 
117 | 
118 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
119 |   """A basic Adam optimizer that includes "correct" L2 weight decay."""
120 | 
121 |   def __init__(self,
122 |                learning_rate,
123 |                weight_decay_rate=0.0,
124 |                beta_1=0.9,
125 |                beta_2=0.999,
126 |                epsilon=1e-6,
127 |                exclude_from_weight_decay=None,
128 |                name="AdamWeightDecayOptimizer"):
129 |     """Constructs a AdamWeightDecayOptimizer."""
130 |     super(AdamWeightDecayOptimizer, self).__init__(False, name)
131 | 
132 |     self.learning_rate = learning_rate
133 |     self.weight_decay_rate = weight_decay_rate
134 |     self.beta_1 = beta_1
135 |     self.beta_2 = beta_2
136 |     self.epsilon = epsilon
137 |     self.exclude_from_weight_decay = exclude_from_weight_decay
138 | 
139 |   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
140 |     """See base class."""
141 |     assignments = []
142 |     for (grad, param) in grads_and_vars:
143 |       if grad is None or param is None:
144 |         continue
145 | 
146 |       param_name = self._get_variable_name(param.name)
147 | 
148 |       m = tf.get_variable(
149 |           name=six.ensure_str(param_name) + "/adam_m",
150 |           shape=param.shape.as_list(),
151 |           dtype=tf.float32,
152 |           trainable=False,
153 |           initializer=tf.zeros_initializer())
154 |       v = tf.get_variable(
155 |           name=six.ensure_str(param_name) + "/adam_v",
156 |           shape=param.shape.as_list(),
157 |           dtype=tf.float32,
158 |           trainable=False,
159 |           initializer=tf.zeros_initializer())
160 | 
161 |       # Standard Adam update.
162 |       next_m = (
163 |           tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
164 |       next_v = (
165 |           tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
166 |                                                     tf.square(grad)))
167 | 
168 |       update = next_m / (tf.sqrt(next_v) + self.epsilon)
169 | 
170 |       # Just adding the square of the weights to the loss function is *not*
171 |       # the correct way of using L2 regularization/weight decay with Adam,
172 |       # since that will interact with the m and v parameters in strange ways.
173 |       #
174 |       # Instead we want ot decay the weights in a manner that doesn't interact
175 |       # with the m/v parameters. This is equivalent to adding the square
176 |       # of the weights to the loss with plain (non-momentum) SGD.
177 |       if self._do_use_weight_decay(param_name):
178 |         update += self.weight_decay_rate * param
179 | 
180 |       update_with_lr = self.learning_rate * update
181 | 
182 |       next_param = param - update_with_lr
183 | 
184 |       assignments.extend(
185 |           [param.assign(next_param),
186 |            m.assign(next_m),
187 |            v.assign(next_v)])
188 |     return tf.group(*assignments, name=name)
189 | 
190 |   def _do_use_weight_decay(self, param_name):
191 |     """Whether to use L2 weight decay for `param_name`."""
192 |     if not self.weight_decay_rate:
193 |       return False
194 |     if self.exclude_from_weight_decay:
195 |       for r in self.exclude_from_weight_decay:
196 |         if re.search(r, param_name) is not None:
197 |           return False
198 |     return True
199 | 
200 |   def _get_variable_name(self, param_name):
201 |     """Get the variable name from the tensor name."""
202 |     m = re.match("^(.*):\\d+$", six.ensure_str(param_name))
203 |     if m is not None:
204 |       param_name = m.group(1)
205 |     return param_name
206 | 


--------------------------------------------------------------------------------
/albert_tsim/predict.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Created on @Time:2019/12/27 14:06
 3 | @Author:sliderSun 
 4 | @FileName: predict.py
 5 | """
 6 | import tensorflow as tf
 7 | 
 8 | import args
 9 | import tokenization
10 | from run_classifier import convert_examples_to_features
11 | from similarity import SimProcessor
12 | 
13 | sess = tf.Session()
14 | with tf.gfile.GFile(
15 |         'F:\python_work\github\\albert_zh\\albert_lcqmc_checkpoints_base\\albert.pb',
16 |         'rb') as f:  # 加载模型
17 |     graph_def = tf.GraphDef()
18 |     graph_def.ParseFromString(f.read())
19 |     sess.graph.as_default()
20 |     tf.import_graph_def(graph_def, name='')  # 导入计算图
21 | # 需要有一个初始化的过程
22 | sess.run(tf.global_variables_initializer())
23 | input_mask = sess.graph.get_operation_by_name("input_mask").outputs[0]
24 | input_ids = sess.graph.get_operation_by_name("input_ids").outputs[0]
25 | segment_ids = sess.graph.get_operation_by_name("segment_ids").outputs[0]
26 | loss_softmax = sess.graph.get_operation_by_name("loss/Softmax").outputs[0]
27 | tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node]
28 | for tensor_name in tensor_name_list:
29 |     print(tensor_name)
30 | 
31 | processor = SimProcessor()
32 | while True:
33 |     sentence1 = input('sentence1: ')
34 |     sentence2 = input('sentence2: ')
35 |     tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True)
36 |     predict_examples = processor.get_sentence_examples([(sentence1, sentence2)])
37 |     features = convert_examples_to_features(predict_examples, processor.get_labels(), args.max_seq_len,
38 |                                             tokenizer)
39 |     a = {
40 |         'input_ids': [f.input_ids for f in features],
41 |         'input_mask': [f.input_mask for f in features],
42 |         'segment_ids': [f.segment_ids for f in features],
43 |         'label_ids': [f.label_id for f in features]
44 |     }
45 |     sim = sess.run([loss_softmax], {input_mask: a["input_mask"], input_ids: a["input_ids"],
46 |                               segment_ids: a["segment_ids"]})
47 |     for i in sim:
48 |         print(i)
49 | 


--------------------------------------------------------------------------------
/albert_tsim/resources/add_data_removing_dropout.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sliderSun/bert/c9e16d652f85398fbb6ca6aea72f11f82166c672/albert_tsim/resources/add_data_removing_dropout.jpg


--------------------------------------------------------------------------------
/albert_tsim/resources/albert_configuration.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sliderSun/bert/c9e16d652f85398fbb6ca6aea72f11f82166c672/albert_tsim/resources/albert_configuration.jpg


--------------------------------------------------------------------------------
/albert_tsim/resources/albert_large_zh_parameters.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sliderSun/bert/c9e16d652f85398fbb6ca6aea72f11f82166c672/albert_tsim/resources/albert_large_zh_parameters.jpg


--------------------------------------------------------------------------------
/albert_tsim/resources/albert_performance.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sliderSun/bert/c9e16d652f85398fbb6ca6aea72f11f82166c672/albert_tsim/resources/albert_performance.jpg


--------------------------------------------------------------------------------
/albert_tsim/resources/albert_tiny_compare_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sliderSun/bert/c9e16d652f85398fbb6ca6aea72f11f82166c672/albert_tsim/resources/albert_tiny_compare_s.jpg


--------------------------------------------------------------------------------
/albert_tsim/resources/albert_tiny_compare_s_old.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sliderSun/bert/c9e16d652f85398fbb6ca6aea72f11f82166c672/albert_tsim/resources/albert_tiny_compare_s_old.jpg


--------------------------------------------------------------------------------
/albert_tsim/resources/crmc2018_compare_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sliderSun/bert/c9e16d652f85398fbb6ca6aea72f11f82166c672/albert_tsim/resources/crmc2018_compare_s.jpg


--------------------------------------------------------------------------------
/albert_tsim/resources/shell_scripts/create_pretrain_data_batch_webtext.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | echo $1,$2
 3 | 
 4 | BERT_BASE_DIR=./bert_config
 5 | for((i=$1;i<=$2;i++));
 6 | do
 7 | python3 create_pretraining_data.py --do_whole_word_mask=True --input_file=gs://raw_text/web_text_zh_raw/web_text_zh_$i.txt \
 8 | --output_file=gs://albert_zh/tf_records/tf_web_text_zh_$i.tfrecord --vocab_file=$BERT_BASE_DIR/vocab.txt --do_lower_case=True \
 9 | --max_seq_length=512 --max_predictions_per_seq=76 --masked_lm_prob=0.15
10 | done
11 | 


--------------------------------------------------------------------------------
/albert_tsim/resources/state_of_the_art.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sliderSun/bert/c9e16d652f85398fbb6ca6aea72f11f82166c672/albert_tsim/resources/state_of_the_art.jpg


--------------------------------------------------------------------------------
/albert_tsim/resources/xlarge_loss.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sliderSun/bert/c9e16d652f85398fbb6ca6aea72f11f82166c672/albert_tsim/resources/xlarge_loss.jpg


--------------------------------------------------------------------------------
/albert_tsim/run_classifier_lcqmc.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # @Author: bo.shi, https://github.com/chineseGLUE/chineseGLUE
 3 | # @Date:   2019-11-04 09:56:36
 4 | # @Last Modified by:   bright
 5 | # @Last Modified time: 2019-11-10 09:00:00
 6 | 
 7 | TASK_NAME="lcqmc"
 8 | MODEL_NAME="albert_tiny_zh"
 9 | CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
10 | 
11 | export CUDA_VISIBLE_DEVICES="0"
12 | export ALBERT_CONFIG_DIR=$CURRENT_DIR/albert_config
13 | export ALBERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model
14 | export ALBERT_TINY_DIR=$ALBERT_PRETRAINED_MODELS_DIR/$MODEL_NAME
15 | #mkdir chineseGLUEdatasets
16 | export GLUE_DATA_DIR=$CURRENT_DIR/chineseGLUEdatasets
17 | 
18 | # download and unzip dataset
19 | if [ ! -d $GLUE_DATA_DIR ]; then
20 |   mkdir -p $GLUE_DATA_DIR
21 |   echo "makedir $GLUE_DATA_DIR"
22 | fi
23 | cd $GLUE_DATA_DIR
24 | if [ ! -d $TASK_NAME ]; then
25 |   mkdir $TASK_NAME
26 |   echo "makedir $GLUE_DATA_DIR/$TASK_NAME"
27 | fi
28 | cd $TASK_NAME
29 | echo "Please try again if the data is not downloaded successfully."
30 | wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/train.txt
31 | wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/dev.txt
32 | wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/test.txt
33 | echo "Finish download dataset."
34 | 
35 | # download model
36 | if [ ! -d $ALBERT_TINY_DIR ]; then
37 |   mkdir -p $ALBERT_TINY_DIR
38 |   echo "makedir $ALBERT_TINY_DIR"
39 | fi
40 | cd $ALBERT_TINY_DIR
41 | if [ ! -f "albert_config_tiny.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "checkpoint" ] || [ ! -f "albert_model.ckpt.index" ] || [ ! -f "albert_model.ckpt.meta" ] || [ ! -f "albert_model.ckpt.data-00000-of-00001" ]; then
42 |   rm *
43 |   wget https://storage.googleapis.com/albert_zh/albert_tiny_489k.zip
44 |   unzip albert_tiny_489k.zip
45 |   rm albert_tiny_489k.zip
46 | else
47 |   echo "model exists"
48 | fi
49 | echo "Finish download model."
50 | 
51 | # run task
52 | cd $CURRENT_DIR
53 | echo "Start running..."
54 | python run_classifier.py \
55 |   --task_name=$TASK_NAME \
56 |   --do_train=true \
57 |   --do_eval=true \
58 |   --data_dir=$GLUE_DATA_DIR/$TASK_NAME \
59 |   --vocab_file=$ALBERT_CONFIG_DIR/vocab.txt \
60 |   --bert_config_file=$ALBERT_CONFIG_DIR/albert_config_tiny.json \
61 |   --init_checkpoint=$ALBERT_TINY_DIR/albert_model.ckpt \
62 |   --max_seq_length=128 \
63 |   --train_batch_size=64 \
64 |   --learning_rate=1e-4 \
65 |   --num_train_epochs=5.0 \
66 |   --output_dir=$CURRENT_DIR/${TASK_NAME}_output/
67 | 


--------------------------------------------------------------------------------
/albert_tsim/similarity.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 进行文本相似度预测的示例。可以直接运行进行预测。
  3 | 参考了项目：https://github.com/chdd/bert-utils
  4 | 
  5 | """
  6 | 
  7 | 
  8 | import tensorflow as tf
  9 | import args
 10 | import tokenization
 11 | import modeling
 12 | from run_classifier import InputFeatures, InputExample, DataProcessor, create_model, convert_examples_to_features
 13 | 
 14 | 
 15 | # os.environ['CUDA_VISIBLE_DEVICES'] = '1'
 16 | 
 17 | 
 18 | class SimProcessor(DataProcessor):
 19 |     def get_sentence_examples(self, questions):
 20 |         examples = []
 21 |         for index, data in enumerate(questions):
 22 |             guid = 'test-%d' % index
 23 |             text_a = tokenization.convert_to_unicode(str(data[0]))
 24 |             text_b = tokenization.convert_to_unicode(str(data[1]))
 25 |             label = str(0)
 26 |             examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
 27 |         return examples
 28 | 
 29 |     def get_labels(self):
 30 |         return ['0', '1']
 31 | 
 32 | 
 33 | """
 34 | 模型类，负责载入checkpoint初始化模型
 35 | """
 36 | class BertSim:
 37 |     def __init__(self, batch_size=args.batch_size):
 38 |         self.mode = None
 39 |         self.max_seq_length = args.max_seq_len
 40 |         self.tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True)
 41 |         self.batch_size = batch_size
 42 |         self.estimator = None
 43 |         self.processor = SimProcessor()
 44 |         tf.logging.set_verbosity(tf.logging.INFO)
 45 | 
 46 | 
 47 | 
 48 |     #载入estimator,构造模型
 49 |     def start_model(self):
 50 |         self.estimator = self.get_estimator()
 51 | 
 52 | 
 53 |     def model_fn_builder(self, bert_config, num_labels, init_checkpoint, learning_rate,
 54 |                          num_train_steps, num_warmup_steps,
 55 |                          use_one_hot_embeddings):
 56 |         """Returns `model_fn` closurimport_tfe for TPUEstimator."""
 57 | 
 58 |         def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
 59 |             from tensorflow.python.estimator.model_fn import EstimatorSpec
 60 | 
 61 |             tf.logging.info("*** Features ***")
 62 |             for name in sorted(features.keys()):
 63 |                 tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
 64 | 
 65 |             input_ids = features["input_ids"]
 66 |             input_mask = features["input_mask"]
 67 |             segment_ids = features["segment_ids"]
 68 |             label_ids = features["label_ids"]
 69 | 
 70 |             is_training = (mode == tf.estimator.ModeKeys.TRAIN)
 71 | 
 72 |             (total_loss, per_example_loss, logits, probabilities) = create_model(
 73 |                 bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
 74 |                 num_labels, use_one_hot_embeddings)
 75 | 
 76 |             tvars = tf.trainable_variables()
 77 |             initialized_variable_names = {}
 78 | 
 79 |             if init_checkpoint:
 80 |                 (assignment_map, initialized_variable_names) \
 81 |                     = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
 82 |                 tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
 83 | 
 84 |             tf.logging.info("**** Trainable Variables ****")
 85 |             for var in tvars:
 86 |                 init_string = ""
 87 |                 if var.name in initialized_variable_names:
 88 |                     init_string = ", *INIT_FROM_CKPT*"
 89 |                 tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
 90 |                                 init_string)
 91 |             output_spec = EstimatorSpec(mode=mode, predictions=probabilities)
 92 | 
 93 |             return output_spec
 94 | 
 95 |         return model_fn
 96 | 
 97 |     def get_estimator(self):
 98 | 
 99 |         from tensorflow.python.estimator.estimator import Estimator
100 |         from tensorflow.python.estimator.run_config import RunConfig
101 | 
102 |         bert_config = modeling.BertConfig.from_json_file(args.config_name)
103 |         label_list = self.processor.get_labels()
104 |         if self.mode == tf.estimator.ModeKeys.TRAIN:
105 |             init_checkpoint = args.ckpt_name
106 |         else:
107 |             init_checkpoint = args.output_dir
108 | 
109 |         model_fn = self.model_fn_builder(
110 |             bert_config=bert_config,
111 |             num_labels=len(label_list),
112 |             init_checkpoint=init_checkpoint,
113 |             learning_rate=args.learning_rate,
114 |             num_train_steps=None,
115 |             num_warmup_steps=None,
116 |             use_one_hot_embeddings=False)
117 | 
118 |         config = tf.ConfigProto()
119 |         config.gpu_options.allow_growth = True
120 |         config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction
121 |         config.log_device_placement = False
122 | 
123 |         return Estimator(model_fn=model_fn, config=RunConfig(session_config=config), model_dir=args.output_dir,
124 |                          params={'batch_size': self.batch_size})
125 | 
126 |     def predict_sentences(self,sentences):
127 |         results= self.estimator.predict(input_fn=input_fn_builder(self,sentences), yield_single_examples=False)
128 |         #打印预测结果
129 |         for i in results:
130 |             print(i)
131 | 
132 |     def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
133 |         """Truncates a sequence pair in place to the maximum length."""
134 | 
135 |         # This is a simple heuristic which will always truncate the longer sequence
136 |         # one token at a time. This makes more sense than truncating an equal percent
137 |         # of tokens from each, since if one sequence is very short then each token
138 |         # that's truncated likely contains more information than a longer sequence.
139 |         while True:
140 |             total_length = len(tokens_a) + len(tokens_b)
141 |             if total_length <= max_length:
142 |                 break
143 |             if len(tokens_a) > len(tokens_b):
144 |                 tokens_a.pop()
145 |             else:
146 |                 tokens_b.pop()
147 | 
148 |     def convert_single_example(self, ex_index, example, label_list, max_seq_length, tokenizer):
149 |         """Converts a single `InputExample` into a single `InputFeatures`."""
150 |         label_map = {}
151 |         for (i, label) in enumerate(label_list):
152 |             label_map[label] = i
153 | 
154 |         tokens_a = tokenizer.tokenize(example.text_a)
155 |         tokens_b = None
156 |         if example.text_b:
157 |             tokens_b = tokenizer.tokenize(example.text_b)
158 | 
159 |         if tokens_b:
160 |             # Modifies `tokens_a` and `tokens_b` in place so that the total
161 |             # length is less than the specified length.
162 |             # Account for [CLS], [SEP], [SEP] with "- 3"
163 |             self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
164 |         else:
165 |             # Account for [CLS] and [SEP] with "- 2"
166 |             if len(tokens_a) > max_seq_length - 2:
167 |                 tokens_a = tokens_a[0:(max_seq_length - 2)]
168 | 
169 |         # The convention in BERT is:
170 |         # (a) For sequence pairs:
171 |         #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
172 |         #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
173 |         # (b) For single sequences:
174 |         #  tokens:   [CLS] the dog is hairy . [SEP]
175 |         #  type_ids: 0     0   0   0  0     0 0
176 |         #
177 |         # Where "type_ids" are used to indicate whether this is the first
178 |         # sequence or the second sequence. The embedding vectors for `type=0` and
179 |         # `type=1` were learned during pre-training and are added to the wordpiece
180 |         # embedding vector (and position vector). This is not *strictly* necessary
181 |         # since the [SEP] token unambiguously separates the sequences, but it makes
182 |         # it easier for the model to learn the concept of sequences.
183 |         #
184 |         # For classification tasks, the first vector (corresponding to [CLS]) is
185 |         # used as as the "sentence vector". Note that this only makes sense because
186 |         # the entire model is fine-tuned.
187 |         tokens = []
188 |         segment_ids = []
189 |         tokens.append("[CLS]")
190 |         segment_ids.append(0)
191 |         for token in tokens_a:
192 |             tokens.append(token)
193 |             segment_ids.append(0)
194 |         tokens.append("[SEP]")
195 |         segment_ids.append(0)
196 | 
197 |         if tokens_b:
198 |             for token in tokens_b:
199 |                 tokens.append(token)
200 |                 segment_ids.append(1)
201 |             tokens.append("[SEP]")
202 |             segment_ids.append(1)
203 | 
204 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
205 | 
206 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
207 |         # tokens are attended to.
208 |         input_mask = [1] * len(input_ids)
209 | 
210 |         # Zero-pad up to the sequence length.
211 |         while len(input_ids) < max_seq_length:
212 |             input_ids.append(0)
213 |             input_mask.append(0)
214 |             segment_ids.append(0)
215 | 
216 |         assert len(input_ids) == max_seq_length
217 |         assert len(input_mask) == max_seq_length
218 |         assert len(segment_ids) == max_seq_length
219 | 
220 |         label_id = label_map[example.label]
221 |         if ex_index < 5:
222 |             tf.logging.info("*** Example ***")
223 |             tf.logging.info("guid: %s" % (example.guid))
224 |             tf.logging.info("tokens: %s" % " ".join(
225 |                 [tokenization.printable_text(x) for x in tokens]))
226 |             tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
227 |             tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
228 |             tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
229 |             tf.logging.info("label: %s (id = %d)" % (example.label, label_id))
230 | 
231 |         feature = InputFeatures(
232 |             input_ids=input_ids,
233 |             input_mask=input_mask,
234 |             segment_ids=segment_ids,
235 |             label_id=label_id)
236 |         return feature
237 | 
238 | 
239 | 
240 | 
241 | def input_fn_builder(bertSim,sentences):
242 |     def predict_input_fn():
243 |         return (tf.data.Dataset.from_generator(
244 |             generate_from_input,
245 |             output_types={
246 |                 'input_ids': tf.int32,
247 |                 'input_mask': tf.int32,
248 |                 'segment_ids': tf.int32,
249 |                 'label_ids': tf.int32},
250 |             output_shapes={
251 |                 'input_ids': (None, bertSim.max_seq_length),
252 |                 'input_mask': (None, bertSim.max_seq_length),
253 |                 'segment_ids': (None, bertSim.max_seq_length),
254 |                 'label_ids': (1,)}).prefetch(10))
255 | 
256 |     def generate_from_input():
257 |         processor = bertSim.processor
258 |         predict_examples = processor.get_sentence_examples(sentences)
259 |         features = convert_examples_to_features(predict_examples, processor.get_labels(), args.max_seq_len,
260 |                                                 bertSim.tokenizer)
261 |         yield {
262 |             'input_ids': [f.input_ids for f in features],
263 |             'input_mask': [f.input_mask for f in features],
264 |             'segment_ids': [f.segment_ids for f in features],
265 |             'label_ids': [f.label_id for f in features]
266 |         }
267 | 
268 |     return predict_input_fn
269 | 
270 | 
271 | if __name__ == '__main__':
272 |     sim = BertSim()
273 |     sim.start_model()
274 |     sim.predict_sentences([("我喜欢妈妈做的汤", "妈妈做的汤我很喜欢喝")])
275 | 


--------------------------------------------------------------------------------
/albert_tsim/test_changes.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import tensorflow as tf
 3 | from modeling import embedding_lookup_factorized,transformer_model
 4 | import os
 5 | 
 6 | """
 7 | 测试albert主要的改进点：词嵌入的因式分解、层间参数共享、段落间连贯性
 8 | test main change of albert from bert
 9 | """
10 | batch_size = 2048
11 | sequence_length = 512
12 | vocab_size = 30000
13 | hidden_size = 1024
14 | num_attention_heads = int(hidden_size / 64)
15 | 
16 | def get_total_parameters():
17 |     """
18 |     get total parameters of a graph
19 |     :return:
20 |     """
21 |     total_parameters = 0
22 |     for variable in tf.trainable_variables():
23 |         # shape is an array of tf.Dimension
24 |         shape = variable.get_shape()
25 |         # print(shape)
26 |         # print(len(shape))
27 |         variable_parameters = 1
28 |         for dim in shape:
29 |             # print(dim)
30 |             variable_parameters *= dim.value
31 |         # print(variable_parameters)
32 |         total_parameters += variable_parameters
33 |     return total_parameters
34 | 
35 | def test_factorized_embedding():
36 |     """
37 |     test of Factorized embedding parameterization
38 |     :return:
39 |     """
40 |     input_ids=tf.zeros((batch_size, sequence_length),dtype=tf.int32)
41 |     output, embedding_table, embedding_table_2=embedding_lookup_factorized(input_ids,vocab_size,hidden_size)
42 |     print("output:",output)
43 | 
44 | def test_share_parameters():
45 |     """
46 |     test of share parameters across all layers: how many parameter after share parameter across layers of transformer.
47 |     :return:
48 |     """
49 |     def total_parameters_transformer(share_parameter_across_layers):
50 |         input_tensor=tf.zeros((batch_size, sequence_length, hidden_size),dtype=tf.float32)
51 |         print("transformer_model. input:",input_tensor)
52 |         transformer_result=transformer_model(input_tensor,hidden_size=hidden_size,num_attention_heads=num_attention_heads,share_parameter_across_layers=share_parameter_across_layers)
53 |         print("transformer_result:",transformer_result)
54 |         total_parameters=get_total_parameters()
55 |         print('total_parameters(not share):',total_parameters)
56 | 
57 |     share_parameter_across_layers=False
58 |     total_parameters_transformer(share_parameter_across_layers) # total parameters, not share: 125,976,576 = 125 million
59 | 
60 |     tf.reset_default_graph() # Clears the default graph stack and resets the global default graph
61 |     share_parameter_across_layers=True
62 |     total_parameters_transformer(share_parameter_across_layers) #  total parameters,   share: 10,498,048 = 10.5 million
63 | 
64 | def test_sentence_order_prediction():
65 |     """
66 |     sentence order prediction.
67 | 
68 |     check method of create_instances_from_document_albert from create_pretrining_data.py
69 | 
70 |     :return:
71 |     """
72 |     # 添加运行权限
73 |     os.system("chmod +x create_pretrain_data.sh")
74 | 
75 |     os.system("./create_pretrain_data.sh")
76 | 
77 | 
78 | # 1.test of Factorized embedding parameterization
79 | #test_factorized_embedding()
80 | 
81 | # 2. test of share parameters across all layers: how many parameter after share parameter across layers of transformer.
82 | # before share parameter: 125,976,576; after share parameter:
83 | #test_share_parameters()
84 | 
85 | # 3. test of sentence order prediction(SOP)
86 | test_sentence_order_prediction()
87 | 
88 | 


--------------------------------------------------------------------------------
/bert_tsim/README.md:
--------------------------------------------------------------------------------
 1 | # bert-tsim
 2 | 
 3 | 本文基于Google开源的[BERT](https://github.com/google-research/bert)代码进行了进一步的简化，方便生成句向量与做文本分类
 4 | 
 5 | 1、下载BERT中文模型 
 6 | 
 7 | 下载地址1**[`BERT-Base, Chinese`]: https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip
 8 | 
 9 | 下载地址2**[`BERT-base, Chinese (Whole Word Masking)`]:(https://drive.google.com/open?id=1RoTQsXp2hkQ1gSRVylRIJfQxJUgkfJMW):
10 | 
11 | 2、把下载好的模型添加到当前目录下
12 | 
13 | 3、句向量生成
14 | 
15 | 生成句向量不需要做fine tune，使用预先训练好的模型即可，可参考`extract_feature.py`的`main`方法，注意参数必须是一个list。
16 | 
17 | 首次生成句向量时需要加载graph，并在output_dir路径下生成一个新的graph文件，因此速度比较慢，再次调用速度会很快
18 | ```
19 | from bert.extrac_feature import BertVector
20 | bv = BertVector()
21 | bv.encode(['毛主席发奖金'])
22 | ```
23 | 
24 | 4、文本分类
25 | 
26 | 文本分类需要做fine tune，首先把数据准备好存放在`data`目录下，训练集的名字必须为`train.csv`，验证集的名字必须为`dev.csv`，测试集的名字必须为`test.csv`，
27 | 必须先调用`set_mode`方法，可参考`similarity.py`的`main`方法，
28 | 
29 | 训练：
30 | ```
31 | from similarity import BertSim
32 | import tensorflow as tf
33 | 
34 | bs = BertSim()
35 | bs.set_mode(tf.estimator.ModeKeys.TRAIN)
36 | bs.train()
37 | ```
38 | 
39 | 验证：
40 | ```
41 | from similarity import BertSim
42 | import tensorflow as tf
43 | 
44 | bs = BertSim()
45 | bs.set_mode(tf.estimator.ModeKeys.EVAL)
46 | bs.eval()
47 | ```
48 | 
49 | 测试：
50 | ```
51 | from similarity import BertSim
52 | import tensorflow as tf
53 | 
54 | bs = BertSim()
55 | bs.set_mode(tf.estimator.ModeKeys.PREDICT)
56 | bs.test
57 | ```
58 | 
59 | 


--------------------------------------------------------------------------------
/bert_tsim/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/bert_tsim/args.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tensorflow as tf
 3 | 
 4 | tf.logging.set_verbosity(tf.logging.INFO)
 5 | 
 6 | file_path = os.path.dirname(__file__)
 7 | 
 8 | model_dir = os.path.join(file_path, 'BERT-wwm/')
 9 | config_name = os.path.join(model_dir, 'bert_config.json')
10 | ckpt_name = os.path.join(model_dir, 'bert_model.ckpt')
11 | output_dir = os.path.join(model_dir, '../output/')
12 | vocab_file = os.path.join(model_dir, 'vocab.txt')
13 | data_dir = os.path.join(model_dir, '../data/')
14 | 
15 | num_train_epochs = 10
16 | batch_size = 128
17 | learning_rate = 0.00005
18 | 
19 | # gpu使用率
20 | gpu_memory_fraction = 0.8
21 | 
22 | # 默认取倒数第二层的输出值作为句向量
23 | layer_indexes = [-2]
24 | 
25 | # 序列的最大程度，单文本建议把该值调小
26 | max_seq_len = 32
27 | 


--------------------------------------------------------------------------------
/bert_tsim/graph.py:
--------------------------------------------------------------------------------
  1 | import tempfile
  2 | import json
  3 | import logging
  4 | from termcolor import colored
  5 | import modeling
  6 | import args
  7 | import tensorflow as tf
  8 | import os
  9 | 
 10 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 11 | 
 12 | 
 13 | def set_logger(context, verbose=False):
 14 |     logger = logging.getLogger(context)
 15 |     logger.setLevel(logging.DEBUG if verbose else logging.INFO)
 16 |     formatter = logging.Formatter(
 17 |         '%(levelname)-.1s:' + context + ':[%(filename).5s:%(funcName).3s:%(lineno)3d]:%(message)s', datefmt=
 18 |         '%m-%d %H:%M:%S')
 19 |     console_handler = logging.StreamHandler()
 20 |     console_handler.setLevel(logging.DEBUG if verbose else logging.INFO)
 21 |     console_handler.setFormatter(formatter)
 22 |     logger.handlers = []
 23 |     logger.addHandler(console_handler)
 24 |     return logger
 25 | 
 26 | 
 27 | def optimize_graph(logger=None, verbose=False):
 28 |     if not logger:
 29 |         logger = set_logger(colored('BERT_VEC', 'yellow'), verbose)
 30 |     try:
 31 |         # we don't need GPU for optimizing the graph
 32 |         from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference
 33 |         tf.gfile.MakeDirs(args.output_dir)
 34 | 
 35 |         config_fp = args.config_name
 36 |         logger.info('model config: %s' % config_fp)
 37 | 
 38 |         # 加载bert配置文件
 39 |         with tf.gfile.GFile(config_fp, 'r') as f:
 40 |             bert_config = modeling.BertConfig.from_dict(json.load(f))
 41 | 
 42 |         logger.info('build graph...')
 43 |         # input placeholders, not sure if they are friendly to XLA
 44 |         input_ids = tf.placeholder(tf.int32, (None, args.max_seq_len), 'input_ids')
 45 |         input_mask = tf.placeholder(tf.int32, (None, args.max_seq_len), 'input_mask')
 46 |         input_type_ids = tf.placeholder(tf.int32, (None, args.max_seq_len), 'input_type_ids')
 47 | 
 48 |         jit_scope = tf.contrib.compiler.jit.experimental_jit_scope
 49 | 
 50 |         with jit_scope():
 51 |             input_tensors = [input_ids, input_mask, input_type_ids]
 52 | 
 53 |             model = modeling.BertModel(
 54 |                 config=bert_config,
 55 |                 is_training=False,
 56 |                 input_ids=input_ids,
 57 |                 input_mask=input_mask,
 58 |                 token_type_ids=input_type_ids,
 59 |                 use_one_hot_embeddings=False)
 60 | 
 61 |             # 获取所有要训练的变量
 62 |             tvars = tf.trainable_variables()
 63 | 
 64 |             init_checkpoint = args.ckpt_name
 65 |             (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars,
 66 |                                                                                                        init_checkpoint)
 67 | 
 68 |             tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
 69 | 
 70 |             # 共享卷积核
 71 |             with tf.variable_scope("pooling"):
 72 |                 # 如果只有一层，就只取对应那一层的weight
 73 |                 if len(args.layer_indexes) == 1:
 74 |                     encoder_layer = model.all_encoder_layers[args.layer_indexes[0]]
 75 |                 else:
 76 |                     # 否则遍历需要取的层，把所有层的weight取出来并拼接起来shape:768*层数
 77 |                     all_layers = [model.all_encoder_layers[l] for l in args.layer_indexes]
 78 |                     encoder_layer = tf.concat(all_layers, -1)
 79 | 
 80 |             mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
 81 |             masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
 82 |                     tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
 83 | 
 84 |             input_mask = tf.cast(input_mask, tf.float32)
 85 |             # 以下代码是句向量的生成方法，可以理解为做了一个卷积的操作，但是没有把结果相加, 卷积核是input_mask
 86 |             pooled = masked_reduce_mean(encoder_layer, input_mask)
 87 |             pooled = tf.identity(pooled, 'final_encodes')
 88 | 
 89 |             output_tensors = [pooled]
 90 |             tmp_g = tf.get_default_graph().as_graph_def()
 91 | 
 92 |         # allow_soft_placement:自动选择运行设备
 93 |         config = tf.ConfigProto(allow_soft_placement=True)
 94 |         with tf.Session(config=config) as sess:
 95 |             logger.info('load parameters from checkpoint...')
 96 |             sess.run(tf.global_variables_initializer())
 97 |             logger.info('freeze...')
 98 |             tmp_g = tf.graph_util.convert_variables_to_constants(sess, tmp_g, [n.name[:-2] for n in output_tensors])
 99 |             dtypes = [n.dtype for n in input_tensors]
100 |             logger.info('optimize...')
101 |             tmp_g = optimize_for_inference(
102 |                 tmp_g,
103 |                 [n.name[:-2] for n in input_tensors],
104 |                 [n.name[:-2] for n in output_tensors],
105 |                 [dtype.as_datatype_enum for dtype in dtypes],
106 |                 False)
107 |         tmp_file = tempfile.NamedTemporaryFile('w', delete=False, dir=args.output_dir).name
108 |         logger.info('write graph to a tmp file: %s' % tmp_file)
109 |         with tf.gfile.GFile(tmp_file, 'wb') as f:
110 |             f.write(tmp_g.SerializeToString())
111 |         return tmp_file
112 |     except Exception as e:
113 |         logger.error('fail to optimize the graph!')
114 |         logger.error(e)
115 | 


--------------------------------------------------------------------------------
/bert_tsim/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Functions and classes related to optimization (weight updates)."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import re
 22 | import tensorflow as tf
 23 | 
 24 | 
 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
 26 |   """Creates an optimizer training op."""
 27 |   global_step = tf.train.get_or_create_global_step()
 28 | 
 29 |   learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
 30 | 
 31 |   # Implements linear decay of the learning rate.
 32 |   learning_rate = tf.train.polynomial_decay(
 33 |       learning_rate,
 34 |       global_step,
 35 |       num_train_steps,
 36 |       end_learning_rate=0.0,
 37 |       power=1.0,
 38 |       cycle=False)
 39 | 
 40 |   # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
 41 |   # learning rate will be `global_step/num_warmup_steps * init_lr`.
 42 |   if num_warmup_steps:
 43 |     global_steps_int = tf.cast(global_step, tf.int32)
 44 |     warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
 45 | 
 46 |     global_steps_float = tf.cast(global_steps_int, tf.float32)
 47 |     warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
 48 | 
 49 |     warmup_percent_done = global_steps_float / warmup_steps_float
 50 |     warmup_learning_rate = init_lr * warmup_percent_done
 51 | 
 52 |     is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
 53 |     learning_rate = (
 54 |         (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
 55 | 
 56 |   # It is recommended that you use this optimizer for fine tuning, since this
 57 |   # is how the model was trained (note that the Adam m/v variables are NOT
 58 |   # loaded from init_checkpoint.)
 59 |   optimizer = AdamWeightDecayOptimizer(
 60 |       learning_rate=learning_rate,
 61 |       weight_decay_rate=0.01,
 62 |       beta_1=0.9,
 63 |       beta_2=0.999,
 64 |       epsilon=1e-6,
 65 |       exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 66 | 
 67 |   if use_tpu:
 68 |     optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
 69 | 
 70 |   tvars = tf.trainable_variables()
 71 |   grads = tf.gradients(loss, tvars)
 72 | 
 73 |   # This is how the model was pre-trained.
 74 |   (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
 75 | 
 76 |   train_op = optimizer.apply_gradients(
 77 |       zip(grads, tvars), global_step=global_step)
 78 | 
 79 |   new_global_step = global_step + 1
 80 |   train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 81 |   return train_op
 82 | 
 83 | 
 84 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
 85 |   """A basic Adam optimizer that includes "correct" L2 weight decay."""
 86 | 
 87 |   def __init__(self,
 88 |                learning_rate,
 89 |                weight_decay_rate=0.0,
 90 |                beta_1=0.9,
 91 |                beta_2=0.999,
 92 |                epsilon=1e-6,
 93 |                exclude_from_weight_decay=None,
 94 |                name="AdamWeightDecayOptimizer"):
 95 |     """Constructs a AdamWeightDecayOptimizer."""
 96 |     super(AdamWeightDecayOptimizer, self).__init__(False, name)
 97 | 
 98 |     self.learning_rate = learning_rate
 99 |     self.weight_decay_rate = weight_decay_rate
100 |     self.beta_1 = beta_1
101 |     self.beta_2 = beta_2
102 |     self.epsilon = epsilon
103 |     self.exclude_from_weight_decay = exclude_from_weight_decay
104 | 
105 |   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
106 |     """See base class."""
107 |     assignments = []
108 |     for (grad, param) in grads_and_vars:
109 |       if grad is None or param is None:
110 |         continue
111 | 
112 |       param_name = self._get_variable_name(param.name)
113 | 
114 |       m = tf.get_variable(
115 |           name=param_name + "/adam_m",
116 |           shape=param.shape.as_list(),
117 |           dtype=tf.float32,
118 |           trainable=False,
119 |           initializer=tf.zeros_initializer())
120 |       v = tf.get_variable(
121 |           name=param_name + "/adam_v",
122 |           shape=param.shape.as_list(),
123 |           dtype=tf.float32,
124 |           trainable=False,
125 |           initializer=tf.zeros_initializer())
126 | 
127 |       # Standard Adam update.
128 |       next_m = (
129 |           tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
130 |       next_v = (
131 |           tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
132 |                                                     tf.square(grad)))
133 | 
134 |       update = next_m / (tf.sqrt(next_v) + self.epsilon)
135 | 
136 |       # Just adding the square of the weights to the loss function is *not*
137 |       # the correct way of using L2 regularization/weight decay with Adam,
138 |       # since that will interact with the m and v parameters in strange ways.
139 |       #
140 |       # Instead we want ot decay the weights in a manner that doesn't interact
141 |       # with the m/v parameters. This is equivalent to adding the square
142 |       # of the weights to the loss with plain (non-momentum) SGD.
143 |       if self._do_use_weight_decay(param_name):
144 |         update += self.weight_decay_rate * param
145 | 
146 |       update_with_lr = self.learning_rate * update
147 | 
148 |       next_param = param - update_with_lr
149 | 
150 |       assignments.extend(
151 |           [param.assign(next_param),
152 |            m.assign(next_m),
153 |            v.assign(next_v)])
154 |     return tf.group(*assignments, name=name)
155 | 
156 |   def _do_use_weight_decay(self, param_name):
157 |     """Whether to use L2 weight decay for `param_name`."""
158 |     if not self.weight_decay_rate:
159 |       return False
160 |     if self.exclude_from_weight_decay:
161 |       for r in self.exclude_from_weight_decay:
162 |         if re.search(r, param_name) is not None:
163 |           return False
164 |     return True
165 | 
166 |   def _get_variable_name(self, param_name):
167 |     """Get the variable name from the tensor name."""
168 |     m = re.match("^(.*):\\d+$", param_name)
169 |     if m is not None:
170 |       param_name = m.group(1)
171 |     return param_name
172 | 


--------------------------------------------------------------------------------
/bert_tsim/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow >= 1.11.0   # CPU Version of TensorFlow.
2 | # tensorflow-gpu  >= 1.11.0  # GPU version of TensorFlow.
3 | 


--------------------------------------------------------------------------------
/bert_tsim/tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import unicodedata
 23 | import six
 24 | import tensorflow as tf
 25 | 
 26 | 
 27 | def convert_to_unicode(text):
 28 |   """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
 29 |   if six.PY3:
 30 |     if isinstance(text, str):
 31 |       return text
 32 |     elif isinstance(text, bytes):
 33 |       return text.decode("utf-8", "ignore")
 34 |     else:
 35 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 36 |   elif six.PY2:
 37 |     if isinstance(text, str):
 38 |       return text.decode("utf-8", "ignore")
 39 |     elif isinstance(text, unicode):
 40 |       return text
 41 |     else:
 42 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 43 |   else:
 44 |     raise ValueError("Not running on Python2 or Python 3?")
 45 | 
 46 | 
 47 | def printable_text(text):
 48 |   """Returns text encoded in a way suitable for print or `tf.logging`."""
 49 | 
 50 |   # These functions want `str` for both Python2 and Python3, but in one case
 51 |   # it's a Unicode string and in the other it's a byte string.
 52 |   if six.PY3:
 53 |     if isinstance(text, str):
 54 |       return text
 55 |     elif isinstance(text, bytes):
 56 |       return text.decode("utf-8", "ignore")
 57 |     else:
 58 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 59 |   elif six.PY2:
 60 |     if isinstance(text, str):
 61 |       return text
 62 |     elif isinstance(text, unicode):
 63 |       return text.encode("utf-8")
 64 |     else:
 65 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 66 |   else:
 67 |     raise ValueError("Not running on Python2 or Python 3?")
 68 | 
 69 | 
 70 | def load_vocab(vocab_file):
 71 |   """Loads a vocabulary file into a dictionary."""
 72 |   vocab = collections.OrderedDict()
 73 |   index = 0
 74 |   with tf.gfile.GFile(vocab_file, "r") as reader:
 75 |     while True:
 76 |       token = convert_to_unicode(reader.readline())
 77 |       if not token:
 78 |         break
 79 |       token = token.strip()
 80 |       vocab[token] = index
 81 |       index += 1
 82 |   return vocab
 83 | 
 84 | 
 85 | def convert_by_vocab(vocab, items):
 86 |   """Converts a sequence of [tokens|ids] using the vocab."""
 87 |   output = []
 88 |   for item in items:
 89 |     output.append(vocab[item])
 90 |   return output
 91 | 
 92 | 
 93 | def convert_tokens_to_ids(vocab, tokens):
 94 |   return convert_by_vocab(vocab, tokens)
 95 | 
 96 | 
 97 | def convert_ids_to_tokens(inv_vocab, ids):
 98 |   return convert_by_vocab(inv_vocab, ids)
 99 | 
100 | 
101 | def whitespace_tokenize(text):
102 |   """Runs basic whitespace cleaning and splitting on a piece of text."""
103 |   text = text.strip()
104 |   if not text:
105 |     return []
106 |   tokens = text.split()
107 |   return tokens
108 | 
109 | 
110 | class FullTokenizer(object):
111 |   """Runs end-to-end tokenziation."""
112 | 
113 |   def __init__(self, vocab_file, do_lower_case=True):
114 |     self.vocab = load_vocab(vocab_file)
115 |     self.inv_vocab = {v: k for k, v in self.vocab.items()}
116 |     self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
117 |     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
118 | 
119 |   def tokenize(self, text):
120 |     split_tokens = []
121 |     for token in self.basic_tokenizer.tokenize(text):
122 |       for sub_token in self.wordpiece_tokenizer.tokenize(token):
123 |         split_tokens.append(sub_token)
124 | 
125 |     return split_tokens
126 | 
127 |   def convert_tokens_to_ids(self, tokens):
128 |     return convert_by_vocab(self.vocab, tokens)
129 | 
130 |   def convert_ids_to_tokens(self, ids):
131 |     return convert_by_vocab(self.inv_vocab, ids)
132 | 
133 | 
134 | class BasicTokenizer(object):
135 |   """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
136 | 
137 |   def __init__(self, do_lower_case=True):
138 |     """Constructs a BasicTokenizer.
139 | 
140 |     Args:
141 |       do_lower_case: Whether to lower case the input.
142 |     """
143 |     self.do_lower_case = do_lower_case
144 | 
145 |   def tokenize(self, text):
146 |     """Tokenizes a piece of text."""
147 |     text = convert_to_unicode(text)
148 |     text = self._clean_text(text)
149 | 
150 |     # This was added on November 1st, 2018 for the multilingual and Chinese
151 |     # models. This is also applied to the English models now, but it doesn't
152 |     # matter since the English models were not trained on any Chinese data
153 |     # and generally don't have any Chinese data in them (there are Chinese
154 |     # characters in the vocabulary because Wikipedia does have some Chinese
155 |     # words in the English Wikipedia.).
156 |     text = self._tokenize_chinese_chars(text)
157 | 
158 |     orig_tokens = whitespace_tokenize(text)
159 |     split_tokens = []
160 |     for token in orig_tokens:
161 |       if self.do_lower_case:
162 |         token = token.lower()
163 |         token = self._run_strip_accents(token)
164 |       split_tokens.extend(self._run_split_on_punc(token))
165 | 
166 |     output_tokens = whitespace_tokenize(" ".join(split_tokens))
167 |     return output_tokens
168 | 
169 |   def _run_strip_accents(self, text):
170 |     """Strips accents from a piece of text."""
171 |     text = unicodedata.normalize("NFD", text)
172 |     output = []
173 |     for char in text:
174 |       cat = unicodedata.category(char)
175 |       if cat == "Mn":
176 |         continue
177 |       output.append(char)
178 |     return "".join(output)
179 | 
180 |   def _run_split_on_punc(self, text):
181 |     """Splits punctuation on a piece of text."""
182 |     chars = list(text)
183 |     i = 0
184 |     start_new_word = True
185 |     output = []
186 |     while i < len(chars):
187 |       char = chars[i]
188 |       if _is_punctuation(char):
189 |         output.append([char])
190 |         start_new_word = True
191 |       else:
192 |         if start_new_word:
193 |           output.append([])
194 |         start_new_word = False
195 |         output[-1].append(char)
196 |       i += 1
197 | 
198 |     return ["".join(x) for x in output]
199 | 
200 |   def _tokenize_chinese_chars(self, text):
201 |     """Adds whitespace around any CJK character."""
202 |     output = []
203 |     for char in text:
204 |       cp = ord(char)
205 |       if self._is_chinese_char(cp):
206 |         output.append(" ")
207 |         output.append(char)
208 |         output.append(" ")
209 |       else:
210 |         output.append(char)
211 |     return "".join(output)
212 | 
213 |   def _is_chinese_char(self, cp):
214 |     """Checks whether CP is the codepoint of a CJK character."""
215 |     # This defines a "chinese character" as anything in the CJK Unicode block:
216 |     #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
217 |     #
218 |     # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
219 |     # despite its name. The modern Korean Hangul alphabet is a different block,
220 |     # as is Japanese Hiragana and Katakana. Those alphabets are used to write
221 |     # space-separated words, so they are not treated specially and handled
222 |     # like the all of the other languages.
223 |     if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
224 |         (cp >= 0x3400 and cp <= 0x4DBF) or  #
225 |         (cp >= 0x20000 and cp <= 0x2A6DF) or  #
226 |         (cp >= 0x2A700 and cp <= 0x2B73F) or  #
227 |         (cp >= 0x2B740 and cp <= 0x2B81F) or  #
228 |         (cp >= 0x2B820 and cp <= 0x2CEAF) or
229 |         (cp >= 0xF900 and cp <= 0xFAFF) or  #
230 |         (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
231 |       return True
232 | 
233 |     return False
234 | 
235 |   def _clean_text(self, text):
236 |     """Performs invalid character removal and whitespace cleanup on text."""
237 |     output = []
238 |     for char in text:
239 |       cp = ord(char)
240 |       if cp == 0 or cp == 0xfffd or _is_control(char):
241 |         continue
242 |       if _is_whitespace(char):
243 |         output.append(" ")
244 |       else:
245 |         output.append(char)
246 |     return "".join(output)
247 | 
248 | 
249 | class WordpieceTokenizer(object):
250 |   """Runs WordPiece tokenziation."""
251 | 
252 |   def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
253 |     self.vocab = vocab
254 |     self.unk_token = unk_token
255 |     self.max_input_chars_per_word = max_input_chars_per_word
256 | 
257 |   def tokenize(self, text):
258 |     """Tokenizes a piece of text into its word pieces.
259 | 
260 |     This uses a greedy longest-match-first algorithm to perform tokenization
261 |     using the given vocabulary.
262 | 
263 |     For example:
264 |       input = "unaffable"
265 |       output = ["un", "##aff", "##able"]
266 | 
267 |     Args:
268 |       text: A single token or whitespace separated tokens. This should have
269 |         already been passed through `BasicTokenizer.
270 | 
271 |     Returns:
272 |       A list of wordpiece tokens.
273 |     """
274 | 
275 |     text = convert_to_unicode(text)
276 | 
277 |     output_tokens = []
278 |     for token in whitespace_tokenize(text):
279 |       chars = list(token)
280 |       if len(chars) > self.max_input_chars_per_word:
281 |         output_tokens.append(self.unk_token)
282 |         continue
283 | 
284 |       is_bad = False
285 |       start = 0
286 |       sub_tokens = []
287 |       while start < len(chars):
288 |         end = len(chars)
289 |         cur_substr = None
290 |         while start < end:
291 |           substr = "".join(chars[start:end])
292 |           if start > 0:
293 |             substr = "##" + substr
294 |           if substr in self.vocab:
295 |             cur_substr = substr
296 |             break
297 |           end -= 1
298 |         if cur_substr is None:
299 |           is_bad = True
300 |           break
301 |         sub_tokens.append(cur_substr)
302 |         start = end
303 | 
304 |       if is_bad:
305 |         output_tokens.append(self.unk_token)
306 |       else:
307 |         output_tokens.extend(sub_tokens)
308 |     return output_tokens
309 | 
310 | 
311 | def _is_whitespace(char):
312 |   """Checks whether `chars` is a whitespace character."""
313 |   # \t, \n, and \r are technically contorl characters but we treat them
314 |   # as whitespace since they are generally considered as such.
315 |   if char == " " or char == "\t" or char == "\n" or char == "\r":
316 |     return True
317 |   cat = unicodedata.category(char)
318 |   if cat == "Zs":
319 |     return True
320 |   return False
321 | 
322 | 
323 | def _is_control(char):
324 |   """Checks whether `chars` is a control character."""
325 |   # These are technically control characters but we count them as whitespace
326 |   # characters.
327 |   if char == "\t" or char == "\n" or char == "\r":
328 |     return False
329 |   cat = unicodedata.category(char)
330 |   if cat.startswith("C"):
331 |     return True
332 |   return False
333 | 
334 | 
335 | def _is_punctuation(char):
336 |   """Checks whether `chars` is a punctuation character."""
337 |   cp = ord(char)
338 |   # We treat all non-letter/number ASCII as punctuation.
339 |   # Characters such as "^", "$", and "`" are not in the Unicode
340 |   # Punctuation class but we treat them as punctuation anyways, for
341 |   # consistency.
342 |   if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
343 |       (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
344 |     return True
345 |   cat = unicodedata.category(char)
346 |   if cat.startswith("P"):
347 |     return True
348 |   return False
349 | 


--------------------------------------------------------------------------------
/data/test.csv:
--------------------------------------------------------------------------------
 1 | 80064	花呗的钱能期现吗	花呗能分几期还	0
 2 | 80065	怎么看到花呗用没用	用花呗付款后 怎么还	0
 3 | 80066	借呗 昨天晚上放款 现在都还没到	花呗借款钱还没到	0
 4 | 80067	我退货了，款也退回到花呗了，为什么还要还款。款也还了怎么没有退钱到我的帐户	好像我买东西了都扣钱了滴，为什么花呗还要这么多钱来还款	0
 5 | 80068	我用了借呗会影响房贷吗	频繁使用借呗会影响房贷吗	1
 6 | 80069	我刚刚用花呗是不是下个月一次性付	这个账单是用花呗付的吗	0
 7 | 80070	花呗临时额度有多少	花呗一个月涨多少额度	0
 8 | 80071	如何取消蚂蚁借呗软件	如何把蚂蚁借呗可借的额度取消了	0
 9 | 80072	我的还是不能开通借呗	我的蚂蚁借呗怎么一直都没开通	0
10 | 80073	我什么时候在借呗借钱了	我想帮他把蚂蚁借呗欠款还了	0
11 | 80074	使用花呗分期付款的东西退货	花呗分期付款，有一件衣服退回去了	0
12 | 80075	换个手机怎么没有花呗跟借呗了	为什么我没有 花呗和借呗	0
13 | 80076	登陆另一账号使用花呗	我支付宝登陆账号了为啥给我显示登录以前手机号使用花呗	0
14 | 80077	花呗为什么不能缴电费了	怎么花呗交电费交不了了	1
15 | 80078	为什么绑定了银行卡不能用花呗	为什么点花呗的时候一直说绑定银行卡	0
16 | 80079	我刚刚开通花呗不是说满***减***吗	我开通花呗是不是不提额是不用还吧	0
17 | 


--------------------------------------------------------------------------------
/data/train.csv:
--------------------------------------------------------------------------------
  1 | 1	﻿怎么更改花呗手机号码	我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号	1
  2 | 2	也开不了花呗，就这样了？完事了	真的嘛？就是花呗付款	0
  3 | 3	花呗冻结以后还能开通吗	我的条件可以开通花呗借款吗	0
  4 | 4	如何得知关闭借呗	想永久关闭借呗	0
  5 | 5	花呗扫码付钱	二维码扫描可以用花呗吗	0
  6 | 6	花呗逾期后不能分期吗	我这个 逾期后还完了 最低还款 后 能分期吗	0
  7 | 7	花呗分期清空	花呗分期查询	0
  8 | 8	借呗逾期短信通知	如何购买花呗短信通知	0
  9 | 9	借呗即将到期要还的账单还能分期吗	借呗要分期还，是吗	0
 10 | 10	花呗为什么不能支付手机交易	花呗透支了为什么不可以继续用了	0
 11 | 11	在吗，双***有临时花呗额度吗	花呗临时额度到时间怎么办	0
 12 | 12	我什么时候开通了花呗	查我什么时候用过，花呗	0
 13 | 13	花呗每月还了最低还款后还会有万分之五的利息吗	花呗每个月最低还款怎么算	0
 14 | 14	我想用蚂蚁借呗怎么用不了	蚂蚁借呗设置了一次性还款，现在想分期还款，怎么操作	0
 15 | 15	花呗也不能用了	花呗没用过	0
 16 | 16	花呗的安全没有验证成功	花呗安全验证没通过怎么回事	1
 17 | 17	我开通不了借呗	我要申请借呗	0
 18 | 18	借呗还款了，额度未恢复	借呗还款后额度没有恢复还显示借款	1
 19 | 19	就是我花呗忘记还款了。逾期一天。有事吗	花呗账单在到期当天还款是逾期吗	0
 20 | 20	我问你我借呗怎么要还钱	怎么一次性还蚂蚁借呗	0
 21 | 21	是否有花呗就不可以借呗	不实用花呗就没有借呗吗	0
 22 | 22	闲鱼可以使用花呗吗	花呗可以在闲鱼上面交易吗	1
 23 | 23	花呗可以买手机吗	花呗逾期一天可以么	0
 24 | 24	借呗每月还款时间	借呗多少天还款	0
 25 | 25	花呗更改绑定银行卡	如何更换花呗绑定银行卡	1
 26 | 26	人脸验证开通花呗	花呗怎么验证	0
 27 | 27	借呗可以提前还第一期吗	蚂蚁借呗借款可以提前一次性还清吗	0
 28 | 28	花呗付款成功为何美团显示支付超时	美团订单没显示付款成功但是花呗显示付了三遍订单	0
 29 | 29	花呗我已经还了，现在要还借呗	我上个月已经还过的一笔，现在退回来的钱怎么还直接退回花呗	0
 30 | 30	国外账户可以开通借呗吗	借呗逾期还能开通吗	0
 31 | 31	但是我银行卡支付的为什么退回花呗	我这件衣服钱款为什么退回花呗	0
 32 | 32	花呗***期免息	蚂蚁花呗***期免息什么意思	1
 33 | 33	已经还了，借呗还能用吗	借呗逾期后还能借款吗	0
 34 | 34	开通花呗不用的话会有费用吗	开通花呗不用，会不会产生费用	1
 35 | 35	花呗怎样邀请好友	邀请好友开通借吧怎样邀请的	0
 36 | 36	我没有这个蚂蚁借呗	我的蚂蚁借呗用不了了怎么回事？没有逾期	0
 37 | 37	我昨天欠借呗多少钱	我的借呗欠了多少钱	0
 38 | 38	为什么我不能开通花呗	开通花呗提示安全不通过	0
 39 | 39	借呗提额输入密码了怎么还没提额	借呗提额还需要输入支付密码吗	0
 40 | 40	我没给自己用花呗充话费，怎么自动充	这几天用花呗交了三次话费都没充上 怎么回事	0
 41 | 41	花呗付款后，扣款成功为什么显示未支付	支付宝花呗扣款，商家显示未扣款	0
 42 | 42	我蚂蚁借呗放款没有到账	借呗还款为什么没有到帐	1
 43 | 43	上个月都用花呗，这个月也还款了，怎么还不能提额？是不是每个月都要用了一定的额度，才能提额	这个钱是***月的，退款之前我已经还清花呗了，现在不知怎么才能使用，现在买东西，花呗都是在我额度***里面扣的	0
 44 | 44	开通花呗收款后，符合什么条件的商家或买家才可以使用花呗交易	我是商家，我已经申请了开通蚂蚁花呗和信用卡收款，为什么还是不可以	0
 45 | 45	信用度多少才能用花呗	蚂蚁花呗多少积分才可以用	0
 46 | 46	刚消费的，扣卡里的钱，没扣花呗的钱，怎么回事	我银行里有钱，怎么证明是花呗付的款	0
 47 | 47	如何在花呗删除账单记录	花呗为什么说删除交易记录	0
 48 | 48	花呗怎么付款不鸟了	帮忙看一下我花呗怎么用不了	1
 49 | 49	为什么我的花呗一直没调	为什么我的花呗没有分期的功能	0
 50 | 50	花呗临时额度可以分期还吗	临时额度和实际额度可以按一个账单一起分期还款吗	0
 51 | 51	为啥我花呗叫话费都交不了	花呗暂时不能交话费 还是以后都不可以了	0
 52 | 52	能用花呗收款收顾客的钱吗	私企可以花呗收款吗	0
 53 | 53	蚂蚁借呗可以延长分期	可以晚***天还蚂蚁借呗吗	0
 54 | 54	没有借呗怎么开通借呗	有没有什么办法可以自己开通蚂蚁借呗	0
 55 | 55	商家二维码怎么开通花呗支付宝	怎么说不支持花呗开通商户	0
 56 | 56	她的花呗已经逾期了	我的花呗逾期了 我现在一次性还完了	0
 57 | 57	在支付宝找不到花呗	可我在支付宝花呗里没看到进账	0
 58 | 58	支付宝花呗分期还款怎么提前还清	蚂蚁花呗分期付款可以全部提前还款吗	1
 59 | 59	注销了一个花呗账号，新账号是否还能再开通花呗了	把以前的帐号注销，现在这个能开通花呗吗	1
 60 | 60	花呗账单还没出，想提前还了怎么还	提前还花呗部分	0
 61 | 61	被封的借呗什么时候恢复	求求你把我的借呗恢复	0
 62 | 62	为何蚂蚁借呗不能使用了	我的借呗***年能有***年怎么不能用了	1
 63 | 63	帮我开通蚂蚁花呗零时额度	里可以看到我花呗的临时额度	0
 64 | 64	用了这么久的支付宝也不给开通借呗，哼	我用支付宝三四年了也不见给我开通借呗	1
 65 | 65	花呗体验额度咋老这那	花呗体验额度最多是多少	0
 66 | 66	借呗还款额度限制	还借呗余额限额	0
 67 | 67	蚂蚁借呗的额度为什么会下降	为什么借呗额度被降低了，没有不良记录	1
 68 | 68	花呗最低还款和花呗分期	花呗分期支持最低还款嘛	0
 69 | 69	花呗开通年级	开通花呗添加银行卡	0
 70 | 70	我想开通信用卡和花呗的收款码	我的蚂蚁花呗为什么不能开通吗	0
 71 | 71	我花呗额度能不能给我调高点	为什么我的花呗的额度不升	0
 72 | 72	花呗之前绑定的手机号码现在没用了，怎么解绑	花呗绑定的是另一个号码，蜜码和帐号忘记了，手机号码也注消了，怎么办	0
 73 | 73	花呗线下还款是什么	花呗主动还款操作	0
 74 | 74	我***月份花呗消费是***元，怎么要我还***元	我原本这个月总的花呗需要还***元，我还了一次***，又还了一个***，剩余的应该是***，为何现在的是***，应还额度为何莫名的多了***	1
 75 | 75	用花呗刷卡可以吗	能用花呗扫一扫付款吗	0
 76 | 76	花呗分期卖家手续费	花呗付款如何分期	0
 77 | 77	花呗自动余额宝还款	缴纳电费 花呗自动扣款了，然后余额宝也扣款了	0
 78 | 78	我的 借呗 在哪儿	借呗的qq	0
 79 | 79	我的花呗没有绑定银行卡怎么还款	我还没有绑定银行卡，我花了花呗的钱，如果要还款的话要绑定银行卡才可以是吧	0
 80 | 80	为什么双十二没有花呗零时额度	双***花呗有木有临时额度	0
 81 | 81	借呗有晚一天还有没有逾期呀	借呗有逾期么	0
 82 | 82	双***零时花呗额度	我要申请花呗临时额度	0
 83 | 83	付款方式添加花呗怎么添加	怎样退出花呗付费方式	0
 84 | 84	使用花呗支付会有短信提醒吗	***月***日的***元花呗没有支付提醒	0
 85 | 85	借呗能提款不	借呗可以提款装修房子吗	0
 86 | 86	我的花呗怎么不能用这个月还没有给我额度	花呗还款之后怎么额度没有了	0
 87 | 87	付款金额多少可以使用花呗	花呗提额***到多少	0
 88 | 88	怎么花呗不能支付	花呗付款不了怎么回事	1
 89 | 89	花呗到还款日期还不了会怎么样	花呗现在为什么还不能还款	0
 90 | 90	为什么我的收钱码打不开花呗收钱	我今天申请了花呗收款己通过，为什么不能使用花呗收款	0
 91 | 91	我满足开通借呗的条件吗	我想咨询，为什么不能满足借呗条件	0
 92 | 92	怎么感觉蚂蚁花呗钱越还欠的越多	我的花呗这怎么会欠你们这么多钱，请给我解释一下	0
 93 | 93	借呗可以一次结清吗	借呗可以一年后再还吗	0
 94 | 94	办理花呗分期后可以提前还清账单吗？利息高吗	提前还花呗要利息	0
 95 | 95	花呗有退运费的	花呗的运费险退回哪	0
 96 | 96	我前天 用了花呗，今天能够还款不，怎么还款	我用了蚂蚁花呗***元钱，不知道怎么还	0
 97 | 97	找不到花呗	我的淘宝花呗怎么找不到	0
 98 | 98	我绑定的银行卡还不上借呗	为啥我的花呗不能绑银行卡	0
 99 | 99	我的花呗收不了款	我是厂家，人家用花呗付款，就是付***了，付不了	0
100 | 100	花呗支付可以使用购物津贴吗	使用购物津贴的费用可以用花呗吗	1
101 | 101	花呗还款有利息没	花呗还钱涨利息吗	0
102 | 102	为什么每个人的借呗日息不一样	为什么借呗利息有的高有的底	1
103 | 103	花呗服务怎么还款	我想还款，还花呗的钱	0
104 | 104	有的人不用花呗有***万的额度	花呗额度用完了	0
105 | 105	借呗还款余额有限额	蚂蚁借呗还款受到限额限制怎么办	0
106 | 106	借呗还款后是否可以恢复额度	借呗还款后额度怎么恢复不了	0
107 | 107	花呗不分期是不是叫没有手续费	花呗分期后全部结清有手续费吗	0
108 | 108	怎么邀请好友开通借呗	怎么邀请自己开通借呗	0
109 | 109	怎么开通，花呗收款	怎么注册可以花呗收钱的	1
110 | 110	我的花呗怎么用这用这不让用了	我花呗为什么不能用了	1
111 | 111	蚂蚁借呗分期付款能恢复额度吗	如何恢复借呗本身的额度	0
112 | 112	蚂蚁花呗还款期数可以更改吗	我之前设置的花呗分期还款现在可以更改吗	1
113 | 113	分期后花呗有多少额度	花呗可用额度	0
114 | 114	花呗里边的钱怎么办	退到花呗额度怎么办	0
115 | 115	花呗系统繁忙 是什么意思	点进花呗为什么是系统繁忙	1
116 | 116	支付宝如何用花呗预定酒店	我用花呗付酒店押金	0
117 | 117	信用卡和花呗支付是会	花呗和信用卡支付，也需要付款方复核	0
118 | 118	花呗交的电费为什么没有到账	我用花呗交电费，显示缴费成功，但没有到账，怎么回事	1
119 | 119	为什么借呗能开通花呗用不了	符合条件却不能开通花呗 借呗	0
120 | 120	我借呗能不能开通	如何开启借呗	1
121 | 121	不小心开通了花呗便利店周卡 如何取消	花呗分期 如何取消订单	0
122 | 122	花呗临时额度还了	临时花呗额度到期自动取消吗	0
123 | 123	蚂蚁花呗一天有使用次数限制吗	使用花呗付款功能每天有金额限制吗	0
124 | 124	花呗自动还款需要手续费ma	花呗自动还款还要收手续费吗	1
125 | 125	余额无法还花呗。绑定不了银行卡，如果还款	我的银行卡为什么不能还款给花呗	0
126 | 126	花呗可以买电动车吗	淘宝可以看花呗额度吗	0
127 | 127	借呗的自动还款怎样取消	借呗哪里取消自动还款	0
128 | 128	余额宝还借呗是还先存的还是还后存的	余额宝放三万，存半年，借呗会出吗	0
129 | 129	蚂蚁借呗，我的还款日不是今天	借呗还款日是怎嚒算	0
130 | 130	可以恢复删除的花呗账单吗	花呗咋删除	0
131 | 131	我是不是还是应该通过花呗还，也不能通过个人还吧	我花呗逾期了。不是故意逾期的	0
132 | 132	我点开看了花呗下月应还显示是***	花呗为什么显示下月还钱为零	0
133 | 133	花呗退款后为什么确认收货还是原来的价钱	退款花呗后为什么金额还是不变	1
134 | 134	借呗怎么操作主动还款	借呗还款怎么只有六个月的	0
135 | 135	借呗这次出现点我提额度永久的吗	怎么我的借呗没有额度了	0
136 | 136	花呗怎么不能用话费扣款	为什么花呗线下不能付款	0
137 | 137	同一张身份证两个支付宝账号都会有花呗吗	一个手机绑定多个账号的同时可以有多个花呗和借呗吗	0
138 | 138	我的花呗能重新评估吗	我想花呗升个几万	0
139 | 139	为什么我提前还不了花呗	为什么我现在花呗额度是负的，我想一次性还完却还不了	0
140 | 140	现在花呗不能用了是吧	麻烦不小，借呗搞得心情不好，现在花呗又不能支付	0
141 | 141	为什么我账户用不了花呗	要了我这个花呗，用不了什么情况，都差不多半年了	1
142 | 142	借呗提前还可以吗	如何提前还每个月借呗的钱	0
143 | 143	所以我今天用的花呗 就是下个月再还了	花呗是这个月花钱，下个月还款吗	1
144 | 144	花呗最后还款期能修改吗	我的花呗还款后可以恢复吗	0


--------------------------------------------------------------------------------
/data/val.csv:
--------------------------------------------------------------------------------
 1 | 89040	开通了，还用了蚂蚁花呗买东西	为什么我用花呗买了东西后显示我不符合开通花呗的条件	0
 2 | 89041	问怎么取消这个花呗	想取消花呗怎么取消	1
 3 | 89042	花呗红包是什么	花呗有什么问题吗	0
 4 | 89043	我根本没开通花呗。为什么我的退款会到花呗	为什么我的蚂蚁借呗怎么没有	0
 5 | 89044	花呗可以订房间么	滴滴打的可以用花呗那	0
 6 | 89045	借呗可以提前还下月账单吗	借呗，借出来后可以一次性还清吗	0
 7 | 89046	花呗逾期还清之后什么时候能再使用	逾期了 花呗什么时候可以使用	0
 8 | 89047	我想买个电视。花呗不够	我想卖洗衣机花呗钱不够怎么办	1
 9 | 89048	借呗额度怎样才能冻结了	为啥我的借呗冻结了	0
10 | 89049	消费时花呗密码	花呗支付宝密码	0
11 | 89050	花呗，可以绑定银行卡吗	花呗还款绑定银行卡可以吗	0
12 | 89051	为什么花呗让我登录qq	什么我的花呗开始不了	0
13 | 89052	我想用这个号开通花呗	你发个开通花呗链接我看能用不	0
14 | 89053	我没法使用花呗淘宝购物	淘宝购物不能用花呗	1
15 | 89054	花呗冻结怎么了解	花呗冻结还会解冻吗	1
16 | 89055	淘宝贷款和借呗便宜	淘宝跟借呗	0


--------------------------------------------------------------------------------
/export.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import os
  6 | 
  7 | import tensorflow as tf
  8 | from tqdm import tqdm
  9 | 
 10 | import modeling
 11 | import tokenization
 12 | from run_classifier import flags, FLAGS, InputExample, DataProcessor, create_model
 13 | 
 14 | flags.DEFINE_string(
 15 |     "model_dir", None,
 16 |     "The input data dir. Should contain the .ckpt files (or other data files) "
 17 |     "for the task.")
 18 | 
 19 | flags.DEFINE_string(
 20 |     "serving_model_save_path", None,
 21 |     "The input serving_model_save_path. Should be used to contain the .pt files (or other data files) "
 22 |     "for the task.")
 23 | 
 24 | 
 25 | def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
 26 |                      num_train_steps, num_warmup_steps, use_tpu,
 27 |                      use_one_hot_embeddings):
 28 |     """Returns `model_fn` closure for TPUEstimator."""
 29 | 
 30 |     def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
 31 |         """The `model_fn` for TPUEstimator."""
 32 | 
 33 |         # tf.logging.info("*** Features ***")
 34 |         # for name in sorted(features.keys()):
 35 |         #     tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
 36 | 
 37 |         input_ids = features["input_ids"]
 38 |         input_mask = features["input_mask"]
 39 |         segment_ids = features["segment_ids"]
 40 |         label_ids = features["label_ids"]
 41 | 
 42 |         is_training = (mode == tf.estimator.ModeKeys.TRAIN)
 43 | 
 44 |         (total_loss, per_example_loss, logits, probabilities) = create_model(
 45 |             bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
 46 |             num_labels, use_one_hot_embeddings)
 47 | 
 48 |         output_spec = tf.estimator.EstimatorSpec(
 49 |             mode=tf.estimator.ModeKeys.PREDICT,
 50 |             predictions=probabilities
 51 |         )
 52 |         return output_spec
 53 | 
 54 |     return model_fn
 55 | 
 56 | 
 57 | def serving_input_receiver_fn():
 58 |     input_ids = tf.placeholder(dtype=tf.int64, shape=[None, FLAGS.max_seq_length], name='input_ids')
 59 |     input_mask = tf.placeholder(dtype=tf.int64, shape=[None, FLAGS.max_seq_length], name='input_mask')
 60 |     segment_ids = tf.placeholder(dtype=tf.int64, shape=[None, FLAGS.max_seq_length], name='segment_ids')
 61 |     label_ids = tf.placeholder(dtype=tf.int64, shape=[None, ], name='unique_ids')
 62 | 
 63 |     receive_tensors = {'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids,
 64 |                        'label_ids': label_ids}
 65 |     features = {'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids, "label_ids": label_ids}
 66 |     return tf.estimator.export.ServingInputReceiver(features, receive_tensors)
 67 | 
 68 | 
 69 | class MyProcessor(DataProcessor):
 70 | 
 71 |     def get_test_examples(self, data_dir):
 72 |         return self.create_examples(
 73 |             self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
 74 | 
 75 |     def get_train_examples(self, data_dir):
 76 |         """See base class."""
 77 |         return self.create_examples(
 78 |             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 79 | 
 80 |     def get_dev_examples(self, data_dir):
 81 |         """See base class."""
 82 |         return self.create_examples(
 83 |             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 84 | 
 85 |     def get_pred_examples(self, data_dir):
 86 |         return self.create_examples(
 87 |             self._read_tsv(os.path.join(data_dir, "pred.tsv")), "pred")
 88 | 
 89 |     def get_labels(self):
 90 |         """See base class."""
 91 |         return ["-1", "0", "1"]
 92 | 
 93 |     def create_examples(self, lines, set_type, file_base=True):
 94 |         """Creates examples for the training and dev sets. each line is label+\t+text_a+\t+text_b """
 95 |         examples = []
 96 |         for (i, line) in tqdm(enumerate(lines)):
 97 | 
 98 |             if file_base:
 99 |                 if i == 0:
100 |                     continue
101 | 
102 |             guid = "%s-%s" % (set_type, i)
103 |             text = tokenization.convert_to_unicode(line[1])
104 |             if set_type == "test" or set_type == "pred":
105 |                 label = "0"
106 |             else:
107 |                 label = tokenization.convert_to_unicode(line[0])
108 |             examples.append(
109 |                 InputExample(guid=guid, text_a=text, label=label))
110 |         return examples
111 | 
112 | 
113 | def main(_):
114 |     tf.logging.set_verbosity(tf.logging.INFO)
115 | 
116 |     processors = {
117 |         "setiment": MyProcessor,
118 |     }
119 | 
120 |     bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
121 | 
122 |     if FLAGS.max_seq_length > bert_config.max_position_embeddings:
123 |         raise ValueError(
124 |             "Cannot use sequence length %d because the BERT model "
125 |             "was only trained up to sequence length %d" %
126 |             (FLAGS.max_seq_length, bert_config.max_position_embeddings))
127 | 
128 |     task_name = FLAGS.task_name.lower()
129 | 
130 |     if task_name not in processors:
131 |         raise ValueError("Task not found: %s" % task_name)
132 | 
133 |     processor = processors[task_name]()
134 | 
135 |     label_list = processor.get_labels()
136 | 
137 |     run_config = tf.contrib.tpu.RunConfig(model_dir=FLAGS.model_dir)
138 | 
139 |     num_train_steps = None
140 |     num_warmup_steps = None
141 | 
142 |     model_fn = model_fn_builder(
143 |         bert_config=bert_config,
144 |         num_labels=len(label_list),
145 |         init_checkpoint=FLAGS.init_checkpoint,
146 |         learning_rate=FLAGS.learning_rate,
147 |         num_train_steps=num_train_steps,
148 |         num_warmup_steps=num_warmup_steps,
149 |         use_tpu=FLAGS.use_tpu,
150 |         use_one_hot_embeddings=FLAGS.use_tpu)
151 | 
152 |     estimator = tf.contrib.tpu.TPUEstimator(use_tpu=FLAGS.use_tpu,
153 |                                             model_fn=model_fn,
154 |                                             config=run_config,
155 |                                             predict_batch_size=FLAGS.predict_batch_size,
156 |                                             export_to_tpu=False)
157 | 
158 |     estimator.export_savedmodel(FLAGS.serving_model_save_path, serving_input_receiver_fn)
159 | 
160 | 
161 | if __name__ == "__main__":
162 |     flags.mark_flag_as_required("model_dir")
163 |     flags.mark_flag_as_required("serving_model_save_path")
164 |     flags.mark_flag_as_required("data_dir")
165 |     flags.mark_flag_as_required("task_name")
166 |     flags.mark_flag_as_required("vocab_file")
167 |     flags.mark_flag_as_required("bert_config_file")
168 |     tf.app.run()
169 | 


--------------------------------------------------------------------------------
/export.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export BERT_BASE_DIR=bert_model/models/chinese_L-12_H-768_A-12
 3 | export GLUE_DIR=Bert/data
 4 | export MODEL_DIR=Bert/output
 5 | export MODEL_PB_DIR=Bert/api/
 6 | 
 7 | python export.py \
 8 |   --task_name=setiment \
 9 |   --do_predict=true \
10 |   --data_dir=$GLUE_DIR/ \
11 |   --vocab_file=$BERT_BASE_DIR/vocab.txt \
12 |   --bert_config_file=$BERT_BASE_DIR/bert_config.json \
13 |   --model_dir=$MODEL_DIR/ \
14 |   --serving_model_save_path=$MODEL_PB_DIR
15 | 
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/modeling_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import
 16 | from __future__ import division
 17 | from __future__ import print_function
 18 | 
 19 | import collections
 20 | import json
 21 | import random
 22 | import re
 23 | 
 24 | import modeling
 25 | import six
 26 | import tensorflow as tf
 27 | 
 28 | 
 29 | class BertModelTest(tf.test.TestCase):
 30 | 
 31 |   class BertModelTester(object):
 32 | 
 33 |     def __init__(self,
 34 |                  parent,
 35 |                  batch_size=13,
 36 |                  seq_length=7,
 37 |                  is_training=True,
 38 |                  use_input_mask=True,
 39 |                  use_token_type_ids=True,
 40 |                  vocab_size=99,
 41 |                  hidden_size=32,
 42 |                  num_hidden_layers=5,
 43 |                  num_attention_heads=4,
 44 |                  intermediate_size=37,
 45 |                  hidden_act="gelu",
 46 |                  hidden_dropout_prob=0.1,
 47 |                  attention_probs_dropout_prob=0.1,
 48 |                  max_position_embeddings=512,
 49 |                  type_vocab_size=16,
 50 |                  initializer_range=0.02,
 51 |                  scope=None):
 52 |       self.parent = parent
 53 |       self.batch_size = batch_size
 54 |       self.seq_length = seq_length
 55 |       self.is_training = is_training
 56 |       self.use_input_mask = use_input_mask
 57 |       self.use_token_type_ids = use_token_type_ids
 58 |       self.vocab_size = vocab_size
 59 |       self.hidden_size = hidden_size
 60 |       self.num_hidden_layers = num_hidden_layers
 61 |       self.num_attention_heads = num_attention_heads
 62 |       self.intermediate_size = intermediate_size
 63 |       self.hidden_act = hidden_act
 64 |       self.hidden_dropout_prob = hidden_dropout_prob
 65 |       self.attention_probs_dropout_prob = attention_probs_dropout_prob
 66 |       self.max_position_embeddings = max_position_embeddings
 67 |       self.type_vocab_size = type_vocab_size
 68 |       self.initializer_range = initializer_range
 69 |       self.scope = scope
 70 | 
 71 |     def create_model(self):
 72 |       input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length],
 73 |                                            self.vocab_size)
 74 | 
 75 |       input_mask = None
 76 |       if self.use_input_mask:
 77 |         input_mask = BertModelTest.ids_tensor(
 78 |             [self.batch_size, self.seq_length], vocab_size=2)
 79 | 
 80 |       token_type_ids = None
 81 |       if self.use_token_type_ids:
 82 |         token_type_ids = BertModelTest.ids_tensor(
 83 |             [self.batch_size, self.seq_length], self.type_vocab_size)
 84 | 
 85 |       config = modeling.BertConfig(
 86 |           vocab_size=self.vocab_size,
 87 |           hidden_size=self.hidden_size,
 88 |           num_hidden_layers=self.num_hidden_layers,
 89 |           num_attention_heads=self.num_attention_heads,
 90 |           intermediate_size=self.intermediate_size,
 91 |           hidden_act=self.hidden_act,
 92 |           hidden_dropout_prob=self.hidden_dropout_prob,
 93 |           attention_probs_dropout_prob=self.attention_probs_dropout_prob,
 94 |           max_position_embeddings=self.max_position_embeddings,
 95 |           type_vocab_size=self.type_vocab_size,
 96 |           initializer_range=self.initializer_range)
 97 | 
 98 |       model = modeling.BertModel(
 99 |           config=config,
100 |           is_training=self.is_training,
101 |           input_ids=input_ids,
102 |           input_mask=input_mask,
103 |           token_type_ids=token_type_ids,
104 |           scope=self.scope)
105 | 
106 |       outputs = {
107 |           "embedding_output": model.get_embedding_output(),
108 |           "sequence_output": model.get_sequence_output(),
109 |           "pooled_output": model.get_pooled_output(),
110 |           "all_encoder_layers": model.get_all_encoder_layers(),
111 |       }
112 |       return outputs
113 | 
114 |     def check_output(self, result):
115 |       self.parent.assertAllEqual(
116 |           result["embedding_output"].shape,
117 |           [self.batch_size, self.seq_length, self.hidden_size])
118 | 
119 |       self.parent.assertAllEqual(
120 |           result["sequence_output"].shape,
121 |           [self.batch_size, self.seq_length, self.hidden_size])
122 | 
123 |       self.parent.assertAllEqual(result["pooled_output"].shape,
124 |                                  [self.batch_size, self.hidden_size])
125 | 
126 |   def test_default(self):
127 |     self.run_tester(BertModelTest.BertModelTester(self))
128 | 
129 |   def test_config_to_json_string(self):
130 |     config = modeling.BertConfig(vocab_size=99, hidden_size=37)
131 |     obj = json.loads(config.to_json_string())
132 |     self.assertEqual(obj["vocab_size"], 99)
133 |     self.assertEqual(obj["hidden_size"], 37)
134 | 
135 |   def run_tester(self, tester):
136 |     with self.test_session() as sess:
137 |       ops = tester.create_model()
138 |       init_op = tf.group(tf.global_variables_initializer(),
139 |                          tf.local_variables_initializer())
140 |       sess.run(init_op)
141 |       output_result = sess.run(ops)
142 |       tester.check_output(output_result)
143 | 
144 |       self.assert_all_tensors_reachable(sess, [init_op, ops])
145 | 
146 |   @classmethod
147 |   def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
148 |     """Creates a random int32 tensor of the shape within the vocab size."""
149 |     if rng is None:
150 |       rng = random.Random()
151 | 
152 |     total_dims = 1
153 |     for dim in shape:
154 |       total_dims *= dim
155 | 
156 |     values = []
157 |     for _ in range(total_dims):
158 |       values.append(rng.randint(0, vocab_size - 1))
159 | 
160 |     return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name)
161 | 
162 |   def assert_all_tensors_reachable(self, sess, outputs):
163 |     """Checks that all the tensors in the graph are reachable from outputs."""
164 |     graph = sess.graph
165 | 
166 |     ignore_strings = [
167 |         "^.*/assert_less_equal/.*$",
168 |         "^.*/dilation_rate$",
169 |         "^.*/Tensordot/concat$",
170 |         "^.*/Tensordot/concat/axis$",
171 |         "^testing/.*$",
172 |     ]
173 | 
174 |     ignore_regexes = [re.compile(x) for x in ignore_strings]
175 | 
176 |     unreachable = self.get_unreachable_ops(graph, outputs)
177 |     filtered_unreachable = []
178 |     for x in unreachable:
179 |       do_ignore = False
180 |       for r in ignore_regexes:
181 |         m = r.match(x.name)
182 |         if m is not None:
183 |           do_ignore = True
184 |       if do_ignore:
185 |         continue
186 |       filtered_unreachable.append(x)
187 |     unreachable = filtered_unreachable
188 | 
189 |     self.assertEqual(
190 |         len(unreachable), 0, "The following ops are unreachable: %s" %
191 |         (" ".join([x.name for x in unreachable])))
192 | 
193 |   @classmethod
194 |   def get_unreachable_ops(cls, graph, outputs):
195 |     """Finds all of the tensors in graph that are unreachable from outputs."""
196 |     outputs = cls.flatten_recursive(outputs)
197 |     output_to_op = collections.defaultdict(list)
198 |     op_to_all = collections.defaultdict(list)
199 |     assign_out_to_in = collections.defaultdict(list)
200 | 
201 |     for op in graph.get_operations():
202 |       for x in op.inputs:
203 |         op_to_all[op.name].append(x.name)
204 |       for y in op.outputs:
205 |         output_to_op[y.name].append(op.name)
206 |         op_to_all[op.name].append(y.name)
207 |       if str(op.type) == "Assign":
208 |         for y in op.outputs:
209 |           for x in op.inputs:
210 |             assign_out_to_in[y.name].append(x.name)
211 | 
212 |     assign_groups = collections.defaultdict(list)
213 |     for out_name in assign_out_to_in.keys():
214 |       name_group = assign_out_to_in[out_name]
215 |       for n1 in name_group:
216 |         assign_groups[n1].append(out_name)
217 |         for n2 in name_group:
218 |           if n1 != n2:
219 |             assign_groups[n1].append(n2)
220 | 
221 |     seen_tensors = {}
222 |     stack = [x.name for x in outputs]
223 |     while stack:
224 |       name = stack.pop()
225 |       if name in seen_tensors:
226 |         continue
227 |       seen_tensors[name] = True
228 | 
229 |       if name in output_to_op:
230 |         for op_name in output_to_op[name]:
231 |           if op_name in op_to_all:
232 |             for input_name in op_to_all[op_name]:
233 |               if input_name not in stack:
234 |                 stack.append(input_name)
235 | 
236 |       expanded_names = []
237 |       if name in assign_groups:
238 |         for assign_name in assign_groups[name]:
239 |           expanded_names.append(assign_name)
240 | 
241 |       for expanded_name in expanded_names:
242 |         if expanded_name not in stack:
243 |           stack.append(expanded_name)
244 | 
245 |     unreachable_ops = []
246 |     for op in graph.get_operations():
247 |       is_unreachable = False
248 |       all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs]
249 |       for name in all_names:
250 |         if name not in seen_tensors:
251 |           is_unreachable = True
252 |       if is_unreachable:
253 |         unreachable_ops.append(op)
254 |     return unreachable_ops
255 | 
256 |   @classmethod
257 |   def flatten_recursive(cls, item):
258 |     """Flattens (potentially nested) a tuple/dictionary/list to a list."""
259 |     output = []
260 |     if isinstance(item, list):
261 |       output.extend(item)
262 |     elif isinstance(item, tuple):
263 |       output.extend(list(item))
264 |     elif isinstance(item, dict):
265 |       for (_, v) in six.iteritems(item):
266 |         output.append(v)
267 |     else:
268 |       return [item]
269 | 
270 |     flat_output = []
271 |     for x in output:
272 |       flat_output.extend(cls.flatten_recursive(x))
273 |     return flat_output
274 | 
275 | 
276 | if __name__ == "__main__":
277 |   tf.test.main()
278 | 


--------------------------------------------------------------------------------
/multilingual.md:
--------------------------------------------------------------------------------
  1 | ## Models
  2 | 
  3 | There are two multilingual models currently available. We do not plan to release
  4 | more single-language models, but we may release `BERT-Large` versions of these
  5 | two in the future:
  6 | 
  7 | *   **[`BERT-Base, Multilingual Cased (New, recommended)`](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip)**:
  8 |     104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
  9 | *   **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)**:
 10 |     102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
 11 | *   **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**:
 12 |     Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M
 13 |     parameters
 14 | 
 15 | **The `Multilingual Cased (New)` model also fixes normalization issues in many
 16 | languages, so it is recommended in languages with non-Latin alphabets (and is
 17 | often better for most languages with Latin alphabets). When using this model,
 18 | make sure to pass `--do_lower_case=false` to `run_pretraining.py` and other
 19 | scripts.**
 20 | 
 21 | See the [list of languages](#list-of-languages) that the Multilingual model
 22 | supports. The Multilingual model does include Chinese (and English), but if your
 23 | fine-tuning data is Chinese-only, then the Chinese model will likely produce
 24 | better results.
 25 | 
 26 | ## Results
 27 | 
 28 | To evaluate these systems, we use the
 29 | [XNLI dataset](https://github.com/facebookresearch/XNLI) dataset, which is a
 30 | version of [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) where the
 31 | dev and test sets have been translated (by humans) into 15 languages. Note that
 32 | the training set was *machine* translated (we used the translations provided by
 33 | XNLI, not Google NMT). For clarity, we only report on 6 languages below:
 34 | 
 35 | <!-- mdformat off(no table) -->
 36 | 
 37 | | System                            | English  | Chinese  | Spanish  | German   | Arabic   | Urdu     |
 38 | | --------------------------------- | -------- | -------- | -------- | -------- | -------- | -------- |
 39 | | XNLI Baseline - Translate Train   | 73.7     | 67.0     | 68.8     | 66.5     | 65.8     | 56.6     |
 40 | | XNLI Baseline - Translate Test    | 73.7     | 68.3     | 70.7     | 68.7     | 66.8     | 59.3     |
 41 | | BERT - Translate Train Cased      | **81.9** | **76.6** | **77.8** | **75.9** | **70.7** | 61.6     |
 42 | | BERT - Translate Train Uncased    | 81.4     | 74.2     | 77.3     | 75.2     | 70.5     | 61.7     |
 43 | | BERT - Translate Test Uncased     | 81.4     | 70.1     | 74.9     | 74.4     | 70.4     | **62.1** |
 44 | | BERT - Zero Shot Uncased          | 81.4     | 63.8     | 74.3     | 70.5     | 62.1     | 58.3     |
 45 | 
 46 | <!-- mdformat on -->
 47 | 
 48 | The first two rows are baselines from the XNLI paper and the last three rows are
 49 | our results with BERT.
 50 | 
 51 | **Translate Train** means that the MultiNLI training set was machine translated
 52 | from English into the foreign language. So training and evaluation were both
 53 | done in the foreign language. Unfortunately, training was done on
 54 | machine-translated data, so it is impossible to quantify how much of the lower
 55 | accuracy (compared to English) is due to the quality of the machine translation
 56 | vs. the quality of the pre-trained model.
 57 | 
 58 | **Translate Test** means that the XNLI test set was machine translated from the
 59 | foreign language into English. So training and evaluation were both done on
 60 | English. However, test evaluation was done on machine-translated English, so the
 61 | accuracy depends on the quality of the machine translation system.
 62 | 
 63 | **Zero Shot** means that the Multilingual BERT system was fine-tuned on English
 64 | MultiNLI, and then evaluated on the foreign language XNLI test. In this case,
 65 | machine translation was not involved at all in either the pre-training or
 66 | fine-tuning.
 67 | 
 68 | Note that the English result is worse than the 84.2 MultiNLI baseline because
 69 | this training used Multilingual BERT rather than English-only BERT. This implies
 70 | that for high-resource languages, the Multilingual model is somewhat worse than
 71 | a single-language model. However, it is not feasible for us to train and
 72 | maintain dozens of single-language models. Therefore, if your goal is to maximize
 73 | performance with a language other than English or Chinese, you might find it
 74 | beneficial to run pre-training for additional steps starting from our
 75 | Multilingual model on data from your language of interest.
 76 | 
 77 | Here is a comparison of training Chinese models with the Multilingual
 78 | `BERT-Base` and Chinese-only `BERT-Base`:
 79 | 
 80 | System                  | Chinese
 81 | ----------------------- | -------
 82 | XNLI Baseline           | 67.0
 83 | BERT Multilingual Model | 74.2
 84 | BERT Chinese-only Model | 77.2
 85 | 
 86 | Similar to English, the single-language model does 3% better than the
 87 | Multilingual model.
 88 | 
 89 | ## Fine-tuning Example
 90 | 
 91 | The multilingual model does **not** require any special consideration or API
 92 | changes. We did update the implementation of `BasicTokenizer` in
 93 | `tokenization.py` to support Chinese character tokenization, so please update if
 94 | you forked it. However, we did not change the tokenization API.
 95 | 
 96 | To test the new models, we did modify `run_classifier.py` to add support for the
 97 | [XNLI dataset](https://github.com/facebookresearch/XNLI). This is a 15-language
 98 | version of MultiNLI where the dev/test sets have been human-translated, and the
 99 | training set has been machine-translated.
100 | 
101 | To run the fine-tuning code, please download the
102 | [XNLI dev/test set](https://s3.amazonaws.com/xnli/XNLI-1.0.zip) and the
103 | [XNLI machine-translated training set](https://s3.amazonaws.com/xnli/XNLI-MT-1.0.zip)
104 | and then unpack both .zip files into some directory `$XNLI_DIR`.
105 | 
106 | To run fine-tuning on XNLI. The language is hard-coded into `run_classifier.py`
107 | (Chinese by default), so please modify `XnliProcessor` if you want to run on
108 | another language.
109 | 
110 | This is a large dataset, so this will training will take a few hours on a GPU
111 | (or about 30 minutes on a Cloud TPU). To run an experiment quickly for
112 | debugging, just set `num_train_epochs` to a small value like `0.1`.
113 | 
114 | ```shell
115 | export BERT_BASE_DIR=/path/to/bert/chinese_L-12_H-768_A-12 # or multilingual_L-12_H-768_A-12
116 | export XNLI_DIR=/path/to/xnli
117 | 
118 | python run_classifier.py \
119 |   --task_name=XNLI \
120 |   --do_train=true \
121 |   --do_eval=true \
122 |   --data_dir=$XNLI_DIR \
123 |   --vocab_file=$BERT_BASE_DIR/vocab.txt \
124 |   --bert_config_file=$BERT_BASE_DIR/bert_config.json \
125 |   --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
126 |   --max_seq_length=128 \
127 |   --train_batch_size=32 \
128 |   --learning_rate=5e-5 \
129 |   --num_train_epochs=2.0 \
130 |   --output_dir=/tmp/xnli_output/
131 | ```
132 | 
133 | With the Chinese-only model, the results should look something like this:
134 | 
135 | ```
136 |  ***** Eval results *****
137 | eval_accuracy = 0.774116
138 | eval_loss = 0.83554
139 | global_step = 24543
140 | loss = 0.74603
141 | ```
142 | 
143 | ## Details
144 | 
145 | ### Data Source and Sampling
146 | 
147 | The languages chosen were the
148 | [top 100 languages with the largest Wikipedias](https://meta.wikimedia.org/wiki/List_of_Wikipedias).
149 | The entire Wikipedia dump for each language (excluding user and talk pages) was
150 | taken as the training data for each language
151 | 
152 | However, the size of the Wikipedia for a given language varies greatly, and
153 | therefore low-resource languages may be "under-represented" in terms of the
154 | neural network model (under the assumption that languages are "competing" for
155 | limited model capacity to some extent). At the same time, we also don't want
156 | to overfit the model by performing thousands of epochs over a tiny Wikipedia
157 | for a particular language.
158 | 
159 | To balance these two factors, we performed exponentially smoothed weighting of
160 | the data during pre-training data creation (and WordPiece vocab creation). In
161 | other words, let's say that the probability of a language is *P(L)*, e.g.,
162 | *P(English) = 0.21* means that after concatenating all of the Wikipedias
163 | together, 21% of our data is English. We exponentiate each probability by some
164 | factor *S* and then re-normalize, and sample from that distribution. In our case
165 | we use *S=0.7*. So, high-resource languages like English will be under-sampled,
166 | and low-resource languages like Icelandic will be over-sampled. E.g., in the
167 | original distribution English would be sampled 1000x more than Icelandic, but
168 | after smoothing it's only sampled 100x more.
169 | 
170 | ### Tokenization
171 | 
172 | For tokenization, we use a 110k shared WordPiece vocabulary. The word counts are
173 | weighted the same way as the data, so low-resource languages are upweighted by
174 | some factor. We intentionally do *not* use any marker to denote the input
175 | language (so that zero-shot training can work).
176 | 
177 | Because Chinese (and Japanese Kanji and Korean Hanja) does not have whitespace
178 | characters, we add spaces around every character in the
179 | [CJK Unicode range](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_\(Unicode_block\))
180 | before applying WordPiece. This means that Chinese is effectively
181 | character-tokenized. Note that the CJK Unicode block only includes
182 | Chinese-origin characters and does *not* include Hangul Korean or
183 | Katakana/Hiragana Japanese, which are tokenized with whitespace+WordPiece like
184 | all other languages.
185 | 
186 | For all other languages, we apply the
187 | [same recipe as English](https://github.com/google-research/bert#tokenization):
188 | (a) lower casing+accent removal, (b) punctuation splitting, (c) whitespace
189 | tokenization. We understand that accent markers have substantial meaning in some
190 | languages, but felt that the benefits of reducing the effective vocabulary make
191 | up for this. Generally the strong contextual models of BERT should make up for
192 | any ambiguity introduced by stripping accent markers.
193 | 
194 | ### List of Languages
195 | 
196 | The multilingual model supports the following languages. These languages were
197 | chosen because they are the top 100 languages with the largest Wikipedias:
198 | 
199 | *   Afrikaans
200 | *   Albanian
201 | *   Arabic
202 | *   Aragonese
203 | *   Armenian
204 | *   Asturian
205 | *   Azerbaijani
206 | *   Bashkir
207 | *   Basque
208 | *   Bavarian
209 | *   Belarusian
210 | *   Bengali
211 | *   Bishnupriya Manipuri
212 | *   Bosnian
213 | *   Breton
214 | *   Bulgarian
215 | *   Burmese
216 | *   Catalan
217 | *   Cebuano
218 | *   Chechen
219 | *   Chinese (Simplified)
220 | *   Chinese (Traditional)
221 | *   Chuvash
222 | *   Croatian
223 | *   Czech
224 | *   Danish
225 | *   Dutch
226 | *   English
227 | *   Estonian
228 | *   Finnish
229 | *   French
230 | *   Galician
231 | *   Georgian
232 | *   German
233 | *   Greek
234 | *   Gujarati
235 | *   Haitian
236 | *   Hebrew
237 | *   Hindi
238 | *   Hungarian
239 | *   Icelandic
240 | *   Ido
241 | *   Indonesian
242 | *   Irish
243 | *   Italian
244 | *   Japanese
245 | *   Javanese
246 | *   Kannada
247 | *   Kazakh
248 | *   Kirghiz
249 | *   Korean
250 | *   Latin
251 | *   Latvian
252 | *   Lithuanian
253 | *   Lombard
254 | *   Low Saxon
255 | *   Luxembourgish
256 | *   Macedonian
257 | *   Malagasy
258 | *   Malay
259 | *   Malayalam
260 | *   Marathi
261 | *   Minangkabau
262 | *   Nepali
263 | *   Newar
264 | *   Norwegian (Bokmal)
265 | *   Norwegian (Nynorsk)
266 | *   Occitan
267 | *   Persian (Farsi)
268 | *   Piedmontese
269 | *   Polish
270 | *   Portuguese
271 | *   Punjabi
272 | *   Romanian
273 | *   Russian
274 | *   Scots
275 | *   Serbian
276 | *   Serbo-Croatian
277 | *   Sicilian
278 | *   Slovak
279 | *   Slovenian
280 | *   South Azerbaijani
281 | *   Spanish
282 | *   Sundanese
283 | *   Swahili
284 | *   Swedish
285 | *   Tagalog
286 | *   Tajik
287 | *   Tamil
288 | *   Tatar
289 | *   Telugu
290 | *   Turkish
291 | *   Ukrainian
292 | *   Urdu
293 | *   Uzbek
294 | *   Vietnamese
295 | *   Volapük
296 | *   Waray-Waray
297 | *   Welsh
298 | *   West Frisian
299 | *   Western Punjabi
300 | *   Yoruba
301 | 
302 | The **Multilingual Cased (New)** release contains additionally **Thai** and
303 | **Mongolian**, which were not included in the original release.
304 | 


--------------------------------------------------------------------------------
/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Functions and classes related to optimization (weight updates)."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import re
 22 | import tensorflow as tf
 23 | 
 24 | 
 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
 26 |   """Creates an optimizer training op."""
 27 |   global_step = tf.train.get_or_create_global_step()
 28 | 
 29 |   learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
 30 | 
 31 |   # Implements linear decay of the learning rate.
 32 |   learning_rate = tf.train.polynomial_decay(
 33 |       learning_rate,
 34 |       global_step,
 35 |       num_train_steps,
 36 |       end_learning_rate=0.0,
 37 |       power=1.0,
 38 |       cycle=False)
 39 | 
 40 |   # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
 41 |   # learning rate will be `global_step/num_warmup_steps * init_lr`.
 42 |   if num_warmup_steps:
 43 |     global_steps_int = tf.cast(global_step, tf.int32)
 44 |     warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
 45 | 
 46 |     global_steps_float = tf.cast(global_steps_int, tf.float32)
 47 |     warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
 48 | 
 49 |     warmup_percent_done = global_steps_float / warmup_steps_float
 50 |     warmup_learning_rate = init_lr * warmup_percent_done
 51 | 
 52 |     is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
 53 |     learning_rate = (
 54 |         (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
 55 | 
 56 |   # It is recommended that you use this optimizer for fine tuning, since this
 57 |   # is how the model was trained (note that the Adam m/v variables are NOT
 58 |   # loaded from init_checkpoint.)
 59 |   optimizer = AdamWeightDecayOptimizer(
 60 |       learning_rate=learning_rate,
 61 |       weight_decay_rate=0.01,
 62 |       beta_1=0.9,
 63 |       beta_2=0.999,
 64 |       epsilon=1e-6,
 65 |       exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 66 | 
 67 |   if use_tpu:
 68 |     optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
 69 | 
 70 |   tvars = tf.trainable_variables()
 71 |   grads = tf.gradients(loss, tvars)
 72 | 
 73 |   # This is how the model was pre-trained.
 74 |   (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
 75 | 
 76 |   train_op = optimizer.apply_gradients(
 77 |       zip(grads, tvars), global_step=global_step)
 78 | 
 79 |   # Normally the global step update is done inside of `apply_gradients`.
 80 |   # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
 81 |   # a different optimizer, you should probably take this line out.
 82 |   new_global_step = global_step + 1
 83 |   train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 84 |   return train_op
 85 | 
 86 | 
 87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
 88 |   """A basic Adam optimizer that includes "correct" L2 weight decay."""
 89 | 
 90 |   def __init__(self,
 91 |                learning_rate,
 92 |                weight_decay_rate=0.0,
 93 |                beta_1=0.9,
 94 |                beta_2=0.999,
 95 |                epsilon=1e-6,
 96 |                exclude_from_weight_decay=None,
 97 |                name="AdamWeightDecayOptimizer"):
 98 |     """Constructs a AdamWeightDecayOptimizer."""
 99 |     super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 | 
101 |     self.learning_rate = learning_rate
102 |     self.weight_decay_rate = weight_decay_rate
103 |     self.beta_1 = beta_1
104 |     self.beta_2 = beta_2
105 |     self.epsilon = epsilon
106 |     self.exclude_from_weight_decay = exclude_from_weight_decay
107 | 
108 |   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 |     """See base class."""
110 |     assignments = []
111 |     for (grad, param) in grads_and_vars:
112 |       if grad is None or param is None:
113 |         continue
114 | 
115 |       param_name = self._get_variable_name(param.name)
116 | 
117 |       m = tf.get_variable(
118 |           name=param_name + "/adam_m",
119 |           shape=param.shape.as_list(),
120 |           dtype=tf.float32,
121 |           trainable=False,
122 |           initializer=tf.zeros_initializer())
123 |       v = tf.get_variable(
124 |           name=param_name + "/adam_v",
125 |           shape=param.shape.as_list(),
126 |           dtype=tf.float32,
127 |           trainable=False,
128 |           initializer=tf.zeros_initializer())
129 | 
130 |       # Standard Adam update.
131 |       next_m = (
132 |           tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 |       next_v = (
134 |           tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 |                                                     tf.square(grad)))
136 | 
137 |       update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 | 
139 |       # Just adding the square of the weights to the loss function is *not*
140 |       # the correct way of using L2 regularization/weight decay with Adam,
141 |       # since that will interact with the m and v parameters in strange ways.
142 |       #
143 |       # Instead we want ot decay the weights in a manner that doesn't interact
144 |       # with the m/v parameters. This is equivalent to adding the square
145 |       # of the weights to the loss with plain (non-momentum) SGD.
146 |       if self._do_use_weight_decay(param_name):
147 |         update += self.weight_decay_rate * param
148 | 
149 |       update_with_lr = self.learning_rate * update
150 | 
151 |       next_param = param - update_with_lr
152 | 
153 |       assignments.extend(
154 |           [param.assign(next_param),
155 |            m.assign(next_m),
156 |            v.assign(next_v)])
157 |     return tf.group(*assignments, name=name)
158 | 
159 |   def _do_use_weight_decay(self, param_name):
160 |     """Whether to use L2 weight decay for `param_name`."""
161 |     if not self.weight_decay_rate:
162 |       return False
163 |     if self.exclude_from_weight_decay:
164 |       for r in self.exclude_from_weight_decay:
165 |         if re.search(r, param_name) is not None:
166 |           return False
167 |     return True
168 | 
169 |   def _get_variable_name(self, param_name):
170 |     """Get the variable name from the tensor name."""
171 |     m = re.match("^(.*):\\d+$", param_name)
172 |     if m is not None:
173 |       param_name = m.group(1)
174 |     return param_name
175 | 


--------------------------------------------------------------------------------
/optimization_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import optimization
20 | import tensorflow as tf
21 | 
22 | 
23 | class OptimizationTest(tf.test.TestCase):
24 | 
25 |   def test_adam(self):
26 |     with self.test_session() as sess:
27 |       w = tf.get_variable(
28 |           "w",
29 |           shape=[3],
30 |           initializer=tf.constant_initializer([0.1, -0.2, -0.1]))
31 |       x = tf.constant([0.4, 0.2, -0.5])
32 |       loss = tf.reduce_mean(tf.square(x - w))
33 |       tvars = tf.trainable_variables()
34 |       grads = tf.gradients(loss, tvars)
35 |       global_step = tf.train.get_or_create_global_step()
36 |       optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2)
37 |       train_op = optimizer.apply_gradients(zip(grads, tvars), global_step)
38 |       init_op = tf.group(tf.global_variables_initializer(),
39 |                          tf.local_variables_initializer())
40 |       sess.run(init_op)
41 |       for _ in range(100):
42 |         sess.run(train_op)
43 |       w_np = sess.run(w)
44 |       self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |   tf.test.main()
49 | 


--------------------------------------------------------------------------------
/predict.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export BERT_BASE_DIR=bert_model/models/chinese_L-12_H-768_A-12
 3 | export GLUE_DIR=Bert/data
 4 | export OUTPUT_DIR=Bert/output
 5 | 
 6 | python classifier.py \
 7 |   --task_name=similarity \
 8 |   --do_predict=true \
 9 |   --data_dir=$GLUE_DIR/ \
10 |   --vocab_file=$BERT_BASE_DIR/vocab.txt \
11 |   --bert_config_file=$BERT_BASE_DIR/bert_config.json \
12 |   --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
13 |   --max_seq_length=128 \
14 |   --output_dir=$OUTPUT_DIR
15 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow >= 1.11.0   # CPU Version of TensorFlow.
2 | # tensorflow-gpu  >= 1.11.0  # GPU version of TensorFlow.
3 | 


--------------------------------------------------------------------------------
/run_classifier_with_tfhub.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """BERT finetuning runner with TF-Hub."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import os
 22 | import optimization
 23 | import run_classifier
 24 | import tokenization
 25 | import tensorflow as tf
 26 | import tensorflow_hub as hub
 27 | 
 28 | flags = tf.flags
 29 | 
 30 | FLAGS = flags.FLAGS
 31 | 
 32 | flags.DEFINE_string(
 33 |     "bert_hub_module_handle", None,
 34 |     "Handle for the BERT TF-Hub module.")
 35 | 
 36 | 
 37 | def create_model(is_training, input_ids, input_mask, segment_ids, labels,
 38 |                  num_labels, bert_hub_module_handle):
 39 |   """Creates a classification model."""
 40 |   tags = set()
 41 |   if is_training:
 42 |     tags.add("train")
 43 |   bert_module = hub.Module(bert_hub_module_handle, tags=tags, trainable=True)
 44 |   bert_inputs = dict(
 45 |       input_ids=input_ids,
 46 |       input_mask=input_mask,
 47 |       segment_ids=segment_ids)
 48 |   bert_outputs = bert_module(
 49 |       inputs=bert_inputs,
 50 |       signature="tokens",
 51 |       as_dict=True)
 52 | 
 53 |   # In the demo, we are doing a simple classification task on the entire
 54 |   # segment.
 55 |   #
 56 |   # If you want to use the token-level output, use
 57 |   # bert_outputs["sequence_output"] instead.
 58 |   output_layer = bert_outputs["pooled_output"]
 59 | 
 60 |   hidden_size = output_layer.shape[-1].value
 61 | 
 62 |   output_weights = tf.get_variable(
 63 |       "output_weights", [num_labels, hidden_size],
 64 |       initializer=tf.truncated_normal_initializer(stddev=0.02))
 65 | 
 66 |   output_bias = tf.get_variable(
 67 |       "output_bias", [num_labels], initializer=tf.zeros_initializer())
 68 | 
 69 |   with tf.variable_scope("loss"):
 70 |     if is_training:
 71 |       # I.e., 0.1 dropout
 72 |       output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
 73 | 
 74 |     logits = tf.matmul(output_layer, output_weights, transpose_b=True)
 75 |     logits = tf.nn.bias_add(logits, output_bias)
 76 |     probabilities = tf.nn.softmax(logits, axis=-1)
 77 |     log_probs = tf.nn.log_softmax(logits, axis=-1)
 78 | 
 79 |     one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
 80 | 
 81 |     per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
 82 |     loss = tf.reduce_mean(per_example_loss)
 83 | 
 84 |     return (loss, per_example_loss, logits, probabilities)
 85 | 
 86 | 
 87 | def model_fn_builder(num_labels, learning_rate, num_train_steps,
 88 |                      num_warmup_steps, use_tpu, bert_hub_module_handle):
 89 |   """Returns `model_fn` closure for TPUEstimator."""
 90 | 
 91 |   def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
 92 |     """The `model_fn` for TPUEstimator."""
 93 | 
 94 |     tf.logging.info("*** Features ***")
 95 |     for name in sorted(features.keys()):
 96 |       tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
 97 | 
 98 |     input_ids = features["input_ids"]
 99 |     input_mask = features["input_mask"]
100 |     segment_ids = features["segment_ids"]
101 |     label_ids = features["label_ids"]
102 | 
103 |     is_training = (mode == tf.estimator.ModeKeys.TRAIN)
104 | 
105 |     (total_loss, per_example_loss, logits, probabilities) = create_model(
106 |         is_training, input_ids, input_mask, segment_ids, label_ids, num_labels,
107 |         bert_hub_module_handle)
108 | 
109 |     output_spec = None
110 |     if mode == tf.estimator.ModeKeys.TRAIN:
111 |       train_op = optimization.create_optimizer(
112 |           total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
113 | 
114 |       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
115 |           mode=mode,
116 |           loss=total_loss,
117 |           train_op=train_op)
118 |     elif mode == tf.estimator.ModeKeys.EVAL:
119 | 
120 |       def metric_fn(per_example_loss, label_ids, logits):
121 |         predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
122 |         accuracy = tf.metrics.accuracy(label_ids, predictions)
123 |         loss = tf.metrics.mean(per_example_loss)
124 |         return {
125 |             "eval_accuracy": accuracy,
126 |             "eval_loss": loss,
127 |         }
128 | 
129 |       eval_metrics = (metric_fn, [per_example_loss, label_ids, logits])
130 |       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
131 |           mode=mode,
132 |           loss=total_loss,
133 |           eval_metrics=eval_metrics)
134 |     elif mode == tf.estimator.ModeKeys.PREDICT:
135 |       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
136 |           mode=mode, predictions={"probabilities": probabilities})
137 |     else:
138 |       raise ValueError(
139 |           "Only TRAIN, EVAL and PREDICT modes are supported: %s" % (mode))
140 | 
141 |     return output_spec
142 | 
143 |   return model_fn
144 | 
145 | 
146 | def create_tokenizer_from_hub_module(bert_hub_module_handle):
147 |   """Get the vocab file and casing info from the Hub module."""
148 |   with tf.Graph().as_default():
149 |     bert_module = hub.Module(bert_hub_module_handle)
150 |     tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
151 |     with tf.Session() as sess:
152 |       vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
153 |                                             tokenization_info["do_lower_case"]])
154 |   return tokenization.FullTokenizer(
155 |       vocab_file=vocab_file, do_lower_case=do_lower_case)
156 | 
157 | 
158 | def main(_):
159 |   tf.logging.set_verbosity(tf.logging.INFO)
160 | 
161 |   processors = {
162 |       "cola": run_classifier.ColaProcessor,
163 |       "mnli": run_classifier.MnliProcessor,
164 |       "mrpc": run_classifier.MrpcProcessor,
165 |   }
166 | 
167 |   if not FLAGS.do_train and not FLAGS.do_eval:
168 |     raise ValueError("At least one of `do_train` or `do_eval` must be True.")
169 | 
170 |   tf.gfile.MakeDirs(FLAGS.output_dir)
171 | 
172 |   task_name = FLAGS.task_name.lower()
173 | 
174 |   if task_name not in processors:
175 |     raise ValueError("Task not found: %s" % (task_name))
176 | 
177 |   processor = processors[task_name]()
178 | 
179 |   label_list = processor.get_labels()
180 | 
181 |   tokenizer = create_tokenizer_from_hub_module(FLAGS.bert_hub_module_handle)
182 | 
183 |   tpu_cluster_resolver = None
184 |   if FLAGS.use_tpu and FLAGS.tpu_name:
185 |     tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
186 |         FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
187 | 
188 |   is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
189 |   run_config = tf.contrib.tpu.RunConfig(
190 |       cluster=tpu_cluster_resolver,
191 |       master=FLAGS.master,
192 |       model_dir=FLAGS.output_dir,
193 |       save_checkpoints_steps=FLAGS.save_checkpoints_steps,
194 |       tpu_config=tf.contrib.tpu.TPUConfig(
195 |           iterations_per_loop=FLAGS.iterations_per_loop,
196 |           num_shards=FLAGS.num_tpu_cores,
197 |           per_host_input_for_training=is_per_host))
198 | 
199 |   train_examples = None
200 |   num_train_steps = None
201 |   num_warmup_steps = None
202 |   if FLAGS.do_train:
203 |     train_examples = processor.get_train_examples(FLAGS.data_dir)
204 |     num_train_steps = int(
205 |         len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
206 |     num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
207 | 
208 |   model_fn = model_fn_builder(
209 |       num_labels=len(label_list),
210 |       learning_rate=FLAGS.learning_rate,
211 |       num_train_steps=num_train_steps,
212 |       num_warmup_steps=num_warmup_steps,
213 |       use_tpu=FLAGS.use_tpu,
214 |       bert_hub_module_handle=FLAGS.bert_hub_module_handle)
215 | 
216 |   # If TPU is not available, this will fall back to normal Estimator on CPU
217 |   # or GPU.
218 |   estimator = tf.contrib.tpu.TPUEstimator(
219 |       use_tpu=FLAGS.use_tpu,
220 |       model_fn=model_fn,
221 |       config=run_config,
222 |       train_batch_size=FLAGS.train_batch_size,
223 |       eval_batch_size=FLAGS.eval_batch_size,
224 |       predict_batch_size=FLAGS.predict_batch_size)
225 | 
226 |   if FLAGS.do_train:
227 |     train_features = run_classifier.convert_examples_to_features(
228 |         train_examples, label_list, FLAGS.max_seq_length, tokenizer)
229 |     tf.logging.info("***** Running training *****")
230 |     tf.logging.info("  Num examples = %d", len(train_examples))
231 |     tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
232 |     tf.logging.info("  Num steps = %d", num_train_steps)
233 |     train_input_fn = run_classifier.input_fn_builder(
234 |         features=train_features,
235 |         seq_length=FLAGS.max_seq_length,
236 |         is_training=True,
237 |         drop_remainder=True)
238 |     estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
239 | 
240 |   if FLAGS.do_eval:
241 |     eval_examples = processor.get_dev_examples(FLAGS.data_dir)
242 |     eval_features = run_classifier.convert_examples_to_features(
243 |         eval_examples, label_list, FLAGS.max_seq_length, tokenizer)
244 | 
245 |     tf.logging.info("***** Running evaluation *****")
246 |     tf.logging.info("  Num examples = %d", len(eval_examples))
247 |     tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
248 | 
249 |     # This tells the estimator to run through the entire set.
250 |     eval_steps = None
251 |     # However, if running eval on the TPU, you will need to specify the
252 |     # number of steps.
253 |     if FLAGS.use_tpu:
254 |       # Eval will be slightly WRONG on the TPU because it will truncate
255 |       # the last batch.
256 |       eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)
257 | 
258 |     eval_drop_remainder = True if FLAGS.use_tpu else False
259 |     eval_input_fn = run_classifier.input_fn_builder(
260 |         features=eval_features,
261 |         seq_length=FLAGS.max_seq_length,
262 |         is_training=False,
263 |         drop_remainder=eval_drop_remainder)
264 | 
265 |     result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
266 | 
267 |     output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
268 |     with tf.gfile.GFile(output_eval_file, "w") as writer:
269 |       tf.logging.info("***** Eval results *****")
270 |       for key in sorted(result.keys()):
271 |         tf.logging.info("  %s = %s", key, str(result[key]))
272 |         writer.write("%s = %s\n" % (key, str(result[key])))
273 | 
274 |   if FLAGS.do_predict:
275 |     predict_examples = processor.get_test_examples(FLAGS.data_dir)
276 |     if FLAGS.use_tpu:
277 |       # Discard batch remainder if running on TPU
278 |       n = len(predict_examples)
279 |       predict_examples = predict_examples[:(n - n % FLAGS.predict_batch_size)]
280 | 
281 |     predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
282 |     run_classifier.file_based_convert_examples_to_features(
283 |         predict_examples, label_list, FLAGS.max_seq_length, tokenizer,
284 |         predict_file)
285 | 
286 |     tf.logging.info("***** Running prediction*****")
287 |     tf.logging.info("  Num examples = %d", len(predict_examples))
288 |     tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
289 | 
290 |     predict_input_fn = run_classifier.file_based_input_fn_builder(
291 |         input_file=predict_file,
292 |         seq_length=FLAGS.max_seq_length,
293 |         is_training=False,
294 |         drop_remainder=FLAGS.use_tpu)
295 | 
296 |     result = estimator.predict(input_fn=predict_input_fn)
297 | 
298 |     output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
299 |     with tf.gfile.GFile(output_predict_file, "w") as writer:
300 |       tf.logging.info("***** Predict results *****")
301 |       for prediction in result:
302 |         probabilities = prediction["probabilities"]
303 |         output_line = "\t".join(
304 |             str(class_probability)
305 |             for class_probability in probabilities) + "\n"
306 |         writer.write(output_line)
307 | 
308 | 
309 | if __name__ == "__main__":
310 |   flags.mark_flag_as_required("data_dir")
311 |   flags.mark_flag_as_required("task_name")
312 |   flags.mark_flag_as_required("bert_hub_module_handle")
313 |   flags.mark_flag_as_required("output_dir")
314 |   tf.app.run()
315 | 


--------------------------------------------------------------------------------
/sample_text.txt:
--------------------------------------------------------------------------------
 1 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
 2 | Text should be one-sentence-per-line, with empty lines between documents.
 3 | This sample text is public domain and was randomly selected from Project Guttenberg.
 4 | 
 5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
 6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
 7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
 8 | "Cass" Beard had risen early that morning, but not with a view to discovery.
 9 | A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
10 | The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
11 | This was nearly opposite.
12 | Mr. Cassius crossed the highway, and stopped suddenly.
13 | Something glittered in the nearest red pool before him.
14 | Gold, surely!
15 | But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
16 | Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
17 | Like most of his fellow gold-seekers, Cass was superstitious.
18 | 
19 | The fountain of classic wisdom, Hypatia herself.
20 | As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
21 | From my youth I felt in me a soul above the matter-entangled herd.
22 | She revealed to me the glorious fact, that I am a spark of Divinity itself.
23 | A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
24 | There is a philosophic pleasure in opening one's treasures to the modest young.
25 | Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
26 | Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
27 | but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
28 | Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
29 | His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
30 | while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
31 | At last they reached the quay at the opposite end of the street;
32 | and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
33 | He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
34 | 


--------------------------------------------------------------------------------
/tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import re
 23 | import unicodedata
 24 | import six
 25 | import tensorflow as tf
 26 | 
 27 | 
 28 | def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
 29 |   """Checks whether the casing config is consistent with the checkpoint name."""
 30 | 
 31 |   # The casing has to be passed in by the user and there is no explicit check
 32 |   # as to whether it matches the checkpoint. The casing information probably
 33 |   # should have been stored in the bert_config.json file, but it's not, so
 34 |   # we have to heuristically detect it to validate.
 35 | 
 36 |   if not init_checkpoint:
 37 |     return
 38 | 
 39 |   m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
 40 |   if m is None:
 41 |     return
 42 | 
 43 |   model_name = m.group(1)
 44 | 
 45 |   lower_models = [
 46 |       "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
 47 |       "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
 48 |   ]
 49 | 
 50 |   cased_models = [
 51 |       "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
 52 |       "multi_cased_L-12_H-768_A-12"
 53 |   ]
 54 | 
 55 |   is_bad_config = False
 56 |   if model_name in lower_models and not do_lower_case:
 57 |     is_bad_config = True
 58 |     actual_flag = "False"
 59 |     case_name = "lowercased"
 60 |     opposite_flag = "True"
 61 | 
 62 |   if model_name in cased_models and do_lower_case:
 63 |     is_bad_config = True
 64 |     actual_flag = "True"
 65 |     case_name = "cased"
 66 |     opposite_flag = "False"
 67 | 
 68 |   if is_bad_config:
 69 |     raise ValueError(
 70 |         "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
 71 |         "However, `%s` seems to be a %s model, so you "
 72 |         "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
 73 |         "how the model was pre-training. If this error is wrong, please "
 74 |         "just comment out this check." % (actual_flag, init_checkpoint,
 75 |                                           model_name, case_name, opposite_flag))
 76 | 
 77 | 
 78 | def convert_to_unicode(text):
 79 |   """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
 80 |   if six.PY3:
 81 |     if isinstance(text, str):
 82 |       return text
 83 |     elif isinstance(text, bytes):
 84 |       return text.decode("utf-8", "ignore")
 85 |     else:
 86 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 87 |   elif six.PY2:
 88 |     if isinstance(text, str):
 89 |       return text.decode("utf-8", "ignore")
 90 |     elif isinstance(text, unicode):
 91 |       return text
 92 |     else:
 93 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 94 |   else:
 95 |     raise ValueError("Not running on Python2 or Python 3?")
 96 | 
 97 | 
 98 | def printable_text(text):
 99 |   """Returns text encoded in a way suitable for print or `tf.logging`."""
100 | 
101 |   # These functions want `str` for both Python2 and Python3, but in one case
102 |   # it's a Unicode string and in the other it's a byte string.
103 |   if six.PY3:
104 |     if isinstance(text, str):
105 |       return text
106 |     elif isinstance(text, bytes):
107 |       return text.decode("utf-8", "ignore")
108 |     else:
109 |       raise ValueError("Unsupported string type: %s" % (type(text)))
110 |   elif six.PY2:
111 |     if isinstance(text, str):
112 |       return text
113 |     elif isinstance(text, unicode):
114 |       return text.encode("utf-8")
115 |     else:
116 |       raise ValueError("Unsupported string type: %s" % (type(text)))
117 |   else:
118 |     raise ValueError("Not running on Python2 or Python 3?")
119 | 
120 | 
121 | def load_vocab(vocab_file):
122 |   """Loads a vocabulary file into a dictionary."""
123 |   vocab = collections.OrderedDict()
124 |   index = 0
125 |   with tf.gfile.GFile(vocab_file, "r") as reader:
126 |     while True:
127 |       token = convert_to_unicode(reader.readline())
128 |       if not token:
129 |         break
130 |       token = token.strip()
131 |       vocab[token] = index
132 |       index += 1
133 |   return vocab
134 | 
135 | 
136 | def convert_by_vocab(vocab, items):
137 |   """Converts a sequence of [tokens|ids] using the vocab."""
138 |   output = []
139 |   for item in items:
140 |     output.append(vocab[item])
141 |   return output
142 | 
143 | 
144 | def convert_tokens_to_ids(vocab, tokens):
145 |   return convert_by_vocab(vocab, tokens)
146 | 
147 | 
148 | def convert_ids_to_tokens(inv_vocab, ids):
149 |   return convert_by_vocab(inv_vocab, ids)
150 | 
151 | 
152 | def whitespace_tokenize(text):
153 |   """Runs basic whitespace cleaning and splitting on a piece of text."""
154 |   text = text.strip()
155 |   if not text:
156 |     return []
157 |   tokens = text.split()
158 |   return tokens
159 | 
160 | 
161 | class FullTokenizer(object):
162 |   """Runs end-to-end tokenziation."""
163 | 
164 |   def __init__(self, vocab_file, do_lower_case=True):
165 |     self.vocab = load_vocab(vocab_file)
166 |     self.inv_vocab = {v: k for k, v in self.vocab.items()}
167 |     self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
168 |     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
169 | 
170 |   def tokenize(self, text):
171 |     split_tokens = []
172 |     for token in self.basic_tokenizer.tokenize(text):
173 |       for sub_token in self.wordpiece_tokenizer.tokenize(token):
174 |         split_tokens.append(sub_token)
175 | 
176 |     return split_tokens
177 | 
178 |   def convert_tokens_to_ids(self, tokens):
179 |     return convert_by_vocab(self.vocab, tokens)
180 | 
181 |   def convert_ids_to_tokens(self, ids):
182 |     return convert_by_vocab(self.inv_vocab, ids)
183 | 
184 | 
185 | class BasicTokenizer(object):
186 |   """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
187 | 
188 |   def __init__(self, do_lower_case=True):
189 |     """Constructs a BasicTokenizer.
190 | 
191 |     Args:
192 |       do_lower_case: Whether to lower case the input.
193 |     """
194 |     self.do_lower_case = do_lower_case
195 | 
196 |   def tokenize(self, text):
197 |     """Tokenizes a piece of text."""
198 |     text = convert_to_unicode(text)
199 |     text = self._clean_text(text)
200 | 
201 |     # This was added on November 1st, 2018 for the multilingual and Chinese
202 |     # models. This is also applied to the English models now, but it doesn't
203 |     # matter since the English models were not trained on any Chinese data
204 |     # and generally don't have any Chinese data in them (there are Chinese
205 |     # characters in the vocabulary because Wikipedia does have some Chinese
206 |     # words in the English Wikipedia.).
207 |     text = self._tokenize_chinese_chars(text)
208 | 
209 |     orig_tokens = whitespace_tokenize(text)
210 |     split_tokens = []
211 |     for token in orig_tokens:
212 |       if self.do_lower_case:
213 |         token = token.lower()
214 |         token = self._run_strip_accents(token)
215 |       split_tokens.extend(self._run_split_on_punc(token))
216 | 
217 |     output_tokens = whitespace_tokenize(" ".join(split_tokens))
218 |     return output_tokens
219 | 
220 |   def _run_strip_accents(self, text):
221 |     """Strips accents from a piece of text."""
222 |     text = unicodedata.normalize("NFD", text)
223 |     output = []
224 |     for char in text:
225 |       cat = unicodedata.category(char)
226 |       if cat == "Mn":
227 |         continue
228 |       output.append(char)
229 |     return "".join(output)
230 | 
231 |   def _run_split_on_punc(self, text):
232 |     """Splits punctuation on a piece of text."""
233 |     chars = list(text)
234 |     i = 0
235 |     start_new_word = True
236 |     output = []
237 |     while i < len(chars):
238 |       char = chars[i]
239 |       if _is_punctuation(char):
240 |         output.append([char])
241 |         start_new_word = True
242 |       else:
243 |         if start_new_word:
244 |           output.append([])
245 |         start_new_word = False
246 |         output[-1].append(char)
247 |       i += 1
248 | 
249 |     return ["".join(x) for x in output]
250 | 
251 |   def _tokenize_chinese_chars(self, text):
252 |     """Adds whitespace around any CJK character."""
253 |     output = []
254 |     for char in text:
255 |       cp = ord(char)
256 |       if self._is_chinese_char(cp):
257 |         output.append(" ")
258 |         output.append(char)
259 |         output.append(" ")
260 |       else:
261 |         output.append(char)
262 |     return "".join(output)
263 | 
264 |   def _is_chinese_char(self, cp):
265 |     """Checks whether CP is the codepoint of a CJK character."""
266 |     # This defines a "chinese character" as anything in the CJK Unicode block:
267 |     #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
268 |     #
269 |     # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
270 |     # despite its name. The modern Korean Hangul alphabet is a different block,
271 |     # as is Japanese Hiragana and Katakana. Those alphabets are used to write
272 |     # space-separated words, so they are not treated specially and handled
273 |     # like the all of the other languages.
274 |     if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
275 |         (cp >= 0x3400 and cp <= 0x4DBF) or  #
276 |         (cp >= 0x20000 and cp <= 0x2A6DF) or  #
277 |         (cp >= 0x2A700 and cp <= 0x2B73F) or  #
278 |         (cp >= 0x2B740 and cp <= 0x2B81F) or  #
279 |         (cp >= 0x2B820 and cp <= 0x2CEAF) or
280 |         (cp >= 0xF900 and cp <= 0xFAFF) or  #
281 |         (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
282 |       return True
283 | 
284 |     return False
285 | 
286 |   def _clean_text(self, text):
287 |     """Performs invalid character removal and whitespace cleanup on text."""
288 |     output = []
289 |     for char in text:
290 |       cp = ord(char)
291 |       if cp == 0 or cp == 0xfffd or _is_control(char):
292 |         continue
293 |       if _is_whitespace(char):
294 |         output.append(" ")
295 |       else:
296 |         output.append(char)
297 |     return "".join(output)
298 | 
299 | 
300 | class WordpieceTokenizer(object):
301 |   """Runs WordPiece tokenziation."""
302 | 
303 |   def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
304 |     self.vocab = vocab
305 |     self.unk_token = unk_token
306 |     self.max_input_chars_per_word = max_input_chars_per_word
307 | 
308 |   def tokenize(self, text):
309 |     """Tokenizes a piece of text into its word pieces.
310 | 
311 |     This uses a greedy longest-match-first algorithm to perform tokenization
312 |     using the given vocabulary.
313 | 
314 |     For example:
315 |       input = "unaffable"
316 |       output = ["un", "##aff", "##able"]
317 | 
318 |     Args:
319 |       text: A single token or whitespace separated tokens. This should have
320 |         already been passed through `BasicTokenizer.
321 | 
322 |     Returns:
323 |       A list of wordpiece tokens.
324 |     """
325 | 
326 |     text = convert_to_unicode(text)
327 | 
328 |     output_tokens = []
329 |     for token in whitespace_tokenize(text):
330 |       chars = list(token)
331 |       if len(chars) > self.max_input_chars_per_word:
332 |         output_tokens.append(self.unk_token)
333 |         continue
334 | 
335 |       is_bad = False
336 |       start = 0
337 |       sub_tokens = []
338 |       while start < len(chars):
339 |         end = len(chars)
340 |         cur_substr = None
341 |         while start < end:
342 |           substr = "".join(chars[start:end])
343 |           if start > 0:
344 |             substr = "##" + substr
345 |           if substr in self.vocab:
346 |             cur_substr = substr
347 |             break
348 |           end -= 1
349 |         if cur_substr is None:
350 |           is_bad = True
351 |           break
352 |         sub_tokens.append(cur_substr)
353 |         start = end
354 | 
355 |       if is_bad:
356 |         output_tokens.append(self.unk_token)
357 |       else:
358 |         output_tokens.extend(sub_tokens)
359 |     return output_tokens
360 | 
361 | 
362 | def _is_whitespace(char):
363 |   """Checks whether `chars` is a whitespace character."""
364 |   # \t, \n, and \r are technically contorl characters but we treat them
365 |   # as whitespace since they are generally considered as such.
366 |   if char == " " or char == "\t" or char == "\n" or char == "\r":
367 |     return True
368 |   cat = unicodedata.category(char)
369 |   if cat == "Zs":
370 |     return True
371 |   return False
372 | 
373 | 
374 | def _is_control(char):
375 |   """Checks whether `chars` is a control character."""
376 |   # These are technically control characters but we count them as whitespace
377 |   # characters.
378 |   if char == "\t" or char == "\n" or char == "\r":
379 |     return False
380 |   cat = unicodedata.category(char)
381 |   if cat in ("Cc", "Cf"):
382 |     return True
383 |   return False
384 | 
385 | 
386 | def _is_punctuation(char):
387 |   """Checks whether `chars` is a punctuation character."""
388 |   cp = ord(char)
389 |   # We treat all non-letter/number ASCII as punctuation.
390 |   # Characters such as "^", "$", and "`" are not in the Unicode
391 |   # Punctuation class but we treat them as punctuation anyways, for
392 |   # consistency.
393 |   if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
394 |       (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
395 |     return True
396 |   cat = unicodedata.category(char)
397 |   if cat.startswith("P"):
398 |     return True
399 |   return False
400 | 


--------------------------------------------------------------------------------
/tokenization_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import
 16 | from __future__ import division
 17 | from __future__ import print_function
 18 | 
 19 | import os
 20 | import tempfile
 21 | import tokenization
 22 | import six
 23 | import tensorflow as tf
 24 | 
 25 | 
 26 | class TokenizationTest(tf.test.TestCase):
 27 | 
 28 |   def test_full_tokenizer(self):
 29 |     vocab_tokens = [
 30 |         "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
 31 |         "##ing", ","
 32 |     ]
 33 |     with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
 34 |       if six.PY2:
 35 |         vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 36 |       else:
 37 |         vocab_writer.write("".join(
 38 |             [x + "\n" for x in vocab_tokens]).encode("utf-8"))
 39 | 
 40 |       vocab_file = vocab_writer.name
 41 | 
 42 |     tokenizer = tokenization.FullTokenizer(vocab_file)
 43 |     os.unlink(vocab_file)
 44 | 
 45 |     tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
 46 |     self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
 47 | 
 48 |     self.assertAllEqual(
 49 |         tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 50 | 
 51 |   def test_chinese(self):
 52 |     tokenizer = tokenization.BasicTokenizer()
 53 | 
 54 |     self.assertAllEqual(
 55 |         tokenizer.tokenize(u"ah\u535A\u63A8zz"),
 56 |         [u"ah", u"\u535A", u"\u63A8", u"zz"])
 57 | 
 58 |   def test_basic_tokenizer_lower(self):
 59 |     tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
 60 | 
 61 |     self.assertAllEqual(
 62 |         tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
 63 |         ["hello", "!", "how", "are", "you", "?"])
 64 |     self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
 65 | 
 66 |   def test_basic_tokenizer_no_lower(self):
 67 |     tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
 68 | 
 69 |     self.assertAllEqual(
 70 |         tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
 71 |         ["HeLLo", "!", "how", "Are", "yoU", "?"])
 72 | 
 73 |   def test_wordpiece_tokenizer(self):
 74 |     vocab_tokens = [
 75 |         "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
 76 |         "##ing"
 77 |     ]
 78 | 
 79 |     vocab = {}
 80 |     for (i, token) in enumerate(vocab_tokens):
 81 |       vocab[token] = i
 82 |     tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)
 83 | 
 84 |     self.assertAllEqual(tokenizer.tokenize(""), [])
 85 | 
 86 |     self.assertAllEqual(
 87 |         tokenizer.tokenize("unwanted running"),
 88 |         ["un", "##want", "##ed", "runn", "##ing"])
 89 | 
 90 |     self.assertAllEqual(
 91 |         tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
 92 | 
 93 |   def test_convert_tokens_to_ids(self):
 94 |     vocab_tokens = [
 95 |         "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
 96 |         "##ing"
 97 |     ]
 98 | 
 99 |     vocab = {}
100 |     for (i, token) in enumerate(vocab_tokens):
101 |       vocab[token] = i
102 | 
103 |     self.assertAllEqual(
104 |         tokenization.convert_tokens_to_ids(
105 |             vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
106 | 
107 |   def test_is_whitespace(self):
108 |     self.assertTrue(tokenization._is_whitespace(u" "))
109 |     self.assertTrue(tokenization._is_whitespace(u"\t"))
110 |     self.assertTrue(tokenization._is_whitespace(u"\r"))
111 |     self.assertTrue(tokenization._is_whitespace(u"\n"))
112 |     self.assertTrue(tokenization._is_whitespace(u"\u00A0"))
113 | 
114 |     self.assertFalse(tokenization._is_whitespace(u"A"))
115 |     self.assertFalse(tokenization._is_whitespace(u"-"))
116 | 
117 |   def test_is_control(self):
118 |     self.assertTrue(tokenization._is_control(u"\u0005"))
119 | 
120 |     self.assertFalse(tokenization._is_control(u"A"))
121 |     self.assertFalse(tokenization._is_control(u" "))
122 |     self.assertFalse(tokenization._is_control(u"\t"))
123 |     self.assertFalse(tokenization._is_control(u"\r"))
124 |     self.assertFalse(tokenization._is_control(u"\U0001F4A9"))
125 | 
126 |   def test_is_punctuation(self):
127 |     self.assertTrue(tokenization._is_punctuation(u"-"))
128 |     self.assertTrue(tokenization._is_punctuation(u"$"))
129 |     self.assertTrue(tokenization._is_punctuation(u"`"))
130 |     self.assertTrue(tokenization._is_punctuation(u"."))
131 | 
132 |     self.assertFalse(tokenization._is_punctuation(u"A"))
133 |     self.assertFalse(tokenization._is_punctuation(u" "))
134 | 
135 | 
136 | if __name__ == "__main__":
137 |   tf.test.main()
138 | 


--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export BERT_BASE_DIR=bert_model/models/chinese_L-12_H-768_A-12
 3 | export GLUE_DIR=Bert/data
 4 | export OUTPUT_DIR=Bert/output
 5 | 
 6 | python classifier.py \
 7 |   --task_name=similarity \
 8 |   --do_train=true \
 9 |   --do_eval=true \
10 |   --data_dir=$GLUE_DIR/ \
11 |   --vocab_file=$BERT_BASE_DIR/vocab.txt \
12 |   --bert_config_file=$BERT_BASE_DIR/bert_config.json \
13 |   --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
14 |   --max_seq_length=128 \
15 |   --train_batch_size=4 \
16 |   --learning_rate=2e-5 \
17 |   --num_train_epochs=5.0 \
18 |   --output_dir=$OUTPUT_DIR/


--------------------------------------------------------------------------------