├── .gitignore
├── README.md
├── step_0
└── bert
│ ├── .gitignore
│ ├── CONTRIBUTING.md
│ ├── LICENSE
│ ├── README.md
│ ├── __init__.py
│ ├── create_pretraining_data.py
│ ├── extract_features.py
│ ├── modeling.py
│ ├── modeling_test.py
│ ├── multilingual.md
│ ├── optimization.py
│ ├── optimization_test.py
│ ├── predicting_movie_reviews_with_bert_on_tf_hub.ipynb
│ ├── requirements.txt
│ ├── run_classifier.py
│ ├── run_classifier_with_tfhub.py
│ ├── run_pretraining.py
│ ├── run_squad.py
│ ├── sample_text.txt
│ ├── tokenization.py
│ └── tokenization_test.py
├── step_1
└── bert
│ ├── .gitignore
│ ├── CONTRIBUTING.md
│ ├── LICENSE
│ ├── README.md
│ ├── __init__.py
│ ├── create_pretraining_data.py
│ ├── extract_features.py
│ ├── modeling.py
│ ├── modeling_test.py
│ ├── multilingual.md
│ ├── optimization.py
│ ├── optimization_test.py
│ ├── predicting_movie_reviews_with_bert_on_tf_hub.ipynb
│ ├── requirements.txt
│ ├── run_classifier.py
│ ├── run_classifier_with_tfhub.py
│ ├── run_pretraining.py
│ ├── run_squad.py
│ ├── sample_text.txt
│ ├── tokenization.py
│ └── tokenization_test.py
├── step_2
└── bert
│ ├── .gitignore
│ ├── CONTRIBUTING.md
│ ├── LICENSE
│ ├── README.md
│ ├── __init__.py
│ ├── create_pretraining_data.py
│ ├── extract_features.py
│ ├── modeling.py
│ ├── modeling_test.py
│ ├── multilingual.md
│ ├── optimization.py
│ ├── optimization_test.py
│ ├── predicting_movie_reviews_with_bert_on_tf_hub.ipynb
│ ├── requirements.txt
│ ├── run_classifier.py
│ ├── run_classifier_with_tfhub.py
│ ├── run_pretraining.py
│ ├── run_squad.py
│ ├── sample_text.txt
│ ├── tokenization.py
│ └── tokenization_test.py
├── step_3
├── Model_Training_and_Analysis.ipynb
└── bert
│ ├── .gitignore
│ ├── CONTRIBUTING.md
│ ├── LICENSE
│ ├── README.md
│ ├── __init__.py
│ ├── create_pretraining_data.py
│ ├── extract_features.py
│ ├── modeling.py
│ ├── modeling_test.py
│ ├── multilingual.md
│ ├── optimization.py
│ ├── optimization_test.py
│ ├── predicting_movie_reviews_with_bert_on_tf_hub.ipynb
│ ├── requirements.txt
│ ├── run_classifier.py
│ ├── run_classifier_with_tfhub.py
│ ├── run_pretraining.py
│ ├── run_squad.py
│ ├── sample_text.txt
│ ├── tokenization.py
│ └── tokenization_test.py
├── step_4
├── Model_Training_and_Analysis.ipynb
├── bert
│ ├── .gitignore
│ ├── CONTRIBUTING.md
│ ├── LICENSE
│ ├── Pipfile
│ ├── Pipfile.lock
│ ├── README.md
│ ├── __init__.py
│ ├── asset
│ │ └── vocab.txt
│ ├── config
│ │ ├── __init__.py
│ │ ├── development.py
│ │ └── production.py
│ ├── create_pretraining_data.py
│ ├── extract_features.py
│ ├── modeling.py
│ ├── modeling_test.py
│ ├── multilingual.md
│ ├── optimization.py
│ ├── optimization_test.py
│ ├── predicting_movie_reviews_with_bert_on_tf_hub.ipynb
│ ├── requirements.txt
│ ├── run_app.py
│ ├── run_classifier.py
│ ├── run_classifier_with_tfhub.py
│ ├── run_pretraining.py
│ ├── run_squad.py
│ ├── sample_text.txt
│ ├── tokenization.py
│ └── tokenization_test.py
├── copy_model_to_k8s.sh
├── deploy_model.sh
├── make_cluster.sh
├── model-chart.yml
└── prepare_bert_client.sh
└── step_5
├── Model_Training_and_Analysis.ipynb
├── bert
├── .gitignore
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── Pipfile
├── Pipfile.lock
├── README.md
├── __init__.py
├── asset
│ └── vocab.txt
├── config
│ ├── __init__.py
│ ├── development.py
│ └── production.py
├── create_pretraining_data.py
├── extract_features.py
├── modeling.py
├── modeling_test.py
├── multilingual.md
├── optimization.py
├── optimization_test.py
├── predicting_movie_reviews_with_bert_on_tf_hub.ipynb
├── requirements.txt
├── run_app.py
├── run_classifier.py
├── run_classifier_with_tfhub.py
├── run_pretraining.py
├── run_squad.py
├── sample_text.txt
├── tokenization.py
└── tokenization_test.py
├── client-chart.yml
├── copy_model_to_k8s.sh
├── deploy_app.sh
├── deploy_model.sh
├── make_client_container.sh
├── make_cluster.sh
├── model-chart.yml
├── prepare_bert_client.sh
└── super-chart.yml
/.gitignore:
--------------------------------------------------------------------------------
1 | /venv/
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Enterprise Text Classification using BERT Tutorial
2 |
3 | ## Setup:
4 |
5 | * Download Intellij Community (for diffing folders):
6 | * Slides
7 | * Slack
8 |
9 | ## Step 0:
10 |
11 | * Explore BERT Repo to understand steps
12 |
13 | ## Step 1: Add code to process data
14 |
15 | * Download data and inspect format
16 | * Modify BERT Repo for training needs
17 | * Create google cloud bucket for model checkpoint saving
18 | * Give account read and write access to bucket
19 |
20 | ## Step 2: Add code to support export
21 |
22 | * Add export flags
23 | * Write serving_input_fn
24 |
25 | ## Step 3: Build, Examine, Export Model
26 |
27 | * Train model
28 |
29 | ```{bash}
30 | import sys
31 | !test -d bert_repo || git clone https://github.com/lapolonio/text_classification_tutorial bert_repo
32 | if not 'bert_repo' in sys.path:
33 | sys.path += ['bert_repo/step_3/bert']
34 |
35 |
36 | %%bash
37 |
38 | export BERT_BASE_DIR=gs://bert_model_demo/uncased_L-12_H-768_A-12
39 | export IMDB_DIR=NOT_USED
40 | export TPU_NAME=grpc://10.92.118.162:8470
41 | export OUTPUT_DIR=gs://bert_model_demo/imdb_v1/output/
42 | export EXPORT_DIR=gs://bert_model_demo/imdb_v1/export/
43 |
44 | python bert_repo/step_3/bert/run_classifier.py \
45 | --task_name=IMDB \
46 | --do_train=true \
47 | --do_predict=true \
48 | --data_dir=$IMDB_DIR \
49 | --vocab_file=$BERT_BASE_DIR/vocab.txt \
50 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \
51 | --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
52 | --max_seq_length=128 \
53 | --train_batch_size=32 \
54 | --learning_rate=2e-5 \
55 | --num_train_epochs=3.0 \
56 | --output_dir=$OUTPUT_DIR \
57 | --use_tpu=True \
58 | --tpu_name=$TPU_NAME \
59 | --do_serve=true \
60 | --export_dir=$EXPORT_DIR
61 | ```
62 | * Export built model to bucket
63 | * Test exported model
64 |
65 | ```{bash}
66 | !saved_model_cli show --dir gs://bert_model_demo/imdb_v1/export/1567569486 --tag_set serve --signature_def serving_default
67 |
68 | # Execute model to understand model input format
69 | # Batch 1
70 | !saved_model_cli run --dir gs://bert_model_demo/imdb_v1/export/1567569486 --tag_set serve --signature_def serving_default \
71 | --input_examples 'examples=[{"input_ids":np.zeros((128), dtype=int).tolist(),"input_mask":np.zeros((128), dtype=int).tolist(),"label_ids":[0],"segment_ids":np.zeros((128), dtype=int).tolist()}]'
72 |
73 | # Batch 3
74 | !saved_model_cli run --dir gs://bert_model_demo/imdb_v1/export/1567569486 --tag_set serve --signature_def serving_default \
75 | --input_examples 'examples=[{"input_ids":np.zeros((128), dtype=int).tolist(),"input_mask":np.zeros((128), dtype=int).tolist(),"label_ids":[0],"segment_ids":np.zeros((128), dtype=int).tolist()},{"input_ids":np.zeros((128), dtype=int).tolist(),"input_mask":np.zeros((128), dtype=int).tolist(),"label_ids":[0],"segment_ids":np.zeros((128), dtype=int).tolist()},{"input_ids":np.zeros((128), dtype=int).tolist(),"input_mask":np.zeros((128), dtype=int).tolist(),"label_ids":[0],"segment_ids":np.zeros((128), dtype=int).tolist()}]'
76 | ```
77 |
78 | ##Step 4: Create serving image and test with python client
79 |
80 | * Create k8s cluster
81 | * make_cluster.sh
82 | * Copy model to persistent volume claim
83 | * copy_model_to_k8s.sh
84 | * Create tensorflow_serving deployment with model
85 | * model-chart.yml
86 | * Deploy to k8s
87 | * deploy_model.sh
88 | * Test Server Service
89 | * kubectl run curler -it --rm --image=pstauffer/curl --restart=Never -- sh
90 | * curl imdb-sentiment-service:8501/v1/models/bert
91 | * Create python client to call tf serving image
92 | * run_app.py
93 | * Download needed assets
94 | * prepare_bert_client.sh
95 | * Run code locally
96 | * Login
97 | * gcloud auth login
98 | * Set kube context
99 | * gcloud container clusters get-credentials bert-cluster --zone us-central1-b --project analog-subset-256304
100 | * Port forward service
101 | * kubectl port-forward agnews-server-5cc95c9456-fmmx5 8501:8501 8500:8500
102 | * Run app locally
103 |
104 |
105 | ```{bash}
106 | mkdir ~/models
107 | gsutil cp -r $MODEL_LOCATION ~/models
108 | export MODEL_NAME=bert
109 | docker run -p 8500:8500 -p 8501:8501 \
110 | --mount type=bind,source=/home/leo/models,target=/models/$MODEL_NAME \
111 | -e MODEL_NAME=$MODEL_NAME --name bert_serving -t tensorflow/serving:latest &
112 | cd bert
113 | python3 -m pip install --user pipenv
114 | python3 -m pipenv install
115 | APP_CONFIG_FILE=config/development.py python3 -m pipenv run python run_app.py
116 |
117 | curl -X POST \
118 | http://localhost:5000/ \
119 | -H 'Content-Type: application/json' \
120 | -d '{"sentences":["Tainted look at kibbutz life
This film is less a cultural story about a boy'\''s life in a kibbutz, but the deliberate demonization of kibbutz life in general. In the first two minutes of the movie, the milk man in charge of the cows rapes one of his calves. And it'\''s all downhill from there in terms of the characters representing typical '\''kibbutznikim'\''. Besides the two main characters, a clinically depressed woman and her young son, every one else in the kibbutz is a gross caricature of well
evil.",
121 | "A great story a young Aussie bloke travels to england to claim his inheritance and meets up with his mates, who are just as loveable and innocent as he is.",
122 | "i hate the movie it was racist",
123 | "i loved the movie it was inspiring."]}'
124 | ```
125 |
126 | ##Step 5: Create client image and deploy
127 |
128 | * Create container for client
129 | * make_client_container.sh
130 | * Create k8s deployment
131 | * client-chart.yml
132 | * Deploy to k8s cluster
133 | * deploy_app.sh
134 | * Test deployment
135 |
136 | ## Resources
137 |
138 | ### Results on Text Classification from https://github.com/zihangdai/xlnet
139 |
140 | Model | IMDB | Yelp-2 | Yelp-5 | DBpedia | Amazon-2 | Amazon-5
141 | --- | --- | --- | --- | --- | --- | ---
142 | BERT-Large | 4.51 | 1.89 | 29.32 | 0.64 | 2.63 | 34.17
143 | XLNet-Large | **3.79** | **1.55** | **27.80** | **0.62** | **2.40** | **32.26**
144 |
145 | The above numbers are error rates.
146 |
147 | ### Links
148 |
149 | #### Bert Model Theory, Explanation and Research
150 |
151 | * Evolution of Representations in the Transformer
152 |
153 | * A survyer of Pre-trained Languge Model literature
154 |
155 | * The Illustrated BERT, ELMo, and co. (How NLP Cracked Transfer Learning)
156 |
157 | * Has BERT Been Cheating? Researchers Say it Exploits ‘Spurious Statistical Cues’
158 |
159 | #### Bert/Transformer Model Implementations
160 |
161 | * Original Bert Repo
162 |
163 | * Multiple Transformer Model Architecture Implementations in Tensorflow and Pytorch
164 |
165 | #### Kubernetes Resources
166 |
167 | * The Illustrated Children's Guide to Kubernetes
168 |
169 | * Kubernetes Basics
170 |
171 | * Kubernetes NodePort vs LoadBalancer vs Ingress? When should I use what?
172 |
173 | #### Machine Learning Engineer Resources
174 |
175 | * https://martinfowler.com/articles/cd4ml.html
176 |
177 | * Learn Production-Level Deep Learning from Top Practitioners:
178 |
179 | * Nuts and Bolts of Applying Deep Learning (Andrew Ng):
180 |
181 | #### Performance
182 |
183 | * Benchmarking Transformers: PyTorch and TensorFlow
184 |
185 | * Benchmarking Transformers
186 |
187 |
188 | * NVIDIA Announces TensorRT 6; Breaks 10 millisecond barrier for BERT-Large
189 |
190 | #### Tutorials
191 |
192 | * BERT Repo Classification Example:
193 |
194 | * IMDB Data:
195 |
196 | * Predicting Movie Reviews with BERT on TF Hub.ipynb:
197 |
198 | * BERT End to End (Fine-tuning + Predicting) in 5 minutes with Cloud TPU:
199 |
200 | * Saved Models Tutorial:
201 |
202 | * Use TensorFlow Serving with Kubernetes:
203 |
204 | * Example client communicating with Tensorflow Serving:
205 |
206 | * TensorFlow Serving with a variable batch size:
207 |
208 | ### Next Steps
209 |
210 | * Optimizing TensorFlow Serving performance with NVIDIA TensorRT
211 |
212 | * TFX is an end-to-end platform for deploying production ML pipelines
213 |
214 | * Kubeflow
215 |
216 | * Seldon
217 |
--------------------------------------------------------------------------------
/step_0/bert/.gitignore:
--------------------------------------------------------------------------------
1 | # Initially taken from Github's Python gitignore file
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 |
62 | # Flask stuff:
63 | instance/
64 | .webassets-cache
65 |
66 | # Scrapy stuff:
67 | .scrapy
68 |
69 | # Sphinx documentation
70 | docs/_build/
71 |
72 | # PyBuilder
73 | target/
74 |
75 | # Jupyter Notebook
76 | .ipynb_checkpoints
77 |
78 | # IPython
79 | profile_default/
80 | ipython_config.py
81 |
82 | # pyenv
83 | .python-version
84 |
85 | # celery beat schedule file
86 | celerybeat-schedule
87 |
88 | # SageMath parsed files
89 | *.sage.py
90 |
91 | # Environments
92 | .env
93 | .venv
94 | env/
95 | venv/
96 | ENV/
97 | env.bak/
98 | venv.bak/
99 |
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 |
104 | # Rope project settings
105 | .ropeproject
106 |
107 | # mkdocs documentation
108 | /site
109 |
110 | # mypy
111 | .mypy_cache/
112 | .dmypy.json
113 | dmypy.json
114 |
115 | # Pyre type checker
116 | .pyre/
117 |
--------------------------------------------------------------------------------
/step_0/bert/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | BERT needs to maintain permanent compatibility with the pre-trained model files,
4 | so we do not plan to make any major changes to this library (other than what was
5 | promised in the README). However, we can accept small patches related to
6 | re-factoring and documentation. To submit contributes, there are just a few
7 | small guidelines you need to follow.
8 |
9 | ## Contributor License Agreement
10 |
11 | Contributions to this project must be accompanied by a Contributor License
12 | Agreement. You (or your employer) retain the copyright to your contribution;
13 | this simply gives us permission to use and redistribute your contributions as
14 | part of the project. Head over to to see
15 | your current agreements on file or to sign a new one.
16 |
17 | You generally only need to submit a CLA once, so if you've already submitted one
18 | (even if it was for a different project), you probably don't need to do it
19 | again.
20 |
21 | ## Code reviews
22 |
23 | All submissions, including submissions by project members, require review. We
24 | use GitHub pull requests for this purpose. Consult
25 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
26 | information on using pull requests.
27 |
28 | ## Community Guidelines
29 |
30 | This project follows
31 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
32 |
--------------------------------------------------------------------------------
/step_0/bert/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
--------------------------------------------------------------------------------
/step_0/bert/modeling_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import collections
20 | import json
21 | import random
22 | import re
23 |
24 | import modeling
25 | import six
26 | import tensorflow as tf
27 |
28 |
29 | class BertModelTest(tf.test.TestCase):
30 |
31 | class BertModelTester(object):
32 |
33 | def __init__(self,
34 | parent,
35 | batch_size=13,
36 | seq_length=7,
37 | is_training=True,
38 | use_input_mask=True,
39 | use_token_type_ids=True,
40 | vocab_size=99,
41 | hidden_size=32,
42 | num_hidden_layers=5,
43 | num_attention_heads=4,
44 | intermediate_size=37,
45 | hidden_act="gelu",
46 | hidden_dropout_prob=0.1,
47 | attention_probs_dropout_prob=0.1,
48 | max_position_embeddings=512,
49 | type_vocab_size=16,
50 | initializer_range=0.02,
51 | scope=None):
52 | self.parent = parent
53 | self.batch_size = batch_size
54 | self.seq_length = seq_length
55 | self.is_training = is_training
56 | self.use_input_mask = use_input_mask
57 | self.use_token_type_ids = use_token_type_ids
58 | self.vocab_size = vocab_size
59 | self.hidden_size = hidden_size
60 | self.num_hidden_layers = num_hidden_layers
61 | self.num_attention_heads = num_attention_heads
62 | self.intermediate_size = intermediate_size
63 | self.hidden_act = hidden_act
64 | self.hidden_dropout_prob = hidden_dropout_prob
65 | self.attention_probs_dropout_prob = attention_probs_dropout_prob
66 | self.max_position_embeddings = max_position_embeddings
67 | self.type_vocab_size = type_vocab_size
68 | self.initializer_range = initializer_range
69 | self.scope = scope
70 |
71 | def create_model(self):
72 | input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length],
73 | self.vocab_size)
74 |
75 | input_mask = None
76 | if self.use_input_mask:
77 | input_mask = BertModelTest.ids_tensor(
78 | [self.batch_size, self.seq_length], vocab_size=2)
79 |
80 | token_type_ids = None
81 | if self.use_token_type_ids:
82 | token_type_ids = BertModelTest.ids_tensor(
83 | [self.batch_size, self.seq_length], self.type_vocab_size)
84 |
85 | config = modeling.BertConfig(
86 | vocab_size=self.vocab_size,
87 | hidden_size=self.hidden_size,
88 | num_hidden_layers=self.num_hidden_layers,
89 | num_attention_heads=self.num_attention_heads,
90 | intermediate_size=self.intermediate_size,
91 | hidden_act=self.hidden_act,
92 | hidden_dropout_prob=self.hidden_dropout_prob,
93 | attention_probs_dropout_prob=self.attention_probs_dropout_prob,
94 | max_position_embeddings=self.max_position_embeddings,
95 | type_vocab_size=self.type_vocab_size,
96 | initializer_range=self.initializer_range)
97 |
98 | model = modeling.BertModel(
99 | config=config,
100 | is_training=self.is_training,
101 | input_ids=input_ids,
102 | input_mask=input_mask,
103 | token_type_ids=token_type_ids,
104 | scope=self.scope)
105 |
106 | outputs = {
107 | "embedding_output": model.get_embedding_output(),
108 | "sequence_output": model.get_sequence_output(),
109 | "pooled_output": model.get_pooled_output(),
110 | "all_encoder_layers": model.get_all_encoder_layers(),
111 | }
112 | return outputs
113 |
114 | def check_output(self, result):
115 | self.parent.assertAllEqual(
116 | result["embedding_output"].shape,
117 | [self.batch_size, self.seq_length, self.hidden_size])
118 |
119 | self.parent.assertAllEqual(
120 | result["sequence_output"].shape,
121 | [self.batch_size, self.seq_length, self.hidden_size])
122 |
123 | self.parent.assertAllEqual(result["pooled_output"].shape,
124 | [self.batch_size, self.hidden_size])
125 |
126 | def test_default(self):
127 | self.run_tester(BertModelTest.BertModelTester(self))
128 |
129 | def test_config_to_json_string(self):
130 | config = modeling.BertConfig(vocab_size=99, hidden_size=37)
131 | obj = json.loads(config.to_json_string())
132 | self.assertEqual(obj["vocab_size"], 99)
133 | self.assertEqual(obj["hidden_size"], 37)
134 |
135 | def run_tester(self, tester):
136 | with self.test_session() as sess:
137 | ops = tester.create_model()
138 | init_op = tf.group(tf.global_variables_initializer(),
139 | tf.local_variables_initializer())
140 | sess.run(init_op)
141 | output_result = sess.run(ops)
142 | tester.check_output(output_result)
143 |
144 | self.assert_all_tensors_reachable(sess, [init_op, ops])
145 |
146 | @classmethod
147 | def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
148 | """Creates a random int32 tensor of the shape within the vocab size."""
149 | if rng is None:
150 | rng = random.Random()
151 |
152 | total_dims = 1
153 | for dim in shape:
154 | total_dims *= dim
155 |
156 | values = []
157 | for _ in range(total_dims):
158 | values.append(rng.randint(0, vocab_size - 1))
159 |
160 | return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name)
161 |
162 | def assert_all_tensors_reachable(self, sess, outputs):
163 | """Checks that all the tensors in the graph are reachable from outputs."""
164 | graph = sess.graph
165 |
166 | ignore_strings = [
167 | "^.*/assert_less_equal/.*$",
168 | "^.*/dilation_rate$",
169 | "^.*/Tensordot/concat$",
170 | "^.*/Tensordot/concat/axis$",
171 | "^testing/.*$",
172 | ]
173 |
174 | ignore_regexes = [re.compile(x) for x in ignore_strings]
175 |
176 | unreachable = self.get_unreachable_ops(graph, outputs)
177 | filtered_unreachable = []
178 | for x in unreachable:
179 | do_ignore = False
180 | for r in ignore_regexes:
181 | m = r.match(x.name)
182 | if m is not None:
183 | do_ignore = True
184 | if do_ignore:
185 | continue
186 | filtered_unreachable.append(x)
187 | unreachable = filtered_unreachable
188 |
189 | self.assertEqual(
190 | len(unreachable), 0, "The following ops are unreachable: %s" %
191 | (" ".join([x.name for x in unreachable])))
192 |
193 | @classmethod
194 | def get_unreachable_ops(cls, graph, outputs):
195 | """Finds all of the tensors in graph that are unreachable from outputs."""
196 | outputs = cls.flatten_recursive(outputs)
197 | output_to_op = collections.defaultdict(list)
198 | op_to_all = collections.defaultdict(list)
199 | assign_out_to_in = collections.defaultdict(list)
200 |
201 | for op in graph.get_operations():
202 | for x in op.inputs:
203 | op_to_all[op.name].append(x.name)
204 | for y in op.outputs:
205 | output_to_op[y.name].append(op.name)
206 | op_to_all[op.name].append(y.name)
207 | if str(op.type) == "Assign":
208 | for y in op.outputs:
209 | for x in op.inputs:
210 | assign_out_to_in[y.name].append(x.name)
211 |
212 | assign_groups = collections.defaultdict(list)
213 | for out_name in assign_out_to_in.keys():
214 | name_group = assign_out_to_in[out_name]
215 | for n1 in name_group:
216 | assign_groups[n1].append(out_name)
217 | for n2 in name_group:
218 | if n1 != n2:
219 | assign_groups[n1].append(n2)
220 |
221 | seen_tensors = {}
222 | stack = [x.name for x in outputs]
223 | while stack:
224 | name = stack.pop()
225 | if name in seen_tensors:
226 | continue
227 | seen_tensors[name] = True
228 |
229 | if name in output_to_op:
230 | for op_name in output_to_op[name]:
231 | if op_name in op_to_all:
232 | for input_name in op_to_all[op_name]:
233 | if input_name not in stack:
234 | stack.append(input_name)
235 |
236 | expanded_names = []
237 | if name in assign_groups:
238 | for assign_name in assign_groups[name]:
239 | expanded_names.append(assign_name)
240 |
241 | for expanded_name in expanded_names:
242 | if expanded_name not in stack:
243 | stack.append(expanded_name)
244 |
245 | unreachable_ops = []
246 | for op in graph.get_operations():
247 | is_unreachable = False
248 | all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs]
249 | for name in all_names:
250 | if name not in seen_tensors:
251 | is_unreachable = True
252 | if is_unreachable:
253 | unreachable_ops.append(op)
254 | return unreachable_ops
255 |
256 | @classmethod
257 | def flatten_recursive(cls, item):
258 | """Flattens (potentially nested) a tuple/dictionary/list to a list."""
259 | output = []
260 | if isinstance(item, list):
261 | output.extend(item)
262 | elif isinstance(item, tuple):
263 | output.extend(list(item))
264 | elif isinstance(item, dict):
265 | for (_, v) in six.iteritems(item):
266 | output.append(v)
267 | else:
268 | return [item]
269 |
270 | flat_output = []
271 | for x in output:
272 | flat_output.extend(cls.flatten_recursive(x))
273 | return flat_output
274 |
275 |
276 | if __name__ == "__main__":
277 | tf.test.main()
278 |
--------------------------------------------------------------------------------
/step_0/bert/optimization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Functions and classes related to optimization (weight updates)."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import re
22 | import tensorflow as tf
23 |
24 |
25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
26 | """Creates an optimizer training op."""
27 | global_step = tf.train.get_or_create_global_step()
28 |
29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
30 |
31 | # Implements linear decay of the learning rate.
32 | learning_rate = tf.train.polynomial_decay(
33 | learning_rate,
34 | global_step,
35 | num_train_steps,
36 | end_learning_rate=0.0,
37 | power=1.0,
38 | cycle=False)
39 |
40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
41 | # learning rate will be `global_step/num_warmup_steps * init_lr`.
42 | if num_warmup_steps:
43 | global_steps_int = tf.cast(global_step, tf.int32)
44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
45 |
46 | global_steps_float = tf.cast(global_steps_int, tf.float32)
47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
48 |
49 | warmup_percent_done = global_steps_float / warmup_steps_float
50 | warmup_learning_rate = init_lr * warmup_percent_done
51 |
52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
53 | learning_rate = (
54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
55 |
56 | # It is recommended that you use this optimizer for fine tuning, since this
57 | # is how the model was trained (note that the Adam m/v variables are NOT
58 | # loaded from init_checkpoint.)
59 | optimizer = AdamWeightDecayOptimizer(
60 | learning_rate=learning_rate,
61 | weight_decay_rate=0.01,
62 | beta_1=0.9,
63 | beta_2=0.999,
64 | epsilon=1e-6,
65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
66 |
67 | if use_tpu:
68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
69 |
70 | tvars = tf.trainable_variables()
71 | grads = tf.gradients(loss, tvars)
72 |
73 | # This is how the model was pre-trained.
74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
75 |
76 | train_op = optimizer.apply_gradients(
77 | zip(grads, tvars), global_step=global_step)
78 |
79 | # Normally the global step update is done inside of `apply_gradients`.
80 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
81 | # a different optimizer, you should probably take this line out.
82 | new_global_step = global_step + 1
83 | train_op = tf.group(train_op, [global_step.assign(new_global_step)])
84 | return train_op
85 |
86 |
87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
88 | """A basic Adam optimizer that includes "correct" L2 weight decay."""
89 |
90 | def __init__(self,
91 | learning_rate,
92 | weight_decay_rate=0.0,
93 | beta_1=0.9,
94 | beta_2=0.999,
95 | epsilon=1e-6,
96 | exclude_from_weight_decay=None,
97 | name="AdamWeightDecayOptimizer"):
98 | """Constructs a AdamWeightDecayOptimizer."""
99 | super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 |
101 | self.learning_rate = learning_rate
102 | self.weight_decay_rate = weight_decay_rate
103 | self.beta_1 = beta_1
104 | self.beta_2 = beta_2
105 | self.epsilon = epsilon
106 | self.exclude_from_weight_decay = exclude_from_weight_decay
107 |
108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 | """See base class."""
110 | assignments = []
111 | for (grad, param) in grads_and_vars:
112 | if grad is None or param is None:
113 | continue
114 |
115 | param_name = self._get_variable_name(param.name)
116 |
117 | m = tf.get_variable(
118 | name=param_name + "/adam_m",
119 | shape=param.shape.as_list(),
120 | dtype=tf.float32,
121 | trainable=False,
122 | initializer=tf.zeros_initializer())
123 | v = tf.get_variable(
124 | name=param_name + "/adam_v",
125 | shape=param.shape.as_list(),
126 | dtype=tf.float32,
127 | trainable=False,
128 | initializer=tf.zeros_initializer())
129 |
130 | # Standard Adam update.
131 | next_m = (
132 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 | next_v = (
134 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 | tf.square(grad)))
136 |
137 | update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 |
139 | # Just adding the square of the weights to the loss function is *not*
140 | # the correct way of using L2 regularization/weight decay with Adam,
141 | # since that will interact with the m and v parameters in strange ways.
142 | #
143 | # Instead we want ot decay the weights in a manner that doesn't interact
144 | # with the m/v parameters. This is equivalent to adding the square
145 | # of the weights to the loss with plain (non-momentum) SGD.
146 | if self._do_use_weight_decay(param_name):
147 | update += self.weight_decay_rate * param
148 |
149 | update_with_lr = self.learning_rate * update
150 |
151 | next_param = param - update_with_lr
152 |
153 | assignments.extend(
154 | [param.assign(next_param),
155 | m.assign(next_m),
156 | v.assign(next_v)])
157 | return tf.group(*assignments, name=name)
158 |
159 | def _do_use_weight_decay(self, param_name):
160 | """Whether to use L2 weight decay for `param_name`."""
161 | if not self.weight_decay_rate:
162 | return False
163 | if self.exclude_from_weight_decay:
164 | for r in self.exclude_from_weight_decay:
165 | if re.search(r, param_name) is not None:
166 | return False
167 | return True
168 |
169 | def _get_variable_name(self, param_name):
170 | """Get the variable name from the tensor name."""
171 | m = re.match("^(.*):\\d+$", param_name)
172 | if m is not None:
173 | param_name = m.group(1)
174 | return param_name
175 |
--------------------------------------------------------------------------------
/step_0/bert/optimization_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import optimization
20 | import tensorflow as tf
21 |
22 |
23 | class OptimizationTest(tf.test.TestCase):
24 |
25 | def test_adam(self):
26 | with self.test_session() as sess:
27 | w = tf.get_variable(
28 | "w",
29 | shape=[3],
30 | initializer=tf.constant_initializer([0.1, -0.2, -0.1]))
31 | x = tf.constant([0.4, 0.2, -0.5])
32 | loss = tf.reduce_mean(tf.square(x - w))
33 | tvars = tf.trainable_variables()
34 | grads = tf.gradients(loss, tvars)
35 | global_step = tf.train.get_or_create_global_step()
36 | optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2)
37 | train_op = optimizer.apply_gradients(zip(grads, tvars), global_step)
38 | init_op = tf.group(tf.global_variables_initializer(),
39 | tf.local_variables_initializer())
40 | sess.run(init_op)
41 | for _ in range(100):
42 | sess.run(train_op)
43 | w_np = sess.run(w)
44 | self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
45 |
46 |
47 | if __name__ == "__main__":
48 | tf.test.main()
49 |
--------------------------------------------------------------------------------
/step_0/bert/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow >= 1.11.0 # CPU Version of TensorFlow.
2 | # tensorflow-gpu >= 1.11.0 # GPU version of TensorFlow.
3 |
--------------------------------------------------------------------------------
/step_0/bert/sample_text.txt:
--------------------------------------------------------------------------------
1 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
2 | Text should be one-sentence-per-line, with empty lines between documents.
3 | This sample text is public domain and was randomly selected from Project Guttenberg.
4 |
5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
8 | "Cass" Beard had risen early that morning, but not with a view to discovery.
9 | A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
10 | The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
11 | This was nearly opposite.
12 | Mr. Cassius crossed the highway, and stopped suddenly.
13 | Something glittered in the nearest red pool before him.
14 | Gold, surely!
15 | But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
16 | Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
17 | Like most of his fellow gold-seekers, Cass was superstitious.
18 |
19 | The fountain of classic wisdom, Hypatia herself.
20 | As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
21 | From my youth I felt in me a soul above the matter-entangled herd.
22 | She revealed to me the glorious fact, that I am a spark of Divinity itself.
23 | A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
24 | There is a philosophic pleasure in opening one's treasures to the modest young.
25 | Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
26 | Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
27 | but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
28 | Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
29 | His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
30 | while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
31 | At last they reached the quay at the opposite end of the street;
32 | and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
33 | He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
34 |
--------------------------------------------------------------------------------
/step_0/bert/tokenization_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import os
20 | import tempfile
21 | import tokenization
22 | import six
23 | import tensorflow as tf
24 |
25 |
26 | class TokenizationTest(tf.test.TestCase):
27 |
28 | def test_full_tokenizer(self):
29 | vocab_tokens = [
30 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
31 | "##ing", ","
32 | ]
33 | with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
34 | if six.PY2:
35 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
36 | else:
37 | vocab_writer.write("".join(
38 | [x + "\n" for x in vocab_tokens]).encode("utf-8"))
39 |
40 | vocab_file = vocab_writer.name
41 |
42 | tokenizer = tokenization.FullTokenizer(vocab_file)
43 | os.unlink(vocab_file)
44 |
45 | tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
46 | self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
47 |
48 | self.assertAllEqual(
49 | tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
50 |
51 | def test_chinese(self):
52 | tokenizer = tokenization.BasicTokenizer()
53 |
54 | self.assertAllEqual(
55 | tokenizer.tokenize(u"ah\u535A\u63A8zz"),
56 | [u"ah", u"\u535A", u"\u63A8", u"zz"])
57 |
58 | def test_basic_tokenizer_lower(self):
59 | tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
60 |
61 | self.assertAllEqual(
62 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
63 | ["hello", "!", "how", "are", "you", "?"])
64 | self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
65 |
66 | def test_basic_tokenizer_no_lower(self):
67 | tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
68 |
69 | self.assertAllEqual(
70 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
71 | ["HeLLo", "!", "how", "Are", "yoU", "?"])
72 |
73 | def test_wordpiece_tokenizer(self):
74 | vocab_tokens = [
75 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
76 | "##ing"
77 | ]
78 |
79 | vocab = {}
80 | for (i, token) in enumerate(vocab_tokens):
81 | vocab[token] = i
82 | tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)
83 |
84 | self.assertAllEqual(tokenizer.tokenize(""), [])
85 |
86 | self.assertAllEqual(
87 | tokenizer.tokenize("unwanted running"),
88 | ["un", "##want", "##ed", "runn", "##ing"])
89 |
90 | self.assertAllEqual(
91 | tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
92 |
93 | def test_convert_tokens_to_ids(self):
94 | vocab_tokens = [
95 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
96 | "##ing"
97 | ]
98 |
99 | vocab = {}
100 | for (i, token) in enumerate(vocab_tokens):
101 | vocab[token] = i
102 |
103 | self.assertAllEqual(
104 | tokenization.convert_tokens_to_ids(
105 | vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
106 |
107 | def test_is_whitespace(self):
108 | self.assertTrue(tokenization._is_whitespace(u" "))
109 | self.assertTrue(tokenization._is_whitespace(u"\t"))
110 | self.assertTrue(tokenization._is_whitespace(u"\r"))
111 | self.assertTrue(tokenization._is_whitespace(u"\n"))
112 | self.assertTrue(tokenization._is_whitespace(u"\u00A0"))
113 |
114 | self.assertFalse(tokenization._is_whitespace(u"A"))
115 | self.assertFalse(tokenization._is_whitespace(u"-"))
116 |
117 | def test_is_control(self):
118 | self.assertTrue(tokenization._is_control(u"\u0005"))
119 |
120 | self.assertFalse(tokenization._is_control(u"A"))
121 | self.assertFalse(tokenization._is_control(u" "))
122 | self.assertFalse(tokenization._is_control(u"\t"))
123 | self.assertFalse(tokenization._is_control(u"\r"))
124 | self.assertFalse(tokenization._is_control(u"\U0001F4A9"))
125 |
126 | def test_is_punctuation(self):
127 | self.assertTrue(tokenization._is_punctuation(u"-"))
128 | self.assertTrue(tokenization._is_punctuation(u"$"))
129 | self.assertTrue(tokenization._is_punctuation(u"`"))
130 | self.assertTrue(tokenization._is_punctuation(u"."))
131 |
132 | self.assertFalse(tokenization._is_punctuation(u"A"))
133 | self.assertFalse(tokenization._is_punctuation(u" "))
134 |
135 |
136 | if __name__ == "__main__":
137 | tf.test.main()
138 |
--------------------------------------------------------------------------------
/step_1/bert/.gitignore:
--------------------------------------------------------------------------------
1 | # Initially taken from Github's Python gitignore file
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 |
62 | # Flask stuff:
63 | instance/
64 | .webassets-cache
65 |
66 | # Scrapy stuff:
67 | .scrapy
68 |
69 | # Sphinx documentation
70 | docs/_build/
71 |
72 | # PyBuilder
73 | target/
74 |
75 | # Jupyter Notebook
76 | .ipynb_checkpoints
77 |
78 | # IPython
79 | profile_default/
80 | ipython_config.py
81 |
82 | # pyenv
83 | .python-version
84 |
85 | # celery beat schedule file
86 | celerybeat-schedule
87 |
88 | # SageMath parsed files
89 | *.sage.py
90 |
91 | # Environments
92 | .env
93 | .venv
94 | env/
95 | venv/
96 | ENV/
97 | env.bak/
98 | venv.bak/
99 |
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 |
104 | # Rope project settings
105 | .ropeproject
106 |
107 | # mkdocs documentation
108 | /site
109 |
110 | # mypy
111 | .mypy_cache/
112 | .dmypy.json
113 | dmypy.json
114 |
115 | # Pyre type checker
116 | .pyre/
117 |
--------------------------------------------------------------------------------
/step_1/bert/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | BERT needs to maintain permanent compatibility with the pre-trained model files,
4 | so we do not plan to make any major changes to this library (other than what was
5 | promised in the README). However, we can accept small patches related to
6 | re-factoring and documentation. To submit contributes, there are just a few
7 | small guidelines you need to follow.
8 |
9 | ## Contributor License Agreement
10 |
11 | Contributions to this project must be accompanied by a Contributor License
12 | Agreement. You (or your employer) retain the copyright to your contribution;
13 | this simply gives us permission to use and redistribute your contributions as
14 | part of the project. Head over to to see
15 | your current agreements on file or to sign a new one.
16 |
17 | You generally only need to submit a CLA once, so if you've already submitted one
18 | (even if it was for a different project), you probably don't need to do it
19 | again.
20 |
21 | ## Code reviews
22 |
23 | All submissions, including submissions by project members, require review. We
24 | use GitHub pull requests for this purpose. Consult
25 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
26 | information on using pull requests.
27 |
28 | ## Community Guidelines
29 |
30 | This project follows
31 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
32 |
--------------------------------------------------------------------------------
/step_1/bert/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
--------------------------------------------------------------------------------
/step_1/bert/modeling_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import collections
20 | import json
21 | import random
22 | import re
23 |
24 | import modeling
25 | import six
26 | import tensorflow as tf
27 |
28 |
29 | class BertModelTest(tf.test.TestCase):
30 |
31 | class BertModelTester(object):
32 |
33 | def __init__(self,
34 | parent,
35 | batch_size=13,
36 | seq_length=7,
37 | is_training=True,
38 | use_input_mask=True,
39 | use_token_type_ids=True,
40 | vocab_size=99,
41 | hidden_size=32,
42 | num_hidden_layers=5,
43 | num_attention_heads=4,
44 | intermediate_size=37,
45 | hidden_act="gelu",
46 | hidden_dropout_prob=0.1,
47 | attention_probs_dropout_prob=0.1,
48 | max_position_embeddings=512,
49 | type_vocab_size=16,
50 | initializer_range=0.02,
51 | scope=None):
52 | self.parent = parent
53 | self.batch_size = batch_size
54 | self.seq_length = seq_length
55 | self.is_training = is_training
56 | self.use_input_mask = use_input_mask
57 | self.use_token_type_ids = use_token_type_ids
58 | self.vocab_size = vocab_size
59 | self.hidden_size = hidden_size
60 | self.num_hidden_layers = num_hidden_layers
61 | self.num_attention_heads = num_attention_heads
62 | self.intermediate_size = intermediate_size
63 | self.hidden_act = hidden_act
64 | self.hidden_dropout_prob = hidden_dropout_prob
65 | self.attention_probs_dropout_prob = attention_probs_dropout_prob
66 | self.max_position_embeddings = max_position_embeddings
67 | self.type_vocab_size = type_vocab_size
68 | self.initializer_range = initializer_range
69 | self.scope = scope
70 |
71 | def create_model(self):
72 | input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length],
73 | self.vocab_size)
74 |
75 | input_mask = None
76 | if self.use_input_mask:
77 | input_mask = BertModelTest.ids_tensor(
78 | [self.batch_size, self.seq_length], vocab_size=2)
79 |
80 | token_type_ids = None
81 | if self.use_token_type_ids:
82 | token_type_ids = BertModelTest.ids_tensor(
83 | [self.batch_size, self.seq_length], self.type_vocab_size)
84 |
85 | config = modeling.BertConfig(
86 | vocab_size=self.vocab_size,
87 | hidden_size=self.hidden_size,
88 | num_hidden_layers=self.num_hidden_layers,
89 | num_attention_heads=self.num_attention_heads,
90 | intermediate_size=self.intermediate_size,
91 | hidden_act=self.hidden_act,
92 | hidden_dropout_prob=self.hidden_dropout_prob,
93 | attention_probs_dropout_prob=self.attention_probs_dropout_prob,
94 | max_position_embeddings=self.max_position_embeddings,
95 | type_vocab_size=self.type_vocab_size,
96 | initializer_range=self.initializer_range)
97 |
98 | model = modeling.BertModel(
99 | config=config,
100 | is_training=self.is_training,
101 | input_ids=input_ids,
102 | input_mask=input_mask,
103 | token_type_ids=token_type_ids,
104 | scope=self.scope)
105 |
106 | outputs = {
107 | "embedding_output": model.get_embedding_output(),
108 | "sequence_output": model.get_sequence_output(),
109 | "pooled_output": model.get_pooled_output(),
110 | "all_encoder_layers": model.get_all_encoder_layers(),
111 | }
112 | return outputs
113 |
114 | def check_output(self, result):
115 | self.parent.assertAllEqual(
116 | result["embedding_output"].shape,
117 | [self.batch_size, self.seq_length, self.hidden_size])
118 |
119 | self.parent.assertAllEqual(
120 | result["sequence_output"].shape,
121 | [self.batch_size, self.seq_length, self.hidden_size])
122 |
123 | self.parent.assertAllEqual(result["pooled_output"].shape,
124 | [self.batch_size, self.hidden_size])
125 |
126 | def test_default(self):
127 | self.run_tester(BertModelTest.BertModelTester(self))
128 |
129 | def test_config_to_json_string(self):
130 | config = modeling.BertConfig(vocab_size=99, hidden_size=37)
131 | obj = json.loads(config.to_json_string())
132 | self.assertEqual(obj["vocab_size"], 99)
133 | self.assertEqual(obj["hidden_size"], 37)
134 |
135 | def run_tester(self, tester):
136 | with self.test_session() as sess:
137 | ops = tester.create_model()
138 | init_op = tf.group(tf.global_variables_initializer(),
139 | tf.local_variables_initializer())
140 | sess.run(init_op)
141 | output_result = sess.run(ops)
142 | tester.check_output(output_result)
143 |
144 | self.assert_all_tensors_reachable(sess, [init_op, ops])
145 |
146 | @classmethod
147 | def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
148 | """Creates a random int32 tensor of the shape within the vocab size."""
149 | if rng is None:
150 | rng = random.Random()
151 |
152 | total_dims = 1
153 | for dim in shape:
154 | total_dims *= dim
155 |
156 | values = []
157 | for _ in range(total_dims):
158 | values.append(rng.randint(0, vocab_size - 1))
159 |
160 | return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name)
161 |
162 | def assert_all_tensors_reachable(self, sess, outputs):
163 | """Checks that all the tensors in the graph are reachable from outputs."""
164 | graph = sess.graph
165 |
166 | ignore_strings = [
167 | "^.*/assert_less_equal/.*$",
168 | "^.*/dilation_rate$",
169 | "^.*/Tensordot/concat$",
170 | "^.*/Tensordot/concat/axis$",
171 | "^testing/.*$",
172 | ]
173 |
174 | ignore_regexes = [re.compile(x) for x in ignore_strings]
175 |
176 | unreachable = self.get_unreachable_ops(graph, outputs)
177 | filtered_unreachable = []
178 | for x in unreachable:
179 | do_ignore = False
180 | for r in ignore_regexes:
181 | m = r.match(x.name)
182 | if m is not None:
183 | do_ignore = True
184 | if do_ignore:
185 | continue
186 | filtered_unreachable.append(x)
187 | unreachable = filtered_unreachable
188 |
189 | self.assertEqual(
190 | len(unreachable), 0, "The following ops are unreachable: %s" %
191 | (" ".join([x.name for x in unreachable])))
192 |
193 | @classmethod
194 | def get_unreachable_ops(cls, graph, outputs):
195 | """Finds all of the tensors in graph that are unreachable from outputs."""
196 | outputs = cls.flatten_recursive(outputs)
197 | output_to_op = collections.defaultdict(list)
198 | op_to_all = collections.defaultdict(list)
199 | assign_out_to_in = collections.defaultdict(list)
200 |
201 | for op in graph.get_operations():
202 | for x in op.inputs:
203 | op_to_all[op.name].append(x.name)
204 | for y in op.outputs:
205 | output_to_op[y.name].append(op.name)
206 | op_to_all[op.name].append(y.name)
207 | if str(op.type) == "Assign":
208 | for y in op.outputs:
209 | for x in op.inputs:
210 | assign_out_to_in[y.name].append(x.name)
211 |
212 | assign_groups = collections.defaultdict(list)
213 | for out_name in assign_out_to_in.keys():
214 | name_group = assign_out_to_in[out_name]
215 | for n1 in name_group:
216 | assign_groups[n1].append(out_name)
217 | for n2 in name_group:
218 | if n1 != n2:
219 | assign_groups[n1].append(n2)
220 |
221 | seen_tensors = {}
222 | stack = [x.name for x in outputs]
223 | while stack:
224 | name = stack.pop()
225 | if name in seen_tensors:
226 | continue
227 | seen_tensors[name] = True
228 |
229 | if name in output_to_op:
230 | for op_name in output_to_op[name]:
231 | if op_name in op_to_all:
232 | for input_name in op_to_all[op_name]:
233 | if input_name not in stack:
234 | stack.append(input_name)
235 |
236 | expanded_names = []
237 | if name in assign_groups:
238 | for assign_name in assign_groups[name]:
239 | expanded_names.append(assign_name)
240 |
241 | for expanded_name in expanded_names:
242 | if expanded_name not in stack:
243 | stack.append(expanded_name)
244 |
245 | unreachable_ops = []
246 | for op in graph.get_operations():
247 | is_unreachable = False
248 | all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs]
249 | for name in all_names:
250 | if name not in seen_tensors:
251 | is_unreachable = True
252 | if is_unreachable:
253 | unreachable_ops.append(op)
254 | return unreachable_ops
255 |
256 | @classmethod
257 | def flatten_recursive(cls, item):
258 | """Flattens (potentially nested) a tuple/dictionary/list to a list."""
259 | output = []
260 | if isinstance(item, list):
261 | output.extend(item)
262 | elif isinstance(item, tuple):
263 | output.extend(list(item))
264 | elif isinstance(item, dict):
265 | for (_, v) in six.iteritems(item):
266 | output.append(v)
267 | else:
268 | return [item]
269 |
270 | flat_output = []
271 | for x in output:
272 | flat_output.extend(cls.flatten_recursive(x))
273 | return flat_output
274 |
275 |
276 | if __name__ == "__main__":
277 | tf.test.main()
278 |
--------------------------------------------------------------------------------
/step_1/bert/optimization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Functions and classes related to optimization (weight updates)."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import re
22 | import tensorflow as tf
23 |
24 |
25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
26 | """Creates an optimizer training op."""
27 | global_step = tf.train.get_or_create_global_step()
28 |
29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
30 |
31 | # Implements linear decay of the learning rate.
32 | learning_rate = tf.train.polynomial_decay(
33 | learning_rate,
34 | global_step,
35 | num_train_steps,
36 | end_learning_rate=0.0,
37 | power=1.0,
38 | cycle=False)
39 |
40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
41 | # learning rate will be `global_step/num_warmup_steps * init_lr`.
42 | if num_warmup_steps:
43 | global_steps_int = tf.cast(global_step, tf.int32)
44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
45 |
46 | global_steps_float = tf.cast(global_steps_int, tf.float32)
47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
48 |
49 | warmup_percent_done = global_steps_float / warmup_steps_float
50 | warmup_learning_rate = init_lr * warmup_percent_done
51 |
52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
53 | learning_rate = (
54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
55 |
56 | # It is recommended that you use this optimizer for fine tuning, since this
57 | # is how the model was trained (note that the Adam m/v variables are NOT
58 | # loaded from init_checkpoint.)
59 | optimizer = AdamWeightDecayOptimizer(
60 | learning_rate=learning_rate,
61 | weight_decay_rate=0.01,
62 | beta_1=0.9,
63 | beta_2=0.999,
64 | epsilon=1e-6,
65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
66 |
67 | if use_tpu:
68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
69 |
70 | tvars = tf.trainable_variables()
71 | grads = tf.gradients(loss, tvars)
72 |
73 | # This is how the model was pre-trained.
74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
75 |
76 | train_op = optimizer.apply_gradients(
77 | zip(grads, tvars), global_step=global_step)
78 |
79 | # Normally the global step update is done inside of `apply_gradients`.
80 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
81 | # a different optimizer, you should probably take this line out.
82 | new_global_step = global_step + 1
83 | train_op = tf.group(train_op, [global_step.assign(new_global_step)])
84 | return train_op
85 |
86 |
87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
88 | """A basic Adam optimizer that includes "correct" L2 weight decay."""
89 |
90 | def __init__(self,
91 | learning_rate,
92 | weight_decay_rate=0.0,
93 | beta_1=0.9,
94 | beta_2=0.999,
95 | epsilon=1e-6,
96 | exclude_from_weight_decay=None,
97 | name="AdamWeightDecayOptimizer"):
98 | """Constructs a AdamWeightDecayOptimizer."""
99 | super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 |
101 | self.learning_rate = learning_rate
102 | self.weight_decay_rate = weight_decay_rate
103 | self.beta_1 = beta_1
104 | self.beta_2 = beta_2
105 | self.epsilon = epsilon
106 | self.exclude_from_weight_decay = exclude_from_weight_decay
107 |
108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 | """See base class."""
110 | assignments = []
111 | for (grad, param) in grads_and_vars:
112 | if grad is None or param is None:
113 | continue
114 |
115 | param_name = self._get_variable_name(param.name)
116 |
117 | m = tf.get_variable(
118 | name=param_name + "/adam_m",
119 | shape=param.shape.as_list(),
120 | dtype=tf.float32,
121 | trainable=False,
122 | initializer=tf.zeros_initializer())
123 | v = tf.get_variable(
124 | name=param_name + "/adam_v",
125 | shape=param.shape.as_list(),
126 | dtype=tf.float32,
127 | trainable=False,
128 | initializer=tf.zeros_initializer())
129 |
130 | # Standard Adam update.
131 | next_m = (
132 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 | next_v = (
134 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 | tf.square(grad)))
136 |
137 | update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 |
139 | # Just adding the square of the weights to the loss function is *not*
140 | # the correct way of using L2 regularization/weight decay with Adam,
141 | # since that will interact with the m and v parameters in strange ways.
142 | #
143 | # Instead we want ot decay the weights in a manner that doesn't interact
144 | # with the m/v parameters. This is equivalent to adding the square
145 | # of the weights to the loss with plain (non-momentum) SGD.
146 | if self._do_use_weight_decay(param_name):
147 | update += self.weight_decay_rate * param
148 |
149 | update_with_lr = self.learning_rate * update
150 |
151 | next_param = param - update_with_lr
152 |
153 | assignments.extend(
154 | [param.assign(next_param),
155 | m.assign(next_m),
156 | v.assign(next_v)])
157 | return tf.group(*assignments, name=name)
158 |
159 | def _do_use_weight_decay(self, param_name):
160 | """Whether to use L2 weight decay for `param_name`."""
161 | if not self.weight_decay_rate:
162 | return False
163 | if self.exclude_from_weight_decay:
164 | for r in self.exclude_from_weight_decay:
165 | if re.search(r, param_name) is not None:
166 | return False
167 | return True
168 |
169 | def _get_variable_name(self, param_name):
170 | """Get the variable name from the tensor name."""
171 | m = re.match("^(.*):\\d+$", param_name)
172 | if m is not None:
173 | param_name = m.group(1)
174 | return param_name
175 |
--------------------------------------------------------------------------------
/step_1/bert/optimization_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import optimization
20 | import tensorflow as tf
21 |
22 |
23 | class OptimizationTest(tf.test.TestCase):
24 |
25 | def test_adam(self):
26 | with self.test_session() as sess:
27 | w = tf.get_variable(
28 | "w",
29 | shape=[3],
30 | initializer=tf.constant_initializer([0.1, -0.2, -0.1]))
31 | x = tf.constant([0.4, 0.2, -0.5])
32 | loss = tf.reduce_mean(tf.square(x - w))
33 | tvars = tf.trainable_variables()
34 | grads = tf.gradients(loss, tvars)
35 | global_step = tf.train.get_or_create_global_step()
36 | optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2)
37 | train_op = optimizer.apply_gradients(zip(grads, tvars), global_step)
38 | init_op = tf.group(tf.global_variables_initializer(),
39 | tf.local_variables_initializer())
40 | sess.run(init_op)
41 | for _ in range(100):
42 | sess.run(train_op)
43 | w_np = sess.run(w)
44 | self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
45 |
46 |
47 | if __name__ == "__main__":
48 | tf.test.main()
49 |
--------------------------------------------------------------------------------
/step_1/bert/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow >= 1.11.0 # CPU Version of TensorFlow.
2 | # tensorflow-gpu >= 1.11.0 # GPU version of TensorFlow.
3 |
--------------------------------------------------------------------------------
/step_1/bert/sample_text.txt:
--------------------------------------------------------------------------------
1 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
2 | Text should be one-sentence-per-line, with empty lines between documents.
3 | This sample text is public domain and was randomly selected from Project Guttenberg.
4 |
5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
8 | "Cass" Beard had risen early that morning, but not with a view to discovery.
9 | A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
10 | The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
11 | This was nearly opposite.
12 | Mr. Cassius crossed the highway, and stopped suddenly.
13 | Something glittered in the nearest red pool before him.
14 | Gold, surely!
15 | But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
16 | Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
17 | Like most of his fellow gold-seekers, Cass was superstitious.
18 |
19 | The fountain of classic wisdom, Hypatia herself.
20 | As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
21 | From my youth I felt in me a soul above the matter-entangled herd.
22 | She revealed to me the glorious fact, that I am a spark of Divinity itself.
23 | A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
24 | There is a philosophic pleasure in opening one's treasures to the modest young.
25 | Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
26 | Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
27 | but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
28 | Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
29 | His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
30 | while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
31 | At last they reached the quay at the opposite end of the street;
32 | and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
33 | He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
34 |
--------------------------------------------------------------------------------
/step_1/bert/tokenization_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import os
20 | import tempfile
21 | import tokenization
22 | import six
23 | import tensorflow as tf
24 |
25 |
26 | class TokenizationTest(tf.test.TestCase):
27 |
28 | def test_full_tokenizer(self):
29 | vocab_tokens = [
30 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
31 | "##ing", ","
32 | ]
33 | with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
34 | if six.PY2:
35 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
36 | else:
37 | vocab_writer.write("".join(
38 | [x + "\n" for x in vocab_tokens]).encode("utf-8"))
39 |
40 | vocab_file = vocab_writer.name
41 |
42 | tokenizer = tokenization.FullTokenizer(vocab_file)
43 | os.unlink(vocab_file)
44 |
45 | tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
46 | self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
47 |
48 | self.assertAllEqual(
49 | tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
50 |
51 | def test_chinese(self):
52 | tokenizer = tokenization.BasicTokenizer()
53 |
54 | self.assertAllEqual(
55 | tokenizer.tokenize(u"ah\u535A\u63A8zz"),
56 | [u"ah", u"\u535A", u"\u63A8", u"zz"])
57 |
58 | def test_basic_tokenizer_lower(self):
59 | tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
60 |
61 | self.assertAllEqual(
62 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
63 | ["hello", "!", "how", "are", "you", "?"])
64 | self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
65 |
66 | def test_basic_tokenizer_no_lower(self):
67 | tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
68 |
69 | self.assertAllEqual(
70 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
71 | ["HeLLo", "!", "how", "Are", "yoU", "?"])
72 |
73 | def test_wordpiece_tokenizer(self):
74 | vocab_tokens = [
75 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
76 | "##ing"
77 | ]
78 |
79 | vocab = {}
80 | for (i, token) in enumerate(vocab_tokens):
81 | vocab[token] = i
82 | tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)
83 |
84 | self.assertAllEqual(tokenizer.tokenize(""), [])
85 |
86 | self.assertAllEqual(
87 | tokenizer.tokenize("unwanted running"),
88 | ["un", "##want", "##ed", "runn", "##ing"])
89 |
90 | self.assertAllEqual(
91 | tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
92 |
93 | def test_convert_tokens_to_ids(self):
94 | vocab_tokens = [
95 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
96 | "##ing"
97 | ]
98 |
99 | vocab = {}
100 | for (i, token) in enumerate(vocab_tokens):
101 | vocab[token] = i
102 |
103 | self.assertAllEqual(
104 | tokenization.convert_tokens_to_ids(
105 | vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
106 |
107 | def test_is_whitespace(self):
108 | self.assertTrue(tokenization._is_whitespace(u" "))
109 | self.assertTrue(tokenization._is_whitespace(u"\t"))
110 | self.assertTrue(tokenization._is_whitespace(u"\r"))
111 | self.assertTrue(tokenization._is_whitespace(u"\n"))
112 | self.assertTrue(tokenization._is_whitespace(u"\u00A0"))
113 |
114 | self.assertFalse(tokenization._is_whitespace(u"A"))
115 | self.assertFalse(tokenization._is_whitespace(u"-"))
116 |
117 | def test_is_control(self):
118 | self.assertTrue(tokenization._is_control(u"\u0005"))
119 |
120 | self.assertFalse(tokenization._is_control(u"A"))
121 | self.assertFalse(tokenization._is_control(u" "))
122 | self.assertFalse(tokenization._is_control(u"\t"))
123 | self.assertFalse(tokenization._is_control(u"\r"))
124 | self.assertFalse(tokenization._is_control(u"\U0001F4A9"))
125 |
126 | def test_is_punctuation(self):
127 | self.assertTrue(tokenization._is_punctuation(u"-"))
128 | self.assertTrue(tokenization._is_punctuation(u"$"))
129 | self.assertTrue(tokenization._is_punctuation(u"`"))
130 | self.assertTrue(tokenization._is_punctuation(u"."))
131 |
132 | self.assertFalse(tokenization._is_punctuation(u"A"))
133 | self.assertFalse(tokenization._is_punctuation(u" "))
134 |
135 |
136 | if __name__ == "__main__":
137 | tf.test.main()
138 |
--------------------------------------------------------------------------------
/step_2/bert/.gitignore:
--------------------------------------------------------------------------------
1 | # Initially taken from Github's Python gitignore file
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 |
62 | # Flask stuff:
63 | instance/
64 | .webassets-cache
65 |
66 | # Scrapy stuff:
67 | .scrapy
68 |
69 | # Sphinx documentation
70 | docs/_build/
71 |
72 | # PyBuilder
73 | target/
74 |
75 | # Jupyter Notebook
76 | .ipynb_checkpoints
77 |
78 | # IPython
79 | profile_default/
80 | ipython_config.py
81 |
82 | # pyenv
83 | .python-version
84 |
85 | # celery beat schedule file
86 | celerybeat-schedule
87 |
88 | # SageMath parsed files
89 | *.sage.py
90 |
91 | # Environments
92 | .env
93 | .venv
94 | env/
95 | venv/
96 | ENV/
97 | env.bak/
98 | venv.bak/
99 |
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 |
104 | # Rope project settings
105 | .ropeproject
106 |
107 | # mkdocs documentation
108 | /site
109 |
110 | # mypy
111 | .mypy_cache/
112 | .dmypy.json
113 | dmypy.json
114 |
115 | # Pyre type checker
116 | .pyre/
117 |
--------------------------------------------------------------------------------
/step_2/bert/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | BERT needs to maintain permanent compatibility with the pre-trained model files,
4 | so we do not plan to make any major changes to this library (other than what was
5 | promised in the README). However, we can accept small patches related to
6 | re-factoring and documentation. To submit contributes, there are just a few
7 | small guidelines you need to follow.
8 |
9 | ## Contributor License Agreement
10 |
11 | Contributions to this project must be accompanied by a Contributor License
12 | Agreement. You (or your employer) retain the copyright to your contribution;
13 | this simply gives us permission to use and redistribute your contributions as
14 | part of the project. Head over to to see
15 | your current agreements on file or to sign a new one.
16 |
17 | You generally only need to submit a CLA once, so if you've already submitted one
18 | (even if it was for a different project), you probably don't need to do it
19 | again.
20 |
21 | ## Code reviews
22 |
23 | All submissions, including submissions by project members, require review. We
24 | use GitHub pull requests for this purpose. Consult
25 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
26 | information on using pull requests.
27 |
28 | ## Community Guidelines
29 |
30 | This project follows
31 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
32 |
--------------------------------------------------------------------------------
/step_2/bert/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
--------------------------------------------------------------------------------
/step_2/bert/modeling_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import collections
20 | import json
21 | import random
22 | import re
23 |
24 | import modeling
25 | import six
26 | import tensorflow as tf
27 |
28 |
29 | class BertModelTest(tf.test.TestCase):
30 |
31 | class BertModelTester(object):
32 |
33 | def __init__(self,
34 | parent,
35 | batch_size=13,
36 | seq_length=7,
37 | is_training=True,
38 | use_input_mask=True,
39 | use_token_type_ids=True,
40 | vocab_size=99,
41 | hidden_size=32,
42 | num_hidden_layers=5,
43 | num_attention_heads=4,
44 | intermediate_size=37,
45 | hidden_act="gelu",
46 | hidden_dropout_prob=0.1,
47 | attention_probs_dropout_prob=0.1,
48 | max_position_embeddings=512,
49 | type_vocab_size=16,
50 | initializer_range=0.02,
51 | scope=None):
52 | self.parent = parent
53 | self.batch_size = batch_size
54 | self.seq_length = seq_length
55 | self.is_training = is_training
56 | self.use_input_mask = use_input_mask
57 | self.use_token_type_ids = use_token_type_ids
58 | self.vocab_size = vocab_size
59 | self.hidden_size = hidden_size
60 | self.num_hidden_layers = num_hidden_layers
61 | self.num_attention_heads = num_attention_heads
62 | self.intermediate_size = intermediate_size
63 | self.hidden_act = hidden_act
64 | self.hidden_dropout_prob = hidden_dropout_prob
65 | self.attention_probs_dropout_prob = attention_probs_dropout_prob
66 | self.max_position_embeddings = max_position_embeddings
67 | self.type_vocab_size = type_vocab_size
68 | self.initializer_range = initializer_range
69 | self.scope = scope
70 |
71 | def create_model(self):
72 | input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length],
73 | self.vocab_size)
74 |
75 | input_mask = None
76 | if self.use_input_mask:
77 | input_mask = BertModelTest.ids_tensor(
78 | [self.batch_size, self.seq_length], vocab_size=2)
79 |
80 | token_type_ids = None
81 | if self.use_token_type_ids:
82 | token_type_ids = BertModelTest.ids_tensor(
83 | [self.batch_size, self.seq_length], self.type_vocab_size)
84 |
85 | config = modeling.BertConfig(
86 | vocab_size=self.vocab_size,
87 | hidden_size=self.hidden_size,
88 | num_hidden_layers=self.num_hidden_layers,
89 | num_attention_heads=self.num_attention_heads,
90 | intermediate_size=self.intermediate_size,
91 | hidden_act=self.hidden_act,
92 | hidden_dropout_prob=self.hidden_dropout_prob,
93 | attention_probs_dropout_prob=self.attention_probs_dropout_prob,
94 | max_position_embeddings=self.max_position_embeddings,
95 | type_vocab_size=self.type_vocab_size,
96 | initializer_range=self.initializer_range)
97 |
98 | model = modeling.BertModel(
99 | config=config,
100 | is_training=self.is_training,
101 | input_ids=input_ids,
102 | input_mask=input_mask,
103 | token_type_ids=token_type_ids,
104 | scope=self.scope)
105 |
106 | outputs = {
107 | "embedding_output": model.get_embedding_output(),
108 | "sequence_output": model.get_sequence_output(),
109 | "pooled_output": model.get_pooled_output(),
110 | "all_encoder_layers": model.get_all_encoder_layers(),
111 | }
112 | return outputs
113 |
114 | def check_output(self, result):
115 | self.parent.assertAllEqual(
116 | result["embedding_output"].shape,
117 | [self.batch_size, self.seq_length, self.hidden_size])
118 |
119 | self.parent.assertAllEqual(
120 | result["sequence_output"].shape,
121 | [self.batch_size, self.seq_length, self.hidden_size])
122 |
123 | self.parent.assertAllEqual(result["pooled_output"].shape,
124 | [self.batch_size, self.hidden_size])
125 |
126 | def test_default(self):
127 | self.run_tester(BertModelTest.BertModelTester(self))
128 |
129 | def test_config_to_json_string(self):
130 | config = modeling.BertConfig(vocab_size=99, hidden_size=37)
131 | obj = json.loads(config.to_json_string())
132 | self.assertEqual(obj["vocab_size"], 99)
133 | self.assertEqual(obj["hidden_size"], 37)
134 |
135 | def run_tester(self, tester):
136 | with self.test_session() as sess:
137 | ops = tester.create_model()
138 | init_op = tf.group(tf.global_variables_initializer(),
139 | tf.local_variables_initializer())
140 | sess.run(init_op)
141 | output_result = sess.run(ops)
142 | tester.check_output(output_result)
143 |
144 | self.assert_all_tensors_reachable(sess, [init_op, ops])
145 |
146 | @classmethod
147 | def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
148 | """Creates a random int32 tensor of the shape within the vocab size."""
149 | if rng is None:
150 | rng = random.Random()
151 |
152 | total_dims = 1
153 | for dim in shape:
154 | total_dims *= dim
155 |
156 | values = []
157 | for _ in range(total_dims):
158 | values.append(rng.randint(0, vocab_size - 1))
159 |
160 | return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name)
161 |
162 | def assert_all_tensors_reachable(self, sess, outputs):
163 | """Checks that all the tensors in the graph are reachable from outputs."""
164 | graph = sess.graph
165 |
166 | ignore_strings = [
167 | "^.*/assert_less_equal/.*$",
168 | "^.*/dilation_rate$",
169 | "^.*/Tensordot/concat$",
170 | "^.*/Tensordot/concat/axis$",
171 | "^testing/.*$",
172 | ]
173 |
174 | ignore_regexes = [re.compile(x) for x in ignore_strings]
175 |
176 | unreachable = self.get_unreachable_ops(graph, outputs)
177 | filtered_unreachable = []
178 | for x in unreachable:
179 | do_ignore = False
180 | for r in ignore_regexes:
181 | m = r.match(x.name)
182 | if m is not None:
183 | do_ignore = True
184 | if do_ignore:
185 | continue
186 | filtered_unreachable.append(x)
187 | unreachable = filtered_unreachable
188 |
189 | self.assertEqual(
190 | len(unreachable), 0, "The following ops are unreachable: %s" %
191 | (" ".join([x.name for x in unreachable])))
192 |
193 | @classmethod
194 | def get_unreachable_ops(cls, graph, outputs):
195 | """Finds all of the tensors in graph that are unreachable from outputs."""
196 | outputs = cls.flatten_recursive(outputs)
197 | output_to_op = collections.defaultdict(list)
198 | op_to_all = collections.defaultdict(list)
199 | assign_out_to_in = collections.defaultdict(list)
200 |
201 | for op in graph.get_operations():
202 | for x in op.inputs:
203 | op_to_all[op.name].append(x.name)
204 | for y in op.outputs:
205 | output_to_op[y.name].append(op.name)
206 | op_to_all[op.name].append(y.name)
207 | if str(op.type) == "Assign":
208 | for y in op.outputs:
209 | for x in op.inputs:
210 | assign_out_to_in[y.name].append(x.name)
211 |
212 | assign_groups = collections.defaultdict(list)
213 | for out_name in assign_out_to_in.keys():
214 | name_group = assign_out_to_in[out_name]
215 | for n1 in name_group:
216 | assign_groups[n1].append(out_name)
217 | for n2 in name_group:
218 | if n1 != n2:
219 | assign_groups[n1].append(n2)
220 |
221 | seen_tensors = {}
222 | stack = [x.name for x in outputs]
223 | while stack:
224 | name = stack.pop()
225 | if name in seen_tensors:
226 | continue
227 | seen_tensors[name] = True
228 |
229 | if name in output_to_op:
230 | for op_name in output_to_op[name]:
231 | if op_name in op_to_all:
232 | for input_name in op_to_all[op_name]:
233 | if input_name not in stack:
234 | stack.append(input_name)
235 |
236 | expanded_names = []
237 | if name in assign_groups:
238 | for assign_name in assign_groups[name]:
239 | expanded_names.append(assign_name)
240 |
241 | for expanded_name in expanded_names:
242 | if expanded_name not in stack:
243 | stack.append(expanded_name)
244 |
245 | unreachable_ops = []
246 | for op in graph.get_operations():
247 | is_unreachable = False
248 | all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs]
249 | for name in all_names:
250 | if name not in seen_tensors:
251 | is_unreachable = True
252 | if is_unreachable:
253 | unreachable_ops.append(op)
254 | return unreachable_ops
255 |
256 | @classmethod
257 | def flatten_recursive(cls, item):
258 | """Flattens (potentially nested) a tuple/dictionary/list to a list."""
259 | output = []
260 | if isinstance(item, list):
261 | output.extend(item)
262 | elif isinstance(item, tuple):
263 | output.extend(list(item))
264 | elif isinstance(item, dict):
265 | for (_, v) in six.iteritems(item):
266 | output.append(v)
267 | else:
268 | return [item]
269 |
270 | flat_output = []
271 | for x in output:
272 | flat_output.extend(cls.flatten_recursive(x))
273 | return flat_output
274 |
275 |
276 | if __name__ == "__main__":
277 | tf.test.main()
278 |
--------------------------------------------------------------------------------
/step_2/bert/optimization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Functions and classes related to optimization (weight updates)."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import re
22 | import tensorflow as tf
23 |
24 |
25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
26 | """Creates an optimizer training op."""
27 | global_step = tf.train.get_or_create_global_step()
28 |
29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
30 |
31 | # Implements linear decay of the learning rate.
32 | learning_rate = tf.train.polynomial_decay(
33 | learning_rate,
34 | global_step,
35 | num_train_steps,
36 | end_learning_rate=0.0,
37 | power=1.0,
38 | cycle=False)
39 |
40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
41 | # learning rate will be `global_step/num_warmup_steps * init_lr`.
42 | if num_warmup_steps:
43 | global_steps_int = tf.cast(global_step, tf.int32)
44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
45 |
46 | global_steps_float = tf.cast(global_steps_int, tf.float32)
47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
48 |
49 | warmup_percent_done = global_steps_float / warmup_steps_float
50 | warmup_learning_rate = init_lr * warmup_percent_done
51 |
52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
53 | learning_rate = (
54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
55 |
56 | # It is recommended that you use this optimizer for fine tuning, since this
57 | # is how the model was trained (note that the Adam m/v variables are NOT
58 | # loaded from init_checkpoint.)
59 | optimizer = AdamWeightDecayOptimizer(
60 | learning_rate=learning_rate,
61 | weight_decay_rate=0.01,
62 | beta_1=0.9,
63 | beta_2=0.999,
64 | epsilon=1e-6,
65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
66 |
67 | if use_tpu:
68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
69 |
70 | tvars = tf.trainable_variables()
71 | grads = tf.gradients(loss, tvars)
72 |
73 | # This is how the model was pre-trained.
74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
75 |
76 | train_op = optimizer.apply_gradients(
77 | zip(grads, tvars), global_step=global_step)
78 |
79 | # Normally the global step update is done inside of `apply_gradients`.
80 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
81 | # a different optimizer, you should probably take this line out.
82 | new_global_step = global_step + 1
83 | train_op = tf.group(train_op, [global_step.assign(new_global_step)])
84 | return train_op
85 |
86 |
87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
88 | """A basic Adam optimizer that includes "correct" L2 weight decay."""
89 |
90 | def __init__(self,
91 | learning_rate,
92 | weight_decay_rate=0.0,
93 | beta_1=0.9,
94 | beta_2=0.999,
95 | epsilon=1e-6,
96 | exclude_from_weight_decay=None,
97 | name="AdamWeightDecayOptimizer"):
98 | """Constructs a AdamWeightDecayOptimizer."""
99 | super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 |
101 | self.learning_rate = learning_rate
102 | self.weight_decay_rate = weight_decay_rate
103 | self.beta_1 = beta_1
104 | self.beta_2 = beta_2
105 | self.epsilon = epsilon
106 | self.exclude_from_weight_decay = exclude_from_weight_decay
107 |
108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 | """See base class."""
110 | assignments = []
111 | for (grad, param) in grads_and_vars:
112 | if grad is None or param is None:
113 | continue
114 |
115 | param_name = self._get_variable_name(param.name)
116 |
117 | m = tf.get_variable(
118 | name=param_name + "/adam_m",
119 | shape=param.shape.as_list(),
120 | dtype=tf.float32,
121 | trainable=False,
122 | initializer=tf.zeros_initializer())
123 | v = tf.get_variable(
124 | name=param_name + "/adam_v",
125 | shape=param.shape.as_list(),
126 | dtype=tf.float32,
127 | trainable=False,
128 | initializer=tf.zeros_initializer())
129 |
130 | # Standard Adam update.
131 | next_m = (
132 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 | next_v = (
134 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 | tf.square(grad)))
136 |
137 | update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 |
139 | # Just adding the square of the weights to the loss function is *not*
140 | # the correct way of using L2 regularization/weight decay with Adam,
141 | # since that will interact with the m and v parameters in strange ways.
142 | #
143 | # Instead we want ot decay the weights in a manner that doesn't interact
144 | # with the m/v parameters. This is equivalent to adding the square
145 | # of the weights to the loss with plain (non-momentum) SGD.
146 | if self._do_use_weight_decay(param_name):
147 | update += self.weight_decay_rate * param
148 |
149 | update_with_lr = self.learning_rate * update
150 |
151 | next_param = param - update_with_lr
152 |
153 | assignments.extend(
154 | [param.assign(next_param),
155 | m.assign(next_m),
156 | v.assign(next_v)])
157 | return tf.group(*assignments, name=name)
158 |
159 | def _do_use_weight_decay(self, param_name):
160 | """Whether to use L2 weight decay for `param_name`."""
161 | if not self.weight_decay_rate:
162 | return False
163 | if self.exclude_from_weight_decay:
164 | for r in self.exclude_from_weight_decay:
165 | if re.search(r, param_name) is not None:
166 | return False
167 | return True
168 |
169 | def _get_variable_name(self, param_name):
170 | """Get the variable name from the tensor name."""
171 | m = re.match("^(.*):\\d+$", param_name)
172 | if m is not None:
173 | param_name = m.group(1)
174 | return param_name
175 |
--------------------------------------------------------------------------------
/step_2/bert/optimization_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import optimization
20 | import tensorflow as tf
21 |
22 |
23 | class OptimizationTest(tf.test.TestCase):
24 |
25 | def test_adam(self):
26 | with self.test_session() as sess:
27 | w = tf.get_variable(
28 | "w",
29 | shape=[3],
30 | initializer=tf.constant_initializer([0.1, -0.2, -0.1]))
31 | x = tf.constant([0.4, 0.2, -0.5])
32 | loss = tf.reduce_mean(tf.square(x - w))
33 | tvars = tf.trainable_variables()
34 | grads = tf.gradients(loss, tvars)
35 | global_step = tf.train.get_or_create_global_step()
36 | optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2)
37 | train_op = optimizer.apply_gradients(zip(grads, tvars), global_step)
38 | init_op = tf.group(tf.global_variables_initializer(),
39 | tf.local_variables_initializer())
40 | sess.run(init_op)
41 | for _ in range(100):
42 | sess.run(train_op)
43 | w_np = sess.run(w)
44 | self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
45 |
46 |
47 | if __name__ == "__main__":
48 | tf.test.main()
49 |
--------------------------------------------------------------------------------
/step_2/bert/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow >= 1.11.0 # CPU Version of TensorFlow.
2 | # tensorflow-gpu >= 1.11.0 # GPU version of TensorFlow.
3 |
--------------------------------------------------------------------------------
/step_2/bert/sample_text.txt:
--------------------------------------------------------------------------------
1 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
2 | Text should be one-sentence-per-line, with empty lines between documents.
3 | This sample text is public domain and was randomly selected from Project Guttenberg.
4 |
5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
8 | "Cass" Beard had risen early that morning, but not with a view to discovery.
9 | A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
10 | The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
11 | This was nearly opposite.
12 | Mr. Cassius crossed the highway, and stopped suddenly.
13 | Something glittered in the nearest red pool before him.
14 | Gold, surely!
15 | But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
16 | Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
17 | Like most of his fellow gold-seekers, Cass was superstitious.
18 |
19 | The fountain of classic wisdom, Hypatia herself.
20 | As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
21 | From my youth I felt in me a soul above the matter-entangled herd.
22 | She revealed to me the glorious fact, that I am a spark of Divinity itself.
23 | A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
24 | There is a philosophic pleasure in opening one's treasures to the modest young.
25 | Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
26 | Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
27 | but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
28 | Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
29 | His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
30 | while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
31 | At last they reached the quay at the opposite end of the street;
32 | and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
33 | He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
34 |
--------------------------------------------------------------------------------
/step_2/bert/tokenization_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import os
20 | import tempfile
21 | import tokenization
22 | import six
23 | import tensorflow as tf
24 |
25 |
26 | class TokenizationTest(tf.test.TestCase):
27 |
28 | def test_full_tokenizer(self):
29 | vocab_tokens = [
30 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
31 | "##ing", ","
32 | ]
33 | with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
34 | if six.PY2:
35 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
36 | else:
37 | vocab_writer.write("".join(
38 | [x + "\n" for x in vocab_tokens]).encode("utf-8"))
39 |
40 | vocab_file = vocab_writer.name
41 |
42 | tokenizer = tokenization.FullTokenizer(vocab_file)
43 | os.unlink(vocab_file)
44 |
45 | tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
46 | self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
47 |
48 | self.assertAllEqual(
49 | tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
50 |
51 | def test_chinese(self):
52 | tokenizer = tokenization.BasicTokenizer()
53 |
54 | self.assertAllEqual(
55 | tokenizer.tokenize(u"ah\u535A\u63A8zz"),
56 | [u"ah", u"\u535A", u"\u63A8", u"zz"])
57 |
58 | def test_basic_tokenizer_lower(self):
59 | tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
60 |
61 | self.assertAllEqual(
62 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
63 | ["hello", "!", "how", "are", "you", "?"])
64 | self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
65 |
66 | def test_basic_tokenizer_no_lower(self):
67 | tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
68 |
69 | self.assertAllEqual(
70 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
71 | ["HeLLo", "!", "how", "Are", "yoU", "?"])
72 |
73 | def test_wordpiece_tokenizer(self):
74 | vocab_tokens = [
75 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
76 | "##ing"
77 | ]
78 |
79 | vocab = {}
80 | for (i, token) in enumerate(vocab_tokens):
81 | vocab[token] = i
82 | tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)
83 |
84 | self.assertAllEqual(tokenizer.tokenize(""), [])
85 |
86 | self.assertAllEqual(
87 | tokenizer.tokenize("unwanted running"),
88 | ["un", "##want", "##ed", "runn", "##ing"])
89 |
90 | self.assertAllEqual(
91 | tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
92 |
93 | def test_convert_tokens_to_ids(self):
94 | vocab_tokens = [
95 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
96 | "##ing"
97 | ]
98 |
99 | vocab = {}
100 | for (i, token) in enumerate(vocab_tokens):
101 | vocab[token] = i
102 |
103 | self.assertAllEqual(
104 | tokenization.convert_tokens_to_ids(
105 | vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
106 |
107 | def test_is_whitespace(self):
108 | self.assertTrue(tokenization._is_whitespace(u" "))
109 | self.assertTrue(tokenization._is_whitespace(u"\t"))
110 | self.assertTrue(tokenization._is_whitespace(u"\r"))
111 | self.assertTrue(tokenization._is_whitespace(u"\n"))
112 | self.assertTrue(tokenization._is_whitespace(u"\u00A0"))
113 |
114 | self.assertFalse(tokenization._is_whitespace(u"A"))
115 | self.assertFalse(tokenization._is_whitespace(u"-"))
116 |
117 | def test_is_control(self):
118 | self.assertTrue(tokenization._is_control(u"\u0005"))
119 |
120 | self.assertFalse(tokenization._is_control(u"A"))
121 | self.assertFalse(tokenization._is_control(u" "))
122 | self.assertFalse(tokenization._is_control(u"\t"))
123 | self.assertFalse(tokenization._is_control(u"\r"))
124 | self.assertFalse(tokenization._is_control(u"\U0001F4A9"))
125 |
126 | def test_is_punctuation(self):
127 | self.assertTrue(tokenization._is_punctuation(u"-"))
128 | self.assertTrue(tokenization._is_punctuation(u"$"))
129 | self.assertTrue(tokenization._is_punctuation(u"`"))
130 | self.assertTrue(tokenization._is_punctuation(u"."))
131 |
132 | self.assertFalse(tokenization._is_punctuation(u"A"))
133 | self.assertFalse(tokenization._is_punctuation(u" "))
134 |
135 |
136 | if __name__ == "__main__":
137 | tf.test.main()
138 |
--------------------------------------------------------------------------------
/step_3/bert/.gitignore:
--------------------------------------------------------------------------------
1 | # Initially taken from Github's Python gitignore file
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 |
62 | # Flask stuff:
63 | instance/
64 | .webassets-cache
65 |
66 | # Scrapy stuff:
67 | .scrapy
68 |
69 | # Sphinx documentation
70 | docs/_build/
71 |
72 | # PyBuilder
73 | target/
74 |
75 | # Jupyter Notebook
76 | .ipynb_checkpoints
77 |
78 | # IPython
79 | profile_default/
80 | ipython_config.py
81 |
82 | # pyenv
83 | .python-version
84 |
85 | # celery beat schedule file
86 | celerybeat-schedule
87 |
88 | # SageMath parsed files
89 | *.sage.py
90 |
91 | # Environments
92 | .env
93 | .venv
94 | env/
95 | venv/
96 | ENV/
97 | env.bak/
98 | venv.bak/
99 |
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 |
104 | # Rope project settings
105 | .ropeproject
106 |
107 | # mkdocs documentation
108 | /site
109 |
110 | # mypy
111 | .mypy_cache/
112 | .dmypy.json
113 | dmypy.json
114 |
115 | # Pyre type checker
116 | .pyre/
117 |
--------------------------------------------------------------------------------
/step_3/bert/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | BERT needs to maintain permanent compatibility with the pre-trained model files,
4 | so we do not plan to make any major changes to this library (other than what was
5 | promised in the README). However, we can accept small patches related to
6 | re-factoring and documentation. To submit contributes, there are just a few
7 | small guidelines you need to follow.
8 |
9 | ## Contributor License Agreement
10 |
11 | Contributions to this project must be accompanied by a Contributor License
12 | Agreement. You (or your employer) retain the copyright to your contribution;
13 | this simply gives us permission to use and redistribute your contributions as
14 | part of the project. Head over to to see
15 | your current agreements on file or to sign a new one.
16 |
17 | You generally only need to submit a CLA once, so if you've already submitted one
18 | (even if it was for a different project), you probably don't need to do it
19 | again.
20 |
21 | ## Code reviews
22 |
23 | All submissions, including submissions by project members, require review. We
24 | use GitHub pull requests for this purpose. Consult
25 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
26 | information on using pull requests.
27 |
28 | ## Community Guidelines
29 |
30 | This project follows
31 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
32 |
--------------------------------------------------------------------------------
/step_3/bert/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
--------------------------------------------------------------------------------
/step_3/bert/modeling_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import collections
20 | import json
21 | import random
22 | import re
23 |
24 | import modeling
25 | import six
26 | import tensorflow as tf
27 |
28 |
29 | class BertModelTest(tf.test.TestCase):
30 |
31 | class BertModelTester(object):
32 |
33 | def __init__(self,
34 | parent,
35 | batch_size=13,
36 | seq_length=7,
37 | is_training=True,
38 | use_input_mask=True,
39 | use_token_type_ids=True,
40 | vocab_size=99,
41 | hidden_size=32,
42 | num_hidden_layers=5,
43 | num_attention_heads=4,
44 | intermediate_size=37,
45 | hidden_act="gelu",
46 | hidden_dropout_prob=0.1,
47 | attention_probs_dropout_prob=0.1,
48 | max_position_embeddings=512,
49 | type_vocab_size=16,
50 | initializer_range=0.02,
51 | scope=None):
52 | self.parent = parent
53 | self.batch_size = batch_size
54 | self.seq_length = seq_length
55 | self.is_training = is_training
56 | self.use_input_mask = use_input_mask
57 | self.use_token_type_ids = use_token_type_ids
58 | self.vocab_size = vocab_size
59 | self.hidden_size = hidden_size
60 | self.num_hidden_layers = num_hidden_layers
61 | self.num_attention_heads = num_attention_heads
62 | self.intermediate_size = intermediate_size
63 | self.hidden_act = hidden_act
64 | self.hidden_dropout_prob = hidden_dropout_prob
65 | self.attention_probs_dropout_prob = attention_probs_dropout_prob
66 | self.max_position_embeddings = max_position_embeddings
67 | self.type_vocab_size = type_vocab_size
68 | self.initializer_range = initializer_range
69 | self.scope = scope
70 |
71 | def create_model(self):
72 | input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length],
73 | self.vocab_size)
74 |
75 | input_mask = None
76 | if self.use_input_mask:
77 | input_mask = BertModelTest.ids_tensor(
78 | [self.batch_size, self.seq_length], vocab_size=2)
79 |
80 | token_type_ids = None
81 | if self.use_token_type_ids:
82 | token_type_ids = BertModelTest.ids_tensor(
83 | [self.batch_size, self.seq_length], self.type_vocab_size)
84 |
85 | config = modeling.BertConfig(
86 | vocab_size=self.vocab_size,
87 | hidden_size=self.hidden_size,
88 | num_hidden_layers=self.num_hidden_layers,
89 | num_attention_heads=self.num_attention_heads,
90 | intermediate_size=self.intermediate_size,
91 | hidden_act=self.hidden_act,
92 | hidden_dropout_prob=self.hidden_dropout_prob,
93 | attention_probs_dropout_prob=self.attention_probs_dropout_prob,
94 | max_position_embeddings=self.max_position_embeddings,
95 | type_vocab_size=self.type_vocab_size,
96 | initializer_range=self.initializer_range)
97 |
98 | model = modeling.BertModel(
99 | config=config,
100 | is_training=self.is_training,
101 | input_ids=input_ids,
102 | input_mask=input_mask,
103 | token_type_ids=token_type_ids,
104 | scope=self.scope)
105 |
106 | outputs = {
107 | "embedding_output": model.get_embedding_output(),
108 | "sequence_output": model.get_sequence_output(),
109 | "pooled_output": model.get_pooled_output(),
110 | "all_encoder_layers": model.get_all_encoder_layers(),
111 | }
112 | return outputs
113 |
114 | def check_output(self, result):
115 | self.parent.assertAllEqual(
116 | result["embedding_output"].shape,
117 | [self.batch_size, self.seq_length, self.hidden_size])
118 |
119 | self.parent.assertAllEqual(
120 | result["sequence_output"].shape,
121 | [self.batch_size, self.seq_length, self.hidden_size])
122 |
123 | self.parent.assertAllEqual(result["pooled_output"].shape,
124 | [self.batch_size, self.hidden_size])
125 |
126 | def test_default(self):
127 | self.run_tester(BertModelTest.BertModelTester(self))
128 |
129 | def test_config_to_json_string(self):
130 | config = modeling.BertConfig(vocab_size=99, hidden_size=37)
131 | obj = json.loads(config.to_json_string())
132 | self.assertEqual(obj["vocab_size"], 99)
133 | self.assertEqual(obj["hidden_size"], 37)
134 |
135 | def run_tester(self, tester):
136 | with self.test_session() as sess:
137 | ops = tester.create_model()
138 | init_op = tf.group(tf.global_variables_initializer(),
139 | tf.local_variables_initializer())
140 | sess.run(init_op)
141 | output_result = sess.run(ops)
142 | tester.check_output(output_result)
143 |
144 | self.assert_all_tensors_reachable(sess, [init_op, ops])
145 |
146 | @classmethod
147 | def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
148 | """Creates a random int32 tensor of the shape within the vocab size."""
149 | if rng is None:
150 | rng = random.Random()
151 |
152 | total_dims = 1
153 | for dim in shape:
154 | total_dims *= dim
155 |
156 | values = []
157 | for _ in range(total_dims):
158 | values.append(rng.randint(0, vocab_size - 1))
159 |
160 | return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name)
161 |
162 | def assert_all_tensors_reachable(self, sess, outputs):
163 | """Checks that all the tensors in the graph are reachable from outputs."""
164 | graph = sess.graph
165 |
166 | ignore_strings = [
167 | "^.*/assert_less_equal/.*$",
168 | "^.*/dilation_rate$",
169 | "^.*/Tensordot/concat$",
170 | "^.*/Tensordot/concat/axis$",
171 | "^testing/.*$",
172 | ]
173 |
174 | ignore_regexes = [re.compile(x) for x in ignore_strings]
175 |
176 | unreachable = self.get_unreachable_ops(graph, outputs)
177 | filtered_unreachable = []
178 | for x in unreachable:
179 | do_ignore = False
180 | for r in ignore_regexes:
181 | m = r.match(x.name)
182 | if m is not None:
183 | do_ignore = True
184 | if do_ignore:
185 | continue
186 | filtered_unreachable.append(x)
187 | unreachable = filtered_unreachable
188 |
189 | self.assertEqual(
190 | len(unreachable), 0, "The following ops are unreachable: %s" %
191 | (" ".join([x.name for x in unreachable])))
192 |
193 | @classmethod
194 | def get_unreachable_ops(cls, graph, outputs):
195 | """Finds all of the tensors in graph that are unreachable from outputs."""
196 | outputs = cls.flatten_recursive(outputs)
197 | output_to_op = collections.defaultdict(list)
198 | op_to_all = collections.defaultdict(list)
199 | assign_out_to_in = collections.defaultdict(list)
200 |
201 | for op in graph.get_operations():
202 | for x in op.inputs:
203 | op_to_all[op.name].append(x.name)
204 | for y in op.outputs:
205 | output_to_op[y.name].append(op.name)
206 | op_to_all[op.name].append(y.name)
207 | if str(op.type) == "Assign":
208 | for y in op.outputs:
209 | for x in op.inputs:
210 | assign_out_to_in[y.name].append(x.name)
211 |
212 | assign_groups = collections.defaultdict(list)
213 | for out_name in assign_out_to_in.keys():
214 | name_group = assign_out_to_in[out_name]
215 | for n1 in name_group:
216 | assign_groups[n1].append(out_name)
217 | for n2 in name_group:
218 | if n1 != n2:
219 | assign_groups[n1].append(n2)
220 |
221 | seen_tensors = {}
222 | stack = [x.name for x in outputs]
223 | while stack:
224 | name = stack.pop()
225 | if name in seen_tensors:
226 | continue
227 | seen_tensors[name] = True
228 |
229 | if name in output_to_op:
230 | for op_name in output_to_op[name]:
231 | if op_name in op_to_all:
232 | for input_name in op_to_all[op_name]:
233 | if input_name not in stack:
234 | stack.append(input_name)
235 |
236 | expanded_names = []
237 | if name in assign_groups:
238 | for assign_name in assign_groups[name]:
239 | expanded_names.append(assign_name)
240 |
241 | for expanded_name in expanded_names:
242 | if expanded_name not in stack:
243 | stack.append(expanded_name)
244 |
245 | unreachable_ops = []
246 | for op in graph.get_operations():
247 | is_unreachable = False
248 | all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs]
249 | for name in all_names:
250 | if name not in seen_tensors:
251 | is_unreachable = True
252 | if is_unreachable:
253 | unreachable_ops.append(op)
254 | return unreachable_ops
255 |
256 | @classmethod
257 | def flatten_recursive(cls, item):
258 | """Flattens (potentially nested) a tuple/dictionary/list to a list."""
259 | output = []
260 | if isinstance(item, list):
261 | output.extend(item)
262 | elif isinstance(item, tuple):
263 | output.extend(list(item))
264 | elif isinstance(item, dict):
265 | for (_, v) in six.iteritems(item):
266 | output.append(v)
267 | else:
268 | return [item]
269 |
270 | flat_output = []
271 | for x in output:
272 | flat_output.extend(cls.flatten_recursive(x))
273 | return flat_output
274 |
275 |
276 | if __name__ == "__main__":
277 | tf.test.main()
278 |
--------------------------------------------------------------------------------
/step_3/bert/optimization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Functions and classes related to optimization (weight updates)."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import re
22 | import tensorflow as tf
23 |
24 |
25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
26 | """Creates an optimizer training op."""
27 | global_step = tf.train.get_or_create_global_step()
28 |
29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
30 |
31 | # Implements linear decay of the learning rate.
32 | learning_rate = tf.train.polynomial_decay(
33 | learning_rate,
34 | global_step,
35 | num_train_steps,
36 | end_learning_rate=0.0,
37 | power=1.0,
38 | cycle=False)
39 |
40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
41 | # learning rate will be `global_step/num_warmup_steps * init_lr`.
42 | if num_warmup_steps:
43 | global_steps_int = tf.cast(global_step, tf.int32)
44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
45 |
46 | global_steps_float = tf.cast(global_steps_int, tf.float32)
47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
48 |
49 | warmup_percent_done = global_steps_float / warmup_steps_float
50 | warmup_learning_rate = init_lr * warmup_percent_done
51 |
52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
53 | learning_rate = (
54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
55 |
56 | # It is recommended that you use this optimizer for fine tuning, since this
57 | # is how the model was trained (note that the Adam m/v variables are NOT
58 | # loaded from init_checkpoint.)
59 | optimizer = AdamWeightDecayOptimizer(
60 | learning_rate=learning_rate,
61 | weight_decay_rate=0.01,
62 | beta_1=0.9,
63 | beta_2=0.999,
64 | epsilon=1e-6,
65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
66 |
67 | if use_tpu:
68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
69 |
70 | tvars = tf.trainable_variables()
71 | grads = tf.gradients(loss, tvars)
72 |
73 | # This is how the model was pre-trained.
74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
75 |
76 | train_op = optimizer.apply_gradients(
77 | zip(grads, tvars), global_step=global_step)
78 |
79 | # Normally the global step update is done inside of `apply_gradients`.
80 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
81 | # a different optimizer, you should probably take this line out.
82 | new_global_step = global_step + 1
83 | train_op = tf.group(train_op, [global_step.assign(new_global_step)])
84 | return train_op
85 |
86 |
87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
88 | """A basic Adam optimizer that includes "correct" L2 weight decay."""
89 |
90 | def __init__(self,
91 | learning_rate,
92 | weight_decay_rate=0.0,
93 | beta_1=0.9,
94 | beta_2=0.999,
95 | epsilon=1e-6,
96 | exclude_from_weight_decay=None,
97 | name="AdamWeightDecayOptimizer"):
98 | """Constructs a AdamWeightDecayOptimizer."""
99 | super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 |
101 | self.learning_rate = learning_rate
102 | self.weight_decay_rate = weight_decay_rate
103 | self.beta_1 = beta_1
104 | self.beta_2 = beta_2
105 | self.epsilon = epsilon
106 | self.exclude_from_weight_decay = exclude_from_weight_decay
107 |
108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 | """See base class."""
110 | assignments = []
111 | for (grad, param) in grads_and_vars:
112 | if grad is None or param is None:
113 | continue
114 |
115 | param_name = self._get_variable_name(param.name)
116 |
117 | m = tf.get_variable(
118 | name=param_name + "/adam_m",
119 | shape=param.shape.as_list(),
120 | dtype=tf.float32,
121 | trainable=False,
122 | initializer=tf.zeros_initializer())
123 | v = tf.get_variable(
124 | name=param_name + "/adam_v",
125 | shape=param.shape.as_list(),
126 | dtype=tf.float32,
127 | trainable=False,
128 | initializer=tf.zeros_initializer())
129 |
130 | # Standard Adam update.
131 | next_m = (
132 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 | next_v = (
134 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 | tf.square(grad)))
136 |
137 | update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 |
139 | # Just adding the square of the weights to the loss function is *not*
140 | # the correct way of using L2 regularization/weight decay with Adam,
141 | # since that will interact with the m and v parameters in strange ways.
142 | #
143 | # Instead we want ot decay the weights in a manner that doesn't interact
144 | # with the m/v parameters. This is equivalent to adding the square
145 | # of the weights to the loss with plain (non-momentum) SGD.
146 | if self._do_use_weight_decay(param_name):
147 | update += self.weight_decay_rate * param
148 |
149 | update_with_lr = self.learning_rate * update
150 |
151 | next_param = param - update_with_lr
152 |
153 | assignments.extend(
154 | [param.assign(next_param),
155 | m.assign(next_m),
156 | v.assign(next_v)])
157 | return tf.group(*assignments, name=name)
158 |
159 | def _do_use_weight_decay(self, param_name):
160 | """Whether to use L2 weight decay for `param_name`."""
161 | if not self.weight_decay_rate:
162 | return False
163 | if self.exclude_from_weight_decay:
164 | for r in self.exclude_from_weight_decay:
165 | if re.search(r, param_name) is not None:
166 | return False
167 | return True
168 |
169 | def _get_variable_name(self, param_name):
170 | """Get the variable name from the tensor name."""
171 | m = re.match("^(.*):\\d+$", param_name)
172 | if m is not None:
173 | param_name = m.group(1)
174 | return param_name
175 |
--------------------------------------------------------------------------------
/step_3/bert/optimization_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import optimization
20 | import tensorflow as tf
21 |
22 |
23 | class OptimizationTest(tf.test.TestCase):
24 |
25 | def test_adam(self):
26 | with self.test_session() as sess:
27 | w = tf.get_variable(
28 | "w",
29 | shape=[3],
30 | initializer=tf.constant_initializer([0.1, -0.2, -0.1]))
31 | x = tf.constant([0.4, 0.2, -0.5])
32 | loss = tf.reduce_mean(tf.square(x - w))
33 | tvars = tf.trainable_variables()
34 | grads = tf.gradients(loss, tvars)
35 | global_step = tf.train.get_or_create_global_step()
36 | optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2)
37 | train_op = optimizer.apply_gradients(zip(grads, tvars), global_step)
38 | init_op = tf.group(tf.global_variables_initializer(),
39 | tf.local_variables_initializer())
40 | sess.run(init_op)
41 | for _ in range(100):
42 | sess.run(train_op)
43 | w_np = sess.run(w)
44 | self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
45 |
46 |
47 | if __name__ == "__main__":
48 | tf.test.main()
49 |
--------------------------------------------------------------------------------
/step_3/bert/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow >= 1.11.0 # CPU Version of TensorFlow.
2 | # tensorflow-gpu >= 1.11.0 # GPU version of TensorFlow.
3 |
--------------------------------------------------------------------------------
/step_3/bert/sample_text.txt:
--------------------------------------------------------------------------------
1 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
2 | Text should be one-sentence-per-line, with empty lines between documents.
3 | This sample text is public domain and was randomly selected from Project Guttenberg.
4 |
5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
8 | "Cass" Beard had risen early that morning, but not with a view to discovery.
9 | A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
10 | The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
11 | This was nearly opposite.
12 | Mr. Cassius crossed the highway, and stopped suddenly.
13 | Something glittered in the nearest red pool before him.
14 | Gold, surely!
15 | But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
16 | Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
17 | Like most of his fellow gold-seekers, Cass was superstitious.
18 |
19 | The fountain of classic wisdom, Hypatia herself.
20 | As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
21 | From my youth I felt in me a soul above the matter-entangled herd.
22 | She revealed to me the glorious fact, that I am a spark of Divinity itself.
23 | A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
24 | There is a philosophic pleasure in opening one's treasures to the modest young.
25 | Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
26 | Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
27 | but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
28 | Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
29 | His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
30 | while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
31 | At last they reached the quay at the opposite end of the street;
32 | and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
33 | He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
34 |
--------------------------------------------------------------------------------
/step_3/bert/tokenization_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import os
20 | import tempfile
21 | import tokenization
22 | import six
23 | import tensorflow as tf
24 |
25 |
26 | class TokenizationTest(tf.test.TestCase):
27 |
28 | def test_full_tokenizer(self):
29 | vocab_tokens = [
30 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
31 | "##ing", ","
32 | ]
33 | with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
34 | if six.PY2:
35 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
36 | else:
37 | vocab_writer.write("".join(
38 | [x + "\n" for x in vocab_tokens]).encode("utf-8"))
39 |
40 | vocab_file = vocab_writer.name
41 |
42 | tokenizer = tokenization.FullTokenizer(vocab_file)
43 | os.unlink(vocab_file)
44 |
45 | tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
46 | self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
47 |
48 | self.assertAllEqual(
49 | tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
50 |
51 | def test_chinese(self):
52 | tokenizer = tokenization.BasicTokenizer()
53 |
54 | self.assertAllEqual(
55 | tokenizer.tokenize(u"ah\u535A\u63A8zz"),
56 | [u"ah", u"\u535A", u"\u63A8", u"zz"])
57 |
58 | def test_basic_tokenizer_lower(self):
59 | tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
60 |
61 | self.assertAllEqual(
62 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
63 | ["hello", "!", "how", "are", "you", "?"])
64 | self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
65 |
66 | def test_basic_tokenizer_no_lower(self):
67 | tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
68 |
69 | self.assertAllEqual(
70 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
71 | ["HeLLo", "!", "how", "Are", "yoU", "?"])
72 |
73 | def test_wordpiece_tokenizer(self):
74 | vocab_tokens = [
75 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
76 | "##ing"
77 | ]
78 |
79 | vocab = {}
80 | for (i, token) in enumerate(vocab_tokens):
81 | vocab[token] = i
82 | tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)
83 |
84 | self.assertAllEqual(tokenizer.tokenize(""), [])
85 |
86 | self.assertAllEqual(
87 | tokenizer.tokenize("unwanted running"),
88 | ["un", "##want", "##ed", "runn", "##ing"])
89 |
90 | self.assertAllEqual(
91 | tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
92 |
93 | def test_convert_tokens_to_ids(self):
94 | vocab_tokens = [
95 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
96 | "##ing"
97 | ]
98 |
99 | vocab = {}
100 | for (i, token) in enumerate(vocab_tokens):
101 | vocab[token] = i
102 |
103 | self.assertAllEqual(
104 | tokenization.convert_tokens_to_ids(
105 | vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
106 |
107 | def test_is_whitespace(self):
108 | self.assertTrue(tokenization._is_whitespace(u" "))
109 | self.assertTrue(tokenization._is_whitespace(u"\t"))
110 | self.assertTrue(tokenization._is_whitespace(u"\r"))
111 | self.assertTrue(tokenization._is_whitespace(u"\n"))
112 | self.assertTrue(tokenization._is_whitespace(u"\u00A0"))
113 |
114 | self.assertFalse(tokenization._is_whitespace(u"A"))
115 | self.assertFalse(tokenization._is_whitespace(u"-"))
116 |
117 | def test_is_control(self):
118 | self.assertTrue(tokenization._is_control(u"\u0005"))
119 |
120 | self.assertFalse(tokenization._is_control(u"A"))
121 | self.assertFalse(tokenization._is_control(u" "))
122 | self.assertFalse(tokenization._is_control(u"\t"))
123 | self.assertFalse(tokenization._is_control(u"\r"))
124 | self.assertFalse(tokenization._is_control(u"\U0001F4A9"))
125 |
126 | def test_is_punctuation(self):
127 | self.assertTrue(tokenization._is_punctuation(u"-"))
128 | self.assertTrue(tokenization._is_punctuation(u"$"))
129 | self.assertTrue(tokenization._is_punctuation(u"`"))
130 | self.assertTrue(tokenization._is_punctuation(u"."))
131 |
132 | self.assertFalse(tokenization._is_punctuation(u"A"))
133 | self.assertFalse(tokenization._is_punctuation(u" "))
134 |
135 |
136 | if __name__ == "__main__":
137 | tf.test.main()
138 |
--------------------------------------------------------------------------------
/step_4/bert/.gitignore:
--------------------------------------------------------------------------------
1 | # Initially taken from Github's Python gitignore file
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 |
62 | # Flask stuff:
63 | instance/
64 | .webassets-cache
65 |
66 | # Scrapy stuff:
67 | .scrapy
68 |
69 | # Sphinx documentation
70 | docs/_build/
71 |
72 | # PyBuilder
73 | target/
74 |
75 | # Jupyter Notebook
76 | .ipynb_checkpoints
77 |
78 | # IPython
79 | profile_default/
80 | ipython_config.py
81 |
82 | # pyenv
83 | .python-version
84 |
85 | # celery beat schedule file
86 | celerybeat-schedule
87 |
88 | # SageMath parsed files
89 | *.sage.py
90 |
91 | # Environments
92 | .env
93 | .venv
94 | env/
95 | venv/
96 | ENV/
97 | env.bak/
98 | venv.bak/
99 |
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 |
104 | # Rope project settings
105 | .ropeproject
106 |
107 | # mkdocs documentation
108 | /site
109 |
110 | # mypy
111 | .mypy_cache/
112 | .dmypy.json
113 | dmypy.json
114 |
115 | # Pyre type checker
116 | .pyre/
117 |
--------------------------------------------------------------------------------
/step_4/bert/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | BERT needs to maintain permanent compatibility with the pre-trained model files,
4 | so we do not plan to make any major changes to this library (other than what was
5 | promised in the README). However, we can accept small patches related to
6 | re-factoring and documentation. To submit contributes, there are just a few
7 | small guidelines you need to follow.
8 |
9 | ## Contributor License Agreement
10 |
11 | Contributions to this project must be accompanied by a Contributor License
12 | Agreement. You (or your employer) retain the copyright to your contribution;
13 | this simply gives us permission to use and redistribute your contributions as
14 | part of the project. Head over to to see
15 | your current agreements on file or to sign a new one.
16 |
17 | You generally only need to submit a CLA once, so if you've already submitted one
18 | (even if it was for a different project), you probably don't need to do it
19 | again.
20 |
21 | ## Code reviews
22 |
23 | All submissions, including submissions by project members, require review. We
24 | use GitHub pull requests for this purpose. Consult
25 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
26 | information on using pull requests.
27 |
28 | ## Community Guidelines
29 |
30 | This project follows
31 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
32 |
--------------------------------------------------------------------------------
/step_4/bert/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 | name = "pypi"
3 | url = "https://pypi.org/simple"
4 | verify_ssl = true
5 |
6 | [dev-packages]
7 |
8 | [packages]
9 | tensorflow-serving-api = "*"
10 | tensorflow = "*"
11 | flask = "*"
12 | pandas = "*"
13 |
14 | [requires]
15 | python_version = "3.7"
16 |
--------------------------------------------------------------------------------
/step_4/bert/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
--------------------------------------------------------------------------------
/step_4/bert/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lapolonio/text_classification_tutorial/db12fbec05621959a345dd3a557d22bb61fa48a6/step_4/bert/config/__init__.py
--------------------------------------------------------------------------------
/step_4/bert/config/development.py:
--------------------------------------------------------------------------------
1 | TF_SERVING_HOST="localhost:8500"
2 | VOCAB_ASSET="asset/vocab.txt"
3 | DEBUG=True
--------------------------------------------------------------------------------
/step_4/bert/config/production.py:
--------------------------------------------------------------------------------
1 | TF_SERVING_HOST=imdb-sentiment-service:8500
2 | VOCAB_ASSET=asset/vocab.txt
3 | DEBUG=False
--------------------------------------------------------------------------------
/step_4/bert/modeling_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import collections
20 | import json
21 | import random
22 | import re
23 |
24 | import modeling
25 | import six
26 | import tensorflow as tf
27 |
28 |
29 | class BertModelTest(tf.test.TestCase):
30 |
31 | class BertModelTester(object):
32 |
33 | def __init__(self,
34 | parent,
35 | batch_size=13,
36 | seq_length=7,
37 | is_training=True,
38 | use_input_mask=True,
39 | use_token_type_ids=True,
40 | vocab_size=99,
41 | hidden_size=32,
42 | num_hidden_layers=5,
43 | num_attention_heads=4,
44 | intermediate_size=37,
45 | hidden_act="gelu",
46 | hidden_dropout_prob=0.1,
47 | attention_probs_dropout_prob=0.1,
48 | max_position_embeddings=512,
49 | type_vocab_size=16,
50 | initializer_range=0.02,
51 | scope=None):
52 | self.parent = parent
53 | self.batch_size = batch_size
54 | self.seq_length = seq_length
55 | self.is_training = is_training
56 | self.use_input_mask = use_input_mask
57 | self.use_token_type_ids = use_token_type_ids
58 | self.vocab_size = vocab_size
59 | self.hidden_size = hidden_size
60 | self.num_hidden_layers = num_hidden_layers
61 | self.num_attention_heads = num_attention_heads
62 | self.intermediate_size = intermediate_size
63 | self.hidden_act = hidden_act
64 | self.hidden_dropout_prob = hidden_dropout_prob
65 | self.attention_probs_dropout_prob = attention_probs_dropout_prob
66 | self.max_position_embeddings = max_position_embeddings
67 | self.type_vocab_size = type_vocab_size
68 | self.initializer_range = initializer_range
69 | self.scope = scope
70 |
71 | def create_model(self):
72 | input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length],
73 | self.vocab_size)
74 |
75 | input_mask = None
76 | if self.use_input_mask:
77 | input_mask = BertModelTest.ids_tensor(
78 | [self.batch_size, self.seq_length], vocab_size=2)
79 |
80 | token_type_ids = None
81 | if self.use_token_type_ids:
82 | token_type_ids = BertModelTest.ids_tensor(
83 | [self.batch_size, self.seq_length], self.type_vocab_size)
84 |
85 | config = modeling.BertConfig(
86 | vocab_size=self.vocab_size,
87 | hidden_size=self.hidden_size,
88 | num_hidden_layers=self.num_hidden_layers,
89 | num_attention_heads=self.num_attention_heads,
90 | intermediate_size=self.intermediate_size,
91 | hidden_act=self.hidden_act,
92 | hidden_dropout_prob=self.hidden_dropout_prob,
93 | attention_probs_dropout_prob=self.attention_probs_dropout_prob,
94 | max_position_embeddings=self.max_position_embeddings,
95 | type_vocab_size=self.type_vocab_size,
96 | initializer_range=self.initializer_range)
97 |
98 | model = modeling.BertModel(
99 | config=config,
100 | is_training=self.is_training,
101 | input_ids=input_ids,
102 | input_mask=input_mask,
103 | token_type_ids=token_type_ids,
104 | scope=self.scope)
105 |
106 | outputs = {
107 | "embedding_output": model.get_embedding_output(),
108 | "sequence_output": model.get_sequence_output(),
109 | "pooled_output": model.get_pooled_output(),
110 | "all_encoder_layers": model.get_all_encoder_layers(),
111 | }
112 | return outputs
113 |
114 | def check_output(self, result):
115 | self.parent.assertAllEqual(
116 | result["embedding_output"].shape,
117 | [self.batch_size, self.seq_length, self.hidden_size])
118 |
119 | self.parent.assertAllEqual(
120 | result["sequence_output"].shape,
121 | [self.batch_size, self.seq_length, self.hidden_size])
122 |
123 | self.parent.assertAllEqual(result["pooled_output"].shape,
124 | [self.batch_size, self.hidden_size])
125 |
126 | def test_default(self):
127 | self.run_tester(BertModelTest.BertModelTester(self))
128 |
129 | def test_config_to_json_string(self):
130 | config = modeling.BertConfig(vocab_size=99, hidden_size=37)
131 | obj = json.loads(config.to_json_string())
132 | self.assertEqual(obj["vocab_size"], 99)
133 | self.assertEqual(obj["hidden_size"], 37)
134 |
135 | def run_tester(self, tester):
136 | with self.test_session() as sess:
137 | ops = tester.create_model()
138 | init_op = tf.group(tf.global_variables_initializer(),
139 | tf.local_variables_initializer())
140 | sess.run(init_op)
141 | output_result = sess.run(ops)
142 | tester.check_output(output_result)
143 |
144 | self.assert_all_tensors_reachable(sess, [init_op, ops])
145 |
146 | @classmethod
147 | def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
148 | """Creates a random int32 tensor of the shape within the vocab size."""
149 | if rng is None:
150 | rng = random.Random()
151 |
152 | total_dims = 1
153 | for dim in shape:
154 | total_dims *= dim
155 |
156 | values = []
157 | for _ in range(total_dims):
158 | values.append(rng.randint(0, vocab_size - 1))
159 |
160 | return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name)
161 |
162 | def assert_all_tensors_reachable(self, sess, outputs):
163 | """Checks that all the tensors in the graph are reachable from outputs."""
164 | graph = sess.graph
165 |
166 | ignore_strings = [
167 | "^.*/assert_less_equal/.*$",
168 | "^.*/dilation_rate$",
169 | "^.*/Tensordot/concat$",
170 | "^.*/Tensordot/concat/axis$",
171 | "^testing/.*$",
172 | ]
173 |
174 | ignore_regexes = [re.compile(x) for x in ignore_strings]
175 |
176 | unreachable = self.get_unreachable_ops(graph, outputs)
177 | filtered_unreachable = []
178 | for x in unreachable:
179 | do_ignore = False
180 | for r in ignore_regexes:
181 | m = r.match(x.name)
182 | if m is not None:
183 | do_ignore = True
184 | if do_ignore:
185 | continue
186 | filtered_unreachable.append(x)
187 | unreachable = filtered_unreachable
188 |
189 | self.assertEqual(
190 | len(unreachable), 0, "The following ops are unreachable: %s" %
191 | (" ".join([x.name for x in unreachable])))
192 |
193 | @classmethod
194 | def get_unreachable_ops(cls, graph, outputs):
195 | """Finds all of the tensors in graph that are unreachable from outputs."""
196 | outputs = cls.flatten_recursive(outputs)
197 | output_to_op = collections.defaultdict(list)
198 | op_to_all = collections.defaultdict(list)
199 | assign_out_to_in = collections.defaultdict(list)
200 |
201 | for op in graph.get_operations():
202 | for x in op.inputs:
203 | op_to_all[op.name].append(x.name)
204 | for y in op.outputs:
205 | output_to_op[y.name].append(op.name)
206 | op_to_all[op.name].append(y.name)
207 | if str(op.type) == "Assign":
208 | for y in op.outputs:
209 | for x in op.inputs:
210 | assign_out_to_in[y.name].append(x.name)
211 |
212 | assign_groups = collections.defaultdict(list)
213 | for out_name in assign_out_to_in.keys():
214 | name_group = assign_out_to_in[out_name]
215 | for n1 in name_group:
216 | assign_groups[n1].append(out_name)
217 | for n2 in name_group:
218 | if n1 != n2:
219 | assign_groups[n1].append(n2)
220 |
221 | seen_tensors = {}
222 | stack = [x.name for x in outputs]
223 | while stack:
224 | name = stack.pop()
225 | if name in seen_tensors:
226 | continue
227 | seen_tensors[name] = True
228 |
229 | if name in output_to_op:
230 | for op_name in output_to_op[name]:
231 | if op_name in op_to_all:
232 | for input_name in op_to_all[op_name]:
233 | if input_name not in stack:
234 | stack.append(input_name)
235 |
236 | expanded_names = []
237 | if name in assign_groups:
238 | for assign_name in assign_groups[name]:
239 | expanded_names.append(assign_name)
240 |
241 | for expanded_name in expanded_names:
242 | if expanded_name not in stack:
243 | stack.append(expanded_name)
244 |
245 | unreachable_ops = []
246 | for op in graph.get_operations():
247 | is_unreachable = False
248 | all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs]
249 | for name in all_names:
250 | if name not in seen_tensors:
251 | is_unreachable = True
252 | if is_unreachable:
253 | unreachable_ops.append(op)
254 | return unreachable_ops
255 |
256 | @classmethod
257 | def flatten_recursive(cls, item):
258 | """Flattens (potentially nested) a tuple/dictionary/list to a list."""
259 | output = []
260 | if isinstance(item, list):
261 | output.extend(item)
262 | elif isinstance(item, tuple):
263 | output.extend(list(item))
264 | elif isinstance(item, dict):
265 | for (_, v) in six.iteritems(item):
266 | output.append(v)
267 | else:
268 | return [item]
269 |
270 | flat_output = []
271 | for x in output:
272 | flat_output.extend(cls.flatten_recursive(x))
273 | return flat_output
274 |
275 |
276 | if __name__ == "__main__":
277 | tf.test.main()
278 |
--------------------------------------------------------------------------------
/step_4/bert/optimization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Functions and classes related to optimization (weight updates)."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import re
22 | import tensorflow as tf
23 |
24 |
25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
26 | """Creates an optimizer training op."""
27 | global_step = tf.train.get_or_create_global_step()
28 |
29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
30 |
31 | # Implements linear decay of the learning rate.
32 | learning_rate = tf.train.polynomial_decay(
33 | learning_rate,
34 | global_step,
35 | num_train_steps,
36 | end_learning_rate=0.0,
37 | power=1.0,
38 | cycle=False)
39 |
40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
41 | # learning rate will be `global_step/num_warmup_steps * init_lr`.
42 | if num_warmup_steps:
43 | global_steps_int = tf.cast(global_step, tf.int32)
44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
45 |
46 | global_steps_float = tf.cast(global_steps_int, tf.float32)
47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
48 |
49 | warmup_percent_done = global_steps_float / warmup_steps_float
50 | warmup_learning_rate = init_lr * warmup_percent_done
51 |
52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
53 | learning_rate = (
54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
55 |
56 | # It is recommended that you use this optimizer for fine tuning, since this
57 | # is how the model was trained (note that the Adam m/v variables are NOT
58 | # loaded from init_checkpoint.)
59 | optimizer = AdamWeightDecayOptimizer(
60 | learning_rate=learning_rate,
61 | weight_decay_rate=0.01,
62 | beta_1=0.9,
63 | beta_2=0.999,
64 | epsilon=1e-6,
65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
66 |
67 | if use_tpu:
68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
69 |
70 | tvars = tf.trainable_variables()
71 | grads = tf.gradients(loss, tvars)
72 |
73 | # This is how the model was pre-trained.
74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
75 |
76 | train_op = optimizer.apply_gradients(
77 | zip(grads, tvars), global_step=global_step)
78 |
79 | # Normally the global step update is done inside of `apply_gradients`.
80 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
81 | # a different optimizer, you should probably take this line out.
82 | new_global_step = global_step + 1
83 | train_op = tf.group(train_op, [global_step.assign(new_global_step)])
84 | return train_op
85 |
86 |
87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
88 | """A basic Adam optimizer that includes "correct" L2 weight decay."""
89 |
90 | def __init__(self,
91 | learning_rate,
92 | weight_decay_rate=0.0,
93 | beta_1=0.9,
94 | beta_2=0.999,
95 | epsilon=1e-6,
96 | exclude_from_weight_decay=None,
97 | name="AdamWeightDecayOptimizer"):
98 | """Constructs a AdamWeightDecayOptimizer."""
99 | super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 |
101 | self.learning_rate = learning_rate
102 | self.weight_decay_rate = weight_decay_rate
103 | self.beta_1 = beta_1
104 | self.beta_2 = beta_2
105 | self.epsilon = epsilon
106 | self.exclude_from_weight_decay = exclude_from_weight_decay
107 |
108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 | """See base class."""
110 | assignments = []
111 | for (grad, param) in grads_and_vars:
112 | if grad is None or param is None:
113 | continue
114 |
115 | param_name = self._get_variable_name(param.name)
116 |
117 | m = tf.get_variable(
118 | name=param_name + "/adam_m",
119 | shape=param.shape.as_list(),
120 | dtype=tf.float32,
121 | trainable=False,
122 | initializer=tf.zeros_initializer())
123 | v = tf.get_variable(
124 | name=param_name + "/adam_v",
125 | shape=param.shape.as_list(),
126 | dtype=tf.float32,
127 | trainable=False,
128 | initializer=tf.zeros_initializer())
129 |
130 | # Standard Adam update.
131 | next_m = (
132 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 | next_v = (
134 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 | tf.square(grad)))
136 |
137 | update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 |
139 | # Just adding the square of the weights to the loss function is *not*
140 | # the correct way of using L2 regularization/weight decay with Adam,
141 | # since that will interact with the m and v parameters in strange ways.
142 | #
143 | # Instead we want ot decay the weights in a manner that doesn't interact
144 | # with the m/v parameters. This is equivalent to adding the square
145 | # of the weights to the loss with plain (non-momentum) SGD.
146 | if self._do_use_weight_decay(param_name):
147 | update += self.weight_decay_rate * param
148 |
149 | update_with_lr = self.learning_rate * update
150 |
151 | next_param = param - update_with_lr
152 |
153 | assignments.extend(
154 | [param.assign(next_param),
155 | m.assign(next_m),
156 | v.assign(next_v)])
157 | return tf.group(*assignments, name=name)
158 |
159 | def _do_use_weight_decay(self, param_name):
160 | """Whether to use L2 weight decay for `param_name`."""
161 | if not self.weight_decay_rate:
162 | return False
163 | if self.exclude_from_weight_decay:
164 | for r in self.exclude_from_weight_decay:
165 | if re.search(r, param_name) is not None:
166 | return False
167 | return True
168 |
169 | def _get_variable_name(self, param_name):
170 | """Get the variable name from the tensor name."""
171 | m = re.match("^(.*):\\d+$", param_name)
172 | if m is not None:
173 | param_name = m.group(1)
174 | return param_name
175 |
--------------------------------------------------------------------------------
/step_4/bert/optimization_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import optimization
20 | import tensorflow as tf
21 |
22 |
23 | class OptimizationTest(tf.test.TestCase):
24 |
25 | def test_adam(self):
26 | with self.test_session() as sess:
27 | w = tf.get_variable(
28 | "w",
29 | shape=[3],
30 | initializer=tf.constant_initializer([0.1, -0.2, -0.1]))
31 | x = tf.constant([0.4, 0.2, -0.5])
32 | loss = tf.reduce_mean(tf.square(x - w))
33 | tvars = tf.trainable_variables()
34 | grads = tf.gradients(loss, tvars)
35 | global_step = tf.train.get_or_create_global_step()
36 | optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2)
37 | train_op = optimizer.apply_gradients(zip(grads, tvars), global_step)
38 | init_op = tf.group(tf.global_variables_initializer(),
39 | tf.local_variables_initializer())
40 | sess.run(init_op)
41 | for _ in range(100):
42 | sess.run(train_op)
43 | w_np = sess.run(w)
44 | self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
45 |
46 |
47 | if __name__ == "__main__":
48 | tf.test.main()
49 |
--------------------------------------------------------------------------------
/step_4/bert/requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py==0.8.0
2 | astor==0.8.0
3 | Click==7.0
4 | Flask==1.1.1
5 | gast==0.3.1
6 | google-pasta==0.1.7
7 | grpcio==1.23.0
8 | h5py==2.10.0
9 | itsdangerous==1.1.0
10 | Jinja2==2.10.1
11 | Keras-Applications==1.0.8
12 | Keras-Preprocessing==1.1.0
13 | Markdown==3.1.1
14 | MarkupSafe==1.1.1
15 | numpy==1.17.2
16 | pandas==0.25.1
17 | protobuf==3.9.1
18 | python-dateutil==2.8.0
19 | pytz==2019.2
20 | six==1.12.0
21 | tensorboard==1.14.0
22 | tensorflow==1.14.0
23 | tensorflow-estimator==1.14.0
24 | tensorflow-serving-api==1.14.0
25 | termcolor==1.1.0
26 | Werkzeug==0.15.6
27 | wrapt==1.11.2
28 |
--------------------------------------------------------------------------------
/step_4/bert/run_app.py:
--------------------------------------------------------------------------------
1 |
2 | import grpc
3 | import tensorflow as tf
4 | import numpy as np
5 |
6 | import run_classifier as classifiers
7 | import tokenization
8 |
9 | from tensorflow_serving.apis import predict_pb2
10 | from tensorflow_serving.apis import prediction_service_pb2_grpc
11 | from tensorflow.core.framework import tensor_pb2
12 | from tensorflow.core.framework import tensor_shape_pb2
13 | from tensorflow.core.framework import types_pb2
14 |
15 | from flask import Flask
16 | from flask import request
17 | import time
18 |
19 | import pandas as pd
20 | import random
21 |
22 | app = Flask(__name__)
23 | app.config.from_envvar('APP_CONFIG_FILE')
24 |
25 | @app.route("/", methods = ['GET'])
26 | def hello():
27 | return "Hello BERT predicting IMDB! Try posting a string to this url"
28 |
29 |
30 | @app.route("/", methods = ['POST'])
31 | def predict():
32 | app.logger.info("Entering Predict Method")
33 | # MODEL PARAMS
34 | max_seq_length = 128
35 |
36 | channel = grpc.insecure_channel(app.config["TF_SERVING_HOST"])
37 | stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
38 |
39 | # Parse Description
40 | tokenizer = tokenization.FullTokenizer(
41 | vocab_file="asset/vocab.txt", do_lower_case=True)
42 | processor = classifiers.ImdbProcessor()
43 | label_list = processor.get_labels()
44 | content = request.get_json()
45 | request_id = str(random.randint(1, 9223372036854775807))
46 |
47 | data = {}
48 | data["id"] = []
49 | data["sentence"] = []
50 | for sentence in content['sentences']:
51 | data["id"].append(request_id)
52 | data["sentence"].append(sentence)
53 |
54 | inputExamples = processor._create_examples(pd.DataFrame.from_dict(data), 'test')
55 |
56 | t0 = time.time()
57 | model_input = classifiers.memory_based_convert_examples_to_features(inputExamples, label_list, max_seq_length, tokenizer)
58 | t1 = time.time()
59 | total = t1-t0
60 | app.logger.info("Convert Examples Time Taken: %d", total)
61 |
62 | app.logger.info(model_input)
63 | # Send request
64 | # See prediction_service.proto for gRPC request/response details.
65 | model_request = predict_pb2.PredictRequest()
66 | model_request.model_spec.name = 'bert'
67 | model_request.model_spec.signature_name = 'serving_default'
68 |
69 | model_request.inputs['examples'].CopyFrom(
70 | tf.contrib.util.make_tensor_proto(model_input, shape=[len(model_input)])
71 | )
72 | app.logger.info(model_request)
73 |
74 | t0 = time.time()
75 | result = stub.Predict(model_request, 60.0) # 20 secs timeout
76 | t1 = time.time()
77 | total = t1-t0
78 |
79 | app.logger.info(result)
80 | result = tf.make_ndarray(result.outputs["probabilities"])
81 | app.logger.info(result)
82 | pretty_result = "Predicted Label: " + str(np.take(processor.get_labels(), result.argmax(axis=1)))
83 | app.logger.info("Predicted Label: %s", str(np.take(processor.get_labels(), result.argmax(axis=1))))
84 | app.logger.info("Predict Time Taken: %f", total)
85 |
86 | return pretty_result
87 |
88 |
89 | if __name__ == '__main__':
90 | app.run(debug=app.config["DEBUG"])
91 |
--------------------------------------------------------------------------------
/step_4/bert/sample_text.txt:
--------------------------------------------------------------------------------
1 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
2 | Text should be one-sentence-per-line, with empty lines between documents.
3 | This sample text is public domain and was randomly selected from Project Guttenberg.
4 |
5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
8 | "Cass" Beard had risen early that morning, but not with a view to discovery.
9 | A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
10 | The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
11 | This was nearly opposite.
12 | Mr. Cassius crossed the highway, and stopped suddenly.
13 | Something glittered in the nearest red pool before him.
14 | Gold, surely!
15 | But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
16 | Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
17 | Like most of his fellow gold-seekers, Cass was superstitious.
18 |
19 | The fountain of classic wisdom, Hypatia herself.
20 | As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
21 | From my youth I felt in me a soul above the matter-entangled herd.
22 | She revealed to me the glorious fact, that I am a spark of Divinity itself.
23 | A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
24 | There is a philosophic pleasure in opening one's treasures to the modest young.
25 | Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
26 | Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
27 | but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
28 | Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
29 | His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
30 | while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
31 | At last they reached the quay at the opposite end of the street;
32 | and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
33 | He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
34 |
--------------------------------------------------------------------------------
/step_4/bert/tokenization_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import os
20 | import tempfile
21 | import tokenization
22 | import six
23 | import tensorflow as tf
24 |
25 |
26 | class TokenizationTest(tf.test.TestCase):
27 |
28 | def test_full_tokenizer(self):
29 | vocab_tokens = [
30 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
31 | "##ing", ","
32 | ]
33 | with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
34 | if six.PY2:
35 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
36 | else:
37 | vocab_writer.write("".join(
38 | [x + "\n" for x in vocab_tokens]).encode("utf-8"))
39 |
40 | vocab_file = vocab_writer.name
41 |
42 | tokenizer = tokenization.FullTokenizer(vocab_file)
43 | os.unlink(vocab_file)
44 |
45 | tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
46 | self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
47 |
48 | self.assertAllEqual(
49 | tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
50 |
51 | def test_chinese(self):
52 | tokenizer = tokenization.BasicTokenizer()
53 |
54 | self.assertAllEqual(
55 | tokenizer.tokenize(u"ah\u535A\u63A8zz"),
56 | [u"ah", u"\u535A", u"\u63A8", u"zz"])
57 |
58 | def test_basic_tokenizer_lower(self):
59 | tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
60 |
61 | self.assertAllEqual(
62 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
63 | ["hello", "!", "how", "are", "you", "?"])
64 | self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
65 |
66 | def test_basic_tokenizer_no_lower(self):
67 | tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
68 |
69 | self.assertAllEqual(
70 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
71 | ["HeLLo", "!", "how", "Are", "yoU", "?"])
72 |
73 | def test_wordpiece_tokenizer(self):
74 | vocab_tokens = [
75 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
76 | "##ing"
77 | ]
78 |
79 | vocab = {}
80 | for (i, token) in enumerate(vocab_tokens):
81 | vocab[token] = i
82 | tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)
83 |
84 | self.assertAllEqual(tokenizer.tokenize(""), [])
85 |
86 | self.assertAllEqual(
87 | tokenizer.tokenize("unwanted running"),
88 | ["un", "##want", "##ed", "runn", "##ing"])
89 |
90 | self.assertAllEqual(
91 | tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
92 |
93 | def test_convert_tokens_to_ids(self):
94 | vocab_tokens = [
95 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
96 | "##ing"
97 | ]
98 |
99 | vocab = {}
100 | for (i, token) in enumerate(vocab_tokens):
101 | vocab[token] = i
102 |
103 | self.assertAllEqual(
104 | tokenization.convert_tokens_to_ids(
105 | vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
106 |
107 | def test_is_whitespace(self):
108 | self.assertTrue(tokenization._is_whitespace(u" "))
109 | self.assertTrue(tokenization._is_whitespace(u"\t"))
110 | self.assertTrue(tokenization._is_whitespace(u"\r"))
111 | self.assertTrue(tokenization._is_whitespace(u"\n"))
112 | self.assertTrue(tokenization._is_whitespace(u"\u00A0"))
113 |
114 | self.assertFalse(tokenization._is_whitespace(u"A"))
115 | self.assertFalse(tokenization._is_whitespace(u"-"))
116 |
117 | def test_is_control(self):
118 | self.assertTrue(tokenization._is_control(u"\u0005"))
119 |
120 | self.assertFalse(tokenization._is_control(u"A"))
121 | self.assertFalse(tokenization._is_control(u" "))
122 | self.assertFalse(tokenization._is_control(u"\t"))
123 | self.assertFalse(tokenization._is_control(u"\r"))
124 | self.assertFalse(tokenization._is_control(u"\U0001F4A9"))
125 |
126 | def test_is_punctuation(self):
127 | self.assertTrue(tokenization._is_punctuation(u"-"))
128 | self.assertTrue(tokenization._is_punctuation(u"$"))
129 | self.assertTrue(tokenization._is_punctuation(u"`"))
130 | self.assertTrue(tokenization._is_punctuation(u"."))
131 |
132 | self.assertFalse(tokenization._is_punctuation(u"A"))
133 | self.assertFalse(tokenization._is_punctuation(u" "))
134 |
135 |
136 | if __name__ == "__main__":
137 | tf.test.main()
138 |
--------------------------------------------------------------------------------
/step_4/copy_model_to_k8s.sh:
--------------------------------------------------------------------------------
1 | # Storage for models
2 | cat < to see
15 | your current agreements on file or to sign a new one.
16 |
17 | You generally only need to submit a CLA once, so if you've already submitted one
18 | (even if it was for a different project), you probably don't need to do it
19 | again.
20 |
21 | ## Code reviews
22 |
23 | All submissions, including submissions by project members, require review. We
24 | use GitHub pull requests for this purpose. Consult
25 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
26 | information on using pull requests.
27 |
28 | ## Community Guidelines
29 |
30 | This project follows
31 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
32 |
--------------------------------------------------------------------------------
/step_5/bert/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7
2 |
3 | MAINTAINER Leonardo Apolonio
4 | ADD . /code
5 | WORKDIR /code
6 | RUN pip install pipenv
7 | RUN pipenv install
8 | ADD asset/vocab.txt asset/vocab.txt
9 | CMD ["pipenv", "run", "python", "run_app.py"]
10 |
--------------------------------------------------------------------------------
/step_5/bert/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 | name = "pypi"
3 | url = "https://pypi.org/simple"
4 | verify_ssl = true
5 |
6 | [dev-packages]
7 |
8 | [packages]
9 | tensorflow-serving-api = "*"
10 | tensorflow = "*"
11 | flask = "*"
12 | pandas = "*"
13 |
14 | [requires]
15 | python_version = "3.7"
16 |
--------------------------------------------------------------------------------
/step_5/bert/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
--------------------------------------------------------------------------------
/step_5/bert/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lapolonio/text_classification_tutorial/db12fbec05621959a345dd3a557d22bb61fa48a6/step_5/bert/config/__init__.py
--------------------------------------------------------------------------------
/step_5/bert/config/development.py:
--------------------------------------------------------------------------------
1 | TF_SERVING_HOST="localhost:8500"
2 | VOCAB_ASSET="asset/vocab.txt"
3 | DEBUG=True
--------------------------------------------------------------------------------
/step_5/bert/config/production.py:
--------------------------------------------------------------------------------
1 | TF_SERVING_HOST="imdb-sentiment-service:8500"
2 | VOCAB_ASSET="asset/vocab.txt"
3 | DEBUG=False
--------------------------------------------------------------------------------
/step_5/bert/optimization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Functions and classes related to optimization (weight updates)."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import re
22 | import tensorflow as tf
23 |
24 |
25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
26 | """Creates an optimizer training op."""
27 | global_step = tf.train.get_or_create_global_step()
28 |
29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
30 |
31 | # Implements linear decay of the learning rate.
32 | learning_rate = tf.train.polynomial_decay(
33 | learning_rate,
34 | global_step,
35 | num_train_steps,
36 | end_learning_rate=0.0,
37 | power=1.0,
38 | cycle=False)
39 |
40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
41 | # learning rate will be `global_step/num_warmup_steps * init_lr`.
42 | if num_warmup_steps:
43 | global_steps_int = tf.cast(global_step, tf.int32)
44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
45 |
46 | global_steps_float = tf.cast(global_steps_int, tf.float32)
47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
48 |
49 | warmup_percent_done = global_steps_float / warmup_steps_float
50 | warmup_learning_rate = init_lr * warmup_percent_done
51 |
52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
53 | learning_rate = (
54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
55 |
56 | # It is recommended that you use this optimizer for fine tuning, since this
57 | # is how the model was trained (note that the Adam m/v variables are NOT
58 | # loaded from init_checkpoint.)
59 | optimizer = AdamWeightDecayOptimizer(
60 | learning_rate=learning_rate,
61 | weight_decay_rate=0.01,
62 | beta_1=0.9,
63 | beta_2=0.999,
64 | epsilon=1e-6,
65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
66 |
67 | if use_tpu:
68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
69 |
70 | tvars = tf.trainable_variables()
71 | grads = tf.gradients(loss, tvars)
72 |
73 | # This is how the model was pre-trained.
74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
75 |
76 | train_op = optimizer.apply_gradients(
77 | zip(grads, tvars), global_step=global_step)
78 |
79 | # Normally the global step update is done inside of `apply_gradients`.
80 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
81 | # a different optimizer, you should probably take this line out.
82 | new_global_step = global_step + 1
83 | train_op = tf.group(train_op, [global_step.assign(new_global_step)])
84 | return train_op
85 |
86 |
87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
88 | """A basic Adam optimizer that includes "correct" L2 weight decay."""
89 |
90 | def __init__(self,
91 | learning_rate,
92 | weight_decay_rate=0.0,
93 | beta_1=0.9,
94 | beta_2=0.999,
95 | epsilon=1e-6,
96 | exclude_from_weight_decay=None,
97 | name="AdamWeightDecayOptimizer"):
98 | """Constructs a AdamWeightDecayOptimizer."""
99 | super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 |
101 | self.learning_rate = learning_rate
102 | self.weight_decay_rate = weight_decay_rate
103 | self.beta_1 = beta_1
104 | self.beta_2 = beta_2
105 | self.epsilon = epsilon
106 | self.exclude_from_weight_decay = exclude_from_weight_decay
107 |
108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 | """See base class."""
110 | assignments = []
111 | for (grad, param) in grads_and_vars:
112 | if grad is None or param is None:
113 | continue
114 |
115 | param_name = self._get_variable_name(param.name)
116 |
117 | m = tf.get_variable(
118 | name=param_name + "/adam_m",
119 | shape=param.shape.as_list(),
120 | dtype=tf.float32,
121 | trainable=False,
122 | initializer=tf.zeros_initializer())
123 | v = tf.get_variable(
124 | name=param_name + "/adam_v",
125 | shape=param.shape.as_list(),
126 | dtype=tf.float32,
127 | trainable=False,
128 | initializer=tf.zeros_initializer())
129 |
130 | # Standard Adam update.
131 | next_m = (
132 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 | next_v = (
134 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 | tf.square(grad)))
136 |
137 | update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 |
139 | # Just adding the square of the weights to the loss function is *not*
140 | # the correct way of using L2 regularization/weight decay with Adam,
141 | # since that will interact with the m and v parameters in strange ways.
142 | #
143 | # Instead we want ot decay the weights in a manner that doesn't interact
144 | # with the m/v parameters. This is equivalent to adding the square
145 | # of the weights to the loss with plain (non-momentum) SGD.
146 | if self._do_use_weight_decay(param_name):
147 | update += self.weight_decay_rate * param
148 |
149 | update_with_lr = self.learning_rate * update
150 |
151 | next_param = param - update_with_lr
152 |
153 | assignments.extend(
154 | [param.assign(next_param),
155 | m.assign(next_m),
156 | v.assign(next_v)])
157 | return tf.group(*assignments, name=name)
158 |
159 | def _do_use_weight_decay(self, param_name):
160 | """Whether to use L2 weight decay for `param_name`."""
161 | if not self.weight_decay_rate:
162 | return False
163 | if self.exclude_from_weight_decay:
164 | for r in self.exclude_from_weight_decay:
165 | if re.search(r, param_name) is not None:
166 | return False
167 | return True
168 |
169 | def _get_variable_name(self, param_name):
170 | """Get the variable name from the tensor name."""
171 | m = re.match("^(.*):\\d+$", param_name)
172 | if m is not None:
173 | param_name = m.group(1)
174 | return param_name
175 |
--------------------------------------------------------------------------------
/step_5/bert/optimization_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import optimization
20 | import tensorflow as tf
21 |
22 |
23 | class OptimizationTest(tf.test.TestCase):
24 |
25 | def test_adam(self):
26 | with self.test_session() as sess:
27 | w = tf.get_variable(
28 | "w",
29 | shape=[3],
30 | initializer=tf.constant_initializer([0.1, -0.2, -0.1]))
31 | x = tf.constant([0.4, 0.2, -0.5])
32 | loss = tf.reduce_mean(tf.square(x - w))
33 | tvars = tf.trainable_variables()
34 | grads = tf.gradients(loss, tvars)
35 | global_step = tf.train.get_or_create_global_step()
36 | optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2)
37 | train_op = optimizer.apply_gradients(zip(grads, tvars), global_step)
38 | init_op = tf.group(tf.global_variables_initializer(),
39 | tf.local_variables_initializer())
40 | sess.run(init_op)
41 | for _ in range(100):
42 | sess.run(train_op)
43 | w_np = sess.run(w)
44 | self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
45 |
46 |
47 | if __name__ == "__main__":
48 | tf.test.main()
49 |
--------------------------------------------------------------------------------
/step_5/bert/requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py==0.8.0
2 | astor==0.8.0
3 | Click==7.0
4 | Flask==1.1.1
5 | gast==0.3.1
6 | google-pasta==0.1.7
7 | grpcio==1.23.0
8 | h5py==2.10.0
9 | itsdangerous==1.1.0
10 | Jinja2==2.10.1
11 | Keras-Applications==1.0.8
12 | Keras-Preprocessing==1.1.0
13 | Markdown==3.1.1
14 | MarkupSafe==1.1.1
15 | numpy==1.17.2
16 | pandas==0.25.1
17 | protobuf==3.9.1
18 | python-dateutil==2.8.0
19 | pytz==2019.2
20 | six==1.12.0
21 | tensorboard==1.14.0
22 | tensorflow==1.14.0
23 | tensorflow-estimator==1.14.0
24 | tensorflow-serving-api==1.14.0
25 | termcolor==1.1.0
26 | Werkzeug==0.15.6
27 | wrapt==1.11.2
28 |
--------------------------------------------------------------------------------
/step_5/bert/run_app.py:
--------------------------------------------------------------------------------
1 |
2 | import grpc
3 | import tensorflow as tf
4 | import numpy as np
5 |
6 | import run_classifier as classifiers
7 | import tokenization
8 |
9 | from tensorflow_serving.apis import predict_pb2
10 | from tensorflow_serving.apis import prediction_service_pb2_grpc
11 | from tensorflow.core.framework import tensor_pb2
12 | from tensorflow.core.framework import tensor_shape_pb2
13 | from tensorflow.core.framework import types_pb2
14 |
15 | from flask import Flask
16 | from flask import request
17 | import time
18 |
19 | import pandas as pd
20 | import random
21 |
22 | app = Flask(__name__)
23 | app.config.from_envvar('APP_CONFIG_FILE')
24 |
25 | @app.route("/", methods = ['GET'])
26 | def hello():
27 | return "Hello BERT predicting IMDB! Try posting a string to this url"
28 |
29 |
30 | @app.route("/", methods = ['POST'])
31 | def predict():
32 | app.logger.info("Entering Predict Method")
33 | # MODEL PARAMS
34 | max_seq_length = 128
35 |
36 | channel = grpc.insecure_channel(app.config["TF_SERVING_HOST"])
37 | stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
38 |
39 | # Parse Description
40 | tokenizer = tokenization.FullTokenizer(
41 | vocab_file="asset/vocab.txt", do_lower_case=True)
42 | processor = classifiers.ImdbProcessor()
43 | label_list = processor.get_labels()
44 | content = request.get_json()
45 | request_id = str(random.randint(1, 9223372036854775807))
46 |
47 | data = {}
48 | data["id"] = []
49 | data["sentence"] = []
50 | for sentence in content['sentences']:
51 | data["id"].append(request_id)
52 | data["sentence"].append(sentence)
53 |
54 | inputExamples = processor._create_examples(pd.DataFrame.from_dict(data), 'test')
55 |
56 | t0 = time.time()
57 | model_input = classifiers.memory_based_convert_examples_to_features(inputExamples, label_list, max_seq_length, tokenizer)
58 | t1 = time.time()
59 | total = t1-t0
60 | app.logger.info("Convert Examples Time Taken: %d", total)
61 |
62 | app.logger.info(model_input)
63 | # Send request
64 | # See prediction_service.proto for gRPC request/response details.
65 | model_request = predict_pb2.PredictRequest()
66 | model_request.model_spec.name = 'bert'
67 | model_request.model_spec.signature_name = 'serving_default'
68 |
69 | model_request.inputs['examples'].CopyFrom(
70 | tf.contrib.util.make_tensor_proto(model_input, shape=[len(model_input)])
71 | )
72 | app.logger.info(model_request)
73 |
74 | t0 = time.time()
75 | result = stub.Predict(model_request, 60.0) # 20 secs timeout
76 | t1 = time.time()
77 | total = t1-t0
78 |
79 | app.logger.info(result)
80 | result = tf.make_ndarray(result.outputs["probabilities"])
81 | app.logger.info(result)
82 | pretty_result = "Predicted Label: " + str(np.take(processor.get_labels(), result.argmax(axis=1)))
83 | app.logger.info("Predicted Label: %s", str(np.take(processor.get_labels(), result.argmax(axis=1))))
84 | app.logger.info("Predict Time Taken: %f", total)
85 |
86 | return pretty_result
87 |
88 |
89 | if __name__ == '__main__':
90 | app.run(debug=app.config["DEBUG"])
91 |
--------------------------------------------------------------------------------
/step_5/bert/sample_text.txt:
--------------------------------------------------------------------------------
1 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
2 | Text should be one-sentence-per-line, with empty lines between documents.
3 | This sample text is public domain and was randomly selected from Project Guttenberg.
4 |
5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
8 | "Cass" Beard had risen early that morning, but not with a view to discovery.
9 | A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
10 | The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
11 | This was nearly opposite.
12 | Mr. Cassius crossed the highway, and stopped suddenly.
13 | Something glittered in the nearest red pool before him.
14 | Gold, surely!
15 | But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
16 | Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
17 | Like most of his fellow gold-seekers, Cass was superstitious.
18 |
19 | The fountain of classic wisdom, Hypatia herself.
20 | As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
21 | From my youth I felt in me a soul above the matter-entangled herd.
22 | She revealed to me the glorious fact, that I am a spark of Divinity itself.
23 | A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
24 | There is a philosophic pleasure in opening one's treasures to the modest young.
25 | Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
26 | Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
27 | but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
28 | Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
29 | His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
30 | while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
31 | At last they reached the quay at the opposite end of the street;
32 | and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
33 | He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
34 |
--------------------------------------------------------------------------------
/step_5/bert/tokenization_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import os
20 | import tempfile
21 | import tokenization
22 | import six
23 | import tensorflow as tf
24 |
25 |
26 | class TokenizationTest(tf.test.TestCase):
27 |
28 | def test_full_tokenizer(self):
29 | vocab_tokens = [
30 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
31 | "##ing", ","
32 | ]
33 | with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
34 | if six.PY2:
35 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
36 | else:
37 | vocab_writer.write("".join(
38 | [x + "\n" for x in vocab_tokens]).encode("utf-8"))
39 |
40 | vocab_file = vocab_writer.name
41 |
42 | tokenizer = tokenization.FullTokenizer(vocab_file)
43 | os.unlink(vocab_file)
44 |
45 | tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
46 | self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
47 |
48 | self.assertAllEqual(
49 | tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
50 |
51 | def test_chinese(self):
52 | tokenizer = tokenization.BasicTokenizer()
53 |
54 | self.assertAllEqual(
55 | tokenizer.tokenize(u"ah\u535A\u63A8zz"),
56 | [u"ah", u"\u535A", u"\u63A8", u"zz"])
57 |
58 | def test_basic_tokenizer_lower(self):
59 | tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
60 |
61 | self.assertAllEqual(
62 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
63 | ["hello", "!", "how", "are", "you", "?"])
64 | self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
65 |
66 | def test_basic_tokenizer_no_lower(self):
67 | tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
68 |
69 | self.assertAllEqual(
70 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
71 | ["HeLLo", "!", "how", "Are", "yoU", "?"])
72 |
73 | def test_wordpiece_tokenizer(self):
74 | vocab_tokens = [
75 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
76 | "##ing"
77 | ]
78 |
79 | vocab = {}
80 | for (i, token) in enumerate(vocab_tokens):
81 | vocab[token] = i
82 | tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)
83 |
84 | self.assertAllEqual(tokenizer.tokenize(""), [])
85 |
86 | self.assertAllEqual(
87 | tokenizer.tokenize("unwanted running"),
88 | ["un", "##want", "##ed", "runn", "##ing"])
89 |
90 | self.assertAllEqual(
91 | tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
92 |
93 | def test_convert_tokens_to_ids(self):
94 | vocab_tokens = [
95 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
96 | "##ing"
97 | ]
98 |
99 | vocab = {}
100 | for (i, token) in enumerate(vocab_tokens):
101 | vocab[token] = i
102 |
103 | self.assertAllEqual(
104 | tokenization.convert_tokens_to_ids(
105 | vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
106 |
107 | def test_is_whitespace(self):
108 | self.assertTrue(tokenization._is_whitespace(u" "))
109 | self.assertTrue(tokenization._is_whitespace(u"\t"))
110 | self.assertTrue(tokenization._is_whitespace(u"\r"))
111 | self.assertTrue(tokenization._is_whitespace(u"\n"))
112 | self.assertTrue(tokenization._is_whitespace(u"\u00A0"))
113 |
114 | self.assertFalse(tokenization._is_whitespace(u"A"))
115 | self.assertFalse(tokenization._is_whitespace(u"-"))
116 |
117 | def test_is_control(self):
118 | self.assertTrue(tokenization._is_control(u"\u0005"))
119 |
120 | self.assertFalse(tokenization._is_control(u"A"))
121 | self.assertFalse(tokenization._is_control(u" "))
122 | self.assertFalse(tokenization._is_control(u"\t"))
123 | self.assertFalse(tokenization._is_control(u"\r"))
124 | self.assertFalse(tokenization._is_control(u"\U0001F4A9"))
125 |
126 | def test_is_punctuation(self):
127 | self.assertTrue(tokenization._is_punctuation(u"-"))
128 | self.assertTrue(tokenization._is_punctuation(u"$"))
129 | self.assertTrue(tokenization._is_punctuation(u"`"))
130 | self.assertTrue(tokenization._is_punctuation(u"."))
131 |
132 | self.assertFalse(tokenization._is_punctuation(u"A"))
133 | self.assertFalse(tokenization._is_punctuation(u" "))
134 |
135 |
136 | if __name__ == "__main__":
137 | tf.test.main()
138 |
--------------------------------------------------------------------------------
/step_5/client-chart.yml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | items:
3 | - apiVersion: v1
4 | kind: Service
5 | metadata:
6 | name: imdb-sentiment-client
7 | labels:
8 | app.kubernetes.io/name: imdb-client
9 | spec:
10 | type: LoadBalancer
11 | ports:
12 | - name: http
13 | port: 5000
14 | targetPort: 5000
15 | selector:
16 | app.kubernetes.io/name: imdb-client
17 |
18 | # APP Environment
19 | - apiVersion: extensions/v1beta1
20 | kind: Deployment
21 | metadata:
22 | name: imdb-client
23 | labels:
24 | app.kubernetes.io/name: imdb-client
25 | spec:
26 | replicas: 1
27 | template:
28 | metadata:
29 | labels:
30 | app.kubernetes.io/name: imdb-client
31 | spec:
32 | containers:
33 | - image: docker.io/lapolonio/imdb_client:v3
34 | name: imdb-client
35 | env:
36 | - name: APP_CONFIG_FILE
37 | value: "config/production.py"
38 | command: ["pipenv", "run", "python", "run_app.py"]
39 | ports:
40 | - name: http
41 | containerPort: 5000
42 | resources:
43 | requests:
44 | cpu: 200m
45 | memory: 256Mi
46 | limits:
47 | cpu: 2
48 | memory: 2Gi
49 | kind: List
50 | metadata: {}
51 |
--------------------------------------------------------------------------------
/step_5/copy_model_to_k8s.sh:
--------------------------------------------------------------------------------
1 | # Storage for models
2 | cat <