├── sentiment_analysis
    ├── samodel-v3.png
    └── utils.py
├── machine_translation
    ├── transformer.png
    ├── nmt
    │   ├── __init__.py
    │   ├── _constants.py
    │   ├── hyperparameters.py
    │   ├── utils.py
    │   ├── index.rst
    │   ├── translation.py
    │   └── dataset.py
    ├── hyperparameters.py
    ├── dataprocessor.py
    └── utils.py
├── sequence_generation
    ├── cache_model.png
    ├── language_model_intro.png
    └── text_generation
    │   ├── __init__.py
    │   ├── model
    │       └── __init__.py
    │   ├── index.rst
    │   └── sequence_sampling.py
├── natural_language_understanding
    ├── bert.png
    ├── qa.png
    ├── bert-embed.png
    ├── bert-sentence-pair.png
    ├── bert
    │   ├── conversion_tools
    │   │   ├── ernie_top_layer_emb.npy
    │   │   ├── compare_gluon_ernie.py
    │   │   ├── infer_pytorch_gluon_parameter_name_mapping.py
    │   │   ├── convert_pytorch_model.py
    │   │   ├── compare_tf_gluon_model.py
    │   │   ├── convert_tf_model.py
    │   │   └── convert_paddle_to_gluon.py
    │   ├── __init__.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── qa.py
    │   │   ├── ner.py
    │   │   └── classification.py
    │   ├── export
    │   │   ├── __init__.py
    │   │   └── export.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── embedding.py
    │   │   ├── transform.py
    │   │   └── baidu_ernie_data.py
    │   ├── utils.py
    │   ├── ner_utils.py
    │   ├── predict_ner.py
    │   ├── embedding.py
    │   ├── finetune_ner.py
    │   └── fp16_utils.py
    └── qa_utils.py
├── README.md
├── intent_classification_and_slot_labelling
    ├── explain_subword_tagging.png
    └── README.md
├── gluon_basics
    ├── mlp_utils.py
    └── autograd.ipynb
├── env
    └── nlp.yml
├── word_embedding
    ├── utils.py
    └── model.py
└── .gitignore


/sentiment_analysis/samodel-v3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/sentiment_analysis/samodel-v3.png


--------------------------------------------------------------------------------
/machine_translation/transformer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/machine_translation/transformer.png


--------------------------------------------------------------------------------
/sequence_generation/cache_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/sequence_generation/cache_model.png


--------------------------------------------------------------------------------
/natural_language_understanding/bert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/natural_language_understanding/bert.png


--------------------------------------------------------------------------------
/natural_language_understanding/qa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/natural_language_understanding/qa.png


--------------------------------------------------------------------------------
/sequence_generation/language_model_intro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/sequence_generation/language_model_intro.png


--------------------------------------------------------------------------------
/natural_language_understanding/bert-embed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/natural_language_understanding/bert-embed.png


--------------------------------------------------------------------------------
/natural_language_understanding/bert-sentence-pair.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/natural_language_understanding/bert-sentence-pair.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # nlp-notebooks
2 | 
3 | ## Environment Setup 
4 | 
5 | - install conda 
6 | - conda env create -f nlp-notebooks/env/nlp.yml
7 | - source activate nlp
8 | - jupyter notebook
9 | 


--------------------------------------------------------------------------------
/intent_classification_and_slot_labelling/explain_subword_tagging.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/intent_classification_and_slot_labelling/explain_subword_tagging.png


--------------------------------------------------------------------------------
/natural_language_understanding/bert/conversion_tools/ernie_top_layer_emb.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/natural_language_understanding/bert/conversion_tools/ernie_top_layer_emb.npy


--------------------------------------------------------------------------------
/gluon_basics/mlp_utils.py:
--------------------------------------------------------------------------------
 1 | def show_fashion_mnist(images, labels):
 2 |     import d2l
 3 |     d2l.use_svg_display()
 4 |     # Here _ means that we ignore (not use) variables.
 5 |     _, figs = d2l.plt.subplots(1, len(images), figsize=(12, 12))
 6 |     for f, img, lbl in zip(figs, images, labels):
 7 |         f.imshow(img.reshape((28, 28)).asnumpy())
 8 |         f.set_title(lbl)
 9 |         f.axes.get_xaxis().set_visible(False)
10 |         f.axes.get_yaxis().set_visible(False)
11 | 


--------------------------------------------------------------------------------
/env/nlp.yml:
--------------------------------------------------------------------------------
 1 | name: nlp
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.6
 6 |   - pip=18.1
 7 |   - spacy
 8 |   - nltk
 9 |   - ipython
10 |   - ipykernel
11 |   - jupyter=1.0.0
12 |   - matplotlib=2.2.2
13 |   - pandas=0.23.4
14 |   - regex
15 |   - pip:
16 |     - mxnet-cu100mkl>=1.5.0b20190630
17 |     - sacremoses
18 |     - sentencepiece<0.2
19 |     - seaborn
20 |     - jieba
21 |     - d2l==0.9.2
22 |     - environment_kernels
23 |     - jupyter_contrib_nbextensions
24 |     - jupyter_nbextensions_configurator
25 |     - gluonnlp
26 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | 
20 | # pylint: disable=wildcard-import
21 | """BERT Module."""
22 | from . import model, data
23 | 


--------------------------------------------------------------------------------
/sequence_generation/text_generation/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | 
20 | # pylint: disable=wildcard-import
21 | """Text Generation Module."""
22 | from . import model
23 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | 
20 | # pylint: disable=wildcard-import
21 | """BERT model."""
22 | from . import classification, ner, qa
23 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/export/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | 
20 | # pylint: disable=wildcard-import
21 | """Hybrid BERT for deployment."""
22 | from . import hybrid_bert
23 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | 
20 | # pylint: disable=wildcard-import
21 | """BERT data."""
22 | from . import qa, classification, embedding, transform, ner
23 | 


--------------------------------------------------------------------------------
/machine_translation/nmt/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | 
20 | # pylint: disable=wildcard-import
21 | """NMT example."""
22 | from . import _constants, bleu, dataset, \
23 |               gnmt, translation, utils
24 | 


--------------------------------------------------------------------------------
/machine_translation/nmt/_constants.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | """Constants used in the NMT examples."""
20 | import os
21 | 
22 | __all__ = ['CACHE_PATH']
23 | 
24 | CACHE_PATH = os.path.realpath(os.path.join(os.path.realpath(__file__), '..', 'cached'))
25 | 


--------------------------------------------------------------------------------
/natural_language_understanding/qa_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import collections
 3 | import mxnet as mx
 4 | import gluonnlp as nlp
 5 | import bert
 6 | from mxnet.gluon.model_zoo import model_store
 7 | 
 8 | def download_qa_ckpt():
 9 |     model_store._model_sha1['bert_qa'] = '7eb11865ecac2a412457a7c8312d37a1456af7fc'
10 |     result = model_store.get_model_file('bert_qa', root='./temp')
11 |     print('Downloaded checkpoint to {}'.format(result))
12 |     return result
13 | 
14 | def predict(dataset, all_results, vocab):
15 |     tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=True)
16 |     transform = bert.data.qa.SQuADTransform(tokenizer, is_pad=False, is_training=False, do_lookup=False)
17 |     dev_dataset = dataset.transform(transform._transform)
18 |     from bert.bert_qa_evaluate import PredResult, predict
19 |     all_predictions = collections.OrderedDict()
20 |     for features in dev_dataset:
21 |         results = all_results[features[0].example_id]
22 |     
23 |         prediction, nbest = predict(
24 |             features=features,
25 |             results=results,
26 |             tokenizer=nlp.data.BERTBasicTokenizer(lower=True))
27 |     
28 |         print('\nContext: %s\n'%(' '.join(features[0].doc_tokens)))
29 |         question = features[0].input_ids.index('[SEP]')
30 |         print('Question: %s\n'%(' '.join((features[0].input_ids[1:question]))))
31 |         print('Top predictions: ')
32 |         for i in range(3):
33 |             print('%.2f%% \t %s'%(nbest[i][1] * 100, nbest[i][0]))
34 |         print('')
35 | 


--------------------------------------------------------------------------------
/machine_translation/hyperparameters.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | """Hyperparameters for transformer."""
20 | 
21 | import nmt
22 | 
23 | # parameters for dataset
24 | src_lang = 'en'
25 | tgt_lang = 'de'
26 | src_max_len = -1
27 | tgt_max_len = -1
28 | 
29 | # parameters for model
30 | num_units = 512
31 | hidden_size = 2048
32 | dropout = 0.1
33 | epsilon = 0.1
34 | num_layers = 6
35 | num_heads = 8
36 | scaled = True
37 | 
38 | # parameters for training
39 | optimizer = 'adam'
40 | epochs = 3
41 | batch_size = 2700
42 | test_batch_size = 256
43 | num_accumulated = 1
44 | lr = 2
45 | warmup_steps = 1
46 | save_dir = 'transformer_en_de_u512'
47 | average_start = 1
48 | num_buckets = 20
49 | log_interval = 10
50 | bleu = '13a'
51 | 
52 | #parameters for testing
53 | beam_size = 4
54 | lp_alpha = 0.6
55 | lp_k = 5


--------------------------------------------------------------------------------
/machine_translation/nmt/hyperparameters.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | """Hyperparameters for transformer, for past reference only."""
20 | 
21 | # parameters for dataset
22 | src_lang = 'en'
23 | tgt_lang = 'de'
24 | src_max_len = -1
25 | tgt_max_len = -1
26 | 
27 | # parameters for model
28 | num_units = 512
29 | hidden_size = 2048
30 | dropout = 0.1
31 | epsilon = 0.1
32 | num_layers = 6
33 | num_heads = 8
34 | scaled = True
35 | 
36 | # parameters for training
37 | optimizer = 'adam'
38 | epochs = 3
39 | batch_size = 2700
40 | test_batch_size = 256
41 | num_accumulated = 1
42 | lr = 2
43 | warmup_steps = 1
44 | save_dir = 'transformer_en_de_u512'
45 | average_start = 1
46 | num_buckets = 20
47 | log_interval = 10
48 | bleu = '13a'
49 | 
50 | #parameters for testing
51 | beam_size = 4
52 | lp_alpha = 0.6
53 | lp_k = 5
54 | 


--------------------------------------------------------------------------------
/word_embedding/utils.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | """Word Embeddings Training Utilities
20 | =====================================
21 | 
22 | """
23 | 
24 | import logging
25 | import time
26 | from contextlib import contextmanager
27 | 
28 | import mxnet as mx
29 | 
30 | 
31 | def get_context(args):
32 |     if args.gpu is None or args.gpu == '':
33 |         context = [mx.cpu()]
34 |     elif isinstance(args.gpu, int):
35 |         context = [mx.gpu(args.gpu)]
36 |     else:
37 |         context = [mx.gpu(int(i)) for i in args.gpu]
38 |     return context
39 | 
40 | 
41 | @contextmanager
42 | def print_time(task):
43 |     start_time = time.time()
44 |     logging.info('Starting to %s', task)
45 |     yield
46 |     logging.info('Finished to {} in {:.2f} seconds'.format(
47 |         task,
48 |         time.time() - start_time))
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/data/embedding.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and DMLC.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """BERT embedding datasets."""
16 | from mxnet.gluon.data import Dataset
17 | 
18 | __all__ = ['BertEmbeddingDataset']
19 | 
20 | class BertEmbeddingDataset(Dataset):
21 |     """Dataset for BERT Embedding
22 | 
23 |     Parameters
24 |     ----------
25 |     sentences : List[str].
26 |         Sentences for embeddings.
27 |     transform : BERTDatasetTransform, default None.
28 |         transformer for BERT input format
29 |     """
30 | 
31 |     def __init__(self, sentences, transform=None):
32 |         """Dataset for BERT Embedding
33 | 
34 |         Parameters
35 |         ----------
36 |         sentences : List[str].
37 |             Sentences for embeddings.
38 |         transform : BERTDatasetTransform, default None.
39 |             transformer for BERT input format
40 |         """
41 |         self.sentences = sentences
42 |         self.transform = transform
43 | 
44 |     def __getitem__(self, idx):
45 |         sentence = (self.sentences[idx], 0)
46 |         if self.transform:
47 |             return self.transform(sentence)
48 |         else:
49 |             return sentence
50 | 
51 |     def __len__(self):
52 |         return len(self.sentences)
53 | 


--------------------------------------------------------------------------------
/machine_translation/nmt/utils.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | """Utility functions."""
20 | 
21 | import os
22 | import logging
23 | import inspect
24 | 
25 | __all__ = ['logging_config']
26 | 
27 | 
28 | def logging_config(folder=None, name=None,
29 |                    level=logging.DEBUG,
30 |                    console_level=logging.INFO,
31 |                    no_console=False):
32 |     """ Config the logging.
33 | 
34 |     Parameters
35 |     ----------
36 |     folder : str or None
37 |     name : str or None
38 |     level : int
39 |     console_level
40 |     no_console: bool
41 |         Whether to disable the console log
42 |     Returns
43 |     -------
44 |     folder : str
45 |         Folder that the logging file will be saved into.
46 |     """
47 |     if name is None:
48 |         name = inspect.stack()[1][1].split('.')[0]
49 |     if folder is None:
50 |         folder = os.path.join(os.getcwd(), name)
51 |     if not os.path.exists(folder):
52 |         os.makedirs(folder)
53 |     # Remove all the current handlers
54 |     for handler in logging.root.handlers:
55 |         logging.root.removeHandler(handler)
56 |     logging.root.handlers = []
57 |     logpath = os.path.join(folder, name + '.log')
58 |     print('All Logs will be saved to {}'.format(logpath))
59 |     logging.root.setLevel(level)
60 |     formatter = logging.Formatter('%(asctime)s - %(name)s - %(message)s')
61 |     logfile = logging.FileHandler(logpath)
62 |     logfile.setLevel(level)
63 |     logfile.setFormatter(formatter)
64 |     logging.root.addHandler(logfile)
65 |     if not no_console:
66 |         # Initialze the console logging
67 |         logconsole = logging.StreamHandler()
68 |         logconsole.setLevel(console_level)
69 |         logconsole.setFormatter(formatter)
70 |         logging.root.addHandler(logconsole)
71 |     return folder
72 | 


--------------------------------------------------------------------------------
/machine_translation/nmt/index.rst:
--------------------------------------------------------------------------------
 1 | Machine Translation
 2 | -------------------
 3 | 
 4 | :download:`Download scripts </model_zoo/machine_translation.zip>`
 5 | 
 6 | Google Neural Machine Translation
 7 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 8 | 
 9 | Use the following command to train the GNMT model on the IWSLT2015 dataset.
10 | 
11 | .. code-block:: console
12 | 
13 |    $ MXNET_GPU_MEM_POOL_TYPE=Round python train_gnmt.py --src_lang en --tgt_lang vi --batch_size 128 \
14 |                    --optimizer adam --lr 0.001 --lr_update_factor 0.5 --beam_size 10 --bucket_scheme exp \
15 |                    --num_hidden 512 --save_dir gnmt_en_vi_l2_h512_beam10 --epochs 12 --gpu 0
16 | 
17 | It gets test BLEU score equals to 26.20.
18 | 
19 | Transformers
20 | ~~~~~~~~~~~~
21 | 
22 | Use the following commands to train the Transformer model on the WMT14 dataset for English to German translation.
23 | 
24 | .. code-block:: console
25 | 
26 |    $ MXNET_GPU_MEM_POOL_TYPE=Round python train_transformer.py --dataset WMT2014BPE \
27 |                           --src_lang en --tgt_lang de --batch_size 2700 \
28 |                           --optimizer adam --num_accumulated 16 --lr 2.0 --warmup_steps 4000 \
29 |                           --save_dir transformer_en_de_u512 --epochs 30 --gpus 0,1,2,3,4,5,6,7 --scaled \
30 |                           --average_start 5 --num_buckets 20 --bucket_scheme exp --bleu 13a --log_interval 10
31 | 
32 | It gets official mteval-v13a BLEU score equals to 27.09 on newstest2014 (http://statmt.org/wmt14/test-filtered.tgz).
33 | This result is obtained by using averaged SGD in last 5 epochs. If we use international tokenization (i.e., ``--bleu intl``),
34 | we can obtain bleu score equals to 27.89. If we use ``--bleu tweaked``, we obtain test BLEU score equals to 28.96.
35 | This result is obtained on tweaked reference, where the tokenized reference text is put in ATAT format for historical reason
36 | and following preprocessing pipeline is done:
37 | 
38 | .. code-block:: console
39 | 
40 |     mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l de
41 |     mosesdecoder/scripts/tokenizer/remove-non-printing-char.perl
42 |     mosesdecoder/scripts/tokenizer/tokenizer.perl -q -no-escape -protected mosesdecoder/scripts/tokenizer/basic-protected-patterns -l de.
43 | 
44 | If we turn on  ``--full``, the testing is performed on newstest2014 (http://statmt.org/wmt14/test-full.tgz). Then, we can
45 | obtain BLEU=27.05 with ``--bleu 13a``, BLEU=27.81 with ``--bleu intl``, and BLEU=28.80 with ``--bleu tweaked``
46 | 
47 | The pre-trained model can be downloaded from http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/transformer_en_de_512_WMT2014-e25287c5.zip.
48 | 
49 | For the users from China, it might be faster with this link instead: https://apache-mxnet.s3.cn-north-1.amazonaws.com.cn/gluon/models/transformer_en_de_512_WMT2014-e25287c5.zip.
50 | 


--------------------------------------------------------------------------------
/sequence_generation/text_generation/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | 
20 | # pylint: disable=wildcard-import
21 | """Text generation models."""
22 | from gluonnlp.model import get_model as _get_model
23 | from .gpt import *
24 | 
25 | def get_model(name, **kwargs):
26 |     """Returns a pre-defined model by name.
27 | 
28 |     In addition to the models in GluonNLP model API, this API supports getting GPT-2 models.
29 | 
30 |     Parameters
31 |     ----------
32 |     name : str
33 |         Name of the model.
34 |     dataset_name : str or None, default None
35 |         The dataset name on which the pre-trained model is trained.
36 |         For language model, options are 'wikitext-2'.
37 |         For ELMo, Options are 'gbw' and '5bw'.
38 |         'gbw' represents 1 Billion Word Language Model Benchmark
39 |         http://www.statmt.org/lm-benchmark/;
40 |         '5bw' represents a dataset of 5.5B tokens consisting of
41 |         Wikipedia (1.9B) and all of the monolingual news crawl data from WMT 2008-2012 (3.6B).
42 |         If specified, then the returned vocabulary is extracted from
43 |         the training set of the dataset.
44 |         If None, then vocab is required, for specifying embedding weight size, and is directly
45 |         returned.
46 |     vocab : gluonnlp.Vocab or None, default None
47 |         Vocabulary object to be used with the language model.
48 |         Required when dataset_name is not specified.
49 |         None Vocabulary object is required with the ELMo model.
50 |     pretrained : bool, default False
51 |         Whether to load the pre-trained weights for model.
52 |     ctx : Context, default CPU
53 |         The context in which to load the pre-trained weights.
54 |     root : str, default '$MXNET_HOME/models' with MXNET_HOME defaults to '~/.mxnet'
55 |         Location for keeping the model parameters.
56 | 
57 |     Returns
58 |     -------
59 |     gluon.Block, gluonnlp.Vocab, (optional) gluonnlp.Vocab
60 |     """
61 |     models = {'gpt2_117m' : gpt2_117m,
62 |               'gpt2_345m' : gpt2_345m}
63 |     name = name.lower()
64 |     if name not in models:
65 |         return _get_model(name, **kwargs)
66 |     return models[name](**kwargs)
67 | 


--------------------------------------------------------------------------------
/sentiment_analysis/utils.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import sys
 3 | import collections
 4 | import os
 5 | import sys
 6 | import numpy as np
 7 | import math
 8 | from matplotlib import pyplot as plt
 9 | from mxnet import nd, autograd, gluon, init, context, image
10 | from mxnet.gluon import nn, rnn
11 | import random
12 | import re
13 | import time
14 | import tarfile
15 | import zipfile
16 | 
17 | import mxnet as mx
18 | import gluonnlp as nlp
19 | 
20 | import d2l
21 | 
22 | 
23 | 
24 | def load_data_imdb(batch_size, num_steps=500):
25 |     d2l.download_imdb()
26 |     train_data, test_data = d2l.read_imdb('train'), d2l.read_imdb('test')
27 |     train_tokens = d2l.tokenize(train_data[0], token='word')
28 |     test_tokens = d2l.tokenize(test_data[0], token='word')
29 |     vocab = nlp.Vocab(nlp.data.count_tokens(itertools.chain.from_iterable(train_tokens)), min_freq=5)
30 |     train_features = mx.nd.array([d2l.trim_pad(vocab[line], num_steps, vocab[vocab.unknown_token])
31 |                                for line in train_tokens])
32 |     test_features = mx.nd.array([d2l.trim_pad(vocab[line], num_steps, vocab[vocab.unknown_token])
33 |                                for line in test_tokens])
34 |     train_iter = d2l.load_array((train_features, train_data[1]), batch_size)
35 |     test_iter = d2l.load_array((test_features, test_data[1]), batch_size,
36 |                                is_train=False)
37 |     return train_iter, test_iter, vocab
38 | 
39 | 
40 | # from d2l import train_ch12 as train
41 | def train_batch_ch12(net, features, labels, loss, trainer, ctx_list):
42 |     Xs, ys = d2l.split_batch(features, labels, ctx_list)
43 |     with autograd.record():
44 |         pys = [net(X) for X in Xs]
45 |         ls = [loss(py, y) for py, y in zip(pys, ys)]
46 |     for l in ls:
47 |         l.backward()
48 |     trainer.step(features.shape[0])
49 |     train_loss_sum = sum([l.sum().asscalar() for l in ls])
50 |     train_acc_sum = sum(d2l.accuracy(py, y) for py, y in zip(pys, ys))
51 |     return train_loss_sum, train_acc_sum
52 | 
53 | def train(net, train_iter, test_iter, loss, trainer, num_epochs,
54 |                ctx_list=d2l.try_all_gpus()):
55 |     num_batches, timer = len(train_iter), d2l.Timer()
56 |     for epoch in range(num_epochs):
57 |         # store training_loss, training_accuracy, num_examples, num_features
58 |         metric = [0.0] * 4
59 |         for i, (features, labels) in enumerate(train_iter):
60 |             timer.start()
61 |             l, acc = d2l.train_batch_ch12(
62 |                 net, features, labels, loss, trainer, ctx_list)
63 |             metric = [a+b for a, b in zip(metric, (l, acc, labels.shape[0], labels.size))]
64 |             timer.stop()
65 |             if (i+1) % (num_batches // 5) == 0:
66 |                 print(epoch+i/num_batches,
67 |                              (metric[0]/metric[2], metric[1]/metric[3], None))
68 |         test_acc = d2l.evaluate_accuracy_gpus(net, test_iter)
69 |     print('loss %.3f, train acc %.3f, test acc %.3f' % (
70 |         metric[0]/metric[2], metric[1]/metric[3], test_acc))
71 |     print('%.1f exampes/sec on %s' % (
72 |         metric[2]*num_epochs/timer.sum(), ctx_list))
73 | 
74 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/utils.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | """Utility functions for BERT."""
20 | 
21 | import logging
22 | import collections
23 | import hashlib
24 | import io
25 | 
26 | import mxnet as mx
27 | import gluonnlp as nlp
28 | 
29 | __all__ = ['tf_vocab_to_gluon_vocab', 'load_text_vocab']
30 | 
31 | 
32 | def tf_vocab_to_gluon_vocab(tf_vocab):
33 |     special_tokens = ['[UNK]', '[PAD]', '[SEP]', '[MASK]', '[CLS]']
34 |     assert all(t in tf_vocab for t in special_tokens)
35 |     counter = nlp.data.count_tokens(tf_vocab.keys())
36 |     vocab = nlp.vocab.BERTVocab(counter, token_to_idx=tf_vocab)
37 |     return vocab
38 | 
39 | 
40 | def get_hash(filename):
41 |     sha1 = hashlib.sha1()
42 |     with open(filename, 'rb') as f:
43 |         while True:
44 |             data = f.read(1048576)
45 |             if not data:
46 |                 break
47 |             sha1.update(data)
48 |     return sha1.hexdigest(), str(sha1.hexdigest())[:8]
49 | 
50 | 
51 | def read_tf_checkpoint(path):
52 |     """read tensorflow checkpoint"""
53 |     from tensorflow.python import pywrap_tensorflow
54 |     tensors = {}
55 |     reader = pywrap_tensorflow.NewCheckpointReader(path)
56 |     var_to_shape_map = reader.get_variable_to_shape_map()
57 |     for key in sorted(var_to_shape_map):
58 |         tensor = reader.get_tensor(key)
59 |         tensors[key] = tensor
60 |     return tensors
61 | 
62 | def profile(curr_step, start_step, end_step, profile_name='profile.json',
63 |             early_exit=True):
64 |     """profile the program between [start_step, end_step)."""
65 |     if curr_step == start_step:
66 |         mx.nd.waitall()
67 |         mx.profiler.set_config(profile_memory=False, profile_symbolic=True,
68 |                                profile_imperative=True, filename=profile_name,
69 |                                aggregate_stats=True)
70 |         mx.profiler.set_state('run')
71 |     elif curr_step == end_step:
72 |         mx.nd.waitall()
73 |         mx.profiler.set_state('stop')
74 |         logging.info(mx.profiler.dumps())
75 |         mx.profiler.dump()
76 |         if early_exit:
77 |             exit()
78 | 
79 | def load_text_vocab(vocab_file):
80 |     """Loads a vocabulary file into a dictionary."""
81 |     vocab = collections.OrderedDict()
82 |     index = 0
83 |     with io.open(vocab_file, 'r') as reader:
84 |         while True:
85 |             token = reader.readline()
86 |             if not token:
87 |                 break
88 |             token = token.strip()
89 |             vocab[token] = index
90 |             index += 1
91 |     return vocab
92 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/conversion_tools/compare_gluon_ernie.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import absolute_import
 4 | from __future__ import division
 5 | import gluonnlp as nlp
 6 | import argparse
 7 | import os
 8 | import mxnet as mx
 9 | import json
10 | 
11 | parser = argparse.ArgumentParser(description='inference compare script for ernie model in gluon',
12 |                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
13 | parser.add_argument('--input_file', type=str, default='input_cn.txt',
14 |                     help='sample input file for testing')
15 | parser.add_argument('--cased', action='store_true',
16 |                     help='if not set, inputs are converted to lower case')
17 | parser.add_argument('--gluon_dataset', type=str, default='baidu_ernie_uncased',
18 |                     help='gluon dataset name')
19 | parser.add_argument('--gluon_model', type=str, default='ernie_12_768_12',
20 |                     help='gluon model name')
21 | parser.add_argument('--gluon_parameter_file', type=str, default=None,
22 |                     help='gluon parameter file name.')
23 | parser.add_argument('--gluon_vocab_file', type=str, default=None,
24 |                     help='gluon vocab file corresponding to --gluon_parameter_file.')
25 | 
26 | args = parser.parse_args()
27 | 
28 | input_file = os.path.expanduser(args.input_file)
29 | do_lower_case = not args.cased
30 | max_length = 11
31 | if not args.gluon_dataset:
32 |     with open(args.gluon_vocab_file) as f:
33 |         vocab_str = json.load(f)
34 |     vocab = nlp.vocab.BERTVocab.from_json(json.dumps(vocab_str))
35 | else:
36 |     vocab = None
37 | bert, vocabulary = nlp.model.get_model(args.gluon_model,
38 |                                        dataset_name=args.gluon_dataset,
39 |                                        vocab=vocab,
40 |                                        pretrained=not args.gluon_parameter_file,
41 |                                        use_pooler=False,
42 |                                        use_decoder=False,
43 |                                        use_classifier=False)
44 | if args.gluon_parameter_file:
45 |     try:
46 |         bert.cast('float16')
47 |         bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
48 |         bert.cast('float32')
49 |     except AssertionError:
50 |         bert.cast('float32')
51 |         bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
52 | 
53 | print(bert)
54 | tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=do_lower_case)
55 | dataset = nlp.data.TSVDataset(input_file, field_separator=nlp.data.Splitter('|||'))
56 | 
57 | trans = nlp.data.BERTSentenceTransform(tokenizer, max_length)
58 | dataset = dataset.transform(trans)
59 | 
60 | bert_dataloader = mx.gluon.data.DataLoader(dataset, batch_size=1,
61 |                                            shuffle=True, last_batch='rollover')
62 | 
63 | # verify the output of the first sample
64 | for i, seq in enumerate(bert_dataloader):
65 |     input_ids, valid_length, type_ids = seq
66 |     out = bert(input_ids, type_ids,
67 |                valid_length.astype('float32'))
68 |     length = valid_length.asscalar()
69 |     gluon_np = out.asnumpy().squeeze(0)
70 |     print(out)
71 |     import numpy as np
72 |     paddle_np = np.load(os.path.expanduser(
73 |         'ernie_top_layer_emb.npy'))
74 |     np.testing.assert_array_almost_equal(paddle_np, gluon_np, decimal=6)
75 |     break
76 | print("verify success")


--------------------------------------------------------------------------------
/machine_translation/nmt/translation.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | """Machine translation models and translators."""
20 | 
21 | 
22 | __all__ = ['BeamSearchTranslator']
23 | 
24 | import numpy as np
25 | import mxnet as mx
26 | from gluonnlp.model import BeamSearchScorer, BeamSearchSampler
27 | 
28 | class BeamSearchTranslator(object):
29 |     """Beam Search Translator
30 | 
31 |     Parameters
32 |     ----------
33 |     model : NMTModel
34 |         The neural machine translation model
35 |     beam_size : int
36 |         Size of the beam
37 |     scorer : BeamSearchScorer
38 |         Score function used in beamsearch
39 |     max_length : int
40 |         The maximum decoding length
41 |     """
42 |     def __init__(self, model, beam_size=1, scorer=BeamSearchScorer(), max_length=100):
43 |         self._model = model
44 |         self._sampler = BeamSearchSampler(
45 |             decoder=self._decode_logprob,
46 |             beam_size=beam_size,
47 |             eos_id=model.tgt_vocab.token_to_idx[model.tgt_vocab.eos_token],
48 |             scorer=scorer,
49 |             max_length=max_length)
50 | 
51 |     def _decode_logprob(self, step_input, states):
52 |         out, states, _ = self._model.decode_step(step_input, states)
53 |         return mx.nd.log_softmax(out), states
54 | 
55 |     def translate(self, src_seq, src_valid_length):
56 |         """Get the translation result given the input sentence.
57 | 
58 |         Parameters
59 |         ----------
60 |         src_seq : mx.nd.NDArray
61 |             Shape (batch_size, length)
62 |         src_valid_length : mx.nd.NDArray
63 |             Shape (batch_size,)
64 | 
65 |         Returns
66 |         -------
67 |         samples : NDArray
68 |             Samples draw by beam search. Shape (batch_size, beam_size, length). dtype is int32.
69 |         scores : NDArray
70 |             Scores of the samples. Shape (batch_size, beam_size). We make sure that scores[i, :] are
71 |             in descending order.
72 |         valid_length : NDArray
73 |             The valid length of the samples. Shape (batch_size, beam_size). dtype will be int32.
74 |         """
75 |         batch_size = src_seq.shape[0]
76 |         encoder_outputs, _ = self._model.encode(src_seq, valid_length=src_valid_length)
77 |         decoder_states = self._model.decoder.init_state_from_encoder(encoder_outputs,
78 |                                                                      src_valid_length)
79 |         inputs = mx.nd.full(shape=(batch_size,), ctx=src_seq.context, dtype=np.float32,
80 |                             val=self._model.tgt_vocab.token_to_idx[self._model.tgt_vocab.bos_token])
81 |         samples, scores, sample_valid_length = self._sampler(inputs, decoder_states)
82 |         return samples, scores, sample_valid_length
83 | 


--------------------------------------------------------------------------------
/intent_classification_and_slot_labelling/README.md:
--------------------------------------------------------------------------------
 1 | # Joint Intent Classification and Slot Labeling with GluonNLP
 2 | 
 3 | 
 4 | ## Introduction
 5 | Intent classification and slot labeling are two essential problems in Natural Language Understanding (NLU). In _intent classification_, the agent needs to detect the intention that the speaker's utterance conveys. For example, when the speaker says "Book a flight from Long Beach to Seattle", the intention is to book a flight ticket. In _slot labeling_, the agent needs to extract the semantic entities that are related to the intent. In our previous example, "Long Beach" and "Seattle" are two semantic constituents related to the flight, i.e., the origin and the destination.
 6 | 
 7 | Essentially, _intent classification_ can be viewed as a sequence classification problem and _slot labeling_ can be viewed as a sequence tagging problem similar to Named-entity Recognition (NER). Due to their inner correlation, these two tasks are usually trained jointly with a multi-task objective function.  
 8 | 
 9 | Here's one example of the ATIS dataset, it uses the [IOB2 format](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)).
10 | 
11 | | Sentence  | Tags | Intent Label |
12 | | --------- | ---- | ------------ |
13 | |    are    | O    |    atis_flight |
14 | | there     | O    |  |
15 | | any       | O    |  |
16 | | flights   | O    |  |
17 | | from      | O    |  |
18 | | long      | B-fromloc.city_name |  |
19 | | beach     | I-fromloc.city_name |  |
20 | | to        | O                   |  |
21 | | columbus  | B-toloc.city_name   |  |
22 | | on        | O                   |  |
23 | | wednesday | B-depart_date.day_name    |  |
24 | | april     | B-depart_date.month_name  |  |
25 | | sixteen   | B-depart_date.day_number  |  |
26 | 
27 | 
28 | 
29 | In this example, we demonstrate how to use GluonNLP to build a model to perform joint intent classification and slot labeling. We choose to finetune a pretrained BERT model.  We use two datasets [ATIS](https://github.com/yvchen/JointSLU) and [SNIPS](https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines).
30 |  
31 | ## Requirements
32 | 
33 | ```
34 | mxnet
35 | gluonnlp
36 | seqeval
37 | ```
38 | 
39 | You may use pip or other tools to install these packages
40 | 
41 | ## Experiment
42 | For the ATIS dataset, use the following command to run the experiment:
43 | ```bash
44 | python finetune_icsl.py --gpu 0 --dataset atis
45 | ```
46 | 
47 | It produces the final slot labeling F1 = `95.83%` and intent classification accuracy = `98.66%`
48 | 
49 | For the SNIPS dataset, use the following command to run the experiment:
50 | ```bash
51 | python finetune_icsl.py --gpu 0 --dataset snips
52 | ```
53 | It produces the final slot labeling F1 = `96.06%` and intent classification accuracy = `98.71%`
54 | 
55 | Also, we train the models with three random seeds and report the mean/std
56 | 
57 | For ATIS
58 | 
59 | | Models | Intent Acc (%) | Slot F1 (%) |
60 | | ------ | ------------------------ | ----------- |
61 | | [Intent Gating & self-attention, EMNLP 2018](https://www.aclweb.org/anthology/D18-1417) | 98.77 | 96.52 |
62 | | [BLSTM-CRF + ELMo, AAAI 2019](https://arxiv.org/abs/1811.05370) | 97.42 | 95.62 |
63 | | [Joint BERT, Arxiv 2019](https://arxiv.org/pdf/1902.10909.pdf) |  97.5 | 96.1 |
64 | | Ours | 98.66±0.00  | 95.88±0.04 |
65 | 
66 | For SNIPS
67 | 
68 | | Models | Intent Acc (%) | Slot F1 (%) |
69 | | ------ | ------------------------ | ----------- |
70 | | [BLSTM-CRF + ELMo, AAAI 2019](https://arxiv.org/abs/1811.05370) | 99.29 | 93.90 |
71 | | [Joint BERT, Arxiv 2019](https://arxiv.org/pdf/1902.10909.pdf) | 98.60 | 97.00 |
72 | | Ours | 98.81±0.13 | 95.94±0.10 |
73 | 


--------------------------------------------------------------------------------
/machine_translation/nmt/dataset.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | # pylint:disable=redefined-outer-name,logging-format-interpolation
20 | """Translation datasets."""
21 | 
22 | 
23 | __all__ = ['TOY']
24 | 
25 | import os
26 | from gluonnlp.base import get_home_dir
27 | from gluonnlp.data.translation import _TranslationDataset, _get_pair_key
28 | from gluonnlp.data.registry import register
29 | 
30 | 
31 | @register(segment=['train', 'val', 'test'])
32 | class TOY(_TranslationDataset):
33 |     """A Small Translation Dataset for Testing Scripts.
34 | 
35 |     Parameters
36 |     ----------
37 |     segment : str or list of str, default 'train'
38 |         Dataset segment. Options are 'train', 'val', 'test' or their combinations.
39 |     src_lang : str, default 'en'
40 |         The source language. Option for source and target languages are 'en' <-> 'de'
41 |     tgt_lang : str, default 'de'
42 |         The target language. Option for source and target languages are 'en' <-> 'de'
43 |     root : str, default '$MXNET_HOME/datasets/translation_test'
44 |         Path to temp folder for storing data.
45 |         MXNET_HOME defaults to '~/.mxnet'.
46 |     """
47 |     def __init__(self, segment='train', src_lang='en', tgt_lang='de',
48 |                  root=os.path.join(get_home_dir(), 'datasets', 'translation_test')):
49 |         self._supported_segments = ['train', 'val', 'test']
50 |         self._archive_file = {_get_pair_key('en', 'de'):
51 |                                   ('translation_test.zip',
52 |                                    '14f6c8e31ac6ec84ce469b4c196d60b4c86a179d')}
53 |         self._data_file = {_get_pair_key('en', 'de'):
54 |                                {'train_en': ('train.en',
55 |                                              'aa7f22b91eb93390fd342a57a81f51f53ed29542'),
56 |                                 'train_de': ('train.de',
57 |                                              'f914217ce23ddd8cac07e761a75685c043d4f6d3'),
58 |                                 'val_en': ('train.en',
59 |                                            'aa7f22b91eb93390fd342a57a81f51f53ed29542'),
60 |                                 'val_de': ('train.de',
61 |                                            'f914217ce23ddd8cac07e761a75685c043d4f6d3'),
62 |                                 'test_en': ('train.en',
63 |                                             'aa7f22b91eb93390fd342a57a81f51f53ed29542'),
64 |                                 'test_de': ('train.de',
65 |                                             'f914217ce23ddd8cac07e761a75685c043d4f6d3'),
66 |                                 'vocab_en': ('vocab.en.json',
67 |                                              'c7c6af4603ea70f0a4af2460a622333fbd014050'),
68 |                                 'vocab_de' : ('vocab.de.json',
69 |                                               '5b6f1be36a3e3cb9946b86e5d0fc73d164fda99f')}}
70 |         super(TOY, self).__init__('translation_test', segment=segment, src_lang=src_lang,
71 |                                   tgt_lang=tgt_lang, root=root)
72 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/ner_utils.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # "License"); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #   http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing,
 14 | # software distributed under the License is distributed on an
 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | # KIND, either express or implied.  See the License for the
 17 | # specific language governing permissions and limitations
 18 | # under the License.
 19 | """Common utilities for the named entity recognition task."""
 20 | 
 21 | import argparse
 22 | import pickle
 23 | from collections import namedtuple
 24 | 
 25 | import mxnet as mx
 26 | import gluonnlp as nlp
 27 | 
 28 | __all__ = ['get_bert_model', 'get_bert_dataset_name', 'get_context',
 29 |            'dump_metadata']
 30 | 
 31 | BERTModelMetadata = namedtuple('BERTModelMetadata', ['config', 'tag_vocab'])
 32 | 
 33 | def _metadata_file_path(checkpoint_prefix):
 34 |     """Gets the file path for meta data"""
 35 |     return checkpoint_prefix + '_metadata.pkl'
 36 | 
 37 | 
 38 | def dump_metadata(config, tag_vocab):
 39 |     """Dumps meta-data to the configured path"""
 40 |     metadata = BERTModelMetadata(config=config, tag_vocab=tag_vocab)
 41 |     with open(_metadata_file_path(config.save_checkpoint_prefix), 'wb') as ofp:
 42 |         pickle.dump(metadata, ofp)
 43 | 
 44 | 
 45 | def load_metadata(checkpoint_prefix):
 46 |     """Loads meta-data to the configured path"""
 47 |     with open(_metadata_file_path(checkpoint_prefix), 'rb') as ifp:
 48 |         metadata = pickle.load(ifp)
 49 |         return metadata.config, metadata.tag_vocab
 50 | 
 51 | 
 52 | def get_context(gpu_index):
 53 |     """This method gets context of execution"""
 54 |     context = None
 55 |     if gpu_index is None or gpu_index == '':
 56 |         context = mx.cpu()
 57 |     if isinstance(gpu_index, int):
 58 |         context = mx.gpu(gpu_index)
 59 |     return context
 60 | 
 61 | 
 62 | def str2bool(v):
 63 |     """Utility function for parsing boolean in argparse
 64 | 
 65 |     https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
 66 | 
 67 |     :param v: value of the argument
 68 |     :return:
 69 |     """
 70 |     if v.lower() in ('yes', 'true', 't', 'y', '1'):
 71 |         return True
 72 |     elif v.lower() in ('no', 'false', 'f', 'n', '0'):
 73 |         return False
 74 |     else:
 75 |         raise argparse.ArgumentTypeError('Boolean value expected.')
 76 | 
 77 | 
 78 | def get_bert_dataset_name(is_cased):
 79 |     """Returns relevant BERT dataset name, depending on whether we are using a cased model.
 80 | 
 81 |     Parameters
 82 |     ----------
 83 |     is_cased: bool
 84 |         Whether we are using a cased model.
 85 | 
 86 |     Returns
 87 |     -------
 88 |     str: Named of the BERT dataset.
 89 | 
 90 |     """
 91 |     if is_cased:
 92 |         return 'book_corpus_wiki_en_cased'
 93 |     else:
 94 |         return 'book_corpus_wiki_en_uncased'
 95 | 
 96 | 
 97 | def get_bert_model(bert_model, cased, ctx, dropout_prob):
 98 |     """Get pre-trained BERT model."""
 99 |     bert_dataset_name = get_bert_dataset_name(cased)
100 | 
101 |     return nlp.model.get_model(
102 |         name=bert_model,
103 |         dataset_name=bert_dataset_name,
104 |         pretrained=True,
105 |         ctx=ctx,
106 |         use_pooler=False,
107 |         use_decoder=False,
108 |         use_classifier=False,
109 |         dropout=dropout_prob,
110 |         embed_dropout=dropout_prob)
111 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/model/qa.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # "License"); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #   http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing,
 14 | # software distributed under the License is distributed on an
 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | # KIND, either express or implied.  See the License for the
 17 | # specific language governing permissions and limitations
 18 | # under the License.
 19 | """BertForQA models."""
 20 | 
 21 | __all__ = ['BertForQA', 'BertForQALoss']
 22 | 
 23 | from mxnet.gluon import Block, loss, nn
 24 | from mxnet.gluon.loss import Loss
 25 | 
 26 | 
 27 | class BertForQA(Block):
 28 |     """Model for SQuAD task with BERT.
 29 | 
 30 |     The model feeds token ids and token type ids into BERT to get the
 31 |     pooled BERT sequence representation, then apply a Dense layer for QA task.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     bert: BERTModel
 36 |         Bidirectional encoder with transformer.
 37 |     prefix : str or None
 38 |         See document of `mx.gluon.Block`.
 39 |     params : ParameterDict or None
 40 |         See document of `mx.gluon.Block`.
 41 |     """
 42 | 
 43 |     def __init__(self, bert, prefix=None, params=None):
 44 |         super(BertForQA, self).__init__(prefix=prefix, params=params)
 45 |         self.bert = bert
 46 |         with self.name_scope():
 47 |             self.span_classifier = nn.Dense(units=2, flatten=False)
 48 | 
 49 |     def forward(self, inputs, token_types, valid_length=None):  # pylint: disable=arguments-differ
 50 |         """Generate the unnormalized score for the given the input sequences.
 51 | 
 52 |         Parameters
 53 |         ----------
 54 |         inputs : NDArray, shape (batch_size, seq_length)
 55 |             Input words for the sequences.
 56 |         token_types : NDArray, shape (batch_size, seq_length)
 57 |             Token types for the sequences, used to indicate whether the word belongs to the
 58 |             first sentence or the second one.
 59 |         valid_length : NDArray or None, shape (batch_size,)
 60 |             Valid length of the sequence. This is used to mask the padded tokens.
 61 | 
 62 |         Returns
 63 |         -------
 64 |         outputs : NDArray
 65 |             Shape (batch_size, seq_length, 2)
 66 |         """
 67 |         bert_output = self.bert(inputs, token_types, valid_length)
 68 |         output = self.span_classifier(bert_output)
 69 |         return output
 70 | 
 71 | 
 72 | class BertForQALoss(Loss):
 73 |     """Loss for SQuAD task with BERT.
 74 | 
 75 |     """
 76 | 
 77 |     def __init__(self, weight=None, batch_axis=0, **kwargs):  # pylint: disable=unused-argument
 78 |         super(BertForQALoss, self).__init__(
 79 |             weight=None, batch_axis=0, **kwargs)
 80 |         self.loss = loss.SoftmaxCELoss()
 81 | 
 82 |     def hybrid_forward(self, F, pred, label):  # pylint: disable=arguments-differ
 83 |         """
 84 |         Parameters
 85 |         ----------
 86 |         pred : NDArray, shape (batch_size, seq_length, 2)
 87 |             BERTSquad forward output.
 88 |         label : list, length is 2, each shape is (batch_size,1)
 89 |             label[0] is the starting position of the answer,
 90 |             label[1] is the ending position of the answer.
 91 | 
 92 |         Returns
 93 |         -------
 94 |         outputs : NDArray
 95 |             Shape (batch_size,)
 96 |         """
 97 |         pred = F.split(pred, axis=2, num_outputs=2)
 98 |         start_pred = pred[0].reshape((0, -3))
 99 |         start_label = label[0]
100 |         end_pred = pred[1].reshape((0, -3))
101 |         end_label = label[1]
102 |         return (self.loss(start_pred, start_label) + self.loss(
103 |             end_pred, end_label)) / 2
104 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/model/ner.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # "License"); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #   http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing,
 14 | # software distributed under the License is distributed on an
 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | # KIND, either express or implied.  See the License for the
 17 | # specific language governing permissions and limitations
 18 | # under the License.
 19 | """Gluon model block for the named entity recognition task."""
 20 | 
 21 | import mxnet as mx
 22 | from mxnet.gluon import Block, nn
 23 | 
 24 | 
 25 | class BERTTagger(Block):
 26 |     """Model for sequence tagging with BERT
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     bert_model: BERTModel
 31 |         Bidirectional encoder with transformer.
 32 |     num_tag_types: int
 33 |         number of possible tags
 34 |     dropout_prob: float
 35 |         dropout probability for the last layer
 36 |     prefix: str or None
 37 |         See document of `mx.gluon.Block`.
 38 |     params: ParameterDict or None
 39 |         See document of `mx.gluon.Block`.
 40 |     """
 41 | 
 42 |     def __init__(self, bert_model, num_tag_types, dropout_prob, prefix=None, params=None):
 43 |         super(BERTTagger, self).__init__(prefix=prefix, params=params)
 44 |         self.bert_model = bert_model
 45 |         with self.name_scope():
 46 |             self.tag_classifier = nn.Dense(units=num_tag_types, flatten=False)
 47 |             self.dropout = nn.Dropout(rate=dropout_prob)
 48 | 
 49 |     def forward(self, token_ids, token_types, valid_length): # pylint: disable=arguments-differ
 50 |         """Generate an unnormalized score for the tag of each token
 51 | 
 52 |         Parameters
 53 |         ----------
 54 |         token_ids: NDArray, shape (batch_size, seq_length)
 55 |             ID of tokens in sentences
 56 |             See `input` of `glounnlp.model.BERTModel`
 57 |         token_types: NDArray, shape (batch_size, seq_length)
 58 |             See `glounnlp.model.BERTModel`
 59 |         valid_length: NDArray, shape (batch_size,)
 60 |             See `glounnlp.model.BERTModel`
 61 | 
 62 |         Returns
 63 |         -------
 64 |         NDArray, shape (batch_size, seq_length, num_tag_types):
 65 |             Unnormalized prediction scores for each tag on each position.
 66 |         """
 67 |         bert_output = self.dropout(self.bert_model(token_ids, token_types, valid_length))
 68 |         output = self.tag_classifier(bert_output)
 69 |         return output
 70 | 
 71 | 
 72 | def attach_prediction(data_loader, net, ctx, is_train):
 73 |     """Attach the prediction from a model to a data loader as the last field.
 74 | 
 75 |     Parameters
 76 |     ----------
 77 |     data_loader: mx.gluon.data.DataLoader
 78 |         Input data from `bert_model.BERTTaggingDataset._encode_as_input`.
 79 |     net: mx.gluon.Block
 80 |         gluon `Block` for making the preciction.
 81 |     ctx:
 82 |         The context data should be loaded to.
 83 |     is_train:
 84 |         Whether the forward pass should be made with `mx.autograd.record()`.
 85 | 
 86 |     Returns
 87 |     -------
 88 |         All fields from `bert_model.BERTTaggingDataset._encode_as_input`,
 89 |         as well as the prediction of the model.
 90 | 
 91 |     """
 92 |     for data in data_loader:
 93 |         text_ids, token_types, valid_length, tag_ids, flag_nonnull_tag = \
 94 |             [x.astype('float32').as_in_context(ctx) for x in data]
 95 | 
 96 |         from contextlib import ExitStack
 97 |         with ExitStack() as stack:
 98 |             if is_train:
 99 |                 stack.enter_context(mx.autograd.record())
100 |             out = net(text_ids, token_types, valid_length)
101 |         yield text_ids, token_types, valid_length, tag_ids, flag_nonnull_tag, out
102 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/conversion_tools/infer_pytorch_gluon_parameter_name_mapping.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # 'License'); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | # pylint:disable=redefined-outer-name,logging-format-interpolation
20 | """PyTorch BERT parameter naming to Gluon BERT parameter naming.
21 | 
22 | Given a Gluon BERT model (eg. obtained with the convert_tf_gluon.py script) and
23 | a pytorch_model.bin containing the same parameters, this script infers the
24 | naming convention of PyTorch.
25 | 
26 | """
27 | 
28 | import argparse
29 | import json
30 | import logging
31 | import os
32 | import sys
33 | 
34 | import gluonnlp as nlp
35 | import torch
36 | 
37 | sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
38 | from utils import load_text_vocab, tf_vocab_to_gluon_vocab
39 | 
40 | parser = argparse.ArgumentParser(description='Pytorch BERT Naming Convention',
41 |                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
42 | parser.add_argument('--model', type=str, default='bert_12_768_12',
43 |                     choices=['bert_12_768_12', 'bert_24_1024_16'], help='BERT model name')
44 | parser.add_argument('--dataset_name', type=str, default='scibert_scivocab_uncased',
45 |                     help='Dataset name')
46 | parser.add_argument('--pytorch_checkpoint_dir', type=str,
47 |                     help='Path to Tensorflow checkpoint folder.')
48 | parser.add_argument('--debug', action='store_true', help='debugging mode')
49 | parser.add_argument('--out', default='gluon_to_pytorch_naming.json',
50 |                     help='Output file to store gluon to pytorch name mapping.')
51 | args = parser.parse_args()
52 | logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
53 | logging.info(args)
54 | 
55 | # Load Gluon Model
56 | bert, vocab = nlp.model.get_model(args.model, dataset_name=args.dataset_name, pretrained=True)
57 | parameters = bert._collect_params_with_prefix()
58 | parameters = {k: v.data().asnumpy() for k, v in parameters.items()}
59 | 
60 | # Load PyTorch Model
61 | pytorch_parameters = torch.load(os.path.join(args.pytorch_checkpoint_dir, 'pytorch_model.bin'),
62 |                                 map_location=lambda storage, loc: storage)
63 | pytorch_vocab = tf_vocab_to_gluon_vocab(
64 |     load_text_vocab(os.path.join(args.pytorch_checkpoint_dir, 'vocab.txt')))
65 | pytorch_parameters = {k: v.numpy() for k, v in pytorch_parameters.items()}
66 | 
67 | # Assert that vocabularies are equal
68 | assert pytorch_vocab.idx_to_token == vocab.idx_to_token
69 | 
70 | mapping = dict()
71 | 
72 | for name, param in parameters.items():
73 |     found_match = False
74 |     for pytorch_name, pytorch_param in pytorch_parameters.items():
75 |         if param.shape == pytorch_param.shape:
76 |             if (param == pytorch_param).all():
77 |                 if found_match:
78 |                     print('Found multiple matches for {}. '
79 |                           'Ignoring new match {}'.format(name, pytorch_name))
80 |                 else:
81 |                     found_match = True
82 |                     mapping.update({name: pytorch_name})
83 | 
84 |         # We don't break here, in case there are mulitple matches
85 | 
86 |     if not found_match:
87 |         raise RuntimeError('Pytorch and Gluon model do not match. '
88 |                            'Cannot infer mapping of names.')
89 | 
90 | assert len(mapping) == len(parameters)
91 | 
92 | with open(args.out, 'w') as f:
93 |     json.dump(mapping, f, indent="  ")
94 |     print('Wrote mapping to {}'.format(args.out))
95 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/model/classification.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # "License"); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #   http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing,
 14 | # software distributed under the License is distributed on an
 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | # KIND, either express or implied.  See the License for the
 17 | # specific language governing permissions and limitations
 18 | # under the License.
 19 | """BERT models."""
 20 | 
 21 | __all__ = ['BERTClassifier', 'BERTRegression']
 22 | 
 23 | from mxnet.gluon import Block
 24 | from mxnet.gluon import nn
 25 | 
 26 | class BERTRegression(Block):
 27 |     """Model for sentence (pair) regression task with BERT.
 28 | 
 29 |     The model feeds token ids and token type ids into BERT to get the
 30 |     pooled BERT sequence representation, then apply a Dense layer for
 31 |     regression.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     bert: BERTModel
 36 |         Bidirectional encoder with transformer.
 37 |     dropout : float or None, default 0.0.
 38 |         Dropout probability for the bert output.
 39 |     prefix : str or None
 40 |         See document of `mx.gluon.Block`.
 41 |     params : ParameterDict or None
 42 |         See document of `mx.gluon.Block`.
 43 |     """
 44 | 
 45 |     def __init__(self, bert, dropout=0.0, prefix=None, params=None):
 46 |         super(BERTRegression, self).__init__(prefix=prefix, params=params)
 47 |         self.bert = bert
 48 |         with self.name_scope():
 49 |             self.regression = nn.HybridSequential(prefix=prefix)
 50 |             if dropout:
 51 |                 self.regression.add(nn.Dropout(rate=dropout))
 52 |             self.regression.add(nn.Dense(1))
 53 | 
 54 |     def forward(self, inputs, token_types, valid_length=None):  # pylint: disable=arguments-differ
 55 |         """Generate the unnormalized score for the given the input sequences.
 56 | 
 57 |         Parameters
 58 |         ----------
 59 |         inputs : NDArray, shape (batch_size, seq_length)
 60 |             Input words for the sequences.
 61 |         token_types : NDArray, shape (batch_size, seq_length)
 62 |             Token types for the sequences, used to indicate whether the word belongs to the
 63 |             first sentence or the second one.
 64 |         valid_length : NDArray or None, shape (batch_size)
 65 |             Valid length of the sequence. This is used to mask the padded tokens.
 66 | 
 67 |         Returns
 68 |         -------
 69 |         outputs : NDArray
 70 |             Shape (batch_size, num_classes)
 71 |         """
 72 |         _, pooler_out = self.bert(inputs, token_types, valid_length)
 73 |         return self.regression(pooler_out)
 74 | 
 75 | 
 76 | class BERTClassifier(Block):
 77 |     """Model for sentence (pair) classification task with BERT.
 78 | 
 79 |     The model feeds token ids and token type ids into BERT to get the
 80 |     pooled BERT sequence representation, then apply a Dense layer for
 81 |     classification.
 82 | 
 83 |     Parameters
 84 |     ----------
 85 |     bert: BERTModel
 86 |         Bidirectional encoder with transformer.
 87 |     num_classes : int, default is 2
 88 |         The number of target classes.
 89 |     dropout : float or None, default 0.0.
 90 |         Dropout probability for the bert output.
 91 |     prefix : str or None
 92 |         See document of `mx.gluon.Block`.
 93 |     params : ParameterDict or None
 94 |         See document of `mx.gluon.Block`.
 95 |     """
 96 | 
 97 |     def __init__(self,
 98 |                  bert,
 99 |                  num_classes=2,
100 |                  dropout=0.0,
101 |                  prefix=None,
102 |                  params=None):
103 |         super(BERTClassifier, self).__init__(prefix=prefix, params=params)
104 |         self.bert = bert
105 |         with self.name_scope():
106 |             self.classifier = nn.HybridSequential(prefix=prefix)
107 |             if dropout:
108 |                 self.classifier.add(nn.Dropout(rate=dropout))
109 |             self.classifier.add(nn.Dense(units=num_classes))
110 | 
111 |     def forward(self, inputs, token_types, valid_length=None):  # pylint: disable=arguments-differ
112 |         """Generate the unnormalized score for the given the input sequences.
113 | 
114 |         Parameters
115 |         ----------
116 |         inputs : NDArray, shape (batch_size, seq_length)
117 |             Input words for the sequences.
118 |         token_types : NDArray, shape (batch_size, seq_length)
119 |             Token types for the sequences, used to indicate whether the word belongs to the
120 |             first sentence or the second one.
121 |         valid_length : NDArray or None, shape (batch_size)
122 |             Valid length of the sequence. This is used to mask the padded tokens.
123 | 
124 |         Returns
125 |         -------
126 |         outputs : NDArray
127 |             Shape (batch_size, num_classes)
128 |         """
129 |         _, pooler_out = self.bert(inputs, token_types, valid_length)
130 |         return self.classifier(pooler_out)
131 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/data/transform.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and DMLC.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """BERT dataset transform."""
 16 | 
 17 | from __future__ import absolute_import
 18 | 
 19 | __all__ = ['BERTDatasetTransform']
 20 | 
 21 | import numpy as np
 22 | from gluonnlp.data import BERTSentenceTransform
 23 | 
 24 | class BERTDatasetTransform(object):
 25 |     """Dataset transformation for BERT-style sentence classification or regression.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     tokenizer : BERTTokenizer.
 30 |         Tokenizer for the sentences.
 31 |     max_seq_length : int.
 32 |         Maximum sequence length of the sentences.
 33 |     labels : list of int , float or None. defaults None
 34 |         List of all label ids for the classification task and regressing task.
 35 |         If labels is None, the default task is regression
 36 |     pad : bool, default True
 37 |         Whether to pad the sentences to maximum length.
 38 |     pair : bool, default True
 39 |         Whether to transform sentences or sentence pairs.
 40 |     label_dtype: int32 or float32, default float32
 41 |         label_dtype = int32 for classification task
 42 |         label_dtype = float32 for regression task
 43 |     """
 44 | 
 45 |     def __init__(self,
 46 |                  tokenizer,
 47 |                  max_seq_length,
 48 |                  class_labels=None,
 49 |                  label_alias=None,
 50 |                  pad=True,
 51 |                  pair=True,
 52 |                  has_label=True):
 53 |         self.class_labels = class_labels
 54 |         self.has_label = has_label
 55 |         self._label_dtype = 'int32' if class_labels else 'float32'
 56 |         if has_label and class_labels:
 57 |             self._label_map = {}
 58 |             for (i, label) in enumerate(class_labels):
 59 |                 self._label_map[label] = i
 60 |             if label_alias:
 61 |                 for key in label_alias:
 62 |                     self._label_map[key] = self._label_map[label_alias[key]]
 63 |         self._bert_xform = BERTSentenceTransform(
 64 |             tokenizer, max_seq_length, pad=pad, pair=pair)
 65 | 
 66 |     def __call__(self, line):
 67 |         """Perform transformation for sequence pairs or single sequences.
 68 | 
 69 |         The transformation is processed in the following steps:
 70 |         - tokenize the input sequences
 71 |         - insert [CLS], [SEP] as necessary
 72 |         - generate type ids to indicate whether a token belongs to the first
 73 |           sequence or the second sequence.
 74 |         - generate valid length
 75 | 
 76 |         For sequence pairs, the input is a tuple of 3 strings:
 77 |         text_a, text_b and label.
 78 | 
 79 |         Inputs:
 80 |             text_a: 'is this jacksonville ?'
 81 |             text_b: 'no it is not'
 82 |             label: '0'
 83 |         Tokenization:
 84 |             text_a: 'is this jack ##son ##ville ?'
 85 |             text_b: 'no it is not .'
 86 |         Processed:
 87 |             tokens:  '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
 88 |             type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
 89 |             valid_length: 14
 90 |             label: 0
 91 | 
 92 |         For single sequences, the input is a tuple of 2 strings: text_a and label.
 93 |         Inputs:
 94 |             text_a: 'the dog is hairy .'
 95 |             label: '1'
 96 |         Tokenization:
 97 |             text_a: 'the dog is hairy .'
 98 |         Processed:
 99 |             text_a:  '[CLS] the dog is hairy . [SEP]'
100 |             type_ids: 0     0   0   0  0     0 0
101 |             valid_length: 7
102 |             label: 1
103 | 
104 |         Parameters
105 |         ----------
106 |         line: tuple of str
107 |             Input strings. For sequence pairs, the input is a tuple of 3 strings:
108 |             (text_a, text_b, label). For single sequences, the input is a tuple
109 |             of 2 strings: (text_a, label).
110 | 
111 |         Returns
112 |         -------
113 |         np.array: input token ids in 'int32', shape (batch_size, seq_length)
114 |         np.array: valid length in 'int32', shape (batch_size,)
115 |         np.array: input token type ids in 'int32', shape (batch_size, seq_length)
116 |         np.array: classification task: label id in 'int32', shape (batch_size, 1),
117 |             regression task: label in 'float32', shape (batch_size, 1)
118 |         """
119 |         if self.has_label:
120 |             input_ids, valid_length, segment_ids = self._bert_xform(line[:-1])
121 |             label = line[-1]
122 |             # map to int if class labels are available
123 |             if self.class_labels:
124 |                 label = self._label_map[label]
125 |             label = np.array([label], dtype=self._label_dtype)
126 |             return input_ids, valid_length, segment_ids, label
127 |         else:
128 |             return self._bert_xform(line)
129 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/predict_ner.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Licensed to the Apache Software Foundation (ASF) under one
  5 | # or more contributor license agreements.  See the NOTICE file
  6 | # distributed with this work for additional information
  7 | # regarding copyright ownership.  The ASF licenses this file
  8 | # to you under the Apache License, Version 2.0 (the
  9 | # "License"); you may not use this file except in compliance
 10 | # with the License.  You may obtain a copy of the License at
 11 | #
 12 | #   http://www.apache.org/licenses/LICENSE-2.0
 13 | #
 14 | # Unless required by applicable law or agreed to in writing,
 15 | # software distributed under the License is distributed on an
 16 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 17 | # KIND, either express or implied.  See the License for the
 18 | # specific language governing permissions and limitations
 19 | # under the License.
 20 | """Script for NER prediction."""
 21 | 
 22 | import argparse
 23 | import logging
 24 | import os
 25 | 
 26 | import mxnet as mx
 27 | from ner_utils import get_bert_model, get_context
 28 | from ner_utils import load_metadata
 29 | from data.ner import BERTTaggingDataset, convert_arrays_to_text
 30 | from model.ner import BERTTagger
 31 | 
 32 | # TODO(bikestra): Currently, our evaluation is dependent on this package.
 33 | # Figure out whether to take actual dependency on it.
 34 | try:
 35 |     import seqeval.metrics
 36 | except ImportError:
 37 |     raise ImportError('seqeval is required to run NER on BERT. Please '
 38 |                       'install it via pip3 install seqeval --user')
 39 | 
 40 | 
 41 | def _find_model_file_from_checkpoint(checkpoint_prefix: str):
 42 |     """Load model checkpoint"""
 43 |     dirname, file_prefix = os.path.split(checkpoint_prefix)
 44 |     # find checkpoint file names and sort by name to find the most recent one.
 45 |     checkpoint_filenames = ([f for f in os.listdir(dirname)
 46 |                              if f.startswith(file_prefix)
 47 |                              and f.endswith(os.path.extsep + 'params')])
 48 |     last_checkpoint_filename = max(checkpoint_filenames)
 49 |     logging.info('found checkpoint filename: {:s}'.format(last_checkpoint_filename))
 50 |     last_checkpoint_path = os.path.join(dirname, last_checkpoint_filename)
 51 |     return last_checkpoint_path
 52 | 
 53 | 
 54 | def parse_args():
 55 |     """Parse command line arguments."""
 56 |     arg_parser = argparse.ArgumentParser(
 57 |         description='Predict on CoNLL format data using BERT-based named entity recognition model',
 58 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 59 | 
 60 |     # data file paths
 61 |     arg_parser.add_argument('--test-path', type=str, required=True,
 62 |                             help='Path to the test data file')
 63 |     arg_parser.add_argument('--seq-len', type=int, default=200,
 64 |                             help='The length of the sequence input to BERT.'
 65 |                                  ' An exception will raised if this is not large enough.')
 66 |     arg_parser.add_argument('--load-checkpoint-prefix', type=str, required=False, default=None,
 67 |                             help='Prefix of model checkpoint file')
 68 | 
 69 |     arg_parser.add_argument('--gpu', type=int,
 70 |                             help='Number (index) of GPU to run on, e.g. 0. '
 71 |                                  'If not specified, CPU context is used.')
 72 |     arg_parser.add_argument('--batch-size', type=int, default=32, help='Batch size for training')
 73 |     args = arg_parser.parse_args()
 74 |     return args
 75 | 
 76 | 
 77 | def main(config):
 78 |     """Main method for predicting BERT-based NER model on CoNLL-formatted test data."""
 79 |     train_config, tag_vocab = load_metadata(config.load_checkpoint_prefix)
 80 | 
 81 |     ctx = get_context(config.gpu)
 82 |     bert_model, text_vocab = get_bert_model(train_config.bert_model, train_config.cased, ctx,
 83 |                                             train_config.dropout_prob)
 84 | 
 85 |     dataset = BERTTaggingDataset(text_vocab, None, None, config.test_path,
 86 |                                  config.seq_len, train_config.cased, tag_vocab=tag_vocab)
 87 | 
 88 |     test_data_loader = dataset.get_test_data_loader(config.batch_size)
 89 | 
 90 |     net = BERTTagger(bert_model, dataset.num_tag_types, train_config.dropout_prob)
 91 |     model_filename = _find_model_file_from_checkpoint(config.load_checkpoint_prefix)
 92 |     net.load_parameters(model_filename, ctx=ctx)
 93 | 
 94 |     net.hybridize(static_alloc=True)
 95 | 
 96 |     loss_function = mx.gluon.loss.SoftmaxCrossEntropyLoss()
 97 |     loss_function.hybridize(static_alloc=True)
 98 | 
 99 |     # TODO(bikestra): make it not redundant between train and predict
100 |     def evaluate(data_loader):
101 |         """Eval function"""
102 |         predictions = []
103 | 
104 |         for batch_id, data in enumerate(data_loader):
105 |             logging.info('evaluating on batch index: %d/%d', batch_id, len(data_loader))
106 |             text_ids, token_types, valid_length, tag_ids, _ = \
107 |                 [x.astype('float32').as_in_context(ctx) for x in data]
108 |             out = net(text_ids, token_types, valid_length)
109 | 
110 |             # convert results to numpy arrays for easier access
111 |             np_text_ids = text_ids.astype('int32').asnumpy()
112 |             np_pred_tags = out.argmax(axis=-1).asnumpy()
113 |             np_valid_length = valid_length.astype('int32').asnumpy()
114 |             np_true_tags = tag_ids.asnumpy()
115 | 
116 |             predictions += convert_arrays_to_text(text_vocab, dataset.tag_vocab, np_text_ids,
117 |                                                   np_true_tags, np_pred_tags, np_valid_length)
118 | 
119 |         all_true_tags = [[entry.true_tag for entry in entries] for entries in predictions]
120 |         all_pred_tags = [[entry.pred_tag for entry in entries] for entries in predictions]
121 |         seqeval_f1 = seqeval.metrics.f1_score(all_true_tags, all_pred_tags)
122 |         return seqeval_f1
123 | 
124 |     test_f1 = evaluate(test_data_loader)
125 |     logging.info('test f1: {:.3f}'.format(test_f1))
126 | 
127 | 
128 | if __name__ == '__main__':
129 |     logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
130 |                         level=logging.DEBUG, datefmt='%Y-%m-%d %I:%M:%S')
131 |     logging.getLogger().setLevel(logging.INFO)
132 |     main(parse_args())
133 | 


--------------------------------------------------------------------------------
/sequence_generation/text_generation/index.rst:
--------------------------------------------------------------------------------
  1 | Text Generation
  2 | ---------------
  3 | 
  4 | :download:`[Download] </model_zoo/text_generation.zip>`
  5 | 
  6 | Sampling a Language Model
  7 | +++++++++++++++++++++++++
  8 | 
  9 | This script can be used to generate sentences using beam search or a sequence sampler, to sample from a pre-trained language model such as GPT-2. For example:
 10 | 
 11 | .. code-block:: console
 12 | 
 13 |    $ python sequence_sampling.py random-sample \
 14 |          --bos 'Deep learning and natural language processing' \
 15 |          --beam-size 1 --print-num 1 \
 16 |          --lm-model gpt2_345m # options are {gpt2_117m, gpt2_345m} \
 17 |          --max-length 1024
 18 | 
 19 | Output is
 20 | 
 21 | .. code-block:: console
 22 | 
 23 |     Sampling Parameters: beam_size=1, temperature=1.0, use_top_k=None
 24 |     Generation Result:
 25 |     ['Deep learning and natural language processing brought application choice in healthcare and perception of sounds and heat to new heights, enriching our physical communities with medical devices and creating vibrant cultures. Anecdote is slowly diminishing but is hardly obsolete nor more appealing than experience.Despite those last words of wisdom, most headset makers even spook us with the complexity and poor code quality. the hard set a mere $150 and beginner creates center for getting started. Temp cheap:\nPosted by Fleegu at 12:02 PM<|endoftext|>', -461.15128]
 26 | 
 27 | Sequence Sampler
 28 | ~~~~~~~~~~~~~~~~
 29 | 
 30 | Use the following command to decode to sample from the multinomial distribution.
 31 | 
 32 | .. code-block:: console
 33 | 
 34 |    $ python sequence_sampling.py random-sample --bos 'I love it' --beam-size 5 --print-num 5
 35 | 
 36 | Output is
 37 | 
 38 | .. code-block:: console
 39 | 
 40 |     Sampling Parameters: beam_size=5, temperature=1.0, use_top_k=None
 41 |     Generation Result:
 42 |     ['I love it in reference to the northwestern country. replay Liberties were raised from the late 1943 to June <eos>', -89.459656]
 43 |     ['I love it to them. Very account suggests that there is no basis as to whether the constellations are <eos>', -72.687996]
 44 |     ['I love it for quick attempts. It does not have any factors, and [the cause] has <eos>', -64.87619]
 45 |     ['I love it one in the English language, and say it was not for English the same standard than <eos>', -71.51008]
 46 |     ['I love it to take care of her; following many attempts to appease the Canadian military and making some <eos>', -75.5512]
 47 | 
 48 | You can also try a lower temperature such as 0.95, which results in sharper distribution.
 49 | 
 50 | .. code-block:: console
 51 | 
 52 |    $ python sequence_sampling.py random-sample --bos 'I love it' --beam-size 5 --print-num 5 --temperature 0.95
 53 | 
 54 | Output is
 55 | 
 56 | .. code-block:: console
 57 | 
 58 |     Sampling Parameters: beam_size=5, temperature=0.95, use_top_k=None
 59 |     Generation Result:
 60 |     ['I love it and flew by <unk> (a <unk> colleague Due to his delicate and non-serious attacks <eos>', -85.825195]
 61 |     ['I love it in a short anticipated 1927 hiatus. As a result, it was able to withstand changes <eos>', -71.8867]
 62 |     ['I love it for analysis. <eos>', -15.78739]
 63 |     ['I love it his own. The total of one hundred lives of all other documented <unk> in the Congo <eos>', -68.57835]
 64 |     ['I love it in his Why My Woman to Get Out of Graham Your Way. <eos>', -65.74211]
 65 | 
 66 | Finally, you can also try to constrain the sampling to sample only from the top-k tokens.
 67 | 
 68 | .. code-block:: console
 69 | 
 70 |    $ python sequence_sampling.py random-sample --bos 'I love it' --beam-size 5 --print-num 5 --temperature 0.95 --use-top-k 800
 71 | 
 72 | Output is
 73 | 
 74 | .. code-block:: console
 75 | 
 76 |     Sampling Parameters: beam_size=5, temperature=0.95, use_top_k=800
 77 |     Generation Result:
 78 |     ['I love it. It is the same as the Old Age. The best known of this is the <eos>', -30.544556]
 79 |     ['I love it and had a weak start by a group of only three-year-old fans. <eos>', -44.970097]
 80 |     ['I love it ". <eos>', -4.725212]
 81 |     ['I love it with the <unk>. <eos>', -7.236909]
 82 |     ['I love it and its working-based <unk> ". <eos>', -25.340023]
 83 | 
 84 | Beam Search Generator
 85 | ~~~~~~~~~~~~~~~~~~~~~
 86 | 
 87 | Use the following command to decode using beam search.
 88 | 
 89 | .. code-block:: console
 90 | 
 91 |    $ python sequence_sampling.py beam-search --bos 'I love it' --beam-size 5 --print-num 5
 92 | 
 93 | Output is
 94 | 
 95 | .. code-block:: console
 96 | 
 97 |     Beam Seach Parameters: beam_size=5, alpha=0.0, K=5
 98 |     Generation Result:
 99 |     ['I love it. <eos>', -2.6606221]
100 |     ['I love it. "<eos>', -4.072001]
101 |     ['I love it, and the <unk> of the <unk>. <eos>', -14.573]
102 |     ['I love it, and the <unk> of the <unk>. The <unk> of the <unk>, the <unk>, <eos>', -28.968985]
103 |     ['I love it, and the <unk> of the <unk>. The <unk> of the <unk>, the <unk> and <eos>', -30.064144]
104 | 
105 | You can also try a larger beam size, such as 15.
106 | 
107 | .. code-block:: console
108 | 
109 |    $ python sequence_sampling.py beam-search --bos 'I love it' --beam-size 15 --print-num 15
110 | 
111 | Output is
112 | 
113 | .. code-block:: console
114 | 
115 |     Beam Seach Parameters: beam_size=15, alpha=0.0, K=5
116 |     Generation Result:
117 |     ['I love it. <eos>', -2.6606221]
118 |     ['I love it. "<eos>', -4.072001]
119 |     ['I love it ". <eos>', -5.222643]
120 |     ['I love it, and the <unk> of the <unk>. <eos>', -14.573]
121 |     ['I love it. It was the first time in the history of the history of the history of the <eos>', -21.041868]
122 |     ['I love it. It was the first time in the history of the history of the country. <eos>', -21.262276]
123 |     ['I love it. It was the first time in the history of the history of the United States. <eos>', -21.826159]
124 |     ['I love it. It was the first time in the history of the history of the world. <eos>', -21.930265]
125 |     ['I love it. It was the first time in the history of the history of the country. The <eos>', -21.94392]
126 |     ['I love it. It was the first time in the history of the history of the city. <eos>', -22.00894]
127 |     ['I love it. It was the first time in the history of the history of the country that the <eos>', -22.152416]
128 |     ['I love it. It was the first time in the history of the history of the United States, <eos>', -22.170143]
129 |     ['I love it. It was the first time in the history of the history of the country, and <eos>', -22.188667]
130 |     ['I love it. It was the first time in the history of the history of the United States that <eos>', -22.254015]
131 |     ['I love it. It was the first time in the history of the history of the state. <eos>', -22.398975]
132 | 


--------------------------------------------------------------------------------
/machine_translation/dataprocessor.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # "License"); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #   http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing,
 14 | # software distributed under the License is distributed on an
 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | # KIND, either express or implied.  See the License for the
 17 | # specific language governing permissions and limitations
 18 | # under the License.
 19 | """Data preprocessing for transformer."""
 20 | 
 21 | import os
 22 | import io
 23 | import time
 24 | import numpy as np
 25 | import mxnet as mx
 26 | from mxnet import gluon
 27 | import gluonnlp as nlp
 28 | import nmt
 29 | import hyperparameters as hparams
 30 | 
 31 | def cache_dataset(dataset, prefix):
 32 |     """Cache the processed npy dataset the dataset into a npz
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     dataset : SimpleDataset
 37 |     file_path : str
 38 |     """
 39 |     if not os.path.exists(nmt._constants.CACHE_PATH):
 40 |         os.makedirs(nmt._constants.CACHE_PATH)
 41 |     src_data = np.concatenate([e[0] for e in dataset])
 42 |     tgt_data = np.concatenate([e[1] for e in dataset])
 43 |     src_cumlen = np.cumsum([0]+[len(e[0]) for e in dataset])
 44 |     tgt_cumlen = np.cumsum([0]+[len(e[1]) for e in dataset])
 45 |     np.savez(os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz'),
 46 |              src_data=src_data, tgt_data=tgt_data,
 47 |              src_cumlen=src_cumlen, tgt_cumlen=tgt_cumlen)
 48 | 
 49 | 
 50 | def load_cached_dataset(prefix):
 51 |     cached_file_path = os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz')
 52 |     if os.path.exists(cached_file_path):
 53 |         print('Loading dataset...')
 54 |         npz_data = np.load(cached_file_path)
 55 |         src_data, tgt_data, src_cumlen, tgt_cumlen = [npz_data[n] for n in
 56 |                 ['src_data', 'tgt_data', 'src_cumlen', 'tgt_cumlen']]
 57 |         src_data = np.array([src_data[low:high] for low, high in zip(src_cumlen[:-1], src_cumlen[1:])])
 58 |         tgt_data = np.array([tgt_data[low:high] for low, high in zip(tgt_cumlen[:-1], tgt_cumlen[1:])])
 59 |         return gluon.data.ArrayDataset(np.array(src_data), np.array(tgt_data))
 60 |     else:
 61 |         return None
 62 | 
 63 | 
 64 | class TrainValDataTransform(object):
 65 |     """Transform the machine translation dataset.
 66 | 
 67 |     Clip source and the target sentences to the maximum length. For the source sentence, append the
 68 |     EOS. For the target sentence, append BOS and EOS.
 69 | 
 70 |     Parameters
 71 |     ----------
 72 |     src_vocab : Vocab
 73 |     tgt_vocab : Vocab
 74 |     src_max_len : int
 75 |     tgt_max_len : int
 76 |     """
 77 | 
 78 |     def __init__(self, src_vocab, tgt_vocab, src_max_len=None, tgt_max_len=None):
 79 |         self._src_vocab = src_vocab
 80 |         self._tgt_vocab = tgt_vocab
 81 |         self._src_max_len = src_max_len
 82 |         self._tgt_max_len = tgt_max_len
 83 | 
 84 |     def __call__(self, src, tgt):
 85 |         if self._src_max_len:
 86 |             src_sentence = self._src_vocab[src.split()[:self._src_max_len]]
 87 |         else:
 88 |             src_sentence = self._src_vocab[src.split()]
 89 |         if self._tgt_max_len:
 90 |             tgt_sentence = self._tgt_vocab[tgt.split()[:self._tgt_max_len]]
 91 |         else:
 92 |             tgt_sentence = self._tgt_vocab[tgt.split()]
 93 |         src_sentence.append(self._src_vocab[self._src_vocab.eos_token])
 94 |         tgt_sentence.insert(0, self._tgt_vocab[self._tgt_vocab.bos_token])
 95 |         tgt_sentence.append(self._tgt_vocab[self._tgt_vocab.eos_token])
 96 |         src_npy = np.array(src_sentence, dtype=np.int32)
 97 |         tgt_npy = np.array(tgt_sentence, dtype=np.int32)
 98 |         return src_npy, tgt_npy
 99 | 
100 | 
101 | def process_dataset(dataset, src_vocab, tgt_vocab, src_max_len=-1, tgt_max_len=-1):
102 |     start = time.time()
103 |     dataset_processed = dataset.transform(TrainValDataTransform(src_vocab, tgt_vocab,
104 |                                                                 src_max_len,
105 |                                                                 tgt_max_len), lazy=False)
106 |     end = time.time()
107 |     print('Processing Time spent: {}'.format(end - start))
108 |     return dataset_processed
109 | 
110 | 
111 | def load_translation_data(dataset, src_lang='en', tgt_lang='de'):
112 |     """Load translation dataset
113 | 
114 |     Parameters
115 |     ----------
116 |     dataset : str
117 |     src_lang : str, default 'en'
118 |     tgt_lang : str, default 'de'
119 | 
120 |     Returns
121 |     -------
122 | 
123 |     """
124 |     if dataset == 'WMT2014BPE':
125 |         common_prefix = 'WMT2014BPE_{}_{}_{}_{}'.format(src_lang, tgt_lang,
126 |                                                         hparams.src_max_len, hparams.tgt_max_len)
127 |         data_train = nlp.data.WMT2014BPE('train', src_lang=src_lang, tgt_lang=tgt_lang)
128 |         data_val = nlp.data.WMT2014BPE('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang)
129 |         data_test = nlp.data.WMT2014BPE('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang,
130 |                                full=False)
131 |     elif dataset == 'TOY':
132 |         common_prefix = 'TOY_{}_{}_{}_{}'.format(src_lang, tgt_lang,
133 |                                                  hparams.src_max_len, hparams.tgt_max_len)
134 |         data_train = nmt.dataset.TOY('train', src_lang=src_lang, tgt_lang=tgt_lang)
135 |         data_val = nmt.dataset.TOY('val', src_lang=src_lang, tgt_lang=tgt_lang)
136 |         data_test = nmt.dataset.TOY('test', src_lang=src_lang, tgt_lang=tgt_lang)
137 |     else:
138 |         raise NotImplementedError
139 |     src_vocab, tgt_vocab = data_train.src_vocab, data_train.tgt_vocab
140 |     data_train_processed = load_cached_dataset(common_prefix + '_train')
141 |     if not data_train_processed:
142 |         data_train_processed = process_dataset(data_train, src_vocab, tgt_vocab,
143 |                                                hparams.src_max_len, hparams.tgt_max_len)
144 |         cache_dataset(data_train_processed, common_prefix + '_train')
145 |     data_val_processed = load_cached_dataset(common_prefix + '_val')
146 |     if not data_val_processed:
147 |         data_val_processed = process_dataset(data_val, src_vocab, tgt_vocab)
148 |         cache_dataset(data_val_processed, common_prefix + '_val')
149 |     data_test_processed = load_cached_dataset(common_prefix + '_' + str(False) + '_test')
150 |     if not data_test_processed:
151 |         data_test_processed = process_dataset(data_test, src_vocab, tgt_vocab)
152 |         cache_dataset(data_test_processed, common_prefix + '_' + str(False) + '_test')
153 |     fetch_tgt_sentence = lambda src, tgt: tgt
154 |     if dataset == 'WMT2014BPE':
155 |         val_text = nlp.data.WMT2014('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang)
156 |         test_text = nlp.data.WMT2014('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang,
157 |                             full=False)
158 |     elif dataset == 'TOY':
159 |         val_text = data_val
160 |         test_text = data_test
161 |     else:
162 |         raise NotImplementedError
163 |     val_tgt_sentences = list(val_text.transform(fetch_tgt_sentence))
164 |     test_tgt_sentences = list(test_text.transform(fetch_tgt_sentence))
165 |     return data_train_processed, data_val_processed, data_test_processed, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab
166 | 
167 | 
168 | def get_data_lengths(dataset):
169 |     return list(dataset.transform(lambda srg, tgt: (len(srg), len(tgt))))
170 | 


--------------------------------------------------------------------------------
/machine_translation/utils.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # "License"); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #   http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing,
 14 | # software distributed under the License is distributed on an
 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | # KIND, either express or implied.  See the License for the
 17 | # specific language governing permissions and limitations
 18 | # under the License.
 19 | """Utilities for transformer."""
 20 | 
 21 | import numpy as np
 22 | import math
 23 | import mxnet as mx
 24 | import time
 25 | import logging
 26 | import io
 27 | import nmt
 28 | import hyperparameters as hparams
 29 | 
 30 | def evaluate(model, data_loader, test_loss_function, translator, tgt_vocab, detokenizer, context):
 31 |     """Evaluate given the data loader
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     data_loader : DataLoader
 36 | 
 37 |     Returns
 38 |     -------
 39 |     avg_loss : float
 40 |         Average loss
 41 |     real_translation_out : list of list of str
 42 |         The translation output
 43 |     """
 44 |     translation_out = []
 45 |     all_inst_ids = []
 46 |     avg_loss_denom = 0
 47 |     avg_loss = 0.0
 48 |     for _, (src_seq, tgt_seq, src_valid_length, tgt_valid_length, inst_ids) \
 49 |             in enumerate(data_loader):
 50 |         src_seq = src_seq.as_in_context(context)
 51 |         tgt_seq = tgt_seq.as_in_context(context)
 52 |         src_valid_length = src_valid_length.as_in_context(context)
 53 |         tgt_valid_length = tgt_valid_length.as_in_context(context)
 54 |         # Calculating Loss
 55 |         out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
 56 |         loss = test_loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean().asscalar()
 57 |         all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist())
 58 |         avg_loss += loss * (tgt_seq.shape[1] - 1)
 59 |         avg_loss_denom += (tgt_seq.shape[1] - 1)
 60 |         # Translate
 61 |         samples, _, sample_valid_length = \
 62 |             translator.translate(src_seq=src_seq, src_valid_length=src_valid_length)
 63 |         max_score_sample = samples[:, 0, :].asnumpy()
 64 |         sample_valid_length = sample_valid_length[:, 0].asnumpy()
 65 |         for i in range(max_score_sample.shape[0]):
 66 |             translation_out.append(
 67 |                 [tgt_vocab.idx_to_token[ele] for ele in
 68 |                  max_score_sample[i][1:(sample_valid_length[i] - 1)]])
 69 |     avg_loss = avg_loss / avg_loss_denom
 70 |     real_translation_out = [None for _ in range(len(all_inst_ids))]
 71 |     for ind, sentence in zip(all_inst_ids, translation_out):
 72 |         real_translation_out[ind] = detokenizer(nmt.bleu._bpe_to_words(sentence),
 73 |                                                 return_str=True)
 74 |     return avg_loss, real_translation_out
 75 | 
 76 | def translate(translator, src_seq, src_vocab, tgt_vocab, detokenizer, ctx):
 77 |     src_sentence = src_vocab[src_seq.split()]
 78 |     src_sentence.append(src_vocab[src_vocab.eos_token])
 79 |     src_npy = np.array(src_sentence, dtype=np.int32)
 80 |     src_nd = mx.nd.array(src_npy)
 81 |     src_nd = src_nd.reshape((1, -1)).as_in_context(ctx)
 82 |     src_valid_length = mx.nd.array([src_nd.shape[1]]).as_in_context(ctx)
 83 |     samples, _, sample_valid_length = \
 84 |         translator.translate(src_seq=src_nd, src_valid_length=src_valid_length)
 85 |     max_score_sample = samples[:, 0, :].asnumpy()
 86 |     
 87 |     sample_valid_length = sample_valid_length[:, 0].asnumpy()
 88 |     translation_out = []
 89 |     for i in range(max_score_sample.shape[0]):
 90 |         translation_out.append(
 91 |             [tgt_vocab.idx_to_token[ele] for ele in
 92 |              max_score_sample[i][1:(sample_valid_length[i] - 1)]])
 93 |     real_translation_out = [None for _ in range(len(translation_out))]
 94 |     for ind, sentence in enumerate(translation_out):
 95 |         real_translation_out[ind] = detokenizer(nmt.bleu._bpe_to_words(sentence),
 96 |                                                 return_str=True)
 97 |     return real_translation_out              
 98 |                 
 99 | def train_one_epoch(epoch_id, model, train_data_loader, trainer, label_smoothing, loss_function, grad_interval, average_param_dict, update_average_param_dict, step_num, ctx):
100 |     log_avg_loss = 0
101 |     log_wc = 0
102 |     loss_denom = 0
103 |     step_loss = 0
104 |     log_start_time = time.time()
105 |     for batch_id, seqs in enumerate(train_data_loader):
106 |         if batch_id % grad_interval == 0:
107 |             step_num += 1
108 |             new_lr = hparams.lr / math.sqrt(hparams.num_units) * min(1. / math.sqrt(step_num), step_num * hparams.warmup_steps ** (-1.5))
109 |             trainer.set_learning_rate(new_lr)
110 |         src_wc, tgt_wc, bs = np.sum([(shard[2].sum(), shard[3].sum(), shard[0].shape[0])
111 |                                      for shard in seqs], axis=0)
112 |         src_wc = src_wc.asscalar()
113 |         tgt_wc = tgt_wc.asscalar()
114 |         loss_denom += tgt_wc - bs
115 |         seqs = [[seq.as_in_context(context) for seq in shard]
116 |                 for context, shard in zip([ctx], seqs)]
117 |         Ls = []
118 |         with mx.autograd.record():
119 |             for src_seq, tgt_seq, src_valid_length, tgt_valid_length in seqs:
120 |                 out, _ = model(src_seq, tgt_seq[:, :-1],
121 |                                src_valid_length, tgt_valid_length - 1)
122 |                 smoothed_label = label_smoothing(tgt_seq[:, 1:])
123 |                 ls = loss_function(out, smoothed_label, tgt_valid_length - 1).sum()
124 |                 Ls.append((ls * (tgt_seq.shape[1] - 1)) / hparams.batch_size / 100.0)
125 |         for L in Ls:
126 |             L.backward()
127 |         if batch_id % grad_interval == grad_interval - 1 or\
128 |                 batch_id == len(train_data_loader) - 1:
129 |             if update_average_param_dict:
130 |                 for k, v in model.collect_params().items():
131 |                     average_param_dict[k] = v.data(ctx).copy()
132 |                 update_average_param_dict = False
133 |                     
134 |             trainer.step(float(loss_denom) / hparams.batch_size / 100.0)
135 |             param_dict = model.collect_params()
136 |             param_dict.zero_grad()
137 |             if step_num > hparams.average_start:
138 |                 alpha = 1. / max(1, step_num - hparams.average_start)
139 |                 for name, average_param in average_param_dict.items():
140 |                     average_param[:] += alpha * (param_dict[name].data(ctx) - average_param)
141 |         step_loss += sum([L.asscalar() for L in Ls])
142 |         if batch_id % grad_interval == grad_interval - 1 or\
143 |                 batch_id == len(train_data_loader) - 1:
144 |             log_avg_loss += step_loss / loss_denom * hparams.batch_size * 100.0
145 |             loss_denom = 0
146 |             step_loss = 0
147 |         log_wc += src_wc + tgt_wc
148 |         if (batch_id + 1) % (hparams.log_interval * grad_interval) == 0:
149 |             wps = log_wc / (time.time() - log_start_time)
150 |             logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, '
151 |                          'throughput={:.2f}K wps, wc={:.2f}K'
152 |                          .format(epoch_id, batch_id + 1, len(train_data_loader),
153 |                                  log_avg_loss / hparams.log_interval,
154 |                                  np.exp(log_avg_loss / hparams.log_interval),
155 |                                  wps / 1000, log_wc / 1000))
156 |             log_start_time = time.time()
157 |             log_avg_loss = 0
158 |             log_wc = 0


--------------------------------------------------------------------------------
/natural_language_understanding/bert/conversion_tools/convert_pytorch_model.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # 'License'); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #   http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing,
 14 | # software distributed under the License is distributed on an
 15 | # 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | # KIND, either express or implied.  See the License for the
 17 | # specific language governing permissions and limitations
 18 | # under the License.
 19 | # pylint:disable=redefined-outer-name,logging-format-interpolation
 20 | """ Script for converting PyTorch Model to Gluon. """
 21 | 
 22 | import argparse
 23 | import json
 24 | import logging
 25 | import os
 26 | import sys
 27 | 
 28 | import mxnet as mx
 29 | import gluonnlp as nlp
 30 | import torch
 31 | from gluonnlp.model import BERTEncoder, BERTModel
 32 | from gluonnlp.model.bert import bert_hparams
 33 | 
 34 | sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
 35 | from utils import get_hash, load_text_vocab, tf_vocab_to_gluon_vocab
 36 | 
 37 | parser = argparse.ArgumentParser(description='Conversion script for PyTorch BERT model',
 38 |                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 39 | parser.add_argument('--model', type=str, default='bert_12_768_12',
 40 |                     choices=['bert_12_768_12', 'bert_24_1024_16'], help='BERT model name')
 41 | parser.add_argument('--pytorch_checkpoint_dir', type=str,
 42 |                     help='Path to Tensorflow checkpoint folder.')
 43 | parser.add_argument('--vocab_file', type=str, help='Full path to the vocab.txt')
 44 | parser.add_argument('--gluon_pytorch_name_mapping', type=str,
 45 |                     default='gluon_to_pytorch_naming.json',
 46 |                     help='Output of infer_pytorch_gluon_parameter_name_mapping.py')
 47 | parser.add_argument('--out_dir', type=str, default=os.path.join('~', 'output'),
 48 |                     help='Path to output folder. The folder must exist.')
 49 | parser.add_argument('--debug', action='store_true', help='debugging mode')
 50 | args = parser.parse_args()
 51 | logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
 52 | logging.info(args)
 53 | 
 54 | # convert vocabulary
 55 | vocab = tf_vocab_to_gluon_vocab(load_text_vocab(args.vocab_file))
 56 | 
 57 | # vocab serialization
 58 | tmp_file_path = os.path.expanduser(os.path.join(args.out_dir, 'tmp'))
 59 | with open(tmp_file_path, 'w') as f:
 60 |     f.write(vocab.to_json())
 61 | hash_full, hash_short = get_hash(tmp_file_path)
 62 | gluon_vocab_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.vocab'))
 63 | with open(gluon_vocab_path, 'w') as f:
 64 |     f.write(vocab.to_json())
 65 |     logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full)
 66 | 
 67 | # Load PyTorch Model
 68 | pytorch_parameters = torch.load(os.path.join(args.pytorch_checkpoint_dir, 'pytorch_model.bin'),
 69 |                                 map_location=lambda storage, loc: storage)
 70 | pytorch_parameters = {k: v.numpy() for k, v in pytorch_parameters.items()}
 71 | 
 72 | # Make sure vocab fits to model
 73 | assert pytorch_parameters['bert.embeddings.word_embeddings.weight'].shape[0] == len(
 74 |     vocab.idx_to_token)
 75 | 
 76 | # Load Mapping
 77 | with open(args.gluon_pytorch_name_mapping, 'r') as f:
 78 |     mapping = json.load(f)
 79 | 
 80 | # BERT config
 81 | tf_config_names_to_gluon_config_names = {
 82 |     'attention_probs_dropout_prob': 'embed_dropout',
 83 |     'hidden_act': None,
 84 |     'hidden_dropout_prob': 'dropout',
 85 |     'hidden_size': 'units',
 86 |     'initializer_range': None,
 87 |     'intermediate_size': 'hidden_size',
 88 |     'max_position_embeddings': 'max_length',
 89 |     'num_attention_heads': 'num_heads',
 90 |     'num_hidden_layers': 'num_layers',
 91 |     'type_vocab_size': 'token_type_vocab_size',
 92 |     'vocab_size': None
 93 | }
 94 | predefined_args = bert_hparams[args.model]
 95 | with open(os.path.join(args.pytorch_checkpoint_dir, 'bert_config.json'), 'r') as f:
 96 |     tf_config = json.load(f)
 97 |     assert len(tf_config) == len(tf_config_names_to_gluon_config_names)
 98 |     for tf_name, gluon_name in tf_config_names_to_gluon_config_names.items():
 99 |         if tf_name is None or gluon_name is None:
100 |             continue
101 |         assert tf_config[tf_name] == predefined_args[gluon_name]
102 | 
103 | # BERT encoder
104 | encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
105 |                       num_layers=predefined_args['num_layers'], units=predefined_args['units'],
106 |                       hidden_size=predefined_args['hidden_size'],
107 |                       max_length=predefined_args['max_length'],
108 |                       num_heads=predefined_args['num_heads'], scaled=predefined_args['scaled'],
109 |                       dropout=predefined_args['dropout'],
110 |                       use_residual=predefined_args['use_residual'])
111 | 
112 | # Infer enabled BERTModel components
113 | use_pooler = any('pooler' in n for n in pytorch_parameters)
114 | use_decoder = any('cls.predictions.transform.dense.weight' in n for n in pytorch_parameters)
115 | use_classifier = any('cls.seq_relationship.weight' in n for n in pytorch_parameters)
116 | 
117 | if not use_classifier and 'classifier.weight' in pytorch_parameters and \
118 |    pytorch_parameters['classifier.weight'].shape[0] == 2:
119 |     logging.info('Assuming classifier weights in provided Pytorch model are '
120 |                  'from next sentence prediction task.')
121 |     use_classifier = True
122 | 
123 | logging.info('Inferred that the pytorch model provides the following parameters:')
124 | logging.info('- use_pooler = {}'.format(use_pooler))
125 | logging.info('- use_decoder = {}'.format(use_decoder))
126 | logging.info('- use_classifier = {}'.format(use_classifier))
127 | 
128 | # BERT model
129 | bert = BERTModel(encoder, len(vocab),
130 |                  token_type_vocab_size=predefined_args['token_type_vocab_size'],
131 |                  units=predefined_args['units'], embed_size=predefined_args['embed_size'],
132 |                  embed_dropout=predefined_args['embed_dropout'],
133 |                  word_embed=predefined_args['word_embed'], use_pooler=use_pooler,
134 |                  use_decoder=use_decoder, use_classifier=use_classifier)
135 | 
136 | bert.initialize(init=mx.init.Normal(0.02))
137 | 
138 | ones = mx.nd.ones((2, 8))
139 | out = bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]]))
140 | params = bert._collect_params_with_prefix()
141 | assert len(params) == len(pytorch_parameters), "Gluon model does not match PyTorch model. " \
142 |     "Please fix the BERTModel hyperparameters"
143 | 
144 | # set parameter data
145 | loaded_params = {}
146 | for name in params:
147 |     if name not in mapping:
148 |         raise RuntimeError('Invalid json mapping file. '
149 |                            'The parameter {} is not described in the mapping file.'.format(name))
150 |     pytorch_name = mapping[name]
151 |     if pytorch_name not in pytorch_parameters.keys():
152 |         # Handle inconsistent naming in PyTorch
153 |         # The Expected names here are based on the PyTorch version of SciBert.
154 |         # The Inconsistencies were found in ClinicalBert
155 |         if 'LayerNorm' in pytorch_name:
156 |             pytorch_name = pytorch_name.replace('weight', 'gamma')
157 |             pytorch_name = pytorch_name.replace('bias', 'beta')
158 |             assert pytorch_name in pytorch_parameters.keys()
159 | 
160 |         if 'cls.seq_relationship' in pytorch_name:
161 |             pytorch_name = pytorch_name.replace('cls.seq_relationship', 'classifier')
162 | 
163 |     arr = mx.nd.array(pytorch_parameters[pytorch_name])
164 | 
165 |     assert arr.shape == params[name].shape
166 |     params[name].set_data(arr)
167 |     loaded_params[name] = True
168 | 
169 | if len(params) != len(loaded_params):
170 |     raise RuntimeError('The Gluon BERTModel comprises {} parameter arrays, '
171 |                        'but {} have been extracted from the pytorch model. '.format(
172 |                            len(params), len(loaded_params)))
173 | 
174 | # param serialization
175 | bert.save_parameters(tmp_file_path)
176 | hash_full, hash_short = get_hash(tmp_file_path)
177 | gluon_param_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.params'))
178 | logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full)
179 | bert.save_parameters(gluon_param_path)
180 | mx.nd.waitall()
181 | 


--------------------------------------------------------------------------------
/word_embedding/model.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # "License"); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #   http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing,
 14 | # software distributed under the License is distributed on an
 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | # KIND, either express or implied.  See the License for the
 17 | # specific language governing permissions and limitations
 18 | # under the License.
 19 | 
 20 | # pylint: disable=
 21 | """Word embedding models."""
 22 | 
 23 | import mxnet as mx
 24 | import numpy as np
 25 | 
 26 | import gluonnlp as nlp
 27 | 
 28 | 
 29 | class Net(mx.gluon.HybridBlock):
 30 |     """Base class for word2vec and fastText SkipGram and CBOW networks.
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     token_to_idx : dict
 35 |         token_to_idx mapping of the vocabulary that this model is to be trained
 36 |         with. token_to_idx is used for __getitem__ and __contains__. For
 37 |         len(token_to_idx) is used during initialization to obtain the input_dim
 38 |         of the embedding matrix.
 39 |     output_dim : int
 40 |         Dimension of the dense embedding.
 41 |     batch_size : int
 42 |         Batchsize this model will be trained with. TODO temporary until
 43 |         random_like ops are supported
 44 |     negatives_weights : mxnet.nd.NDArray
 45 |         Weights for UnigramCandidateSampler for sampling negatives.
 46 |     smoothing : float, default 0.75
 47 |         Smoothing factor applied to negatives_weights. Final weights are
 48 |         mxnet.nd.power(negative_weights, smoothing).
 49 |     num_negatives : int, default 5
 50 |         Number of negatives to sample for each real sample.
 51 |     sparse_grad : bool, default True
 52 |         Specifies mxnet.gluon.nn.Embedding sparse_grad argument.
 53 |     dtype : str, default 'float32'
 54 |         dtype argument passed to gluon.nn.Embedding
 55 | 
 56 |     """
 57 | 
 58 |     # pylint: disable=abstract-method
 59 |     def __init__(self, token_to_idx, output_dim, batch_size, negatives_weights,
 60 |                  subword_function=None, num_negatives=5, smoothing=0.75,
 61 |                  sparse_grad=True, dtype='float32', **kwargs):
 62 |         super(Net, self).__init__(**kwargs)
 63 | 
 64 |         self._kwargs = dict(
 65 |             input_dim=len(token_to_idx), output_dim=output_dim, dtype=dtype,
 66 |             sparse_grad=sparse_grad, num_negatives=num_negatives)
 67 | 
 68 |         with self.name_scope():
 69 |             if subword_function is not None:
 70 |                 self.embedding = nlp.model.train.FasttextEmbeddingModel(
 71 |                     token_to_idx=token_to_idx,
 72 |                     subword_function=subword_function,
 73 |                     output_dim=output_dim,
 74 |                     weight_initializer=mx.init.Uniform(scale=1 / output_dim),
 75 |                     sparse_grad=sparse_grad,
 76 |                 )
 77 |             else:
 78 |                 self.embedding = nlp.model.train.CSREmbeddingModel(
 79 |                     token_to_idx=token_to_idx,
 80 |                     output_dim=output_dim,
 81 |                     weight_initializer=mx.init.Uniform(scale=1 / output_dim),
 82 |                     sparse_grad=sparse_grad,
 83 |                 )
 84 |             self.embedding_out = mx.gluon.nn.Embedding(
 85 |                 len(token_to_idx), output_dim=output_dim,
 86 |                 weight_initializer=mx.init.Zero(), sparse_grad=sparse_grad,
 87 |                 dtype=dtype)
 88 | 
 89 |             self.negatives_sampler = nlp.data.UnigramCandidateSampler(
 90 |                 weights=negatives_weights**smoothing, shape=(batch_size, ),
 91 |                 dtype='int64')
 92 | 
 93 |     def __getitem__(self, tokens):
 94 |         return self.embedding[tokens]
 95 | 
 96 | 
 97 | class SG(Net):
 98 |     """SkipGram network"""
 99 | 
100 |     # pylint: disable=arguments-differ
101 |     def hybrid_forward(self, F, center, context, center_words):
102 |         """SkipGram forward pass.
103 | 
104 |         Parameters
105 |         ----------
106 |         center : mxnet.nd.NDArray or mxnet.sym.Symbol
107 |             Sparse CSR array of word / subword indices of shape (batch_size,
108 |             len(token_to_idx) + num_subwords). Embedding for center words are
109 |             computed via F.sparse.dot between the CSR center array and the
110 |             weight matrix.
111 |         context : mxnet.nd.NDArray or mxnet.sym.Symbol
112 |             Dense array of context words of shape (batch_size, ). Also used for
113 |             row-wise independently masking negatives equal to one of context.
114 |         center_words : mxnet.nd.NDArray or mxnet.sym.Symbol
115 |             Dense array of center words of shape (batch_size, ). Only used for
116 |             row-wise independently masking negatives equal to one of
117 |             center_words.
118 |         """
119 | 
120 |         # negatives sampling
121 |         negatives = []
122 |         mask = []
123 |         for _ in range(self._kwargs['num_negatives']):
124 |             negatives.append(self.negatives_sampler(center_words))
125 |             mask_ = negatives[-1] != center_words
126 |             mask_ = F.stack(mask_, (negatives[-1] != context))
127 |             mask.append(mask_.min(axis=0))
128 | 
129 |         negatives = F.stack(*negatives, axis=1)
130 |         mask = F.stack(*mask, axis=1).astype(np.float32)
131 | 
132 |         # center - context pairs
133 |         emb_center = self.embedding(center).expand_dims(1)
134 |         emb_context = self.embedding_out(context).expand_dims(2)
135 |         pred_pos = F.batch_dot(emb_center, emb_context).squeeze()
136 |         loss_pos = (F.relu(pred_pos) - pred_pos + F.Activation(
137 |             -F.abs(pred_pos), act_type='softrelu')) / (mask.sum(axis=1) + 1)
138 | 
139 |         # center - negatives pairs
140 |         emb_negatives = self.embedding_out(negatives).reshape(
141 |             (-1, self._kwargs['num_negatives'],
142 |              self._kwargs['output_dim'])).swapaxes(1, 2)
143 |         pred_neg = F.batch_dot(emb_center, emb_negatives).squeeze()
144 |         mask = mask.reshape((-1, self._kwargs['num_negatives']))
145 |         loss_neg = (F.relu(pred_neg) + F.Activation(
146 |             -F.abs(pred_neg), act_type='softrelu')) * mask
147 |         loss_neg = loss_neg.sum(axis=1) / (mask.sum(axis=1) + 1)
148 | 
149 |         return loss_pos + loss_neg
150 | 
151 | 
152 | class CBOW(Net):
153 |     """CBOW network"""
154 | 
155 |     # pylint: disable=arguments-differ
156 |     def hybrid_forward(self, F, center, context):
157 |         """CBOW forward pass.
158 | 
159 |         Parameters
160 |         ----------
161 |         center : mxnet.nd.NDArray or mxnet.sym.Symbol
162 |             Dense array of center words of shape (batch_size, ).
163 |         context : mxnet.nd.NDArray or mxnet.sym.Symbol
164 |             Sparse CSR array of word / subword indices of shape (batch_size,
165 |             len(vocab) + num_subwords). Embedding for context words are
166 |             computed via F.sparse.dot between the CSR center array and the
167 |             weight matrix.
168 | 
169 |         """
170 |         # negatives sampling
171 |         negatives = []
172 |         mask = []
173 |         for _ in range(self._kwargs['num_negatives']):
174 |             negatives.append(self.negatives_sampler(center))
175 |             mask.append(negatives[-1] != center)
176 | 
177 |         negatives = F.stack(*negatives, axis=1)
178 |         mask = F.stack(*mask, axis=1).astype(np.float32)
179 | 
180 |         # context - center samples
181 |         emb_context = self.embedding(context).expand_dims(1)
182 |         emb_center = self.embedding_out(center).expand_dims(2)
183 |         pred_pos = F.batch_dot(emb_context, emb_center).squeeze()
184 |         loss_pos = (F.relu(pred_pos) - pred_pos + F.Activation(
185 |             -F.abs(pred_pos), act_type='softrelu')) / (mask.sum(axis=1) + 1)
186 | 
187 |         # context - negatives samples
188 |         emb_negatives = self.embedding_out(negatives).reshape(
189 |             (-1, self._kwargs['num_negatives'],
190 |              self._kwargs['output_dim'])).swapaxes(1, 2)
191 |         pred_neg = F.batch_dot(emb_context, emb_negatives).squeeze()
192 |         mask = mask.reshape((-1, self._kwargs['num_negatives']))
193 |         loss_neg = (F.relu(pred_neg) + F.Activation(
194 |             -F.abs(pred_neg), act_type='softrelu')) * mask
195 |         loss_neg = loss_neg.sum(axis=1) / (mask.sum(axis=1) + 1)
196 | 
197 |         return loss_pos + loss_neg
198 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/conversion_tools/compare_tf_gluon_model.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # 'License'); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #   http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing,
 14 | # software distributed under the License is distributed on an
 15 | # 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | # KIND, either express or implied.  See the License for the
 17 | # specific language governing permissions and limitations
 18 | # under the License.
 19 | """Script for model comparison between TF and Gluon."""
 20 | 
 21 | # pylint: disable=wrong-import-position, wrong-import-order, wildcard-import
 22 | 
 23 | import sys
 24 | import os
 25 | import argparse
 26 | import numpy as np
 27 | import mxnet as mx
 28 | import gluonnlp as nlp
 29 | 
 30 | sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
 31 | 
 32 | parser = argparse.ArgumentParser(description='Comparison script for BERT model in Tensorflow'
 33 |                                              'and that in Gluon. This script works with '
 34 |                                              'google/bert@f39e881b',
 35 |                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 36 | parser.add_argument('--input_file', type=str, default='input.txt',
 37 |                     help='sample input file for testing')
 38 | parser.add_argument('--tf_bert_repo_dir', type=str,
 39 |                     default='~/bert/',
 40 |                     help='path to the original Tensorflow bert repository. '
 41 |                          'The repo should be at f39e881b.')
 42 | parser.add_argument('--tf_model_dir', type=str,
 43 |                     default='~/uncased_L-12_H-768_A-12/',
 44 |                     help='path to the original Tensorflow bert checkpoint directory.')
 45 | parser.add_argument('--tf_model_prefix', type=str,
 46 |                     default='bert_model.ckpt',
 47 |                     help='name of bert checkpoint file.')
 48 | parser.add_argument('--tf_config_name', type=str,
 49 |                     default='bert_config.json',
 50 |                     help='Name of Bert config file')
 51 | parser.add_argument('--cased', action='store_true',
 52 |                     help='if not set, inputs are converted to lower case')
 53 | parser.add_argument('--gluon_dataset', type=str, default='book_corpus_wiki_en_uncased',
 54 |                     help='gluon dataset name')
 55 | parser.add_argument('--gluon_model', type=str, default='bert_12_768_12',
 56 |                     help='gluon model name')
 57 | parser.add_argument('--gluon_parameter_file', type=str, default=None,
 58 |                     help='gluon parameter file name.')
 59 | parser.add_argument('--gluon_vocab_file', type=str, default=None,
 60 |                     help='gluon vocab file corresponding to --gluon_parameter_file.')
 61 | 
 62 | args = parser.parse_args()
 63 | 
 64 | input_file = os.path.expanduser(args.input_file)
 65 | tf_bert_repo_dir = os.path.expanduser(args.tf_bert_repo_dir)
 66 | tf_model_dir = os.path.expanduser(args.tf_model_dir)
 67 | vocab_file = os.path.join(tf_model_dir, 'vocab.txt')
 68 | bert_config_file = os.path.join(tf_model_dir, args.tf_config_name)
 69 | init_checkpoint = os.path.join(tf_model_dir, args.tf_model_prefix)
 70 | do_lower_case = not args.cased
 71 | max_length = 128
 72 | 
 73 | ###############################################################################
 74 | #                          Tensorflow MODEL                                   #
 75 | ###############################################################################
 76 | # import tensorflow modules
 77 | sys.path.insert(0, tf_bert_repo_dir)
 78 | 
 79 | # tensorflow model inference
 80 | import modeling
 81 | import tokenization
 82 | from extract_features import *
 83 | 
 84 | # data
 85 | num_layers = int(args.gluon_model.split('_')[1])
 86 | layer_indexes = list(range(num_layers))
 87 | bert_config = modeling.BertConfig.from_json_file(bert_config_file)
 88 | tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
 89 | examples = read_examples(input_file)
 90 | 
 91 | features = convert_examples_to_features(
 92 |     examples=examples, seq_length=max_length, tokenizer=tokenizer)
 93 | 
 94 | is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
 95 | run_config = tf.contrib.tpu.RunConfig(
 96 |     master=None,
 97 |     tpu_config=tf.contrib.tpu.TPUConfig(
 98 |         num_shards=1,
 99 |         per_host_input_for_training=is_per_host))
100 | # model
101 | model_fn = model_fn_builder(
102 |     bert_config=bert_config,
103 |     init_checkpoint=init_checkpoint,
104 |     layer_indexes=layer_indexes,
105 |     use_tpu=False,
106 |     use_one_hot_embeddings=False)
107 | 
108 | estimator = tf.contrib.tpu.TPUEstimator(
109 |     use_tpu=False,
110 |     model_fn=model_fn,
111 |     config=run_config,
112 |     predict_batch_size=1)
113 | 
114 | input_fn = input_fn_builder(
115 |     features=features, seq_length=max_length)
116 | 
117 | tensorflow_all_out = []
118 | for result in estimator.predict(input_fn, yield_single_examples=True):
119 |     output_json = collections.OrderedDict()
120 |     tensorflow_all_out_features = []
121 |     all_layers = []
122 |     for (j, layer_index) in enumerate(layer_indexes):
123 |         layer_output = result['layer_output_%d' % j]
124 |         layers = collections.OrderedDict()
125 |         layers['index'] = layer_index
126 |         layers['values'] = layer_output
127 |         all_layers.append(layers)
128 |     tensorflow_out_features = collections.OrderedDict()
129 |     tensorflow_out_features['layers'] = all_layers
130 |     tensorflow_all_out_features.append(tensorflow_out_features)
131 | 
132 |     output_json['features'] = tensorflow_all_out_features
133 |     tensorflow_all_out.append(output_json)
134 | 
135 | tf_outputs = [tensorflow_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes]
136 | 
137 | ###############################################################################
138 | #                               Gluon MODEL                                   #
139 | ###############################################################################
140 | 
141 | if args.gluon_parameter_file:
142 |     assert args.gluon_vocab_file, \
143 |         'Must specify --gluon_vocab_file when specifying --gluon_parameter_file'
144 |     with open(args.gluon_vocab_file, 'r') as f:
145 |         vocabulary = nlp.Vocab.from_json(f.read())
146 |     bert, vocabulary = nlp.model.get_model(args.gluon_model,
147 |                                            dataset_name=None,
148 |                                            vocab=vocabulary,
149 |                                            pretrained=not args.gluon_parameter_file,
150 |                                            use_pooler=False,
151 |                                            use_decoder=False,
152 |                                            use_classifier=False)
153 |     try:
154 |         bert.cast('float16')
155 |         bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
156 |         bert.cast('float32')
157 |     except AssertionError:
158 |         bert.cast('float32')
159 |         bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
160 | else:
161 |     assert not args.gluon_vocab_file, \
162 |         'Cannot specify --gluon_vocab_file without specifying --gluon_parameter_file'
163 |     bert, vocabulary = nlp.model.get_model(args.gluon_model,
164 |                                            dataset_name=args.gluon_dataset,
165 |                                            pretrained=not args.gluon_parameter_file,
166 |                                            use_pooler=False,
167 |                                            use_decoder=False,
168 |                                            use_classifier=False)
169 | 
170 | print(bert)
171 | tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=do_lower_case)
172 | dataset = nlp.data.TSVDataset(input_file, field_separator=nlp.data.Splitter(' ||| '))
173 | 
174 | trans = nlp.data.BERTSentenceTransform(tokenizer, max_length)
175 | dataset = dataset.transform(trans)
176 | 
177 | bert_dataloader = mx.gluon.data.DataLoader(dataset, batch_size=1,
178 |                                            shuffle=True, last_batch='rollover')
179 | 
180 | # verify the output of the first sample
181 | for i, seq in enumerate(bert_dataloader):
182 |     input_ids, valid_length, type_ids = seq
183 |     out = bert(input_ids, type_ids,
184 |                valid_length.astype('float32'))
185 |     length = valid_length.asscalar()
186 |     a = tf_outputs[-1][:length]
187 |     b = out[0][:length].asnumpy()
188 | 
189 |     print('stdev = %s' % (np.std(a - b)))
190 |     mx.test_utils.assert_almost_equal(a, b, atol=5e-6, rtol=5e-6)
191 |     break
192 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/data/baidu_ernie_data.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # "License"); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #   http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing,
 14 | # software distributed under the License is distributed on an
 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | # KIND, either express or implied.  See the License for the
 17 | # specific language governing permissions and limitations
 18 | # under the License.
 19 | 
 20 | # pylint: disable=line-too-long
 21 | """Baidu ernie data, contains XNLI."""
 22 | 
 23 | __all__ = ['BaiduErnieXNLI', 'BaiduErnieLCQMC', 'BaiduErnieChnSentiCorp']
 24 | 
 25 | import os
 26 | import sys
 27 | import tarfile
 28 | from gluonnlp.data.dataset import TSVDataset
 29 | from gluonnlp.data.registry import register
 30 | from gluonnlp.base import get_home_dir
 31 | if sys.version_info[0] >= 3:
 32 |     from urllib.request import urlretrieve
 33 | else:
 34 |     from urllib import urlretrieve
 35 | 
 36 | _baidu_ernie_data_url = 'https://ernie.bj.bcebos.com/task_data.tgz'
 37 | 
 38 | class _BaiduErnieDataset(TSVDataset):
 39 |     def __init__(self, root, dataset_name, segment, **kwargs):
 40 |         root = os.path.expanduser(root)
 41 |         if not os.path.isdir(root):
 42 |             os.makedirs(root)
 43 |         self._root = root
 44 |         download_data_path = os.path.join(self._root, 'task_data.tgz')
 45 |         if not os.path.exists(download_data_path):
 46 |             urlretrieve(_baidu_ernie_data_url, download_data_path)
 47 |             tar_file = tarfile.open(download_data_path, mode='r:gz')
 48 |             tar_file.extractall(self._root)
 49 |         filename = os.path.join(self._root, 'task_data', dataset_name, '%s.tsv' % segment)
 50 |         super(_BaiduErnieDataset, self).__init__(filename, **kwargs)
 51 | 
 52 | 
 53 | @register(segment=['train', 'dev', 'test'])
 54 | class BaiduErnieXNLI(_BaiduErnieDataset):
 55 |     """ The XNLI dataset redistributed by Baidu
 56 |     <https://github.com/PaddlePaddle/LARK/tree/develop/ERNIE>.
 57 | 
 58 |     Original from:
 59 |     Conneau, Alexis, et al. "Xnli: Evaluating cross-lingual sentence representations."
 60 |         arXiv preprint arXiv:1809.05053 (2018).
 61 |         https://github.com/facebookresearch/XNLI
 62 | 
 63 |     Licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.
 64 |         License details: https://creativecommons.org/licenses/by-nc/4.0/
 65 | 
 66 |     Parameters
 67 |     ----------
 68 |     segment : {'train', 'dev', 'test'}, default 'train'
 69 |         Dataset segment.
 70 |     root : str, default '$MXNET_HOME/datasets/baidu_ernie_task_data'
 71 |         Path to temp folder for storing data.
 72 |         MXNET_HOME defaults to '~/.mxnet'.
 73 |     return_all_fields : bool, default False
 74 |         Return all fields available in the dataset.
 75 | 
 76 |     Examples
 77 |     --------
 78 |     >>> xnli_dev = BaiduErnieXNLI('dev', root='./datasets/baidu_ernie_task_data/')
 79 |     -etc-
 80 |     >>> len(xnli_dev)
 81 |     2490
 82 |     >>> len(xnli_dev[0])
 83 |     3
 84 |     >>> xnli_dev[0]
 85 |     ['他说，妈妈，我回来了。', '校车把他放下后，他立即给他妈妈打了电话。', 'neutral']
 86 |     >>> xnli_test = BaiduErnieXNLI('test', root='./datasets/baidu_ernie_task_data/')
 87 |     -etc-
 88 |     >>> len(xnli_test)
 89 |     5010
 90 |     >>> len(xnli_test[0])
 91 |     2
 92 |     >>> xnli_test[0]
 93 |     ['嗯，我根本没想过，但是我很沮丧，最后我又和他说话了。', '我还没有和他再次谈论。']
 94 |     """
 95 |     def __init__(self, segment='train',
 96 |                  root=os.path.join(get_home_dir(), 'datasets', 'baidu_ernie_data'),
 97 |                  return_all_fields=False):
 98 |         A_IDX, B_IDX, LABEL_IDX = 0, 1, 2
 99 |         if segment in ['train', 'dev']:
100 |             field_indices = [A_IDX, B_IDX, LABEL_IDX] if not return_all_fields else None
101 |             num_discard_samples = 1
102 |         elif segment == 'test':
103 |             field_indices = [A_IDX, B_IDX] if not return_all_fields else None
104 |             num_discard_samples = 1
105 | 
106 |         super(BaiduErnieXNLI, self).__init__(root, 'xnli', segment,
107 |                                              num_discard_samples=num_discard_samples,
108 |                                              field_indices=field_indices)
109 | 
110 | @register(segment=['train', 'dev', 'test'])
111 | class BaiduErnieLCQMC(_BaiduErnieDataset):
112 |     """ The LCQMC dataset redistributed by Baidu
113 |     <https://github.com/PaddlePaddle/LARK/tree/develop/ERNIE>.
114 | 
115 |     Original from:
116 |     Xin Liu, Qingcai Chen, Chong Deng, Huajun Zeng, Jing Chen, Dongfang Li, Buzhou Tang,
117 |         LCQMC: A Large-scale Chinese Question Matching Corpus,COLING2018.
118 |     Licensed under a Creative Commons Attribution 4.0 International License. License details:
119 |         http://creativecommons.org/licenses/by/4.0/
120 | 
121 |     Parameters
122 |     ----------
123 |     segment : {'train', 'dev', 'test'}, default 'train'
124 |         Dataset segment.
125 |     root : str, default '$MXNET_HOME/datasets/baidu_ernie_task_data'
126 |         Path to temp folder for storing data.
127 |         MXNET_HOME defaults to '~/.mxnet'.
128 |     return_all_fields : bool, default False
129 |         Return all fields available in the dataset.
130 | 
131 |     Examples
132 |     --------
133 |     >>> lcqmc_dev = BaiduErnieLCQMC('dev', root='./datasets/baidu_ernie_task_data/')
134 |     -etc-
135 |     >>> len(lcqmc_dev)
136 |     8802
137 |     >>> len(lcqmc_dev[0])
138 |     3
139 |     >>> lcqmc_dev[0]
140 |     ['开初婚未育证明怎么弄？', '初婚未育情况证明怎么开？', '1']
141 |     >>> lcqmc_test = BaiduErnieLCQMC('test', root='./datasets/baidu_ernie_task_data/')
142 |     -etc-
143 |     >>> len(lcqmc_test)
144 |     12500
145 |     >>> len(lcqmc_test[0])
146 |     2
147 |     >>> lcqmc_test[0]
148 |     ['谁有狂三这张高清的', '这张高清图，谁有']
149 |     """
150 |     def __init__(self, segment='train',
151 |                  root=os.path.join(get_home_dir(), 'datasets', 'baidu_ernie_data'),
152 |                  return_all_fields=False):
153 |         A_IDX, B_IDX, LABEL_IDX = 0, 1, 2
154 |         if segment in ['train', 'dev']:
155 |             field_indices = [A_IDX, B_IDX, LABEL_IDX] if not return_all_fields else None
156 |             num_discard_samples = 1
157 |         elif segment == 'test':
158 |             field_indices = [A_IDX, B_IDX] if not return_all_fields else None
159 |             num_discard_samples = 1
160 | 
161 |         super(BaiduErnieLCQMC, self).__init__(root, 'lcqmc', segment,
162 |                                               num_discard_samples=num_discard_samples,
163 |                                               field_indices=field_indices)
164 | 
165 | 
166 | @register(segment=['train', 'dev', 'test'])
167 | class BaiduErnieChnSentiCorp(_BaiduErnieDataset):
168 |     """ The ChnSentiCorp dataset redistributed by Baidu
169 |     <https://github.com/PaddlePaddle/LARK/tree/develop/ERNIE>.
170 | 
171 |     Original from Tan Songbo (Chinese Academy of Sciences, tansongbo@software.ict.ac.cn).
172 | 
173 |     Parameters
174 |     ----------
175 |     segment : {'train', 'dev', 'test'}, default 'train'
176 |         Dataset segment.
177 |     root : str, default '$MXNET_HOME/datasets/baidu_ernie_task_data'
178 |         Path to temp folder for storing data.
179 |         MXNET_HOME defaults to '~/.mxnet'.
180 |     return_all_fields : bool, default False
181 |         Return all fields available in the dataset.
182 | 
183 |     Examples
184 |     --------
185 |     >>> chnsenticorp_dev = BaiduErnieChnSentiCorp('dev', root='./datasets/baidu_ernie_task_data/')
186 |     -etc-
187 |     >>> len(chnsenticorp_dev)
188 |     1200
189 |     >>> len(chnsenticorp_dev[0])
190 |     2
191 |     >>> chnsenticorp_dev[2]
192 |     ['商品的不足暂时还没发现，京东的订单处理速度实在.......周二就打包完成，周五才发货...', '0']
193 |     >>> chnsenticorp_test = BaiduErnieChnSentiCorp('test', root='./datasets/baidu_ernie_task_data/')
194 |     -etc-
195 |     >>> len(chnsenticorp_test)
196 |     1200
197 |     >>> len(chnsenticorp_test[0])
198 |     1
199 |     >>> chnsenticorp_test[0]
200 |     ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般']
201 |     """
202 |     def __init__(self, segment='train',
203 |                  root=os.path.join(get_home_dir(), 'datasets', 'baidu_ernie_data'),
204 |                  return_all_fields=False):
205 |         LABEL_IDX, A_IDX = 0, 1
206 |         if segment in ['train', 'dev']:
207 |             field_indices = [A_IDX, LABEL_IDX] if not return_all_fields else None
208 |             num_discard_samples = 1
209 |         elif segment == 'test':
210 |             field_indices = [A_IDX] if not return_all_fields else None
211 |             num_discard_samples = 1
212 | 
213 |         super(BaiduErnieChnSentiCorp, self).__init__(root, 'chnsenticorp', segment,
214 |                                                      num_discard_samples=num_discard_samples,
215 |                                                      field_indices=field_indices)
216 | 


--------------------------------------------------------------------------------
/sequence_generation/text_generation/sequence_sampling.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Generate Sentences by Sampling and Beam Search
  3 | ==============================================
  4 | 
  5 | This example shows how to load a pre-trained language model on wikitext-2 in Gluon NLP Toolkit model
  6 | zoo, and use sequence sampler and beam search sampler on the language model to generate sentences.
  7 | """
  8 | 
  9 | # coding: utf-8
 10 | 
 11 | # Licensed to the Apache Software Foundation (ASF) under one
 12 | # or more contributor license agreements.  See the NOTICE file
 13 | # distributed with this work for additional information
 14 | # regarding copyright ownership.  The ASF licenses this file
 15 | # to you under the Apache License, Version 2.0 (the
 16 | # "License"); you may not use this file except in compliance
 17 | # with the License.  You may obtain a copy of the License at
 18 | #
 19 | #   http://www.apache.org/licenses/LICENSE-2.0
 20 | #
 21 | # Unless required by applicable law or agreed to in writing,
 22 | # software distributed under the License is distributed on an
 23 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 24 | # KIND, either express or implied.  See the License for the
 25 | # specific language governing permissions and limitations
 26 | # under the License.
 27 | # pylint:disable=missing-docstring
 28 | import argparse
 29 | 
 30 | import numpy as np
 31 | import mxnet as mx
 32 | import gluonnlp as nlp
 33 | 
 34 | import model # local 'model' module with the addition of GPT-2
 35 | 
 36 | 
 37 | parser = argparse.ArgumentParser(description='Generate sentences by beam search. '
 38 |                                              'We load a LSTM model that is pre-trained on '
 39 |                                              'WikiText as our encoder.')
 40 | 
 41 | # beam search sampler options
 42 | subparsers = parser.add_subparsers(help='Sequence generation methods.',
 43 |                                    dest='command')
 44 | subparsers.required = True
 45 | beam_search_parser = subparsers.add_parser('beam-search', help='Use beam search for decoding.')
 46 | beam_search_parser.add_argument('--alpha', type=float, default=0.0,
 47 |                                 help='Alpha in the length penalty term.')
 48 | beam_search_parser.add_argument('--k', type=int, default=5, help='K in the length penalty term.')
 49 | 
 50 | # random sampler options
 51 | random_sample_parser = subparsers.add_parser('random-sample',
 52 |                                              help='Use random sampling for decoding.')
 53 | random_sample_parser.add_argument('--temperature', type=float, default=1.0,
 54 |                                   help='Softmax temperature used in sampling.')
 55 | random_sample_parser.add_argument('--use-top-k', type=int, required=False,
 56 |                                   help='Sample only from the top-k candidates.')
 57 | 
 58 | # shared options
 59 | for p in [beam_search_parser, random_sample_parser]:
 60 |     p.add_argument('--gpu', type=int, default=0,
 61 |                    help='id of the gpu to use. Set it to empty means to use cpu.')
 62 |     p.add_argument('--lm-model', type=str, default='awd_lstm_lm_1150',
 63 |                    help='type of the pre-trained model to load, can be "standard_lstm_lm_200", '
 64 |                         '"standard_lstm_lm_650", "standard_lstm_lm_1500", '
 65 |                         '"awd_lstm_lm_1150", etc.')
 66 |     p.add_argument('--max-length', type=int, default=20, help='Maximum sentence length.')
 67 |     p.add_argument('--print-num', type=int, default=3, help='Number of sentences to display.')
 68 |     p.add_argument('--bos', type=str, default='I think this works')
 69 |     p.add_argument('--beam-size', type=int, default=5,
 70 |                    help='Beam size in the beam search sampler.')
 71 | 
 72 | args = parser.parse_args()
 73 | 
 74 | print(args)
 75 | if args.gpu is not None and args.gpu < mx.context.num_gpus():
 76 |     ctx = mx.gpu(args.gpu)
 77 | else:
 78 |     if args.gpu:
 79 |         print('Specified GPU id {} does not exist. Available #GPUs: {}. Using CPU instead.'\
 80 |                 .format(args.gpu, mx.context.num_gpus()))
 81 |     ctx = mx.cpu()
 82 | 
 83 | assert 0 < args.print_num <= args.beam_size,\
 84 |     'print_num must be between {} and {}, received={}'.format(1, args.beam_size, args.print_num)
 85 | 
 86 | 
 87 | # Define the decoder function, we use log_softmax to map the output scores to log-likelihoods
 88 | # Also, we transform the layout to NTC
 89 | class LMDecoder(object):
 90 |     def __init__(self, net):
 91 |         self.net = net
 92 | 
 93 |     def __call__(self, inputs, states):
 94 |         outputs, states = self.net(mx.nd.expand_dims(inputs, axis=0), states)
 95 |         return outputs[0], states
 96 | 
 97 |     def state_info(self, *arg, **kwargs):
 98 |         return self.net.state_info(*arg, **kwargs)
 99 | 
100 | class GPT2Decoder(LMDecoder):
101 |     def __call__(self, inputs, states):
102 |         inputs = mx.nd.expand_dims(inputs, axis=1)
103 |         out, new_states = self.net(inputs, states)
104 |         out = mx.nd.slice_axis(out, axis=1, begin=0, end=1).reshape((inputs.shape[0], -1))
105 |         return out, new_states
106 | 
107 | def get_decoder_vocab(lm_model):
108 |     if lm_model.startswith('gpt2'):
109 |         dataset_name = 'openai_webtext'
110 |         decoder_cls = GPT2Decoder
111 |     else:
112 |         dataset_name = 'wikitext-2'
113 |         decoder_cls = LMDecoder
114 |     lm_model, vocab = model.get_model(name=lm_model,
115 |                                       dataset_name=dataset_name,
116 |                                       pretrained=True,
117 |                                       ctx=ctx)
118 |     decoder = decoder_cls(lm_model)
119 |     return decoder, vocab
120 | 
121 | def get_tokenizer(lm_model):
122 |     if lm_model.startswith('gpt2'):
123 |         return nlp.data.GPT2BPETokenizer(), nlp.data.GPT2BPEDetokenizer()
124 |     else:
125 |         return nlp.data.SacreMosesTokenizer(), nlp.data.SacreMosesDetokenizer(return_str=True)
126 | 
127 | def get_initial_input_state(decoder, bos_ids):
128 |     if isinstance(decoder, GPT2Decoder):
129 |         inputs, begin_states = decoder.net(
130 |             mx.nd.array([bos_ids], dtype=np.int32, ctx=ctx), None)
131 |         inputs = inputs[:, -1, :]
132 |         smoothed_probs = (inputs / args.temperature).softmax(axis=1)
133 |         inputs = mx.nd.sample_multinomial(smoothed_probs, dtype=np.int32)
134 |         return inputs, begin_states
135 |     else:
136 |         begin_states = decoder.net.begin_state(batch_size=1, ctx=ctx)
137 |         if len(bos_ids) > 1:
138 |             _, begin_states = decoder.net(mx.nd.expand_dims(mx.nd.array(bos_ids[:-1], ctx=ctx),
139 |                                                             axis=1),
140 |                                           begin_states)
141 |         inputs = mx.nd.full(shape=(1,), ctx=ctx, val=bos_ids[-1])
142 |         return inputs, begin_states
143 | 
144 | 
145 | def generate():
146 |     assert not args.lm_model.startswith('gpt2') or args.command != 'beam-search'
147 |     decoder, vocab = get_decoder_vocab(args.lm_model)
148 |     tokenizer, detokenizer = get_tokenizer(args.lm_model)
149 |     bos_str = args.bos
150 |     if not bos_str.startswith(' '):
151 |         bos_str = ' ' + bos_str
152 |     bos_tokens = tokenizer(bos_str)
153 |     bos_ids = vocab[bos_tokens]
154 |     eos_id = vocab[vocab.eos_token]
155 |     if args.command == 'random-sample':
156 |         print('Sampling Parameters: beam_size={}, temperature={}, use_top_k={}'\
157 |                 .format(args.beam_size, args.temperature, args.use_top_k))
158 |         sampler = nlp.model.SequenceSampler(beam_size=args.beam_size,
159 |                                             decoder=decoder,
160 |                                             eos_id=eos_id,
161 |                                             max_length=args.max_length - len(bos_tokens),
162 |                                             temperature=args.temperature,
163 |                                             top_k=args.use_top_k)
164 |     else:
165 |         print('Beam Seach Parameters: beam_size={}, alpha={}, K={}'\
166 |                 .format(args.beam_size, args.alpha, args.k))
167 |         scorer = nlp.model.BeamSearchScorer(alpha=args.alpha, K=args.k, from_logits=False)
168 |         sampler = nlp.model.BeamSearchSampler(beam_size=args.beam_size,
169 |                                               decoder=decoder,
170 |                                               eos_id=eos_id,
171 |                                               scorer=scorer,
172 |                                               max_length=args.max_length - len(bos_tokens))
173 |     inputs, begin_states = get_initial_input_state(decoder, bos_ids)
174 |     # samples have shape (1, beam_size, length), scores have shape (1, beam_size)
175 |     samples, scores, valid_lengths = sampler(inputs, begin_states)
176 |     samples = samples[0].asnumpy()
177 |     scores = scores[0].asnumpy()
178 |     valid_lengths = valid_lengths[0].asnumpy()
179 | 
180 |     print('Generation Result:')
181 |     for i in range(args.print_num):
182 |         generated_tokens = [vocab.idx_to_token[ele] for ele in samples[i][:valid_lengths[i]]]
183 |         tokens = bos_tokens + generated_tokens[1:]
184 |         print([detokenizer(tokens).strip(), scores[i]])
185 | 
186 | 
187 | if __name__ == '__main__':
188 |     generate()
189 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/export/export.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Export the BERT Model for Deployment
  3 | 
  4 | ====================================
  5 | 
  6 | This script exports the BERT model to a hybrid model serialized as a symbol.json file,
  7 | which is suitable for deployment, or use with MXNet Module API.
  8 | 
  9 | @article{devlin2018bert,
 10 |   title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
 11 |   author={Devlin, Jacob and Chang, Ming- \
 12 |       Wei and Lee, Kenton and Toutanova, Kristina},
 13 |   journal={arXiv preprint arXiv:1810.04805},
 14 |   year={2018}
 15 | }
 16 | """
 17 | 
 18 | # coding=utf-8
 19 | 
 20 | # Licensed to the Apache Software Foundation (ASF) under one
 21 | # or more contributor license agreements.  See the NOTICE file
 22 | # distributed with this work for additional information
 23 | # regarding copyright ownership.  The ASF licenses this file
 24 | # to you under the Apache License, Version 2.0 (the
 25 | # "License"); you may not use this file except in compliance
 26 | # with the License.  You may obtain a copy of the License at
 27 | #
 28 | #   http://www.apache.org/licenses/LICENSE-2.0
 29 | #
 30 | # Unless required by applicable law or agreed to in writing,
 31 | # software distributed under the License is distributed on an
 32 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 33 | # KIND, either express or implied.  See the License for the
 34 | # specific language governing permissions and limitations
 35 | # under the License.
 36 | # pylint:disable=redefined-outer-name,logging-format-interpolation
 37 | 
 38 | import argparse
 39 | import logging
 40 | import warnings
 41 | import os
 42 | import time
 43 | 
 44 | import mxnet as mx
 45 | import gluonnlp as nlp
 46 | from hybrid_bert import get_hybrid_model
 47 | from hybrid_bert import HybridBERTClassifier, HybridBERTRegression, HybridBERTForQA
 48 | 
 49 | parser = argparse.ArgumentParser(description='Export hybrid BERT base model.')
 50 | 
 51 | parser.add_argument('--model_parameters',
 52 |                     type=str,
 53 |                     default=None,
 54 |                     help='The model parameter file saved from training.')
 55 | 
 56 | parser.add_argument('--model_name',
 57 |                     type=str,
 58 |                     default='bert_12_768_12',
 59 |                     choices=['bert_12_768_12', 'bert_24_1024_16'],
 60 |                     help='BERT model name. Options are "bert_12_768_12" and "bert_24_1024_16"')
 61 | 
 62 | parser.add_argument('--task',
 63 |                     type=str,
 64 |                     choices=['classification', 'regression', 'question_answering'],
 65 |                     required=True,
 66 |                     help='Task to export. Options are "classification", "regression", '
 67 |                          '"question_answering"')
 68 | 
 69 | parser.add_argument('--dataset_name',
 70 |                     type=str,
 71 |                     default='book_corpus_wiki_en_uncased',
 72 |                     choices=['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased',
 73 |                              'wiki_multilingual_uncased', 'wiki_multilingual_cased',
 74 |                              'wiki_cn_cased'],
 75 |                     help='BERT dataset name. Options include '
 76 |                          '"book_corpus_wiki_en_uncased", "book_corpus_wiki_en_cased", '
 77 |                          '"wiki_multilingual_uncased", "wiki_multilingual_cased", '
 78 |                          '"wiki_cn_cased"')
 79 | 
 80 | parser.add_argument('--output_dir',
 81 |                     type=str,
 82 |                     default='./output_dir',
 83 |                     help='The directory where the exported model symbol will be created. '
 84 |                          'The default is ./output_dir')
 85 | 
 86 | parser.add_argument('--seq_length',
 87 |                     type=int,
 88 |                     default=384,
 89 |                     help='The maximum total input sequence length after WordPiece tokenization.'
 90 |                          'Sequences longer than this needs to be truncated, and sequences shorter '
 91 |                          'than this needs to be padded. Default is 384')
 92 | 
 93 | parser.add_argument('--dropout',
 94 |                     type=float,
 95 |                     default=0.1,
 96 |                     help='The dropout probability for the classification/regression head.')
 97 | 
 98 | args = parser.parse_args()
 99 | 
100 | # create output dir
101 | output_dir = args.output_dir
102 | nlp.utils.mkdir(output_dir)
103 | 
104 | ###############################################################################
105 | #                                Logging                                      #
106 | ###############################################################################
107 | 
108 | log = logging.getLogger('gluonnlp')
109 | log.setLevel(logging.DEBUG)
110 | formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s',
111 |                               datefmt='%H:%M:%S')
112 | fh = logging.FileHandler(os.path.join(args.output_dir, 'hybrid_export_bert.log'), mode='w')
113 | fh.setLevel(logging.INFO)
114 | fh.setFormatter(formatter)
115 | console = logging.StreamHandler()
116 | console.setLevel(logging.INFO)
117 | console.setFormatter(formatter)
118 | log.addHandler(console)
119 | log.addHandler(fh)
120 | log.info(args)
121 | 
122 | ###############################################################################
123 | #                              Hybridize the model                            #
124 | ###############################################################################
125 | 
126 | seq_length = args.seq_length
127 | 
128 | if args.task == 'classification':
129 |     bert, _ = get_hybrid_model(
130 |         name=args.model_name,
131 |         dataset_name=args.dataset_name,
132 |         pretrained=False,
133 |         use_pooler=True,
134 |         use_decoder=False,
135 |         use_classifier=False,
136 |         seq_length=args.seq_length)
137 |     net = HybridBERTClassifier(bert, num_classes=2, dropout=args.dropout)
138 | elif args.task == 'regression':
139 |     bert, _ = get_hybrid_model(
140 |         name=args.model_name,
141 |         dataset_name=args.dataset_name,
142 |         pretrained=False,
143 |         use_pooler=True,
144 |         use_decoder=False,
145 |         use_classifier=False,
146 |         seq_length=args.seq_length)
147 |     net = HybridBERTRegression(bert, dropout=args.dropout)
148 | elif args.task == 'question_answering':
149 |     bert, _ = get_hybrid_model(
150 |         name=args.model_name,
151 |         dataset_name=args.dataset_name,
152 |         pretrained=False,
153 |         use_pooler=False,
154 |         use_decoder=False,
155 |         use_classifier=False,
156 |         seq_length=args.seq_length)
157 |     net = HybridBERTForQA(bert)
158 | else:
159 |     raise ValueError('unknown task: %s'%args.task)
160 | 
161 | if args.model_parameters:
162 |     net.load_parameters(args.model_parameters)
163 | else:
164 |     net.initialize()
165 |     warnings.warn('--model_parameters is not provided. The parameter checkpoint (.params) '
166 |                   'file will be created based on default parameter intialization.')
167 | 
168 | net.hybridize(static_alloc=True, static_shape=True)
169 | 
170 | ###############################################################################
171 | #                            Prepare dummy input data                         #
172 | ###############################################################################
173 | 
174 | test_batch_size = 1
175 | 
176 | inputs = mx.nd.arange(test_batch_size * seq_length)
177 | inputs = inputs.reshape(shape=(test_batch_size, seq_length))
178 | token_types = mx.nd.zeros_like(inputs)
179 | valid_length = mx.nd.arange(test_batch_size)
180 | batch = inputs, token_types, valid_length
181 | 
182 | def export(batch, prefix):
183 |     """Export the model."""
184 |     log.info('Exporting the model ... ')
185 |     inputs, token_types, valid_length = batch
186 |     net(inputs, token_types, valid_length)
187 |     net.export(prefix, epoch=0)
188 |     assert os.path.isfile(prefix + '-symbol.json')
189 |     assert os.path.isfile(prefix + '-0000.params')
190 | 
191 | def infer(batch, prefix):
192 |     """Evaluate the model on a mini-batch."""
193 |     log.info('Start inference ... ')
194 | 
195 |     # import with SymbolBlock. Alternatively, you can use Module.load APIs.
196 |     imported_net = mx.gluon.nn.SymbolBlock.imports(prefix + '-symbol.json',
197 |                                                    ['data0', 'data1', 'data2'],
198 |                                                    prefix + '-0000.params')
199 |     tic = time.time()
200 |     # run forward inference
201 |     inputs, token_types, valid_length = batch
202 |     num_trials = 10
203 |     for _ in range(num_trials):
204 |         imported_net(inputs, token_types, valid_length)
205 |     mx.nd.waitall()
206 |     toc = time.time()
207 |     log.info('Inference time cost={:.2f} s, Thoughput={:.2f} samples/s'
208 |              .format(toc - tic, num_trials / (toc - tic)))
209 | 
210 | 
211 | ###############################################################################
212 | #                              Export the model                               #
213 | ###############################################################################
214 | if __name__ == '__main__':
215 |     prefix = os.path.join(args.output_dir, args.task)
216 |     export(batch, prefix)
217 |     infer(batch, prefix)
218 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/embedding.py:
--------------------------------------------------------------------------------
  1 | """BERT embedding."""
  2 | import argparse
  3 | import io
  4 | import logging
  5 | import os
  6 | 
  7 | import numpy as np
  8 | import mxnet as mx
  9 | 
 10 | from mxnet.gluon.data import DataLoader
 11 | 
 12 | import gluonnlp
 13 | from gluonnlp.data import BERTTokenizer, BERTSentenceTransform
 14 | from gluonnlp.base import get_home_dir
 15 | 
 16 | try:
 17 |     from data.embedding import BertEmbeddingDataset
 18 | except ImportError:
 19 |     from .data.embedding import BertEmbeddingDataset
 20 | 
 21 | try:
 22 |     unicode
 23 | except NameError:
 24 |     # Define `unicode` for Python3
 25 |     def unicode(s, *_):
 26 |         return s
 27 | 
 28 | 
 29 | def to_unicode(s):
 30 |     return unicode(s, 'utf-8')
 31 | 
 32 | 
 33 | __all__ = ['BertEmbedding']
 34 | 
 35 | 
 36 | logger = logging.getLogger(__name__)
 37 | 
 38 | 
 39 | class BertEmbedding(object):
 40 |     """
 41 |     Encoding from BERT model.
 42 | 
 43 |     Parameters
 44 |     ----------
 45 |     ctx : Context.
 46 |         running BertEmbedding on which gpu device id.
 47 |     dtype: str
 48 |         data type to use for the model.
 49 |     model : str, default bert_12_768_12.
 50 |         pre-trained BERT model
 51 |     dataset_name : str, default book_corpus_wiki_en_uncased.
 52 |         pre-trained model dataset
 53 |     params_path: str, default None
 54 |         path to a parameters file to load instead of the pretrained model.
 55 |     max_seq_length : int, default 25
 56 |         max length of each sequence
 57 |     batch_size : int, default 256
 58 |         batch size
 59 |     root : str, default '$MXNET_HOME/models' with MXNET_HOME defaults to '~/.mxnet'
 60 |         Location for keeping the model parameters.
 61 |     """
 62 |     def __init__(self, ctx=mx.cpu(), dtype='float32', model='bert_12_768_12',
 63 |                  dataset_name='book_corpus_wiki_en_uncased', params_path=None,
 64 |                  max_seq_length=25, batch_size=256,
 65 |                  root=os.path.join(get_home_dir(), 'models')):
 66 |         self.ctx = ctx
 67 |         self.dtype = dtype
 68 |         self.max_seq_length = max_seq_length
 69 |         self.batch_size = batch_size
 70 |         self.dataset_name = dataset_name
 71 | 
 72 |         # Don't download the pretrained models if we have a parameter path
 73 |         self.bert, self.vocab = gluonnlp.model.get_model(model,
 74 |                                                          dataset_name=self.dataset_name,
 75 |                                                          pretrained=params_path is None,
 76 |                                                          ctx=self.ctx,
 77 |                                                          use_pooler=False,
 78 |                                                          use_decoder=False,
 79 |                                                          use_classifier=False,
 80 |                                                          root=root)
 81 |         self.bert.cast(self.dtype)
 82 | 
 83 |         if params_path:
 84 |             logger.info('Loading params from %s', params_path)
 85 |             self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True)
 86 | 
 87 |         lower = 'uncased' in self.dataset_name
 88 |         self.tokenizer = BERTTokenizer(self.vocab, lower=lower)
 89 |         self.transform = BERTSentenceTransform(tokenizer=self.tokenizer,
 90 |                                                max_seq_length=self.max_seq_length,
 91 |                                                pair=False)
 92 | 
 93 |     def __call__(self, sentences, oov_way='avg'):
 94 |         return self.embedding(sentences, oov_way='avg')
 95 | 
 96 |     def embedding(self, sentences, oov_way='avg'):
 97 |         """
 98 |         Get tokens, tokens embedding
 99 | 
100 |         Parameters
101 |         ----------
102 |         sentences : List[str]
103 |             sentences for encoding.
104 |         oov_way : str, default avg.
105 |             use **avg**, **sum** or **last** to get token embedding for those out of
106 |             vocabulary words
107 | 
108 |         Returns
109 |         -------
110 |         List[(List[str], List[ndarray])]
111 |             List of tokens, and tokens embedding
112 |         """
113 |         data_iter = self.data_loader(sentences=sentences)
114 |         batches = []
115 |         for token_ids, valid_length, token_types in data_iter:
116 |             token_ids = token_ids.as_in_context(self.ctx)
117 |             valid_length = valid_length.as_in_context(self.ctx)
118 |             token_types = token_types.as_in_context(self.ctx)
119 |             sequence_outputs = self.bert(token_ids, token_types,
120 |                                          valid_length.astype(self.dtype))
121 |             for token_id, sequence_output in zip(token_ids.asnumpy(),
122 |                                                  sequence_outputs.asnumpy()):
123 |                 batches.append((token_id, sequence_output))
124 |         return self.oov(batches, oov_way)
125 | 
126 |     def data_loader(self, sentences, shuffle=False):
127 |         """Load, tokenize and prepare the input sentences."""
128 |         dataset = BertEmbeddingDataset(sentences, self.transform)
129 |         return DataLoader(dataset=dataset, batch_size=self.batch_size, shuffle=shuffle)
130 | 
131 |     def oov(self, batches, oov_way='avg'):
132 |         """
133 |         How to handle oov. Also filter out [CLS], [SEP] tokens.
134 | 
135 |         Parameters
136 |         ----------
137 |         batches : List[(tokens_id,
138 |                         sequence_outputs,
139 |                         pooled_output].
140 |             batch   token_ids (max_seq_length, ),
141 |                     sequence_outputs (max_seq_length, dim, ),
142 |                     pooled_output (dim, )
143 |         oov_way : str
144 |             use **avg**, **sum** or **last** to get token embedding for those out of
145 |             vocabulary words
146 | 
147 |         Returns
148 |         -------
149 |         List[(List[str], List[ndarray])]
150 |             List of tokens, and tokens embedding
151 |         """
152 |         sentences = []
153 |         for token_ids, sequence_outputs in batches:
154 |             tokens = []
155 |             tensors = []
156 |             oov_len = 1
157 |             for token_id, sequence_output in zip(token_ids, sequence_outputs):
158 |                 if token_id == 1:
159 |                     # [PAD] token, sequence is finished.
160 |                     break
161 |                 if token_id in (2, 3):
162 |                     # [CLS], [SEP]
163 |                     continue
164 |                 token = self.vocab.idx_to_token[token_id]
165 |                 if token.startswith('##'):
166 |                     token = token[2:]
167 |                     tokens[-1] += token
168 |                     if oov_way == 'last':
169 |                         tensors[-1] = sequence_output
170 |                     else:
171 |                         tensors[-1] += sequence_output
172 |                     if oov_way == 'avg':
173 |                         oov_len += 1
174 |                 else:  # iv, avg last oov
175 |                     if oov_len > 1:
176 |                         tensors[-1] /= oov_len
177 |                         oov_len = 1
178 |                     tokens.append(token)
179 |                     tensors.append(sequence_output)
180 |             if oov_len > 1:  # if the whole sentence is one oov, handle this special case
181 |                 tensors[-1] /= oov_len
182 |             sentences.append((tokens, tensors))
183 |         return sentences
184 | 
185 | 
186 | if __name__ == '__main__':
187 |     np.set_printoptions(threshold=5)
188 |     parser = argparse.ArgumentParser(description='Get embeddings from BERT',
189 |                                      formatter_class=argparse.RawTextHelpFormatter)
190 |     parser.add_argument('--gpu', type=int, default=None,
191 |                         help='id of the gpu to use. Set it to empty means to use cpu.')
192 |     parser.add_argument('--dtype', type=str, default='float32', help='data dtype')
193 |     parser.add_argument('--model', type=str, default='bert_12_768_12',
194 |                         help='pre-trained model')
195 |     parser.add_argument('--dataset_name', type=str, default='book_corpus_wiki_en_uncased',
196 |                         help='dataset')
197 |     parser.add_argument('--params_path', type=str, default=None,
198 |                         help='path to a params file to load instead of the pretrained model.')
199 |     parser.add_argument('--max_seq_length', type=int, default=25,
200 |                         help='max length of each sequence')
201 |     parser.add_argument('--batch_size', type=int, default=256,
202 |                         help='batch size')
203 |     parser.add_argument('--oov_way', type=str, default='avg',
204 |                         help='how to handle oov\n'
205 |                              'avg: average all oov embeddings to represent the original token\n'
206 |                              'sum: sum all oov embeddings to represent the original token\n'
207 |                              'last: use last oov embeddings to represent the original token\n')
208 |     parser.add_argument('--sentences', type=to_unicode, nargs='+', default=None,
209 |                         help='sentence for encoding')
210 |     parser.add_argument('--file', type=str, default=None,
211 |                         help='file for encoding')
212 |     parser.add_argument('--verbose', action='store_true', help='verbose logging')
213 |     args = parser.parse_args()
214 | 
215 |     level = logging.DEBUG if args.verbose else logging.INFO
216 |     logging.getLogger().setLevel(level)
217 |     logging.info(args)
218 | 
219 |     if args.gpu is not None:
220 |         context = mx.gpu(args.gpu)
221 |     else:
222 |         context = mx.cpu()
223 |     bert_embedding = BertEmbedding(ctx=context, model=args.model, dataset_name=args.dataset_name,
224 |                                    max_seq_length=args.max_seq_length, batch_size=args.batch_size)
225 |     result = []
226 |     sents = []
227 |     if args.sentences:
228 |         sents = args.sentences
229 |         result = bert_embedding(sents, oov_way=args.oov_way)
230 |     elif args.file:
231 |         with io.open(args.file, 'r', encoding='utf8') as in_file:
232 |             for line in in_file:
233 |                 sents.append(line.strip())
234 |         result = bert_embedding(sents, oov_way=args.oov_way)
235 |     else:
236 |         logger.error('Please specify --sentence or --file')
237 | 
238 |     if result:
239 |         for sent, embeddings in zip(sents, result):
240 |             print('Text: {}'.format(sent))
241 |             _, tokens_embedding = embeddings
242 |             print('Tokens embedding: {}'.format(tokens_embedding))
243 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/finetune_ner.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  coding: utf-8
  3 | 
  4 | # Licensed to the Apache Software Foundation (ASF) under one
  5 | # or more contributor license agreements.  See the NOTICE file
  6 | # distributed with this work for additional information
  7 | # regarding copyright ownership.  The ASF licenses this file
  8 | # to you under the Apache License, Version 2.0 (the
  9 | # "License"); you may not use this file except in compliance
 10 | # with the License.  You may obtain a copy of the License at
 11 | #
 12 | #   http://www.apache.org/licenses/LICENSE-2.0
 13 | #
 14 | # Unless required by applicable law or agreed to in writing,
 15 | # software distributed under the License is distributed on an
 16 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 17 | # KIND, either express or implied.  See the License for the
 18 | # specific language governing permissions and limitations
 19 | # under the License.
 20 | """Provides command-line interace for training BERT-based named entity recognition model."""
 21 | 
 22 | # coding: utf-8
 23 | import argparse
 24 | import logging
 25 | import random
 26 | 
 27 | import numpy as np
 28 | import mxnet as mx
 29 | 
 30 | import gluonnlp as nlp
 31 | 
 32 | from ner_utils import get_context, get_bert_model, dump_metadata, str2bool
 33 | from data.ner import BERTTaggingDataset, convert_arrays_to_text
 34 | from model.ner import BERTTagger, attach_prediction
 35 | 
 36 | # seqeval is a dependency that is specific to named entity recognition.
 37 | import seqeval.metrics
 38 | 
 39 | 
 40 | def parse_args():
 41 |     """Parse command line arguments."""
 42 |     arg_parser = argparse.ArgumentParser(
 43 |         description='Train a BERT-based named entity recognition model',
 44 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 45 | 
 46 |     # data file paths
 47 |     arg_parser.add_argument('--train-path', type=str, required=True,
 48 |                             help='Path to the training data file')
 49 |     arg_parser.add_argument('--dev-path', type=str, required=True,
 50 |                             help='Path to the development data file')
 51 |     arg_parser.add_argument('--test-path', type=str, required=True,
 52 |                             help='Path to the test data file')
 53 | 
 54 |     arg_parser.add_argument('--save-checkpoint-prefix', type=str, required=False, default=None,
 55 |                             help='Prefix of model checkpoint file')
 56 | 
 57 |     # bert options
 58 |     arg_parser.add_argument('--bert-model', type=str, default='bert_12_768_12',
 59 |                             help='Name of the BERT model')
 60 |     arg_parser.add_argument('--cased', type=str2bool, default=True,
 61 |                             help='Path to the development data file')
 62 |     arg_parser.add_argument('--dropout-prob', type=float, default=0.1,
 63 |                             help='Dropout probability for the last layer')
 64 | 
 65 |     # optimization parameters
 66 |     arg_parser.add_argument('--seed', type=int, default=13531,
 67 |                             help='Random number seed.')
 68 |     arg_parser.add_argument('--seq-len', type=int, default=180,
 69 |                             help='The length of the sequence input to BERT.'
 70 |                                  ' An exception will raised if this is not large enough.')
 71 |     arg_parser.add_argument('--gpu', type=int,
 72 |                             help='Number (index) of GPU to run on, e.g. 0.  '
 73 |                                  'If not specified, uses CPU.')
 74 |     arg_parser.add_argument('--batch-size', type=int, default=32, help='Batch size for training')
 75 |     arg_parser.add_argument('--num-epochs', type=int, default=4, help='Number of epochs to train')
 76 |     arg_parser.add_argument('--optimizer', type=str, default='bertadam',
 77 |                             help='Optimization algorithm to use')
 78 |     arg_parser.add_argument('--learning-rate', type=float, default=5e-5,
 79 |                             help='Learning rate for optimization')
 80 |     arg_parser.add_argument('--warmup-ratio', type=float, default=0.1,
 81 |                             help='Warmup ratio for learning rate scheduling')
 82 |     args = arg_parser.parse_args()
 83 |     return args
 84 | 
 85 | 
 86 | def main(config):
 87 |     """Main method for training BERT-based NER model."""
 88 |     # provide random seed for every RNGs we use
 89 |     np.random.seed(config.seed)
 90 |     random.seed(config.seed)
 91 |     mx.random.seed(config.seed)
 92 | 
 93 |     ctx = get_context(config.gpu)
 94 | 
 95 |     logging.info('Loading BERT model...')
 96 |     bert_model, text_vocab = get_bert_model(config.bert_model, config.cased, ctx,
 97 |                                             config.dropout_prob)
 98 | 
 99 |     dataset = BERTTaggingDataset(text_vocab, config.train_path, config.dev_path, config.test_path,
100 |                                  config.seq_len, config.cased)
101 | 
102 |     train_data_loader = dataset.get_train_data_loader(config.batch_size)
103 |     dev_data_loader = dataset.get_dev_data_loader(config.batch_size)
104 |     test_data_loader = dataset.get_test_data_loader(config.batch_size)
105 | 
106 |     net = BERTTagger(bert_model, dataset.num_tag_types, config.dropout_prob)
107 |     net.tag_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
108 |     net.hybridize(static_alloc=True)
109 | 
110 |     loss_function = mx.gluon.loss.SoftmaxCrossEntropyLoss()
111 |     loss_function.hybridize(static_alloc=True)
112 | 
113 |     # step size adaptation, adopted from: https://github.com/dmlc/gluon-nlp/blob/
114 |     # 87d36e3cc7c615f93732d01048cf7ce3b3b09eb7/scripts/bert/finetune_classifier.py#L348-L351
115 |     step_size = config.batch_size
116 |     num_train_steps = int(len(dataset.train_inputs) / step_size * config.num_epochs)
117 |     num_warmup_steps = int(num_train_steps * config.warmup_ratio)
118 | 
119 |     optimizer_params = {'learning_rate': config.learning_rate}
120 |     try:
121 |         trainer = mx.gluon.Trainer(net.collect_params(), config.optimizer, optimizer_params)
122 |     except ValueError as e:
123 |         print(e)
124 |         logging.warning('AdamW optimizer is not found. Please consider upgrading to '
125 |                         'mxnet>=1.5.0. Now the original Adam optimizer is used instead.')
126 |         trainer = mx.gluon.Trainer(net.collect_params(), 'adam', optimizer_params)
127 | 
128 |     # collect differentiable parameters
129 |     logging.info('Collect params...')
130 |     # do not apply weight decay on LayerNorm and bias terms
131 |     for _, v in net.collect_params('.*beta|.*gamma|.*bias').items():
132 |         v.wd_mult = 0.0
133 |     params = [p for p in net.collect_params().values() if p.grad_req != 'null']
134 | 
135 |     if config.save_checkpoint_prefix is not None:
136 |         logging.info('dumping metadata...')
137 |         dump_metadata(config, tag_vocab=dataset.tag_vocab)
138 | 
139 |     def train(data_loader, start_step_num):
140 |         """Training loop."""
141 |         step_num = start_step_num
142 |         logging.info('current starting step num: %d', step_num)
143 |         for batch_id, (_, _, _, tag_ids, flag_nonnull_tag, out) in \
144 |                 enumerate(attach_prediction(data_loader, net, ctx, is_train=True)):
145 |             logging.info('training on batch index: %d/%d', batch_id, len(data_loader))
146 | 
147 |             # step size adjustments
148 |             step_num += 1
149 |             if step_num < num_warmup_steps:
150 |                 new_lr = config.learning_rate * step_num / num_warmup_steps
151 |             else:
152 |                 offset = ((step_num - num_warmup_steps) * config.learning_rate /
153 |                           (num_train_steps - num_warmup_steps))
154 |                 new_lr = config.learning_rate - offset
155 |             trainer.set_learning_rate(new_lr)
156 | 
157 |             with mx.autograd.record():
158 |                 loss_value = loss_function(out, tag_ids,
159 |                                            flag_nonnull_tag.expand_dims(axis=2)).mean()
160 | 
161 |             loss_value.backward()
162 |             nlp.utils.clip_grad_global_norm(params, 1)
163 |             trainer.step(1)
164 | 
165 |             pred_tags = out.argmax(axis=-1)
166 |             logging.info('loss_value: %6f', loss_value.asscalar())
167 | 
168 |             num_tag_preds = flag_nonnull_tag.sum().asscalar()
169 |             logging.info(
170 |                 'accuracy: %6f', (((pred_tags == tag_ids) * flag_nonnull_tag).sum().asscalar()
171 |                                   / num_tag_preds))
172 |         return step_num
173 | 
174 |     def evaluate(data_loader):
175 |         """Eval loop."""
176 |         predictions = []
177 | 
178 |         for batch_id, (text_ids, _, valid_length, tag_ids, _, out) in \
179 |                 enumerate(attach_prediction(data_loader, net, ctx, is_train=False)):
180 |             logging.info('evaluating on batch index: %d/%d', batch_id, len(data_loader))
181 | 
182 |             # convert results to numpy arrays for easier access
183 |             np_text_ids = text_ids.astype('int32').asnumpy()
184 |             np_pred_tags = out.argmax(axis=-1).asnumpy()
185 |             np_valid_length = valid_length.astype('int32').asnumpy()
186 |             np_true_tags = tag_ids.asnumpy()
187 | 
188 |             predictions += convert_arrays_to_text(text_vocab, dataset.tag_vocab, np_text_ids,
189 |                                                   np_true_tags, np_pred_tags, np_valid_length)
190 | 
191 |         all_true_tags = [[entry.true_tag for entry in entries] for entries in predictions]
192 |         all_pred_tags = [[entry.pred_tag for entry in entries] for entries in predictions]
193 |         seqeval_f1 = seqeval.metrics.f1_score(all_true_tags, all_pred_tags)
194 |         return seqeval_f1
195 | 
196 |     best_dev_f1 = 0.0
197 |     last_test_f1 = 0.0
198 |     best_epoch = -1
199 | 
200 |     last_epoch_step_num = 0
201 |     for epoch_index in range(config.num_epochs):
202 |         last_epoch_step_num = train(train_data_loader, last_epoch_step_num)
203 |         train_f1 = evaluate(train_data_loader)
204 |         logging.info('train f1: %3f', train_f1)
205 |         dev_f1 = evaluate(dev_data_loader)
206 |         logging.info('dev f1: %3f, previous best dev f1: %3f', dev_f1, best_dev_f1)
207 |         if dev_f1 > best_dev_f1:
208 |             best_dev_f1 = dev_f1
209 |             best_epoch = epoch_index
210 |             logging.info('update the best dev f1 to be: %3f', best_dev_f1)
211 |             test_f1 = evaluate(test_data_loader)
212 |             logging.info('test f1: %3f', test_f1)
213 |             last_test_f1 = test_f1
214 | 
215 |             # save params
216 |             params_file = config.save_checkpoint_prefix + '_{:03d}.params'.format(epoch_index)
217 |             logging.info('saving current checkpoint to: %s', params_file)
218 |             net.save_parameters(params_file)
219 | 
220 |         logging.info('current best epoch: %d', best_epoch)
221 | 
222 |     logging.info('best epoch: %d, best dev f1: %3f, test f1 at tha epoch: %3f',
223 |                  best_epoch, best_dev_f1, last_test_f1)
224 | 
225 | 
226 | if __name__ == '__main__':
227 |     logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
228 |                         level=logging.DEBUG, datefmt='%Y-%m-%d %I:%M:%S')
229 |     logging.getLogger().setLevel(logging.INFO)
230 |     main(parse_args())
231 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/conversion_tools/convert_tf_model.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # 'License'); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #   http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing,
 14 | # software distributed under the License is distributed on an
 15 | # 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | # KIND, either express or implied.  See the License for the
 17 | # specific language governing permissions and limitations
 18 | # under the License.
 19 | # pylint:disable=redefined-outer-name,logging-format-interpolation
 20 | """ Script for converting TF Model to Gluon. """
 21 | 
 22 | import argparse
 23 | import json
 24 | import logging
 25 | import os
 26 | import sys
 27 | 
 28 | import mxnet as mx
 29 | import gluonnlp as nlp
 30 | from gluonnlp.model import BERTEncoder, BERTModel
 31 | from gluonnlp.model.bert import bert_hparams
 32 | 
 33 | sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
 34 | 
 35 | from utils import (get_hash, load_text_vocab, read_tf_checkpoint,
 36 |                    tf_vocab_to_gluon_vocab)
 37 | 
 38 | 
 39 | parser = argparse.ArgumentParser(
 40 |     description='Conversion script for Tensorflow BERT model',
 41 |     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 42 | parser.add_argument('--model',
 43 |                     type=str,
 44 |                     default='bert_12_768_12',
 45 |                     choices=['bert_12_768_12', 'bert_24_1024_16'],
 46 |                     help='BERT model name')
 47 | parser.add_argument('--tf_checkpoint_dir',
 48 |                     type=str,
 49 |                     help='Path to Tensorflow checkpoint folder.')
 50 | parser.add_argument('--tf_model_prefix', type=str,
 51 |                     default='bert_model.ckpt',
 52 |                     help='name of bert checkpoint file.')
 53 | parser.add_argument('--tf_config_name', type=str,
 54 |                     default='bert_config.json',
 55 |                     help='Name of Bert config file')
 56 | parser.add_argument('--out_dir',
 57 |                     type=str,
 58 |                     default=os.path.join('~', 'output'),
 59 |                     help='Path to output folder. The folder must exist.')
 60 | parser.add_argument('--debug', action='store_true', help='debugging mode')
 61 | args = parser.parse_args()
 62 | logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
 63 | logging.info(args)
 64 | 
 65 | # convert vocabulary
 66 | vocab_path = os.path.join(args.tf_checkpoint_dir, 'vocab.txt')
 67 | vocab = tf_vocab_to_gluon_vocab(load_text_vocab(vocab_path))
 68 | 
 69 | # vocab serialization
 70 | out_dir = os.path.expanduser(args.out_dir)
 71 | nlp.utils.mkdir(out_dir)
 72 | tmp_file_path = os.path.join(out_dir, 'tmp')
 73 | with open(tmp_file_path, 'w') as f:
 74 |     f.write(vocab.to_json())
 75 | hash_full, hash_short = get_hash(tmp_file_path)
 76 | gluon_vocab_path = os.path.join(out_dir, hash_short + '.vocab')
 77 | with open(gluon_vocab_path, 'w') as f:
 78 |     f.write(vocab.to_json())
 79 |     logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full)
 80 | 
 81 | # load tf model
 82 | tf_checkpoint_file = os.path.expanduser(
 83 |     os.path.join(args.tf_checkpoint_dir, args.tf_model_prefix))
 84 | logging.info('loading Tensorflow checkpoint %s ...', tf_checkpoint_file)
 85 | tf_tensors = read_tf_checkpoint(tf_checkpoint_file)
 86 | tf_names = sorted(tf_tensors.keys())
 87 | 
 88 | tf_names = filter(lambda name: not name.endswith('adam_m'), tf_names)
 89 | tf_names = filter(lambda name: not name.endswith('adam_v'), tf_names)
 90 | tf_names = filter(lambda name: name != 'global_step', tf_names)
 91 | tf_names = list(tf_names)
 92 | if len(tf_tensors) != len(tf_names):
 93 |     logging.info('Tensorflow model was saved with Optimizer parameters. '
 94 |                  'Ignoring them.')
 95 | 
 96 | for name in tf_names:
 97 |     logging.debug('%s: %s', name, tf_tensors[name].shape)
 98 | 
 99 | # replace tensorflow parameter names with gluon parameter names
100 | NAME_MAP = [
101 |     ('bert/encoder/layer_', 'encoder.transformer_cells.'),
102 |     ('/attention/self/', '.attention_cell.'),
103 |     ('key', 'proj_key'),
104 |     ('query', 'proj_query'),
105 |     ('value', 'proj_value'),
106 |     ('/attention/output/LayerNorm/', '.layer_norm.'),
107 |     ('/attention/output/dense/', '.proj.'),
108 |     ('cls/seq_relationship/output_weights', 'classifier.weight'),
109 |     ('cls/seq_relationship/output_bias', 'classifier.bias'),
110 |     ('cls/predictions/output_bias', 'decoder.3.bias'),
111 |     ('cls/predictions/transform/dense/', 'decoder.0.'),
112 |     ('cls/predictions/transform/LayerNorm/', 'decoder.2.'),
113 |     ('kernel', 'weight'),
114 |     ('/intermediate/dense/', '.ffn.ffn_1.'),
115 |     ('/output/dense/', '.ffn.ffn_2.'),
116 |     ('/output/LayerNorm/', '.ffn.layer_norm.'),
117 |     ('bert/embeddings/LayerNorm/', 'encoder.layer_norm.'),
118 |     ('bert/embeddings/position_embeddings', 'encoder.position_weight'),
119 |     ('bert/embeddings/token_type_embeddings', 'token_type_embed.0.weight'),
120 |     ('bert/embeddings/word_embeddings', 'word_embed.0.weight'),
121 |     ('bert/pooler/dense/', 'pooler.'),
122 |     ('/', '.'),
123 | ]
124 | 
125 | # convert to gluon parameters
126 | mx_tensors = {}
127 | logging.info('converting to Gluon checkpoint ... ')
128 | for source_name in tf_names:
129 |     # get the source tensor and its transpose
130 |     source, source_t = tf_tensors[source_name], tf_tensors[source_name].T
131 |     target, target_name = source, source_name
132 |     for old, new in NAME_MAP:
133 |         target_name = target_name.replace(old, new)
134 |     # transpose kernel layer parameters
135 |     if 'kernel' in source_name:
136 |         target = source_t
137 |     mx_tensors[target_name] = target
138 |     if source_t.shape == source.shape and len(source.shape) > 1 and target is not source_t:
139 |         logging.info('warning: %s has symmetric shape %s', target_name, target.shape)
140 |     logging.debug('%s: %s', target_name, target.shape)
141 | 
142 | # BERT config
143 | tf_config_names_to_gluon_config_names = {
144 |     'attention_probs_dropout_prob': 'embed_dropout',
145 |     'hidden_act': None,
146 |     'hidden_dropout_prob': 'dropout',
147 |     'hidden_size': 'units',
148 |     'initializer_range': None,
149 |     'intermediate_size': 'hidden_size',
150 |     'max_position_embeddings': 'max_length',
151 |     'num_attention_heads': 'num_heads',
152 |     'num_hidden_layers': 'num_layers',
153 |     'type_vocab_size': 'token_type_vocab_size',
154 |     'vocab_size': None
155 | }
156 | predefined_args = bert_hparams[args.model]
157 | with open(os.path.join(args.tf_checkpoint_dir, args.tf_config_name), 'r') as f:
158 |     tf_config = json.load(f)
159 |     assert len(tf_config) == len(tf_config_names_to_gluon_config_names)
160 |     for tf_name, gluon_name in tf_config_names_to_gluon_config_names.items():
161 |         if tf_name is None or gluon_name is None:
162 |             continue
163 |         assert tf_config[tf_name] == predefined_args[gluon_name]
164 | 
165 | # BERT encoder
166 | encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
167 |                       num_layers=predefined_args['num_layers'],
168 |                       units=predefined_args['units'],
169 |                       hidden_size=predefined_args['hidden_size'],
170 |                       max_length=predefined_args['max_length'],
171 |                       num_heads=predefined_args['num_heads'],
172 |                       scaled=predefined_args['scaled'],
173 |                       dropout=predefined_args['dropout'],
174 |                       use_residual=predefined_args['use_residual'])
175 | 
176 | # Infer enabled BERTModel components
177 | use_pooler = any('pooler' in n for n in mx_tensors)
178 | use_decoder = any('decoder.0' in n for n in mx_tensors)
179 | use_classifier = any('classifier.weight' in n for n in mx_tensors)
180 | 
181 | logging.info('Inferred that the tensorflow model provides the following parameters:')
182 | logging.info('- use_pooler = {}'.format(use_pooler))
183 | logging.info('- use_decoder = {}'.format(use_decoder))
184 | logging.info('- use_classifier = {}'.format(use_classifier))
185 | 
186 | # post processings for parameters:
187 | # - handle tied decoder weight
188 | logging.info('total number of tf parameters = %d', len(tf_names))
189 | if use_decoder:
190 |     mx_tensors['decoder.3.weight'] = mx_tensors['word_embed.0.weight']
191 |     logging.info('total number of mx parameters = %d'
192 |                  '(including decoder param for weight tying)', len(mx_tensors))
193 | else:
194 |     logging.info('total number of mx parameters = %d', len(mx_tensors))
195 | 
196 | # BERT model
197 | bert = BERTModel(encoder, len(vocab),
198 |                  token_type_vocab_size=predefined_args['token_type_vocab_size'],
199 |                  units=predefined_args['units'],
200 |                  embed_size=predefined_args['embed_size'],
201 |                  embed_dropout=predefined_args['embed_dropout'],
202 |                  word_embed=predefined_args['word_embed'],
203 |                  use_pooler=use_pooler, use_decoder=use_decoder,
204 |                  use_classifier=use_classifier)
205 | 
206 | bert.initialize(init=mx.init.Normal(0.02))
207 | 
208 | ones = mx.nd.ones((2, 8))
209 | out = bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]]))
210 | params = bert._collect_params_with_prefix()
211 | if len(params) != len(mx_tensors):
212 |     raise RuntimeError('The Gluon BERTModel comprises {} parameter arrays, '
213 |                        'but {} have been extracted from the tf model. '
214 |                        'Most likely the BERTModel hyperparameters do not match '
215 |                        'the hyperparameters of the tf model.'.format(len(params), len(mx_tensors)))
216 | 
217 | # set parameter data
218 | loaded_params = {}
219 | for name in params:
220 |     try:
221 |         arr = mx.nd.array(mx_tensors[name])
222 |         params[name].set_data(arr)
223 |         loaded_params[name] = True
224 |     # pylint: disable=broad-except
225 |     except Exception:
226 |         if name not in mx_tensors:
227 |             raise RuntimeError('cannot initialize %s from tf checkpoint' % name)
228 |         else:
229 |             raise RuntimeError('cannot initialize %s. Expect shape = %s, but found %s' %
230 |                                name, params[name].shape, arr.shape)
231 | 
232 | logging.info('num loaded params = %d, total num params = %d',
233 |              len(loaded_params), len(mx_tensors))
234 | for name in mx_tensors:
235 |     if name not in loaded_params:
236 |         logging.info('%s is not loaded', name)
237 | 
238 | # param serialization
239 | bert.save_parameters(tmp_file_path)
240 | hash_full, hash_short = get_hash(tmp_file_path)
241 | gluon_param_path = os.path.join(out_dir, hash_short + '.params')
242 | logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full)
243 | bert.save_parameters(gluon_param_path)
244 | mx.nd.waitall()
245 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/fp16_utils.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # Licensed to the Apache Software Foundation (ASF) under one
  3 | # or more contributor license agreements.  See the NOTICE file
  4 | # distributed with this work for additional information
  5 | # regarding copyright ownership.  The ASF licenses this file
  6 | # to you under the Apache License, Version 2.0 (the
  7 | # "License"); you may not use this file except in compliance
  8 | # with the License.  You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing,
 13 | # software distributed under the License is distributed on an
 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | # KIND, either express or implied.  See the License for the
 16 | # specific language governing permissions and limitations
 17 | # under the License.
 18 | 
 19 | """Trainer for mixed precision training."""
 20 | import warnings
 21 | import collections
 22 | import numpy as np
 23 | import mxnet as mx
 24 | from mxnet import nd
 25 | 
 26 | def grad_global_norm(parameters, max_norm):
 27 |     """Calculate the 2-norm of gradients of parameters, and how much they should be scaled down
 28 |     such that their 2-norm does not exceed `max_norm`.
 29 | 
 30 |     If gradients exist for more than one context for a parameter, user needs to explicitly call
 31 |     ``trainer.allreduce_grads`` so that the gradients are summed first before calculating
 32 |     the 2-norm.
 33 | 
 34 |     .. note::
 35 | 
 36 |         This function is only for use when `update_on_kvstore` is set to False in trainer.
 37 | 
 38 |     Example::
 39 | 
 40 |         trainer = Trainer(net.collect_params(), update_on_kvstore=False, ...)
 41 |         for x, y in mx.gluon.utils.split_and_load(X, [mx.gpu(0), mx.gpu(1)]):
 42 |             with mx.autograd.record():
 43 |                 y = net(x)
 44 |                 loss = loss_fn(y, label)
 45 |             loss.backward()
 46 |         trainer.allreduce_grads()
 47 |         norm, ratio = grad_global_norm(net.collect_params().values(), max_norm)
 48 |         trainer.update(batch_size * ratio)
 49 |         ...
 50 | 
 51 |     Parameters
 52 |     ----------
 53 |     parameters : list of Parameters
 54 | 
 55 |     Returns
 56 |     -------
 57 |     NDArray
 58 |       Total norm. Shape is (1,)
 59 |     NDArray
 60 |       Ratio for rescaling gradients based on max_norm s.t. grad = grad / ratio.
 61 |       If total norm is NaN, ratio will be NaN, too. Shape is (1,)
 62 |     NDArray
 63 |       Whether the total norm is finite. Shape is (1,)
 64 |     """
 65 |     # collect gradient arrays
 66 |     arrays = []
 67 |     idx = 0
 68 |     for p in parameters:
 69 |         if p.grad_req != 'null':
 70 |             p_grads = p.list_grad()
 71 |             arrays.append(p_grads[idx % len(p_grads)])
 72 |             idx += 1
 73 |     assert len(arrays) > 0, 'No parameter found available for gradient norm.'
 74 | 
 75 |     # compute gradient norms
 76 |     def _norm(array):
 77 |         # TODO(haibin) norm operator does not support fp16 safe reduction.
 78 |         # Issue is tracked at: https://github.com/apache/incubator-mxnet/issues/14126
 79 |         x = array.reshape((-1,)).astype('float32', copy=False)
 80 |         return nd.dot(x, x)
 81 | 
 82 |     norm_arrays = [_norm(arr) for arr in arrays]
 83 | 
 84 |     # group norm arrays by ctx
 85 |     def group_by_ctx(arr_list):
 86 |         groups = collections.defaultdict(list)
 87 |         for arr in arr_list:
 88 |             ctx = arr.context
 89 |             groups[ctx].append(arr)
 90 |         return groups
 91 |     norm_groups = group_by_ctx(norm_arrays)
 92 | 
 93 |     # reduce
 94 |     ctx, dtype = arrays[0].context, 'float32'
 95 |     norms = [nd.add_n(*g).as_in_context(ctx) for g in norm_groups.values()]
 96 |     total_norm = nd.add_n(*norms).sqrt()
 97 |     scale = total_norm / max_norm
 98 |     # is_finite = 0 if NaN or Inf, 1 otherwise.
 99 |     is_finite = nd.contrib.isfinite(scale)
100 |     # if scale is finite, nd.maximum selects the max between scale and 1. That is,
101 |     # 1 is returned if total_norm does not exceed max_norm.
102 |     # if scale = NaN or Inf, the result of nd.minimum is undefined. Therefore, we use
103 |     # choices.take to return NaN or Inf.
104 |     scale_or_one = nd.maximum(nd.ones((1,), dtype=dtype, ctx=ctx), scale)
105 |     choices = nd.concat(scale, scale_or_one, dim=0)
106 |     chosen_scale = choices.take(is_finite)
107 |     return total_norm, chosen_scale, is_finite
108 | 
109 | 
110 | class FP16Trainer(object):
111 |     """ Trainer for mixed precision training.
112 | 
113 |     Parameters
114 |     ----------
115 |     trainer: gluon.Trainer
116 |       the original gluon Trainer object for fp32 training.
117 |     dynamic_loss_scale: bool. Default is True
118 |       whether to use dynamic loss scaling. This is recommended for optimizing model
119 |       parameters using FP16.
120 |     loss_scaler_params : dict
121 |         Key-word arguments to be passed to loss scaler constructor. For example,
122 |         `{"init_scale" : 2.**15, "scale_window" : 2000, "tolerance" : 0.05}`
123 |         for `DynamicLossScaler`.
124 |         See each `LossScaler` for a list of supported arguments'
125 |     """
126 |     def __init__(self, trainer, dynamic_loss_scale=True, loss_scaler_params=None):
127 |         if trainer._kvstore_params['update_on_kvstore'] is not False and trainer._kvstore:
128 |             err = 'Only gluon.Trainer created with update_on_kvstore=False is supported.'
129 |             raise NotImplementedError(err)
130 |         self.fp32_trainer = trainer
131 |         loss_scaler_params = loss_scaler_params if loss_scaler_params else {}
132 |         self._scaler = DynamicLossScaler(**loss_scaler_params) if dynamic_loss_scale \
133 |                        else StaticLossScaler(**loss_scaler_params)
134 |         # if the optimizer supports NaN check, we can always defer the NaN check to the optimizer
135 |         # TODO(haibin) this should be added via registry
136 |         self._support_nan_check = trainer._optimizer.__class__.__name__ == 'BERTAdam'
137 | 
138 |     def backward(self, loss):
139 |         """backward propagation with loss"""
140 |         with mx.autograd.record():
141 |             if isinstance(loss, (tuple, list)):
142 |                 ls = [l * self._scaler.loss_scale for l in loss]
143 |             else:
144 |                 ls = loss * self._scaler.loss_scale
145 |         mx.autograd.backward(ls)
146 | 
147 |     def step(self, batch_size, max_norm=None):
148 |         """Makes one step of parameter update. Should be called after
149 |         `fp16_optimizer.backward()`, and outside of `record()` scope.
150 | 
151 |         Parameters
152 |         ----------
153 |         batch_size : int
154 |             Batch size of data processed. Gradient will be normalized by `1/batch_size`.
155 |             Set this to 1 if you normalized loss manually with `loss = mean(loss)`.
156 |         max_norm : NDArray, optional, default is None
157 |             max value for global 2-norm of gradients.
158 |         """
159 |         self.fp32_trainer.allreduce_grads()
160 |         step_size = batch_size * self._scaler.loss_scale
161 |         if max_norm:
162 |             norm, ratio, is_finite = grad_global_norm(self.fp32_trainer._params,
163 |                                                       max_norm * self._scaler.loss_scale)
164 |             step_size = ratio * step_size
165 |             if self._support_nan_check:
166 |                 self.fp32_trainer.update(step_size)
167 |                 overflow = is_finite.asscalar() < 1
168 |             else:
169 |                 overflow = not np.isfinite(norm.asscalar())
170 |                 if not overflow:
171 |                     self.fp32_trainer.update(step_size)
172 |         else:
173 |             # TODO(haibin) optimize the performance when max_norm is not present
174 |             # sequentially adding isnan/isinf results may be slow
175 |             if self._support_nan_check:
176 |                 self.fp32_trainer.update(step_size)
177 |                 overflow = self._scaler.has_overflow(self.fp32_trainer._params)
178 |             else:
179 |                 overflow = self._scaler.has_overflow(self.fp32_trainer._params)
180 |                 if not overflow:
181 |                     self.fp32_trainer.update(step_size)
182 |         # update scale based on overflow information
183 |         self._scaler.update_scale(overflow)
184 | 
185 | class LossScaler(object):
186 |     """Abstract loss scaler"""
187 |     def has_overflow(self, params):
188 |         """ detect inf and nan """
189 |         is_not_finite = 0
190 |         for param in params:
191 |             if param.grad_req != 'null':
192 |                 grad = param.list_grad()[0]
193 |                 is_not_finite += mx.nd.contrib.isnan(grad).sum()
194 |                 is_not_finite += mx.nd.contrib.isinf(grad).sum()
195 |         # NDArray is implicitly converted to bool
196 |         if is_not_finite == 0:
197 |             return False
198 |         else:
199 |             return True
200 | 
201 |     def update_scale(self, overflow):
202 |         raise NotImplementedError()
203 | 
204 | class StaticLossScaler(LossScaler):
205 |     """Static loss scaler"""
206 |     def __init__(self, init_scale=1):
207 |         self.loss_scale = init_scale
208 | 
209 |     def update_scale(self, overflow):
210 |         """update loss scale"""
211 |         pass
212 | 
213 | class DynamicLossScaler(LossScaler):
214 |     """Class that manages dynamic loss scaling.
215 | 
216 |     There are two problems regarding gradient scale when fp16 is used for training.
217 |     One is overflow: the fp16 gradient is too large that it causes NaN.
218 |     To combat such an issue, we need to scale down the gradient when such an event
219 |     is detected. The other is underflow: the gradient is too small such that the
220 |     precision suffers. This is hard to detect though. What dynamic loss scaler does
221 |     it that, it starts the scale at a relatively large value (e.g. 2**15).
222 |     Everytime when a NaN is detected in the gradient, the scale is reduced (by default)
223 |     by 2x. On the other hand, if a NaN is not detected for a long time
224 |     (e.g. 2000 steps), then the scale is increased (by default) by 2x."""
225 |     def __init__(self, init_scale=2.**15, scale_factor=2., scale_window=2000,
226 |                  tolerance=0.01):
227 |         self.loss_scale = init_scale
228 |         self.scale_factor = scale_factor
229 |         self.scale_window = scale_window
230 |         self.tolerance = tolerance
231 |         self._num_steps = 0
232 |         self._last_overflow_iter = -1
233 |         self._last_rescale_iter = -1
234 |         self._overflows_since_rescale = 0
235 | 
236 |     def update_scale(self, overflow):
237 |         """dynamically update loss scale"""
238 |         iter_since_rescale = self._num_steps - self._last_rescale_iter
239 |         if overflow:
240 |             self._last_overflow_iter = self._num_steps
241 |             self._overflows_since_rescale += 1
242 |             percentage = self._overflows_since_rescale / float(iter_since_rescale)
243 |             # we tolerate a certrain amount of NaNs before actually scaling it down
244 |             if percentage >= self.tolerance:
245 |                 self.loss_scale /= self.scale_factor
246 |                 self._last_rescale_iter = self._num_steps
247 |                 self._overflows_since_rescale = 0
248 |                 if self.loss_scale < 1:
249 |                     warnings.warn('DynamicLossScaler: overflow detected. set loss_scale = %s'%
250 |                                   self.loss_scale)
251 |         elif (self._num_steps - self._last_overflow_iter) % self.scale_window == 0:
252 |             self.loss_scale *= self.scale_factor
253 |             self._last_rescale_iter = self._num_steps
254 |         self._num_steps += 1
255 | 


--------------------------------------------------------------------------------
/gluon_basics/autograd.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "slide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# Automatic Differentiation with `autograd`"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {
 17 |     "slideshow": {
 18 |      "slide_type": "skip"
 19 |     }
 20 |    },
 21 |    "source": [
 22 |     "We train models to get better and better as a function of experience. Usually, getting better means minimizing a loss function. To achieve this goal, we often iteratively compute the gradient of the loss with respect to weights and then update the weights accordingly. While the gradient calculations are straightforward through a chain rule, for complex models, working it out by hand can be a pain.\n",
 23 |     "\n",
 24 |     "Before diving deep into the model training, let's go through how MXNet’s `autograd` package expedites this work by automatically calculating derivatives."
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {
 30 |     "slideshow": {
 31 |      "slide_type": "slide"
 32 |     }
 33 |    },
 34 |    "source": [
 35 |     "## Basic usage\n",
 36 |     "\n",
 37 |     "Let's first import the `autograd` package."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 1,
 43 |    "metadata": {
 44 |     "ExecuteTime": {
 45 |      "end_time": "2019-06-13T16:36:29.255811Z",
 46 |      "start_time": "2019-06-13T16:36:28.119160Z"
 47 |     },
 48 |     "slideshow": {
 49 |      "slide_type": "fragment"
 50 |     }
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "from mxnet import nd\n",
 55 |     "from mxnet import autograd"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {
 61 |     "slideshow": {
 62 |      "slide_type": "slide"
 63 |     }
 64 |    },
 65 |    "source": [
 66 |     "### Case Study: Autograd for $f(x) = 2 x^2$"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {
 72 |     "slideshow": {
 73 |      "slide_type": "slide"
 74 |     }
 75 |    },
 76 |    "source": [
 77 |     "Let's start by assigning an initial value of $x$."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 2,
 83 |    "metadata": {
 84 |     "ExecuteTime": {
 85 |      "end_time": "2019-06-13T16:36:29.262414Z",
 86 |      "start_time": "2019-06-13T16:36:29.258567Z"
 87 |     },
 88 |     "attributes": {
 89 |      "classes": [],
 90 |      "id": "",
 91 |      "n": "3"
 92 |     },
 93 |     "slideshow": {
 94 |      "slide_type": "fragment"
 95 |     }
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "x = nd.array([[1, 2], [3, 4]])"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {
105 |     "slideshow": {
106 |      "slide_type": "skip"
107 |     }
108 |    },
109 |    "source": [
110 |     "In MXNet, we can tell an NDArray that we plan to calculate and store a gradient by invoking its `attach_grad` method."
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {
116 |     "slideshow": {
117 |      "slide_type": "fragment"
118 |     }
119 |    },
120 |    "source": [
121 |     "#### Attach Gradient Storage\n",
122 |     "\n",
123 |     "Calculating gradients require extra computation, and we’ll need a place to store it."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 3,
129 |    "metadata": {
130 |     "ExecuteTime": {
131 |      "end_time": "2019-06-13T16:36:29.267429Z",
132 |      "start_time": "2019-06-13T16:36:29.264217Z"
133 |     },
134 |     "attributes": {
135 |      "classes": [],
136 |      "id": "",
137 |      "n": "6"
138 |     },
139 |     "slideshow": {
140 |      "slide_type": "fragment"
141 |     }
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "x.attach_grad()"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {
151 |     "slideshow": {
152 |      "slide_type": "subslide"
153 |     }
154 |    },
155 |    "source": [
156 |     "#### Define and Record y = f(x)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {
162 |     "slideshow": {
163 |      "slide_type": "skip"
164 |     }
165 |    },
166 |    "source": [
167 |     "Now we’re going to define the function $y=f(x)$. To let MXNet store $y$, so that we can compute gradients later, we need to put the definition inside a `autograd.record()` scope."
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 4,
173 |    "metadata": {
174 |     "ExecuteTime": {
175 |      "end_time": "2019-06-13T16:36:29.274746Z",
176 |      "start_time": "2019-06-13T16:36:29.269163Z"
177 |     },
178 |     "attributes": {
179 |      "classes": [],
180 |      "id": "",
181 |      "n": "7"
182 |     },
183 |     "slideshow": {
184 |      "slide_type": "fragment"
185 |     }
186 |    },
187 |    "outputs": [
188 |     {
189 |      "name": "stdout",
190 |      "output_type": "stream",
191 |      "text": [
192 |       "\n",
193 |       "[[ 2.  8.]\n",
194 |       " [18. 32.]]\n",
195 |       "<NDArray 2x2 @cpu(0)>\n"
196 |      ]
197 |     }
198 |    ],
199 |    "source": [
200 |     "with autograd.record():\n",
201 |     "    y = 2 * x**2\n",
202 |     "print(y)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {
208 |     "slideshow": {
209 |      "slide_type": "fragment"
210 |     }
211 |    },
212 |    "source": [
213 |     "#### Invoke Back Propagation"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {
219 |     "slideshow": {
220 |      "slide_type": "skip"
221 |     }
222 |    },
223 |    "source": [
224 |     "Let’s invoke back propagation (backprop) by calling `y.backward()`."
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 5,
230 |    "metadata": {
231 |     "ExecuteTime": {
232 |      "end_time": "2019-06-13T16:36:29.280657Z",
233 |      "start_time": "2019-06-13T16:36:29.277604Z"
234 |     },
235 |     "attributes": {
236 |      "classes": [],
237 |      "id": "",
238 |      "n": "8"
239 |     },
240 |     "slideshow": {
241 |      "slide_type": "fragment"
242 |     }
243 |    },
244 |    "outputs": [],
245 |    "source": [
246 |     "y.backward()"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "markdown",
251 |    "metadata": {
252 |     "slideshow": {
253 |      "slide_type": "subslide"
254 |     }
255 |    },
256 |    "source": [
257 |     "#### Verify Computed Gradients\n",
258 |     "\n",
259 |     "Note that $y=2x^2$ and $\\frac{dy}{dx} = 4x$, which should be\n",
260 |     "\n",
261 |     "`[[4, 8],[12, 16]]`"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 6,
267 |    "metadata": {
268 |     "ExecuteTime": {
269 |      "end_time": "2019-06-13T16:36:29.293743Z",
270 |      "start_time": "2019-06-13T16:36:29.282783Z"
271 |     },
272 |     "attributes": {
273 |      "classes": [],
274 |      "id": "",
275 |      "n": "9"
276 |     },
277 |     "slideshow": {
278 |      "slide_type": "fragment"
279 |     }
280 |    },
281 |    "outputs": [
282 |     {
283 |      "data": {
284 |       "text/plain": [
285 |        "\n",
286 |        "[[ 4.  8.]\n",
287 |        " [12. 16.]]\n",
288 |        "<NDArray 2x2 @cpu(0)>"
289 |       ]
290 |      },
291 |      "execution_count": 6,
292 |      "metadata": {},
293 |      "output_type": "execute_result"
294 |     }
295 |    ],
296 |    "source": [
297 |     "x.grad"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {
303 |     "slideshow": {
304 |      "slide_type": "slide"
305 |     }
306 |    },
307 |    "source": [
308 |     "## Using Python Control Flows"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "markdown",
313 |    "metadata": {
314 |     "slideshow": {
315 |      "slide_type": "notes"
316 |     }
317 |    },
318 |    "source": [
319 |     "Sometimes we want to write dynamic programs where the execution depends on some real-time values. MXNet will record the execution trace and compute the gradient as well."
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {
325 |     "slideshow": {
326 |      "slide_type": "subslide"
327 |     }
328 |    },
329 |    "source": [
330 |     "### Cast Study: \n",
331 |     "\n",
332 |     "`f(a)`: it doubles `a` until `norm(a)` reaches 1000. Then it selects one element depending on the sum of its elements."
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": 7,
338 |    "metadata": {
339 |     "ExecuteTime": {
340 |      "end_time": "2019-06-13T16:36:29.301810Z",
341 |      "start_time": "2019-06-13T16:36:29.296328Z"
342 |     },
343 |     "slideshow": {
344 |      "slide_type": "fragment"
345 |     }
346 |    },
347 |    "outputs": [],
348 |    "source": [
349 |     "def f(a):\n",
350 |     "    # a is a vector of two elements\n",
351 |     "    b = a * 2\n",
352 |     "    while b.norm() < 1000:\n",
353 |     "        b = b * 2\n",
354 |     "    return b[0] if b.sum() >= 0 else b[1]"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "markdown",
359 |    "metadata": {
360 |     "slideshow": {
361 |      "slide_type": "subslide"
362 |     }
363 |    },
364 |    "source": [
365 |     "#### Feed in a Random Value and Record:"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": 8,
371 |    "metadata": {
372 |     "ExecuteTime": {
373 |      "end_time": "2019-06-13T16:36:29.316038Z",
374 |      "start_time": "2019-06-13T16:36:29.303593Z"
375 |     },
376 |     "slideshow": {
377 |      "slide_type": "fragment"
378 |     }
379 |    },
380 |    "outputs": [],
381 |    "source": [
382 |     "a = nd.random.uniform(shape=2)\n",
383 |     "a.attach_grad()\n",
384 |     "with autograd.record():\n",
385 |     "    c = f(a)"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "markdown",
390 |    "metadata": {
391 |     "slideshow": {
392 |      "slide_type": "subslide"
393 |     }
394 |    },
395 |    "source": [
396 |     "#### Compute and Verify Gradients\n",
397 |     "\n",
398 |     "`b` is a linear function of `a`, and `c` is chosen from `b`. \n",
399 |     "\n",
400 |     "The gradient w.r.t. `a` be will be either `[c/a[0], 0]` or `[0, c/a[1]]`.\n",
401 |     "\n",
402 |     "```python\n",
403 |     "def f(a):\n",
404 |     "    b = a * 2\n",
405 |     "    while b.norm() < 1000:\n",
406 |     "        b = b * 2\n",
407 |     "    return b[0] if b.sum() >= 0 else b[1]\n",
408 |     "```"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": 9,
414 |    "metadata": {
415 |     "ExecuteTime": {
416 |      "end_time": "2019-06-13T16:36:29.323825Z",
417 |      "start_time": "2019-06-13T16:36:29.317884Z"
418 |     },
419 |     "slideshow": {
420 |      "slide_type": "subslide"
421 |     }
422 |    },
423 |    "outputs": [
424 |     {
425 |      "data": {
426 |       "text/plain": [
427 |        "[\n",
428 |        " [2048.    0.]\n",
429 |        " <NDArray 2 @cpu(0)>, \n",
430 |        " [2048.     1895.8933]\n",
431 |        " <NDArray 2 @cpu(0)>]"
432 |       ]
433 |      },
434 |      "execution_count": 9,
435 |      "metadata": {},
436 |      "output_type": "execute_result"
437 |     }
438 |    ],
439 |    "source": [
440 |     "c.backward()\n",
441 |     "[a.grad, c/a]"
442 |    ]
443 |   }
444 |  ],
445 |  "metadata": {
446 |   "kernelspec": {
447 |    "display_name": "conda_mxnet_p36",
448 |    "language": "python",
449 |    "name": "conda_mxnet_p36"
450 |   },
451 |   "language_info": {
452 |    "codemirror_mode": {
453 |     "name": "ipython",
454 |     "version": 3
455 |    },
456 |    "file_extension": ".py",
457 |    "mimetype": "text/x-python",
458 |    "name": "python",
459 |    "nbconvert_exporter": "python",
460 |    "pygments_lexer": "ipython3",
461 |    "version": "3.6.5"
462 |   }
463 |  },
464 |  "nbformat": 4,
465 |  "nbformat_minor": 2
466 | }
467 | 


--------------------------------------------------------------------------------
/natural_language_understanding/bert/conversion_tools/convert_paddle_to_gluon.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | from __future__ import unicode_literals
  4 | from __future__ import absolute_import
  5 | from __future__ import print_function
  6 | 
  7 | import collections
  8 | import os
  9 | import sys
 10 | import numpy as np
 11 | import argparse
 12 | import logging
 13 | import json
 14 | import mxnet as mx
 15 | import gluonnlp as nlp
 16 | import paddle.fluid as fluid
 17 | 
 18 | from gluonnlp.model import BERTEncoder, BERTModel
 19 | from gluonnlp.model.bert import bert_hparams
 20 | from utils import get_hash, tf_vocab_to_gluon_vocab, load_text_vocab
 21 | 
 22 | parser = argparse.ArgumentParser()
 23 | parser.add_argument("--gluon_bert_model_base", default='ernie_12_768_12', type=str, help=".")
 24 | parser.add_argument("--init_pretraining_params", default='./ERNIE_stable-1.0.1/params',
 25 |                     type=str, help=".")
 26 | parser.add_argument("--ernie_config_path", default='./ERNIE_stable-1.0.1/ernie_config.json',
 27 |                     type=str, help=".")
 28 | parser.add_argument("--ernie_vocab_path", default='./ERNIE_stable-1.0.1/vocab.txt',
 29 |                     type=str, help=".")
 30 | parser.add_argument("--out_dir", default='./ernie_gluon_model2', type=str, help=".")
 31 | parser.add_argument("--baidu_lark_repo_dir", default='../../../../LARK', type=str,
 32 |                     help='path to the original baidu lark repository. '
 33 |                          'The repo should be at f97e3c8581e36dc1979560d62f75df862acd9585.'
 34 |                          '(https://github.com/PaddlePaddle/LARK.git)')
 35 | args = parser.parse_args()
 36 | 
 37 | sys.path = [os.path.join(args.baidu_lark_repo_dir,'ERNIE')] + sys.path
 38 | try:
 39 |     from model.ernie import ErnieConfig
 40 |     from finetune.classifier import create_model
 41 | except:
 42 |     raise ImportError('Place clone ERNIE first')
 43 | 
 44 | def if_exist(var):
 45 |     return os.path.exists(os.path.join(args.init_pretraining_params, var.name))
 46 | 
 47 | 
 48 | def build_weight_map():
 49 |     weight_map = collections.OrderedDict({
 50 |         'word_embedding': 'word_embed.0.weight',
 51 |         'pos_embedding': 'encoder.position_weight',
 52 |         'sent_embedding': 'token_type_embed.0.weight',
 53 |         'pre_encoder_layer_norm_scale': 'encoder.layer_norm.gamma',
 54 |         'pre_encoder_layer_norm_bias': 'encoder.layer_norm.beta',
 55 |     })
 56 | 
 57 |     def add_w_and_b(ernie_pre, gluon_pre):
 58 |         weight_map[ernie_pre + ".w_0"] = gluon_pre + ".weight"
 59 |         weight_map[ernie_pre + ".b_0"] = gluon_pre + ".bias"
 60 | 
 61 |     def add_one_encoder_layer(layer_number):
 62 |         # attention
 63 |         add_w_and_b("encoder_layer_{}_multi_head_att_query_fc".format(layer_number),
 64 |                     "encoder.transformer_cells.{}.attention_cell.proj_query".format(layer_number))
 65 |         add_w_and_b("encoder_layer_{}_multi_head_att_key_fc".format(layer_number),
 66 |                     "encoder.transformer_cells.{}.attention_cell.proj_key".format(layer_number))
 67 |         add_w_and_b("encoder_layer_{}_multi_head_att_value_fc".format(layer_number),
 68 |                     "encoder.transformer_cells.{}.attention_cell.proj_value".format(layer_number))
 69 |         add_w_and_b("encoder_layer_{}_multi_head_att_output_fc".format(layer_number),
 70 |                     "encoder.transformer_cells.{}.proj".format(layer_number))
 71 |         weight_map["encoder_layer_{}_post_att_layer_norm_bias".format(layer_number)] = \
 72 |             "encoder.transformer_cells.{}.layer_norm.beta".format(layer_number)
 73 |         weight_map["encoder_layer_{}_post_att_layer_norm_scale".format(layer_number)] = \
 74 |             "encoder.transformer_cells.{}.layer_norm.gamma".format(layer_number)
 75 |         # intermediate
 76 |         add_w_and_b("encoder_layer_{}_ffn_fc_0".format(layer_number),
 77 |                     "encoder.transformer_cells.{}.ffn.ffn_1".format(layer_number))
 78 |         # output
 79 |         add_w_and_b("encoder_layer_{}_ffn_fc_1".format(layer_number),
 80 |                     "encoder.transformer_cells.{}.ffn.ffn_2".format(layer_number))
 81 |         weight_map["encoder_layer_{}_post_ffn_layer_norm_bias".format(layer_number)] = \
 82 |             "encoder.transformer_cells.{}.ffn.layer_norm.beta".format(layer_number)
 83 |         weight_map["encoder_layer_{}_post_ffn_layer_norm_scale".format(layer_number)] = \
 84 |             "encoder.transformer_cells.{}.ffn.layer_norm.gamma".format(layer_number)
 85 | 
 86 |     for i in range(12):
 87 |         add_one_encoder_layer(i)
 88 |     add_w_and_b('pooled_fc', 'pooler')
 89 |     return weight_map
 90 | 
 91 | 
 92 | def extract_weights(args):
 93 |     # add ERNIE to environment
 94 |     print('extract weights start'.center(60, '='))
 95 |     startup_prog = fluid.Program()
 96 |     test_prog = fluid.Program()
 97 |     place = fluid.CPUPlace()
 98 |     exe = fluid.Executor(place)
 99 |     exe.run(startup_prog)
100 |     args.max_seq_len = 512
101 |     args.use_fp16 = False
102 |     args.num_labels = 2
103 |     args.loss_scaling = 1.0
104 |     print('model config:')
105 |     ernie_config = ErnieConfig(args.ernie_config_path)
106 |     ernie_config.print_config()
107 |     with fluid.program_guard(test_prog, startup_prog):
108 |         with fluid.unique_name.guard():
109 |             _, _ = create_model(
110 |                 args,
111 |                 pyreader_name='train',
112 |                 ernie_config=ernie_config)
113 |     fluid.io.load_vars(exe, args.init_pretraining_params, main_program=test_prog, predicate=if_exist)
114 |     state_dict = collections.OrderedDict()
115 |     weight_map = build_weight_map()
116 |     for ernie_name, gluon_name in weight_map.items():
117 |         fluid_tensor = fluid.global_scope().find_var(ernie_name).get_tensor()
118 |         fluid_array = np.array(fluid_tensor, dtype=np.float32)
119 |         if 'w_0' in ernie_name:
120 |             fluid_array = fluid_array.transpose()
121 |         state_dict[gluon_name] = fluid_array
122 |         print(f'{ernie_name} -> {gluon_name} {fluid_array.shape}')
123 |     print('extract weights done!'.center(60, '='))
124 |     return state_dict
125 | 
126 | 
127 | def save_model(new_gluon_parameters, output_dir):
128 |     print('save model start'.center(60, '='))
129 |     if not os.path.exists(output_dir):
130 |         os.makedirs(output_dir)
131 |     # save model
132 |     # load vocab
133 |     vocab_f = open(os.path.join(output_dir, "vocab.txt"), "wt", encoding='utf-8')
134 |     with open(args.ernie_vocab_path, "rt", encoding='utf-8') as f:
135 |         for line in f:
136 |             data = line.strip().split("\t")
137 |             vocab_f.writelines(data[0] + "\n")
138 |     vocab_f.close()
139 |     vocab = tf_vocab_to_gluon_vocab(load_text_vocab(os.path.join(output_dir, "vocab.txt")))
140 |     # vocab serialization
141 |     tmp_file_path = os.path.expanduser(os.path.join(output_dir, 'tmp'))
142 |     if not os.path.exists(os.path.join(args.out_dir)):
143 |         os.makedirs(os.path.join(args.out_dir))
144 |     with open(tmp_file_path, 'w') as f:
145 |         f.write(vocab.to_json())
146 |     hash_full, hash_short = get_hash(tmp_file_path)
147 |     gluon_vocab_path = os.path.expanduser(os.path.join(output_dir, hash_short + '.vocab'))
148 |     with open(gluon_vocab_path, 'w') as f:
149 |         f.write(vocab.to_json())
150 |         logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full)
151 | 
152 |     # BERT config
153 |     tf_config_names_to_gluon_config_names = {
154 |         'attention_probs_dropout_prob': 'embed_dropout',
155 |         'hidden_act': None,
156 |         'hidden_dropout_prob': 'dropout',
157 |         'hidden_size': 'units',
158 |         'initializer_range': None,
159 |         # 'intermediate_size': 'hidden_size',
160 |         'max_position_embeddings': 'max_length',
161 |         'num_attention_heads': 'num_heads',
162 |         'num_hidden_layers': 'num_layers',
163 |         'type_vocab_size': 'token_type_vocab_size',
164 |         'vocab_size': None
165 |     }
166 |     predefined_args = bert_hparams[args.gluon_bert_model_base]
167 |     with open(args.ernie_config_path, 'r') as f:
168 |         tf_config = json.load(f)
169 |         if 'layer_norm_eps' in tf_config: # ignore layer_norm_eps
170 |             del tf_config['layer_norm_eps']
171 |         assert len(tf_config) == len(tf_config_names_to_gluon_config_names)
172 |         for tf_name, gluon_name in tf_config_names_to_gluon_config_names.items():
173 |             if tf_name is None or gluon_name is None:
174 |                 continue
175 |             if gluon_name != 'max_length':
176 |                 assert tf_config[tf_name] == predefined_args[gluon_name]
177 | 
178 |     encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
179 |                           num_layers=predefined_args['num_layers'], units=predefined_args['units'],
180 |                           hidden_size=predefined_args['hidden_size'],
181 |                           max_length=predefined_args['max_length'],
182 |                           num_heads=predefined_args['num_heads'], scaled=predefined_args['scaled'],
183 |                           dropout=predefined_args['dropout'],
184 |                           use_residual=predefined_args['use_residual'],
185 |                           activation='relu')
186 | 
187 |     bert = BERTModel(encoder, len(vocab),
188 |                      token_type_vocab_size=predefined_args['token_type_vocab_size'],
189 |                      units=predefined_args['units'], embed_size=predefined_args['embed_size'],
190 |                      embed_dropout=predefined_args['embed_dropout'],
191 |                      word_embed=predefined_args['word_embed'], use_pooler=True,
192 |                      use_decoder=False, use_classifier=False)
193 | 
194 |     bert.initialize(init=mx.init.Normal(0.02))
195 | 
196 |     ones = mx.nd.ones((2, 8))
197 |     out = bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]]))
198 |     params = bert._collect_params_with_prefix()
199 |     assert len(params) == len(new_gluon_parameters), "Gluon model does not match paddle model. " \
200 |                                                    "Please fix the BERTModel hyperparameters"
201 | 
202 |     # post processings for parameters:
203 |     # - handle tied decoder weight
204 |     new_gluon_parameters['decoder.3.weight'] = new_gluon_parameters['word_embed.0.weight']
205 |     # set parameter data
206 |     loaded_params = {}
207 |     for name in params:
208 |         if name == 'word_embed.0.weight':
209 |             arr = mx.nd.array(new_gluon_parameters[name][:params[name].shape[0]])
210 |         else:
211 |             arr = mx.nd.array(new_gluon_parameters[name])
212 |         try:
213 |             assert arr.shape == params[name].shape
214 |         except:
215 |             print(name)
216 |         params[name].set_data(arr)
217 |         loaded_params[name] = True
218 | 
219 |     # post processings for parameters:
220 |     # - handle tied decoder weight
221 |     # - update word embedding for reserved tokens
222 | 
223 |     if len(params) != len(loaded_params):
224 |         raise RuntimeError('The Gluon BERTModel comprises {} parameter arrays, '
225 |                            'but {} have been extracted from the paddle model. '.format(
226 |             len(params), len(loaded_params)))
227 | 
228 |     # param serialization
229 |     bert.save_parameters(tmp_file_path)
230 |     hash_full, hash_short = get_hash(tmp_file_path)
231 |     gluon_param_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.params'))
232 |     logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full)
233 |     bert.save_parameters(gluon_param_path)
234 |     mx.nd.waitall()
235 |     # save config
236 |     print('finish save vocab')
237 |     print('save model done!'.center(60, '='))
238 | 
239 | 
240 | if __name__ == "__main__":
241 |     state_dict = extract_weights(args)
242 |     save_model(state_dict, args.out_dir)


--------------------------------------------------------------------------------