├── sentiment_analysis ├── samodel-v3.png └── utils.py ├── machine_translation ├── transformer.png ├── nmt │ ├── __init__.py │ ├── _constants.py │ ├── hyperparameters.py │ ├── utils.py │ ├── index.rst │ ├── translation.py │ └── dataset.py ├── hyperparameters.py ├── dataprocessor.py └── utils.py ├── sequence_generation ├── cache_model.png ├── language_model_intro.png └── text_generation │ ├── __init__.py │ ├── model │ └── __init__.py │ ├── index.rst │ └── sequence_sampling.py ├── natural_language_understanding ├── bert.png ├── qa.png ├── bert-embed.png ├── bert-sentence-pair.png ├── bert │ ├── conversion_tools │ │ ├── ernie_top_layer_emb.npy │ │ ├── compare_gluon_ernie.py │ │ ├── infer_pytorch_gluon_parameter_name_mapping.py │ │ ├── convert_pytorch_model.py │ │ ├── compare_tf_gluon_model.py │ │ ├── convert_tf_model.py │ │ └── convert_paddle_to_gluon.py │ ├── __init__.py │ ├── model │ │ ├── __init__.py │ │ ├── qa.py │ │ ├── ner.py │ │ └── classification.py │ ├── export │ │ ├── __init__.py │ │ └── export.py │ ├── data │ │ ├── __init__.py │ │ ├── embedding.py │ │ ├── transform.py │ │ └── baidu_ernie_data.py │ ├── utils.py │ ├── ner_utils.py │ ├── predict_ner.py │ ├── embedding.py │ ├── finetune_ner.py │ └── fp16_utils.py └── qa_utils.py ├── README.md ├── intent_classification_and_slot_labelling ├── explain_subword_tagging.png └── README.md ├── gluon_basics ├── mlp_utils.py └── autograd.ipynb ├── env └── nlp.yml ├── word_embedding ├── utils.py └── model.py └── .gitignore /sentiment_analysis/samodel-v3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/sentiment_analysis/samodel-v3.png -------------------------------------------------------------------------------- /machine_translation/transformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/machine_translation/transformer.png -------------------------------------------------------------------------------- /sequence_generation/cache_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/sequence_generation/cache_model.png -------------------------------------------------------------------------------- /natural_language_understanding/bert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/natural_language_understanding/bert.png -------------------------------------------------------------------------------- /natural_language_understanding/qa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/natural_language_understanding/qa.png -------------------------------------------------------------------------------- /sequence_generation/language_model_intro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/sequence_generation/language_model_intro.png -------------------------------------------------------------------------------- /natural_language_understanding/bert-embed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/natural_language_understanding/bert-embed.png -------------------------------------------------------------------------------- /natural_language_understanding/bert-sentence-pair.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/natural_language_understanding/bert-sentence-pair.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nlp-notebooks 2 | 3 | ## Environment Setup 4 | 5 | - install conda 6 | - conda env create -f nlp-notebooks/env/nlp.yml 7 | - source activate nlp 8 | - jupyter notebook 9 | -------------------------------------------------------------------------------- /intent_classification_and_slot_labelling/explain_subword_tagging.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/intent_classification_and_slot_labelling/explain_subword_tagging.png -------------------------------------------------------------------------------- /natural_language_understanding/bert/conversion_tools/ernie_top_layer_emb.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric-haibin-lin/nlp-notebooks/HEAD/natural_language_understanding/bert/conversion_tools/ernie_top_layer_emb.npy -------------------------------------------------------------------------------- /gluon_basics/mlp_utils.py: -------------------------------------------------------------------------------- 1 | def show_fashion_mnist(images, labels): 2 | import d2l 3 | d2l.use_svg_display() 4 | # Here _ means that we ignore (not use) variables. 5 | _, figs = d2l.plt.subplots(1, len(images), figsize=(12, 12)) 6 | for f, img, lbl in zip(figs, images, labels): 7 | f.imshow(img.reshape((28, 28)).asnumpy()) 8 | f.set_title(lbl) 9 | f.axes.get_xaxis().set_visible(False) 10 | f.axes.get_yaxis().set_visible(False) 11 | -------------------------------------------------------------------------------- /env/nlp.yml: -------------------------------------------------------------------------------- 1 | name: nlp 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.6 6 | - pip=18.1 7 | - spacy 8 | - nltk 9 | - ipython 10 | - ipykernel 11 | - jupyter=1.0.0 12 | - matplotlib=2.2.2 13 | - pandas=0.23.4 14 | - regex 15 | - pip: 16 | - mxnet-cu100mkl>=1.5.0b20190630 17 | - sacremoses 18 | - sentencepiece<0.2 19 | - seaborn 20 | - jieba 21 | - d2l==0.9.2 22 | - environment_kernels 23 | - jupyter_contrib_nbextensions 24 | - jupyter_nbextensions_configurator 25 | - gluonnlp 26 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | # pylint: disable=wildcard-import 21 | """BERT Module.""" 22 | from . import model, data 23 | -------------------------------------------------------------------------------- /sequence_generation/text_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | # pylint: disable=wildcard-import 21 | """Text Generation Module.""" 22 | from . import model 23 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | # pylint: disable=wildcard-import 21 | """BERT model.""" 22 | from . import classification, ner, qa 23 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/export/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | # pylint: disable=wildcard-import 21 | """Hybrid BERT for deployment.""" 22 | from . import hybrid_bert 23 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/data/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | # pylint: disable=wildcard-import 21 | """BERT data.""" 22 | from . import qa, classification, embedding, transform, ner 23 | -------------------------------------------------------------------------------- /machine_translation/nmt/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | # pylint: disable=wildcard-import 21 | """NMT example.""" 22 | from . import _constants, bleu, dataset, \ 23 | gnmt, translation, utils 24 | -------------------------------------------------------------------------------- /machine_translation/nmt/_constants.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | """Constants used in the NMT examples.""" 20 | import os 21 | 22 | __all__ = ['CACHE_PATH'] 23 | 24 | CACHE_PATH = os.path.realpath(os.path.join(os.path.realpath(__file__), '..', 'cached')) 25 | -------------------------------------------------------------------------------- /natural_language_understanding/qa_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import collections 3 | import mxnet as mx 4 | import gluonnlp as nlp 5 | import bert 6 | from mxnet.gluon.model_zoo import model_store 7 | 8 | def download_qa_ckpt(): 9 | model_store._model_sha1['bert_qa'] = '7eb11865ecac2a412457a7c8312d37a1456af7fc' 10 | result = model_store.get_model_file('bert_qa', root='./temp') 11 | print('Downloaded checkpoint to {}'.format(result)) 12 | return result 13 | 14 | def predict(dataset, all_results, vocab): 15 | tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=True) 16 | transform = bert.data.qa.SQuADTransform(tokenizer, is_pad=False, is_training=False, do_lookup=False) 17 | dev_dataset = dataset.transform(transform._transform) 18 | from bert.bert_qa_evaluate import PredResult, predict 19 | all_predictions = collections.OrderedDict() 20 | for features in dev_dataset: 21 | results = all_results[features[0].example_id] 22 | 23 | prediction, nbest = predict( 24 | features=features, 25 | results=results, 26 | tokenizer=nlp.data.BERTBasicTokenizer(lower=True)) 27 | 28 | print('\nContext: %s\n'%(' '.join(features[0].doc_tokens))) 29 | question = features[0].input_ids.index('[SEP]') 30 | print('Question: %s\n'%(' '.join((features[0].input_ids[1:question])))) 31 | print('Top predictions: ') 32 | for i in range(3): 33 | print('%.2f%% \t %s'%(nbest[i][1] * 100, nbest[i][0])) 34 | print('') 35 | -------------------------------------------------------------------------------- /machine_translation/hyperparameters.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | """Hyperparameters for transformer.""" 20 | 21 | import nmt 22 | 23 | # parameters for dataset 24 | src_lang = 'en' 25 | tgt_lang = 'de' 26 | src_max_len = -1 27 | tgt_max_len = -1 28 | 29 | # parameters for model 30 | num_units = 512 31 | hidden_size = 2048 32 | dropout = 0.1 33 | epsilon = 0.1 34 | num_layers = 6 35 | num_heads = 8 36 | scaled = True 37 | 38 | # parameters for training 39 | optimizer = 'adam' 40 | epochs = 3 41 | batch_size = 2700 42 | test_batch_size = 256 43 | num_accumulated = 1 44 | lr = 2 45 | warmup_steps = 1 46 | save_dir = 'transformer_en_de_u512' 47 | average_start = 1 48 | num_buckets = 20 49 | log_interval = 10 50 | bleu = '13a' 51 | 52 | #parameters for testing 53 | beam_size = 4 54 | lp_alpha = 0.6 55 | lp_k = 5 -------------------------------------------------------------------------------- /machine_translation/nmt/hyperparameters.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | """Hyperparameters for transformer, for past reference only.""" 20 | 21 | # parameters for dataset 22 | src_lang = 'en' 23 | tgt_lang = 'de' 24 | src_max_len = -1 25 | tgt_max_len = -1 26 | 27 | # parameters for model 28 | num_units = 512 29 | hidden_size = 2048 30 | dropout = 0.1 31 | epsilon = 0.1 32 | num_layers = 6 33 | num_heads = 8 34 | scaled = True 35 | 36 | # parameters for training 37 | optimizer = 'adam' 38 | epochs = 3 39 | batch_size = 2700 40 | test_batch_size = 256 41 | num_accumulated = 1 42 | lr = 2 43 | warmup_steps = 1 44 | save_dir = 'transformer_en_de_u512' 45 | average_start = 1 46 | num_buckets = 20 47 | log_interval = 10 48 | bleu = '13a' 49 | 50 | #parameters for testing 51 | beam_size = 4 52 | lp_alpha = 0.6 53 | lp_k = 5 54 | -------------------------------------------------------------------------------- /word_embedding/utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | """Word Embeddings Training Utilities 20 | ===================================== 21 | 22 | """ 23 | 24 | import logging 25 | import time 26 | from contextlib import contextmanager 27 | 28 | import mxnet as mx 29 | 30 | 31 | def get_context(args): 32 | if args.gpu is None or args.gpu == '': 33 | context = [mx.cpu()] 34 | elif isinstance(args.gpu, int): 35 | context = [mx.gpu(args.gpu)] 36 | else: 37 | context = [mx.gpu(int(i)) for i in args.gpu] 38 | return context 39 | 40 | 41 | @contextmanager 42 | def print_time(task): 43 | start_time = time.time() 44 | logging.info('Starting to %s', task) 45 | yield 46 | logging.info('Finished to {} in {:.2f} seconds'.format( 47 | task, 48 | time.time() - start_time)) 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/data/embedding.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and DMLC. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """BERT embedding datasets.""" 16 | from mxnet.gluon.data import Dataset 17 | 18 | __all__ = ['BertEmbeddingDataset'] 19 | 20 | class BertEmbeddingDataset(Dataset): 21 | """Dataset for BERT Embedding 22 | 23 | Parameters 24 | ---------- 25 | sentences : List[str]. 26 | Sentences for embeddings. 27 | transform : BERTDatasetTransform, default None. 28 | transformer for BERT input format 29 | """ 30 | 31 | def __init__(self, sentences, transform=None): 32 | """Dataset for BERT Embedding 33 | 34 | Parameters 35 | ---------- 36 | sentences : List[str]. 37 | Sentences for embeddings. 38 | transform : BERTDatasetTransform, default None. 39 | transformer for BERT input format 40 | """ 41 | self.sentences = sentences 42 | self.transform = transform 43 | 44 | def __getitem__(self, idx): 45 | sentence = (self.sentences[idx], 0) 46 | if self.transform: 47 | return self.transform(sentence) 48 | else: 49 | return sentence 50 | 51 | def __len__(self): 52 | return len(self.sentences) 53 | -------------------------------------------------------------------------------- /machine_translation/nmt/utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | """Utility functions.""" 20 | 21 | import os 22 | import logging 23 | import inspect 24 | 25 | __all__ = ['logging_config'] 26 | 27 | 28 | def logging_config(folder=None, name=None, 29 | level=logging.DEBUG, 30 | console_level=logging.INFO, 31 | no_console=False): 32 | """ Config the logging. 33 | 34 | Parameters 35 | ---------- 36 | folder : str or None 37 | name : str or None 38 | level : int 39 | console_level 40 | no_console: bool 41 | Whether to disable the console log 42 | Returns 43 | ------- 44 | folder : str 45 | Folder that the logging file will be saved into. 46 | """ 47 | if name is None: 48 | name = inspect.stack()[1][1].split('.')[0] 49 | if folder is None: 50 | folder = os.path.join(os.getcwd(), name) 51 | if not os.path.exists(folder): 52 | os.makedirs(folder) 53 | # Remove all the current handlers 54 | for handler in logging.root.handlers: 55 | logging.root.removeHandler(handler) 56 | logging.root.handlers = [] 57 | logpath = os.path.join(folder, name + '.log') 58 | print('All Logs will be saved to {}'.format(logpath)) 59 | logging.root.setLevel(level) 60 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(message)s') 61 | logfile = logging.FileHandler(logpath) 62 | logfile.setLevel(level) 63 | logfile.setFormatter(formatter) 64 | logging.root.addHandler(logfile) 65 | if not no_console: 66 | # Initialze the console logging 67 | logconsole = logging.StreamHandler() 68 | logconsole.setLevel(console_level) 69 | logconsole.setFormatter(formatter) 70 | logging.root.addHandler(logconsole) 71 | return folder 72 | -------------------------------------------------------------------------------- /machine_translation/nmt/index.rst: -------------------------------------------------------------------------------- 1 | Machine Translation 2 | ------------------- 3 | 4 | :download:`Download scripts ` 5 | 6 | Google Neural Machine Translation 7 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 8 | 9 | Use the following command to train the GNMT model on the IWSLT2015 dataset. 10 | 11 | .. code-block:: console 12 | 13 | $ MXNET_GPU_MEM_POOL_TYPE=Round python train_gnmt.py --src_lang en --tgt_lang vi --batch_size 128 \ 14 | --optimizer adam --lr 0.001 --lr_update_factor 0.5 --beam_size 10 --bucket_scheme exp \ 15 | --num_hidden 512 --save_dir gnmt_en_vi_l2_h512_beam10 --epochs 12 --gpu 0 16 | 17 | It gets test BLEU score equals to 26.20. 18 | 19 | Transformers 20 | ~~~~~~~~~~~~ 21 | 22 | Use the following commands to train the Transformer model on the WMT14 dataset for English to German translation. 23 | 24 | .. code-block:: console 25 | 26 | $ MXNET_GPU_MEM_POOL_TYPE=Round python train_transformer.py --dataset WMT2014BPE \ 27 | --src_lang en --tgt_lang de --batch_size 2700 \ 28 | --optimizer adam --num_accumulated 16 --lr 2.0 --warmup_steps 4000 \ 29 | --save_dir transformer_en_de_u512 --epochs 30 --gpus 0,1,2,3,4,5,6,7 --scaled \ 30 | --average_start 5 --num_buckets 20 --bucket_scheme exp --bleu 13a --log_interval 10 31 | 32 | It gets official mteval-v13a BLEU score equals to 27.09 on newstest2014 (http://statmt.org/wmt14/test-filtered.tgz). 33 | This result is obtained by using averaged SGD in last 5 epochs. If we use international tokenization (i.e., ``--bleu intl``), 34 | we can obtain bleu score equals to 27.89. If we use ``--bleu tweaked``, we obtain test BLEU score equals to 28.96. 35 | This result is obtained on tweaked reference, where the tokenized reference text is put in ATAT format for historical reason 36 | and following preprocessing pipeline is done: 37 | 38 | .. code-block:: console 39 | 40 | mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l de 41 | mosesdecoder/scripts/tokenizer/remove-non-printing-char.perl 42 | mosesdecoder/scripts/tokenizer/tokenizer.perl -q -no-escape -protected mosesdecoder/scripts/tokenizer/basic-protected-patterns -l de. 43 | 44 | If we turn on ``--full``, the testing is performed on newstest2014 (http://statmt.org/wmt14/test-full.tgz). Then, we can 45 | obtain BLEU=27.05 with ``--bleu 13a``, BLEU=27.81 with ``--bleu intl``, and BLEU=28.80 with ``--bleu tweaked`` 46 | 47 | The pre-trained model can be downloaded from http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/transformer_en_de_512_WMT2014-e25287c5.zip. 48 | 49 | For the users from China, it might be faster with this link instead: https://apache-mxnet.s3.cn-north-1.amazonaws.com.cn/gluon/models/transformer_en_de_512_WMT2014-e25287c5.zip. 50 | -------------------------------------------------------------------------------- /sequence_generation/text_generation/model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | # pylint: disable=wildcard-import 21 | """Text generation models.""" 22 | from gluonnlp.model import get_model as _get_model 23 | from .gpt import * 24 | 25 | def get_model(name, **kwargs): 26 | """Returns a pre-defined model by name. 27 | 28 | In addition to the models in GluonNLP model API, this API supports getting GPT-2 models. 29 | 30 | Parameters 31 | ---------- 32 | name : str 33 | Name of the model. 34 | dataset_name : str or None, default None 35 | The dataset name on which the pre-trained model is trained. 36 | For language model, options are 'wikitext-2'. 37 | For ELMo, Options are 'gbw' and '5bw'. 38 | 'gbw' represents 1 Billion Word Language Model Benchmark 39 | http://www.statmt.org/lm-benchmark/; 40 | '5bw' represents a dataset of 5.5B tokens consisting of 41 | Wikipedia (1.9B) and all of the monolingual news crawl data from WMT 2008-2012 (3.6B). 42 | If specified, then the returned vocabulary is extracted from 43 | the training set of the dataset. 44 | If None, then vocab is required, for specifying embedding weight size, and is directly 45 | returned. 46 | vocab : gluonnlp.Vocab or None, default None 47 | Vocabulary object to be used with the language model. 48 | Required when dataset_name is not specified. 49 | None Vocabulary object is required with the ELMo model. 50 | pretrained : bool, default False 51 | Whether to load the pre-trained weights for model. 52 | ctx : Context, default CPU 53 | The context in which to load the pre-trained weights. 54 | root : str, default '$MXNET_HOME/models' with MXNET_HOME defaults to '~/.mxnet' 55 | Location for keeping the model parameters. 56 | 57 | Returns 58 | ------- 59 | gluon.Block, gluonnlp.Vocab, (optional) gluonnlp.Vocab 60 | """ 61 | models = {'gpt2_117m' : gpt2_117m, 62 | 'gpt2_345m' : gpt2_345m} 63 | name = name.lower() 64 | if name not in models: 65 | return _get_model(name, **kwargs) 66 | return models[name](**kwargs) 67 | -------------------------------------------------------------------------------- /sentiment_analysis/utils.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import sys 3 | import collections 4 | import os 5 | import sys 6 | import numpy as np 7 | import math 8 | from matplotlib import pyplot as plt 9 | from mxnet import nd, autograd, gluon, init, context, image 10 | from mxnet.gluon import nn, rnn 11 | import random 12 | import re 13 | import time 14 | import tarfile 15 | import zipfile 16 | 17 | import mxnet as mx 18 | import gluonnlp as nlp 19 | 20 | import d2l 21 | 22 | 23 | 24 | def load_data_imdb(batch_size, num_steps=500): 25 | d2l.download_imdb() 26 | train_data, test_data = d2l.read_imdb('train'), d2l.read_imdb('test') 27 | train_tokens = d2l.tokenize(train_data[0], token='word') 28 | test_tokens = d2l.tokenize(test_data[0], token='word') 29 | vocab = nlp.Vocab(nlp.data.count_tokens(itertools.chain.from_iterable(train_tokens)), min_freq=5) 30 | train_features = mx.nd.array([d2l.trim_pad(vocab[line], num_steps, vocab[vocab.unknown_token]) 31 | for line in train_tokens]) 32 | test_features = mx.nd.array([d2l.trim_pad(vocab[line], num_steps, vocab[vocab.unknown_token]) 33 | for line in test_tokens]) 34 | train_iter = d2l.load_array((train_features, train_data[1]), batch_size) 35 | test_iter = d2l.load_array((test_features, test_data[1]), batch_size, 36 | is_train=False) 37 | return train_iter, test_iter, vocab 38 | 39 | 40 | # from d2l import train_ch12 as train 41 | def train_batch_ch12(net, features, labels, loss, trainer, ctx_list): 42 | Xs, ys = d2l.split_batch(features, labels, ctx_list) 43 | with autograd.record(): 44 | pys = [net(X) for X in Xs] 45 | ls = [loss(py, y) for py, y in zip(pys, ys)] 46 | for l in ls: 47 | l.backward() 48 | trainer.step(features.shape[0]) 49 | train_loss_sum = sum([l.sum().asscalar() for l in ls]) 50 | train_acc_sum = sum(d2l.accuracy(py, y) for py, y in zip(pys, ys)) 51 | return train_loss_sum, train_acc_sum 52 | 53 | def train(net, train_iter, test_iter, loss, trainer, num_epochs, 54 | ctx_list=d2l.try_all_gpus()): 55 | num_batches, timer = len(train_iter), d2l.Timer() 56 | for epoch in range(num_epochs): 57 | # store training_loss, training_accuracy, num_examples, num_features 58 | metric = [0.0] * 4 59 | for i, (features, labels) in enumerate(train_iter): 60 | timer.start() 61 | l, acc = d2l.train_batch_ch12( 62 | net, features, labels, loss, trainer, ctx_list) 63 | metric = [a+b for a, b in zip(metric, (l, acc, labels.shape[0], labels.size))] 64 | timer.stop() 65 | if (i+1) % (num_batches // 5) == 0: 66 | print(epoch+i/num_batches, 67 | (metric[0]/metric[2], metric[1]/metric[3], None)) 68 | test_acc = d2l.evaluate_accuracy_gpus(net, test_iter) 69 | print('loss %.3f, train acc %.3f, test acc %.3f' % ( 70 | metric[0]/metric[2], metric[1]/metric[3], test_acc)) 71 | print('%.1f exampes/sec on %s' % ( 72 | metric[2]*num_epochs/timer.sum(), ctx_list)) 73 | 74 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | """Utility functions for BERT.""" 20 | 21 | import logging 22 | import collections 23 | import hashlib 24 | import io 25 | 26 | import mxnet as mx 27 | import gluonnlp as nlp 28 | 29 | __all__ = ['tf_vocab_to_gluon_vocab', 'load_text_vocab'] 30 | 31 | 32 | def tf_vocab_to_gluon_vocab(tf_vocab): 33 | special_tokens = ['[UNK]', '[PAD]', '[SEP]', '[MASK]', '[CLS]'] 34 | assert all(t in tf_vocab for t in special_tokens) 35 | counter = nlp.data.count_tokens(tf_vocab.keys()) 36 | vocab = nlp.vocab.BERTVocab(counter, token_to_idx=tf_vocab) 37 | return vocab 38 | 39 | 40 | def get_hash(filename): 41 | sha1 = hashlib.sha1() 42 | with open(filename, 'rb') as f: 43 | while True: 44 | data = f.read(1048576) 45 | if not data: 46 | break 47 | sha1.update(data) 48 | return sha1.hexdigest(), str(sha1.hexdigest())[:8] 49 | 50 | 51 | def read_tf_checkpoint(path): 52 | """read tensorflow checkpoint""" 53 | from tensorflow.python import pywrap_tensorflow 54 | tensors = {} 55 | reader = pywrap_tensorflow.NewCheckpointReader(path) 56 | var_to_shape_map = reader.get_variable_to_shape_map() 57 | for key in sorted(var_to_shape_map): 58 | tensor = reader.get_tensor(key) 59 | tensors[key] = tensor 60 | return tensors 61 | 62 | def profile(curr_step, start_step, end_step, profile_name='profile.json', 63 | early_exit=True): 64 | """profile the program between [start_step, end_step).""" 65 | if curr_step == start_step: 66 | mx.nd.waitall() 67 | mx.profiler.set_config(profile_memory=False, profile_symbolic=True, 68 | profile_imperative=True, filename=profile_name, 69 | aggregate_stats=True) 70 | mx.profiler.set_state('run') 71 | elif curr_step == end_step: 72 | mx.nd.waitall() 73 | mx.profiler.set_state('stop') 74 | logging.info(mx.profiler.dumps()) 75 | mx.profiler.dump() 76 | if early_exit: 77 | exit() 78 | 79 | def load_text_vocab(vocab_file): 80 | """Loads a vocabulary file into a dictionary.""" 81 | vocab = collections.OrderedDict() 82 | index = 0 83 | with io.open(vocab_file, 'r') as reader: 84 | while True: 85 | token = reader.readline() 86 | if not token: 87 | break 88 | token = token.strip() 89 | vocab[token] = index 90 | index += 1 91 | return vocab 92 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/conversion_tools/compare_gluon_ernie.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | import gluonnlp as nlp 6 | import argparse 7 | import os 8 | import mxnet as mx 9 | import json 10 | 11 | parser = argparse.ArgumentParser(description='inference compare script for ernie model in gluon', 12 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 13 | parser.add_argument('--input_file', type=str, default='input_cn.txt', 14 | help='sample input file for testing') 15 | parser.add_argument('--cased', action='store_true', 16 | help='if not set, inputs are converted to lower case') 17 | parser.add_argument('--gluon_dataset', type=str, default='baidu_ernie_uncased', 18 | help='gluon dataset name') 19 | parser.add_argument('--gluon_model', type=str, default='ernie_12_768_12', 20 | help='gluon model name') 21 | parser.add_argument('--gluon_parameter_file', type=str, default=None, 22 | help='gluon parameter file name.') 23 | parser.add_argument('--gluon_vocab_file', type=str, default=None, 24 | help='gluon vocab file corresponding to --gluon_parameter_file.') 25 | 26 | args = parser.parse_args() 27 | 28 | input_file = os.path.expanduser(args.input_file) 29 | do_lower_case = not args.cased 30 | max_length = 11 31 | if not args.gluon_dataset: 32 | with open(args.gluon_vocab_file) as f: 33 | vocab_str = json.load(f) 34 | vocab = nlp.vocab.BERTVocab.from_json(json.dumps(vocab_str)) 35 | else: 36 | vocab = None 37 | bert, vocabulary = nlp.model.get_model(args.gluon_model, 38 | dataset_name=args.gluon_dataset, 39 | vocab=vocab, 40 | pretrained=not args.gluon_parameter_file, 41 | use_pooler=False, 42 | use_decoder=False, 43 | use_classifier=False) 44 | if args.gluon_parameter_file: 45 | try: 46 | bert.cast('float16') 47 | bert.load_parameters(args.gluon_parameter_file, ignore_extra=True) 48 | bert.cast('float32') 49 | except AssertionError: 50 | bert.cast('float32') 51 | bert.load_parameters(args.gluon_parameter_file, ignore_extra=True) 52 | 53 | print(bert) 54 | tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=do_lower_case) 55 | dataset = nlp.data.TSVDataset(input_file, field_separator=nlp.data.Splitter('|||')) 56 | 57 | trans = nlp.data.BERTSentenceTransform(tokenizer, max_length) 58 | dataset = dataset.transform(trans) 59 | 60 | bert_dataloader = mx.gluon.data.DataLoader(dataset, batch_size=1, 61 | shuffle=True, last_batch='rollover') 62 | 63 | # verify the output of the first sample 64 | for i, seq in enumerate(bert_dataloader): 65 | input_ids, valid_length, type_ids = seq 66 | out = bert(input_ids, type_ids, 67 | valid_length.astype('float32')) 68 | length = valid_length.asscalar() 69 | gluon_np = out.asnumpy().squeeze(0) 70 | print(out) 71 | import numpy as np 72 | paddle_np = np.load(os.path.expanduser( 73 | 'ernie_top_layer_emb.npy')) 74 | np.testing.assert_array_almost_equal(paddle_np, gluon_np, decimal=6) 75 | break 76 | print("verify success") -------------------------------------------------------------------------------- /machine_translation/nmt/translation.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | """Machine translation models and translators.""" 20 | 21 | 22 | __all__ = ['BeamSearchTranslator'] 23 | 24 | import numpy as np 25 | import mxnet as mx 26 | from gluonnlp.model import BeamSearchScorer, BeamSearchSampler 27 | 28 | class BeamSearchTranslator(object): 29 | """Beam Search Translator 30 | 31 | Parameters 32 | ---------- 33 | model : NMTModel 34 | The neural machine translation model 35 | beam_size : int 36 | Size of the beam 37 | scorer : BeamSearchScorer 38 | Score function used in beamsearch 39 | max_length : int 40 | The maximum decoding length 41 | """ 42 | def __init__(self, model, beam_size=1, scorer=BeamSearchScorer(), max_length=100): 43 | self._model = model 44 | self._sampler = BeamSearchSampler( 45 | decoder=self._decode_logprob, 46 | beam_size=beam_size, 47 | eos_id=model.tgt_vocab.token_to_idx[model.tgt_vocab.eos_token], 48 | scorer=scorer, 49 | max_length=max_length) 50 | 51 | def _decode_logprob(self, step_input, states): 52 | out, states, _ = self._model.decode_step(step_input, states) 53 | return mx.nd.log_softmax(out), states 54 | 55 | def translate(self, src_seq, src_valid_length): 56 | """Get the translation result given the input sentence. 57 | 58 | Parameters 59 | ---------- 60 | src_seq : mx.nd.NDArray 61 | Shape (batch_size, length) 62 | src_valid_length : mx.nd.NDArray 63 | Shape (batch_size,) 64 | 65 | Returns 66 | ------- 67 | samples : NDArray 68 | Samples draw by beam search. Shape (batch_size, beam_size, length). dtype is int32. 69 | scores : NDArray 70 | Scores of the samples. Shape (batch_size, beam_size). We make sure that scores[i, :] are 71 | in descending order. 72 | valid_length : NDArray 73 | The valid length of the samples. Shape (batch_size, beam_size). dtype will be int32. 74 | """ 75 | batch_size = src_seq.shape[0] 76 | encoder_outputs, _ = self._model.encode(src_seq, valid_length=src_valid_length) 77 | decoder_states = self._model.decoder.init_state_from_encoder(encoder_outputs, 78 | src_valid_length) 79 | inputs = mx.nd.full(shape=(batch_size,), ctx=src_seq.context, dtype=np.float32, 80 | val=self._model.tgt_vocab.token_to_idx[self._model.tgt_vocab.bos_token]) 81 | samples, scores, sample_valid_length = self._sampler(inputs, decoder_states) 82 | return samples, scores, sample_valid_length 83 | -------------------------------------------------------------------------------- /intent_classification_and_slot_labelling/README.md: -------------------------------------------------------------------------------- 1 | # Joint Intent Classification and Slot Labeling with GluonNLP 2 | 3 | 4 | ## Introduction 5 | Intent classification and slot labeling are two essential problems in Natural Language Understanding (NLU). In _intent classification_, the agent needs to detect the intention that the speaker's utterance conveys. For example, when the speaker says "Book a flight from Long Beach to Seattle", the intention is to book a flight ticket. In _slot labeling_, the agent needs to extract the semantic entities that are related to the intent. In our previous example, "Long Beach" and "Seattle" are two semantic constituents related to the flight, i.e., the origin and the destination. 6 | 7 | Essentially, _intent classification_ can be viewed as a sequence classification problem and _slot labeling_ can be viewed as a sequence tagging problem similar to Named-entity Recognition (NER). Due to their inner correlation, these two tasks are usually trained jointly with a multi-task objective function. 8 | 9 | Here's one example of the ATIS dataset, it uses the [IOB2 format](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)). 10 | 11 | | Sentence | Tags | Intent Label | 12 | | --------- | ---- | ------------ | 13 | | are | O | atis_flight | 14 | | there | O | | 15 | | any | O | | 16 | | flights | O | | 17 | | from | O | | 18 | | long | B-fromloc.city_name | | 19 | | beach | I-fromloc.city_name | | 20 | | to | O | | 21 | | columbus | B-toloc.city_name | | 22 | | on | O | | 23 | | wednesday | B-depart_date.day_name | | 24 | | april | B-depart_date.month_name | | 25 | | sixteen | B-depart_date.day_number | | 26 | 27 | 28 | 29 | In this example, we demonstrate how to use GluonNLP to build a model to perform joint intent classification and slot labeling. We choose to finetune a pretrained BERT model. We use two datasets [ATIS](https://github.com/yvchen/JointSLU) and [SNIPS](https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines). 30 | 31 | ## Requirements 32 | 33 | ``` 34 | mxnet 35 | gluonnlp 36 | seqeval 37 | ``` 38 | 39 | You may use pip or other tools to install these packages 40 | 41 | ## Experiment 42 | For the ATIS dataset, use the following command to run the experiment: 43 | ```bash 44 | python finetune_icsl.py --gpu 0 --dataset atis 45 | ``` 46 | 47 | It produces the final slot labeling F1 = `95.83%` and intent classification accuracy = `98.66%` 48 | 49 | For the SNIPS dataset, use the following command to run the experiment: 50 | ```bash 51 | python finetune_icsl.py --gpu 0 --dataset snips 52 | ``` 53 | It produces the final slot labeling F1 = `96.06%` and intent classification accuracy = `98.71%` 54 | 55 | Also, we train the models with three random seeds and report the mean/std 56 | 57 | For ATIS 58 | 59 | | Models | Intent Acc (%) | Slot F1 (%) | 60 | | ------ | ------------------------ | ----------- | 61 | | [Intent Gating & self-attention, EMNLP 2018](https://www.aclweb.org/anthology/D18-1417) | 98.77 | 96.52 | 62 | | [BLSTM-CRF + ELMo, AAAI 2019](https://arxiv.org/abs/1811.05370) | 97.42 | 95.62 | 63 | | [Joint BERT, Arxiv 2019](https://arxiv.org/pdf/1902.10909.pdf) | 97.5 | 96.1 | 64 | | Ours | 98.66±0.00 | 95.88±0.04 | 65 | 66 | For SNIPS 67 | 68 | | Models | Intent Acc (%) | Slot F1 (%) | 69 | | ------ | ------------------------ | ----------- | 70 | | [BLSTM-CRF + ELMo, AAAI 2019](https://arxiv.org/abs/1811.05370) | 99.29 | 93.90 | 71 | | [Joint BERT, Arxiv 2019](https://arxiv.org/pdf/1902.10909.pdf) | 98.60 | 97.00 | 72 | | Ours | 98.81±0.13 | 95.94±0.10 | 73 | -------------------------------------------------------------------------------- /machine_translation/nmt/dataset.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # pylint:disable=redefined-outer-name,logging-format-interpolation 20 | """Translation datasets.""" 21 | 22 | 23 | __all__ = ['TOY'] 24 | 25 | import os 26 | from gluonnlp.base import get_home_dir 27 | from gluonnlp.data.translation import _TranslationDataset, _get_pair_key 28 | from gluonnlp.data.registry import register 29 | 30 | 31 | @register(segment=['train', 'val', 'test']) 32 | class TOY(_TranslationDataset): 33 | """A Small Translation Dataset for Testing Scripts. 34 | 35 | Parameters 36 | ---------- 37 | segment : str or list of str, default 'train' 38 | Dataset segment. Options are 'train', 'val', 'test' or their combinations. 39 | src_lang : str, default 'en' 40 | The source language. Option for source and target languages are 'en' <-> 'de' 41 | tgt_lang : str, default 'de' 42 | The target language. Option for source and target languages are 'en' <-> 'de' 43 | root : str, default '$MXNET_HOME/datasets/translation_test' 44 | Path to temp folder for storing data. 45 | MXNET_HOME defaults to '~/.mxnet'. 46 | """ 47 | def __init__(self, segment='train', src_lang='en', tgt_lang='de', 48 | root=os.path.join(get_home_dir(), 'datasets', 'translation_test')): 49 | self._supported_segments = ['train', 'val', 'test'] 50 | self._archive_file = {_get_pair_key('en', 'de'): 51 | ('translation_test.zip', 52 | '14f6c8e31ac6ec84ce469b4c196d60b4c86a179d')} 53 | self._data_file = {_get_pair_key('en', 'de'): 54 | {'train_en': ('train.en', 55 | 'aa7f22b91eb93390fd342a57a81f51f53ed29542'), 56 | 'train_de': ('train.de', 57 | 'f914217ce23ddd8cac07e761a75685c043d4f6d3'), 58 | 'val_en': ('train.en', 59 | 'aa7f22b91eb93390fd342a57a81f51f53ed29542'), 60 | 'val_de': ('train.de', 61 | 'f914217ce23ddd8cac07e761a75685c043d4f6d3'), 62 | 'test_en': ('train.en', 63 | 'aa7f22b91eb93390fd342a57a81f51f53ed29542'), 64 | 'test_de': ('train.de', 65 | 'f914217ce23ddd8cac07e761a75685c043d4f6d3'), 66 | 'vocab_en': ('vocab.en.json', 67 | 'c7c6af4603ea70f0a4af2460a622333fbd014050'), 68 | 'vocab_de' : ('vocab.de.json', 69 | '5b6f1be36a3e3cb9946b86e5d0fc73d164fda99f')}} 70 | super(TOY, self).__init__('translation_test', segment=segment, src_lang=src_lang, 71 | tgt_lang=tgt_lang, root=root) 72 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/ner_utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | """Common utilities for the named entity recognition task.""" 20 | 21 | import argparse 22 | import pickle 23 | from collections import namedtuple 24 | 25 | import mxnet as mx 26 | import gluonnlp as nlp 27 | 28 | __all__ = ['get_bert_model', 'get_bert_dataset_name', 'get_context', 29 | 'dump_metadata'] 30 | 31 | BERTModelMetadata = namedtuple('BERTModelMetadata', ['config', 'tag_vocab']) 32 | 33 | def _metadata_file_path(checkpoint_prefix): 34 | """Gets the file path for meta data""" 35 | return checkpoint_prefix + '_metadata.pkl' 36 | 37 | 38 | def dump_metadata(config, tag_vocab): 39 | """Dumps meta-data to the configured path""" 40 | metadata = BERTModelMetadata(config=config, tag_vocab=tag_vocab) 41 | with open(_metadata_file_path(config.save_checkpoint_prefix), 'wb') as ofp: 42 | pickle.dump(metadata, ofp) 43 | 44 | 45 | def load_metadata(checkpoint_prefix): 46 | """Loads meta-data to the configured path""" 47 | with open(_metadata_file_path(checkpoint_prefix), 'rb') as ifp: 48 | metadata = pickle.load(ifp) 49 | return metadata.config, metadata.tag_vocab 50 | 51 | 52 | def get_context(gpu_index): 53 | """This method gets context of execution""" 54 | context = None 55 | if gpu_index is None or gpu_index == '': 56 | context = mx.cpu() 57 | if isinstance(gpu_index, int): 58 | context = mx.gpu(gpu_index) 59 | return context 60 | 61 | 62 | def str2bool(v): 63 | """Utility function for parsing boolean in argparse 64 | 65 | https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse 66 | 67 | :param v: value of the argument 68 | :return: 69 | """ 70 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 71 | return True 72 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 73 | return False 74 | else: 75 | raise argparse.ArgumentTypeError('Boolean value expected.') 76 | 77 | 78 | def get_bert_dataset_name(is_cased): 79 | """Returns relevant BERT dataset name, depending on whether we are using a cased model. 80 | 81 | Parameters 82 | ---------- 83 | is_cased: bool 84 | Whether we are using a cased model. 85 | 86 | Returns 87 | ------- 88 | str: Named of the BERT dataset. 89 | 90 | """ 91 | if is_cased: 92 | return 'book_corpus_wiki_en_cased' 93 | else: 94 | return 'book_corpus_wiki_en_uncased' 95 | 96 | 97 | def get_bert_model(bert_model, cased, ctx, dropout_prob): 98 | """Get pre-trained BERT model.""" 99 | bert_dataset_name = get_bert_dataset_name(cased) 100 | 101 | return nlp.model.get_model( 102 | name=bert_model, 103 | dataset_name=bert_dataset_name, 104 | pretrained=True, 105 | ctx=ctx, 106 | use_pooler=False, 107 | use_decoder=False, 108 | use_classifier=False, 109 | dropout=dropout_prob, 110 | embed_dropout=dropout_prob) 111 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/model/qa.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | """BertForQA models.""" 20 | 21 | __all__ = ['BertForQA', 'BertForQALoss'] 22 | 23 | from mxnet.gluon import Block, loss, nn 24 | from mxnet.gluon.loss import Loss 25 | 26 | 27 | class BertForQA(Block): 28 | """Model for SQuAD task with BERT. 29 | 30 | The model feeds token ids and token type ids into BERT to get the 31 | pooled BERT sequence representation, then apply a Dense layer for QA task. 32 | 33 | Parameters 34 | ---------- 35 | bert: BERTModel 36 | Bidirectional encoder with transformer. 37 | prefix : str or None 38 | See document of `mx.gluon.Block`. 39 | params : ParameterDict or None 40 | See document of `mx.gluon.Block`. 41 | """ 42 | 43 | def __init__(self, bert, prefix=None, params=None): 44 | super(BertForQA, self).__init__(prefix=prefix, params=params) 45 | self.bert = bert 46 | with self.name_scope(): 47 | self.span_classifier = nn.Dense(units=2, flatten=False) 48 | 49 | def forward(self, inputs, token_types, valid_length=None): # pylint: disable=arguments-differ 50 | """Generate the unnormalized score for the given the input sequences. 51 | 52 | Parameters 53 | ---------- 54 | inputs : NDArray, shape (batch_size, seq_length) 55 | Input words for the sequences. 56 | token_types : NDArray, shape (batch_size, seq_length) 57 | Token types for the sequences, used to indicate whether the word belongs to the 58 | first sentence or the second one. 59 | valid_length : NDArray or None, shape (batch_size,) 60 | Valid length of the sequence. This is used to mask the padded tokens. 61 | 62 | Returns 63 | ------- 64 | outputs : NDArray 65 | Shape (batch_size, seq_length, 2) 66 | """ 67 | bert_output = self.bert(inputs, token_types, valid_length) 68 | output = self.span_classifier(bert_output) 69 | return output 70 | 71 | 72 | class BertForQALoss(Loss): 73 | """Loss for SQuAD task with BERT. 74 | 75 | """ 76 | 77 | def __init__(self, weight=None, batch_axis=0, **kwargs): # pylint: disable=unused-argument 78 | super(BertForQALoss, self).__init__( 79 | weight=None, batch_axis=0, **kwargs) 80 | self.loss = loss.SoftmaxCELoss() 81 | 82 | def hybrid_forward(self, F, pred, label): # pylint: disable=arguments-differ 83 | """ 84 | Parameters 85 | ---------- 86 | pred : NDArray, shape (batch_size, seq_length, 2) 87 | BERTSquad forward output. 88 | label : list, length is 2, each shape is (batch_size,1) 89 | label[0] is the starting position of the answer, 90 | label[1] is the ending position of the answer. 91 | 92 | Returns 93 | ------- 94 | outputs : NDArray 95 | Shape (batch_size,) 96 | """ 97 | pred = F.split(pred, axis=2, num_outputs=2) 98 | start_pred = pred[0].reshape((0, -3)) 99 | start_label = label[0] 100 | end_pred = pred[1].reshape((0, -3)) 101 | end_label = label[1] 102 | return (self.loss(start_pred, start_label) + self.loss( 103 | end_pred, end_label)) / 2 104 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/model/ner.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | """Gluon model block for the named entity recognition task.""" 20 | 21 | import mxnet as mx 22 | from mxnet.gluon import Block, nn 23 | 24 | 25 | class BERTTagger(Block): 26 | """Model for sequence tagging with BERT 27 | 28 | Parameters 29 | ---------- 30 | bert_model: BERTModel 31 | Bidirectional encoder with transformer. 32 | num_tag_types: int 33 | number of possible tags 34 | dropout_prob: float 35 | dropout probability for the last layer 36 | prefix: str or None 37 | See document of `mx.gluon.Block`. 38 | params: ParameterDict or None 39 | See document of `mx.gluon.Block`. 40 | """ 41 | 42 | def __init__(self, bert_model, num_tag_types, dropout_prob, prefix=None, params=None): 43 | super(BERTTagger, self).__init__(prefix=prefix, params=params) 44 | self.bert_model = bert_model 45 | with self.name_scope(): 46 | self.tag_classifier = nn.Dense(units=num_tag_types, flatten=False) 47 | self.dropout = nn.Dropout(rate=dropout_prob) 48 | 49 | def forward(self, token_ids, token_types, valid_length): # pylint: disable=arguments-differ 50 | """Generate an unnormalized score for the tag of each token 51 | 52 | Parameters 53 | ---------- 54 | token_ids: NDArray, shape (batch_size, seq_length) 55 | ID of tokens in sentences 56 | See `input` of `glounnlp.model.BERTModel` 57 | token_types: NDArray, shape (batch_size, seq_length) 58 | See `glounnlp.model.BERTModel` 59 | valid_length: NDArray, shape (batch_size,) 60 | See `glounnlp.model.BERTModel` 61 | 62 | Returns 63 | ------- 64 | NDArray, shape (batch_size, seq_length, num_tag_types): 65 | Unnormalized prediction scores for each tag on each position. 66 | """ 67 | bert_output = self.dropout(self.bert_model(token_ids, token_types, valid_length)) 68 | output = self.tag_classifier(bert_output) 69 | return output 70 | 71 | 72 | def attach_prediction(data_loader, net, ctx, is_train): 73 | """Attach the prediction from a model to a data loader as the last field. 74 | 75 | Parameters 76 | ---------- 77 | data_loader: mx.gluon.data.DataLoader 78 | Input data from `bert_model.BERTTaggingDataset._encode_as_input`. 79 | net: mx.gluon.Block 80 | gluon `Block` for making the preciction. 81 | ctx: 82 | The context data should be loaded to. 83 | is_train: 84 | Whether the forward pass should be made with `mx.autograd.record()`. 85 | 86 | Returns 87 | ------- 88 | All fields from `bert_model.BERTTaggingDataset._encode_as_input`, 89 | as well as the prediction of the model. 90 | 91 | """ 92 | for data in data_loader: 93 | text_ids, token_types, valid_length, tag_ids, flag_nonnull_tag = \ 94 | [x.astype('float32').as_in_context(ctx) for x in data] 95 | 96 | from contextlib import ExitStack 97 | with ExitStack() as stack: 98 | if is_train: 99 | stack.enter_context(mx.autograd.record()) 100 | out = net(text_ids, token_types, valid_length) 101 | yield text_ids, token_types, valid_length, tag_ids, flag_nonnull_tag, out 102 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/conversion_tools/infer_pytorch_gluon_parameter_name_mapping.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # 'License'); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # pylint:disable=redefined-outer-name,logging-format-interpolation 20 | """PyTorch BERT parameter naming to Gluon BERT parameter naming. 21 | 22 | Given a Gluon BERT model (eg. obtained with the convert_tf_gluon.py script) and 23 | a pytorch_model.bin containing the same parameters, this script infers the 24 | naming convention of PyTorch. 25 | 26 | """ 27 | 28 | import argparse 29 | import json 30 | import logging 31 | import os 32 | import sys 33 | 34 | import gluonnlp as nlp 35 | import torch 36 | 37 | sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))) 38 | from utils import load_text_vocab, tf_vocab_to_gluon_vocab 39 | 40 | parser = argparse.ArgumentParser(description='Pytorch BERT Naming Convention', 41 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 42 | parser.add_argument('--model', type=str, default='bert_12_768_12', 43 | choices=['bert_12_768_12', 'bert_24_1024_16'], help='BERT model name') 44 | parser.add_argument('--dataset_name', type=str, default='scibert_scivocab_uncased', 45 | help='Dataset name') 46 | parser.add_argument('--pytorch_checkpoint_dir', type=str, 47 | help='Path to Tensorflow checkpoint folder.') 48 | parser.add_argument('--debug', action='store_true', help='debugging mode') 49 | parser.add_argument('--out', default='gluon_to_pytorch_naming.json', 50 | help='Output file to store gluon to pytorch name mapping.') 51 | args = parser.parse_args() 52 | logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO) 53 | logging.info(args) 54 | 55 | # Load Gluon Model 56 | bert, vocab = nlp.model.get_model(args.model, dataset_name=args.dataset_name, pretrained=True) 57 | parameters = bert._collect_params_with_prefix() 58 | parameters = {k: v.data().asnumpy() for k, v in parameters.items()} 59 | 60 | # Load PyTorch Model 61 | pytorch_parameters = torch.load(os.path.join(args.pytorch_checkpoint_dir, 'pytorch_model.bin'), 62 | map_location=lambda storage, loc: storage) 63 | pytorch_vocab = tf_vocab_to_gluon_vocab( 64 | load_text_vocab(os.path.join(args.pytorch_checkpoint_dir, 'vocab.txt'))) 65 | pytorch_parameters = {k: v.numpy() for k, v in pytorch_parameters.items()} 66 | 67 | # Assert that vocabularies are equal 68 | assert pytorch_vocab.idx_to_token == vocab.idx_to_token 69 | 70 | mapping = dict() 71 | 72 | for name, param in parameters.items(): 73 | found_match = False 74 | for pytorch_name, pytorch_param in pytorch_parameters.items(): 75 | if param.shape == pytorch_param.shape: 76 | if (param == pytorch_param).all(): 77 | if found_match: 78 | print('Found multiple matches for {}. ' 79 | 'Ignoring new match {}'.format(name, pytorch_name)) 80 | else: 81 | found_match = True 82 | mapping.update({name: pytorch_name}) 83 | 84 | # We don't break here, in case there are mulitple matches 85 | 86 | if not found_match: 87 | raise RuntimeError('Pytorch and Gluon model do not match. ' 88 | 'Cannot infer mapping of names.') 89 | 90 | assert len(mapping) == len(parameters) 91 | 92 | with open(args.out, 'w') as f: 93 | json.dump(mapping, f, indent=" ") 94 | print('Wrote mapping to {}'.format(args.out)) 95 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/model/classification.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | """BERT models.""" 20 | 21 | __all__ = ['BERTClassifier', 'BERTRegression'] 22 | 23 | from mxnet.gluon import Block 24 | from mxnet.gluon import nn 25 | 26 | class BERTRegression(Block): 27 | """Model for sentence (pair) regression task with BERT. 28 | 29 | The model feeds token ids and token type ids into BERT to get the 30 | pooled BERT sequence representation, then apply a Dense layer for 31 | regression. 32 | 33 | Parameters 34 | ---------- 35 | bert: BERTModel 36 | Bidirectional encoder with transformer. 37 | dropout : float or None, default 0.0. 38 | Dropout probability for the bert output. 39 | prefix : str or None 40 | See document of `mx.gluon.Block`. 41 | params : ParameterDict or None 42 | See document of `mx.gluon.Block`. 43 | """ 44 | 45 | def __init__(self, bert, dropout=0.0, prefix=None, params=None): 46 | super(BERTRegression, self).__init__(prefix=prefix, params=params) 47 | self.bert = bert 48 | with self.name_scope(): 49 | self.regression = nn.HybridSequential(prefix=prefix) 50 | if dropout: 51 | self.regression.add(nn.Dropout(rate=dropout)) 52 | self.regression.add(nn.Dense(1)) 53 | 54 | def forward(self, inputs, token_types, valid_length=None): # pylint: disable=arguments-differ 55 | """Generate the unnormalized score for the given the input sequences. 56 | 57 | Parameters 58 | ---------- 59 | inputs : NDArray, shape (batch_size, seq_length) 60 | Input words for the sequences. 61 | token_types : NDArray, shape (batch_size, seq_length) 62 | Token types for the sequences, used to indicate whether the word belongs to the 63 | first sentence or the second one. 64 | valid_length : NDArray or None, shape (batch_size) 65 | Valid length of the sequence. This is used to mask the padded tokens. 66 | 67 | Returns 68 | ------- 69 | outputs : NDArray 70 | Shape (batch_size, num_classes) 71 | """ 72 | _, pooler_out = self.bert(inputs, token_types, valid_length) 73 | return self.regression(pooler_out) 74 | 75 | 76 | class BERTClassifier(Block): 77 | """Model for sentence (pair) classification task with BERT. 78 | 79 | The model feeds token ids and token type ids into BERT to get the 80 | pooled BERT sequence representation, then apply a Dense layer for 81 | classification. 82 | 83 | Parameters 84 | ---------- 85 | bert: BERTModel 86 | Bidirectional encoder with transformer. 87 | num_classes : int, default is 2 88 | The number of target classes. 89 | dropout : float or None, default 0.0. 90 | Dropout probability for the bert output. 91 | prefix : str or None 92 | See document of `mx.gluon.Block`. 93 | params : ParameterDict or None 94 | See document of `mx.gluon.Block`. 95 | """ 96 | 97 | def __init__(self, 98 | bert, 99 | num_classes=2, 100 | dropout=0.0, 101 | prefix=None, 102 | params=None): 103 | super(BERTClassifier, self).__init__(prefix=prefix, params=params) 104 | self.bert = bert 105 | with self.name_scope(): 106 | self.classifier = nn.HybridSequential(prefix=prefix) 107 | if dropout: 108 | self.classifier.add(nn.Dropout(rate=dropout)) 109 | self.classifier.add(nn.Dense(units=num_classes)) 110 | 111 | def forward(self, inputs, token_types, valid_length=None): # pylint: disable=arguments-differ 112 | """Generate the unnormalized score for the given the input sequences. 113 | 114 | Parameters 115 | ---------- 116 | inputs : NDArray, shape (batch_size, seq_length) 117 | Input words for the sequences. 118 | token_types : NDArray, shape (batch_size, seq_length) 119 | Token types for the sequences, used to indicate whether the word belongs to the 120 | first sentence or the second one. 121 | valid_length : NDArray or None, shape (batch_size) 122 | Valid length of the sequence. This is used to mask the padded tokens. 123 | 124 | Returns 125 | ------- 126 | outputs : NDArray 127 | Shape (batch_size, num_classes) 128 | """ 129 | _, pooler_out = self.bert(inputs, token_types, valid_length) 130 | return self.classifier(pooler_out) 131 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/data/transform.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and DMLC. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """BERT dataset transform.""" 16 | 17 | from __future__ import absolute_import 18 | 19 | __all__ = ['BERTDatasetTransform'] 20 | 21 | import numpy as np 22 | from gluonnlp.data import BERTSentenceTransform 23 | 24 | class BERTDatasetTransform(object): 25 | """Dataset transformation for BERT-style sentence classification or regression. 26 | 27 | Parameters 28 | ---------- 29 | tokenizer : BERTTokenizer. 30 | Tokenizer for the sentences. 31 | max_seq_length : int. 32 | Maximum sequence length of the sentences. 33 | labels : list of int , float or None. defaults None 34 | List of all label ids for the classification task and regressing task. 35 | If labels is None, the default task is regression 36 | pad : bool, default True 37 | Whether to pad the sentences to maximum length. 38 | pair : bool, default True 39 | Whether to transform sentences or sentence pairs. 40 | label_dtype: int32 or float32, default float32 41 | label_dtype = int32 for classification task 42 | label_dtype = float32 for regression task 43 | """ 44 | 45 | def __init__(self, 46 | tokenizer, 47 | max_seq_length, 48 | class_labels=None, 49 | label_alias=None, 50 | pad=True, 51 | pair=True, 52 | has_label=True): 53 | self.class_labels = class_labels 54 | self.has_label = has_label 55 | self._label_dtype = 'int32' if class_labels else 'float32' 56 | if has_label and class_labels: 57 | self._label_map = {} 58 | for (i, label) in enumerate(class_labels): 59 | self._label_map[label] = i 60 | if label_alias: 61 | for key in label_alias: 62 | self._label_map[key] = self._label_map[label_alias[key]] 63 | self._bert_xform = BERTSentenceTransform( 64 | tokenizer, max_seq_length, pad=pad, pair=pair) 65 | 66 | def __call__(self, line): 67 | """Perform transformation for sequence pairs or single sequences. 68 | 69 | The transformation is processed in the following steps: 70 | - tokenize the input sequences 71 | - insert [CLS], [SEP] as necessary 72 | - generate type ids to indicate whether a token belongs to the first 73 | sequence or the second sequence. 74 | - generate valid length 75 | 76 | For sequence pairs, the input is a tuple of 3 strings: 77 | text_a, text_b and label. 78 | 79 | Inputs: 80 | text_a: 'is this jacksonville ?' 81 | text_b: 'no it is not' 82 | label: '0' 83 | Tokenization: 84 | text_a: 'is this jack ##son ##ville ?' 85 | text_b: 'no it is not .' 86 | Processed: 87 | tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]' 88 | type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 89 | valid_length: 14 90 | label: 0 91 | 92 | For single sequences, the input is a tuple of 2 strings: text_a and label. 93 | Inputs: 94 | text_a: 'the dog is hairy .' 95 | label: '1' 96 | Tokenization: 97 | text_a: 'the dog is hairy .' 98 | Processed: 99 | text_a: '[CLS] the dog is hairy . [SEP]' 100 | type_ids: 0 0 0 0 0 0 0 101 | valid_length: 7 102 | label: 1 103 | 104 | Parameters 105 | ---------- 106 | line: tuple of str 107 | Input strings. For sequence pairs, the input is a tuple of 3 strings: 108 | (text_a, text_b, label). For single sequences, the input is a tuple 109 | of 2 strings: (text_a, label). 110 | 111 | Returns 112 | ------- 113 | np.array: input token ids in 'int32', shape (batch_size, seq_length) 114 | np.array: valid length in 'int32', shape (batch_size,) 115 | np.array: input token type ids in 'int32', shape (batch_size, seq_length) 116 | np.array: classification task: label id in 'int32', shape (batch_size, 1), 117 | regression task: label in 'float32', shape (batch_size, 1) 118 | """ 119 | if self.has_label: 120 | input_ids, valid_length, segment_ids = self._bert_xform(line[:-1]) 121 | label = line[-1] 122 | # map to int if class labels are available 123 | if self.class_labels: 124 | label = self._label_map[label] 125 | label = np.array([label], dtype=self._label_dtype) 126 | return input_ids, valid_length, segment_ids, label 127 | else: 128 | return self._bert_xform(line) 129 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/predict_ner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Licensed to the Apache Software Foundation (ASF) under one 5 | # or more contributor license agreements. See the NOTICE file 6 | # distributed with this work for additional information 7 | # regarding copyright ownership. The ASF licenses this file 8 | # to you under the Apache License, Version 2.0 (the 9 | # "License"); you may not use this file except in compliance 10 | # with the License. You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, 15 | # software distributed under the License is distributed on an 16 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 17 | # KIND, either express or implied. See the License for the 18 | # specific language governing permissions and limitations 19 | # under the License. 20 | """Script for NER prediction.""" 21 | 22 | import argparse 23 | import logging 24 | import os 25 | 26 | import mxnet as mx 27 | from ner_utils import get_bert_model, get_context 28 | from ner_utils import load_metadata 29 | from data.ner import BERTTaggingDataset, convert_arrays_to_text 30 | from model.ner import BERTTagger 31 | 32 | # TODO(bikestra): Currently, our evaluation is dependent on this package. 33 | # Figure out whether to take actual dependency on it. 34 | try: 35 | import seqeval.metrics 36 | except ImportError: 37 | raise ImportError('seqeval is required to run NER on BERT. Please ' 38 | 'install it via pip3 install seqeval --user') 39 | 40 | 41 | def _find_model_file_from_checkpoint(checkpoint_prefix: str): 42 | """Load model checkpoint""" 43 | dirname, file_prefix = os.path.split(checkpoint_prefix) 44 | # find checkpoint file names and sort by name to find the most recent one. 45 | checkpoint_filenames = ([f for f in os.listdir(dirname) 46 | if f.startswith(file_prefix) 47 | and f.endswith(os.path.extsep + 'params')]) 48 | last_checkpoint_filename = max(checkpoint_filenames) 49 | logging.info('found checkpoint filename: {:s}'.format(last_checkpoint_filename)) 50 | last_checkpoint_path = os.path.join(dirname, last_checkpoint_filename) 51 | return last_checkpoint_path 52 | 53 | 54 | def parse_args(): 55 | """Parse command line arguments.""" 56 | arg_parser = argparse.ArgumentParser( 57 | description='Predict on CoNLL format data using BERT-based named entity recognition model', 58 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 59 | 60 | # data file paths 61 | arg_parser.add_argument('--test-path', type=str, required=True, 62 | help='Path to the test data file') 63 | arg_parser.add_argument('--seq-len', type=int, default=200, 64 | help='The length of the sequence input to BERT.' 65 | ' An exception will raised if this is not large enough.') 66 | arg_parser.add_argument('--load-checkpoint-prefix', type=str, required=False, default=None, 67 | help='Prefix of model checkpoint file') 68 | 69 | arg_parser.add_argument('--gpu', type=int, 70 | help='Number (index) of GPU to run on, e.g. 0. ' 71 | 'If not specified, CPU context is used.') 72 | arg_parser.add_argument('--batch-size', type=int, default=32, help='Batch size for training') 73 | args = arg_parser.parse_args() 74 | return args 75 | 76 | 77 | def main(config): 78 | """Main method for predicting BERT-based NER model on CoNLL-formatted test data.""" 79 | train_config, tag_vocab = load_metadata(config.load_checkpoint_prefix) 80 | 81 | ctx = get_context(config.gpu) 82 | bert_model, text_vocab = get_bert_model(train_config.bert_model, train_config.cased, ctx, 83 | train_config.dropout_prob) 84 | 85 | dataset = BERTTaggingDataset(text_vocab, None, None, config.test_path, 86 | config.seq_len, train_config.cased, tag_vocab=tag_vocab) 87 | 88 | test_data_loader = dataset.get_test_data_loader(config.batch_size) 89 | 90 | net = BERTTagger(bert_model, dataset.num_tag_types, train_config.dropout_prob) 91 | model_filename = _find_model_file_from_checkpoint(config.load_checkpoint_prefix) 92 | net.load_parameters(model_filename, ctx=ctx) 93 | 94 | net.hybridize(static_alloc=True) 95 | 96 | loss_function = mx.gluon.loss.SoftmaxCrossEntropyLoss() 97 | loss_function.hybridize(static_alloc=True) 98 | 99 | # TODO(bikestra): make it not redundant between train and predict 100 | def evaluate(data_loader): 101 | """Eval function""" 102 | predictions = [] 103 | 104 | for batch_id, data in enumerate(data_loader): 105 | logging.info('evaluating on batch index: %d/%d', batch_id, len(data_loader)) 106 | text_ids, token_types, valid_length, tag_ids, _ = \ 107 | [x.astype('float32').as_in_context(ctx) for x in data] 108 | out = net(text_ids, token_types, valid_length) 109 | 110 | # convert results to numpy arrays for easier access 111 | np_text_ids = text_ids.astype('int32').asnumpy() 112 | np_pred_tags = out.argmax(axis=-1).asnumpy() 113 | np_valid_length = valid_length.astype('int32').asnumpy() 114 | np_true_tags = tag_ids.asnumpy() 115 | 116 | predictions += convert_arrays_to_text(text_vocab, dataset.tag_vocab, np_text_ids, 117 | np_true_tags, np_pred_tags, np_valid_length) 118 | 119 | all_true_tags = [[entry.true_tag for entry in entries] for entries in predictions] 120 | all_pred_tags = [[entry.pred_tag for entry in entries] for entries in predictions] 121 | seqeval_f1 = seqeval.metrics.f1_score(all_true_tags, all_pred_tags) 122 | return seqeval_f1 123 | 124 | test_f1 = evaluate(test_data_loader) 125 | logging.info('test f1: {:.3f}'.format(test_f1)) 126 | 127 | 128 | if __name__ == '__main__': 129 | logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', 130 | level=logging.DEBUG, datefmt='%Y-%m-%d %I:%M:%S') 131 | logging.getLogger().setLevel(logging.INFO) 132 | main(parse_args()) 133 | -------------------------------------------------------------------------------- /sequence_generation/text_generation/index.rst: -------------------------------------------------------------------------------- 1 | Text Generation 2 | --------------- 3 | 4 | :download:`[Download] ` 5 | 6 | Sampling a Language Model 7 | +++++++++++++++++++++++++ 8 | 9 | This script can be used to generate sentences using beam search or a sequence sampler, to sample from a pre-trained language model such as GPT-2. For example: 10 | 11 | .. code-block:: console 12 | 13 | $ python sequence_sampling.py random-sample \ 14 | --bos 'Deep learning and natural language processing' \ 15 | --beam-size 1 --print-num 1 \ 16 | --lm-model gpt2_345m # options are {gpt2_117m, gpt2_345m} \ 17 | --max-length 1024 18 | 19 | Output is 20 | 21 | .. code-block:: console 22 | 23 | Sampling Parameters: beam_size=1, temperature=1.0, use_top_k=None 24 | Generation Result: 25 | ['Deep learning and natural language processing brought application choice in healthcare and perception of sounds and heat to new heights, enriching our physical communities with medical devices and creating vibrant cultures. Anecdote is slowly diminishing but is hardly obsolete nor more appealing than experience.Despite those last words of wisdom, most headset makers even spook us with the complexity and poor code quality. the hard set a mere $150 and beginner creates center for getting started. Temp cheap:\nPosted by Fleegu at 12:02 PM<|endoftext|>', -461.15128] 26 | 27 | Sequence Sampler 28 | ~~~~~~~~~~~~~~~~ 29 | 30 | Use the following command to decode to sample from the multinomial distribution. 31 | 32 | .. code-block:: console 33 | 34 | $ python sequence_sampling.py random-sample --bos 'I love it' --beam-size 5 --print-num 5 35 | 36 | Output is 37 | 38 | .. code-block:: console 39 | 40 | Sampling Parameters: beam_size=5, temperature=1.0, use_top_k=None 41 | Generation Result: 42 | ['I love it in reference to the northwestern country. replay Liberties were raised from the late 1943 to June ', -89.459656] 43 | ['I love it to them. Very account suggests that there is no basis as to whether the constellations are ', -72.687996] 44 | ['I love it for quick attempts. It does not have any factors, and [the cause] has ', -64.87619] 45 | ['I love it one in the English language, and say it was not for English the same standard than ', -71.51008] 46 | ['I love it to take care of her; following many attempts to appease the Canadian military and making some ', -75.5512] 47 | 48 | You can also try a lower temperature such as 0.95, which results in sharper distribution. 49 | 50 | .. code-block:: console 51 | 52 | $ python sequence_sampling.py random-sample --bos 'I love it' --beam-size 5 --print-num 5 --temperature 0.95 53 | 54 | Output is 55 | 56 | .. code-block:: console 57 | 58 | Sampling Parameters: beam_size=5, temperature=0.95, use_top_k=None 59 | Generation Result: 60 | ['I love it and flew by (a colleague Due to his delicate and non-serious attacks ', -85.825195] 61 | ['I love it in a short anticipated 1927 hiatus. As a result, it was able to withstand changes ', -71.8867] 62 | ['I love it for analysis. ', -15.78739] 63 | ['I love it his own. The total of one hundred lives of all other documented in the Congo ', -68.57835] 64 | ['I love it in his Why My Woman to Get Out of Graham Your Way. ', -65.74211] 65 | 66 | Finally, you can also try to constrain the sampling to sample only from the top-k tokens. 67 | 68 | .. code-block:: console 69 | 70 | $ python sequence_sampling.py random-sample --bos 'I love it' --beam-size 5 --print-num 5 --temperature 0.95 --use-top-k 800 71 | 72 | Output is 73 | 74 | .. code-block:: console 75 | 76 | Sampling Parameters: beam_size=5, temperature=0.95, use_top_k=800 77 | Generation Result: 78 | ['I love it. It is the same as the Old Age. The best known of this is the ', -30.544556] 79 | ['I love it and had a weak start by a group of only three-year-old fans. ', -44.970097] 80 | ['I love it ". ', -4.725212] 81 | ['I love it with the . ', -7.236909] 82 | ['I love it and its working-based ". ', -25.340023] 83 | 84 | Beam Search Generator 85 | ~~~~~~~~~~~~~~~~~~~~~ 86 | 87 | Use the following command to decode using beam search. 88 | 89 | .. code-block:: console 90 | 91 | $ python sequence_sampling.py beam-search --bos 'I love it' --beam-size 5 --print-num 5 92 | 93 | Output is 94 | 95 | .. code-block:: console 96 | 97 | Beam Seach Parameters: beam_size=5, alpha=0.0, K=5 98 | Generation Result: 99 | ['I love it. ', -2.6606221] 100 | ['I love it. "', -4.072001] 101 | ['I love it, and the of the . ', -14.573] 102 | ['I love it, and the of the . The of the , the , ', -28.968985] 103 | ['I love it, and the of the . The of the , the and ', -30.064144] 104 | 105 | You can also try a larger beam size, such as 15. 106 | 107 | .. code-block:: console 108 | 109 | $ python sequence_sampling.py beam-search --bos 'I love it' --beam-size 15 --print-num 15 110 | 111 | Output is 112 | 113 | .. code-block:: console 114 | 115 | Beam Seach Parameters: beam_size=15, alpha=0.0, K=5 116 | Generation Result: 117 | ['I love it. ', -2.6606221] 118 | ['I love it. "', -4.072001] 119 | ['I love it ". ', -5.222643] 120 | ['I love it, and the of the . ', -14.573] 121 | ['I love it. It was the first time in the history of the history of the history of the ', -21.041868] 122 | ['I love it. It was the first time in the history of the history of the country. ', -21.262276] 123 | ['I love it. It was the first time in the history of the history of the United States. ', -21.826159] 124 | ['I love it. It was the first time in the history of the history of the world. ', -21.930265] 125 | ['I love it. It was the first time in the history of the history of the country. The ', -21.94392] 126 | ['I love it. It was the first time in the history of the history of the city. ', -22.00894] 127 | ['I love it. It was the first time in the history of the history of the country that the ', -22.152416] 128 | ['I love it. It was the first time in the history of the history of the United States, ', -22.170143] 129 | ['I love it. It was the first time in the history of the history of the country, and ', -22.188667] 130 | ['I love it. It was the first time in the history of the history of the United States that ', -22.254015] 131 | ['I love it. It was the first time in the history of the history of the state. ', -22.398975] 132 | -------------------------------------------------------------------------------- /machine_translation/dataprocessor.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | """Data preprocessing for transformer.""" 20 | 21 | import os 22 | import io 23 | import time 24 | import numpy as np 25 | import mxnet as mx 26 | from mxnet import gluon 27 | import gluonnlp as nlp 28 | import nmt 29 | import hyperparameters as hparams 30 | 31 | def cache_dataset(dataset, prefix): 32 | """Cache the processed npy dataset the dataset into a npz 33 | 34 | Parameters 35 | ---------- 36 | dataset : SimpleDataset 37 | file_path : str 38 | """ 39 | if not os.path.exists(nmt._constants.CACHE_PATH): 40 | os.makedirs(nmt._constants.CACHE_PATH) 41 | src_data = np.concatenate([e[0] for e in dataset]) 42 | tgt_data = np.concatenate([e[1] for e in dataset]) 43 | src_cumlen = np.cumsum([0]+[len(e[0]) for e in dataset]) 44 | tgt_cumlen = np.cumsum([0]+[len(e[1]) for e in dataset]) 45 | np.savez(os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz'), 46 | src_data=src_data, tgt_data=tgt_data, 47 | src_cumlen=src_cumlen, tgt_cumlen=tgt_cumlen) 48 | 49 | 50 | def load_cached_dataset(prefix): 51 | cached_file_path = os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz') 52 | if os.path.exists(cached_file_path): 53 | print('Loading dataset...') 54 | npz_data = np.load(cached_file_path) 55 | src_data, tgt_data, src_cumlen, tgt_cumlen = [npz_data[n] for n in 56 | ['src_data', 'tgt_data', 'src_cumlen', 'tgt_cumlen']] 57 | src_data = np.array([src_data[low:high] for low, high in zip(src_cumlen[:-1], src_cumlen[1:])]) 58 | tgt_data = np.array([tgt_data[low:high] for low, high in zip(tgt_cumlen[:-1], tgt_cumlen[1:])]) 59 | return gluon.data.ArrayDataset(np.array(src_data), np.array(tgt_data)) 60 | else: 61 | return None 62 | 63 | 64 | class TrainValDataTransform(object): 65 | """Transform the machine translation dataset. 66 | 67 | Clip source and the target sentences to the maximum length. For the source sentence, append the 68 | EOS. For the target sentence, append BOS and EOS. 69 | 70 | Parameters 71 | ---------- 72 | src_vocab : Vocab 73 | tgt_vocab : Vocab 74 | src_max_len : int 75 | tgt_max_len : int 76 | """ 77 | 78 | def __init__(self, src_vocab, tgt_vocab, src_max_len=None, tgt_max_len=None): 79 | self._src_vocab = src_vocab 80 | self._tgt_vocab = tgt_vocab 81 | self._src_max_len = src_max_len 82 | self._tgt_max_len = tgt_max_len 83 | 84 | def __call__(self, src, tgt): 85 | if self._src_max_len: 86 | src_sentence = self._src_vocab[src.split()[:self._src_max_len]] 87 | else: 88 | src_sentence = self._src_vocab[src.split()] 89 | if self._tgt_max_len: 90 | tgt_sentence = self._tgt_vocab[tgt.split()[:self._tgt_max_len]] 91 | else: 92 | tgt_sentence = self._tgt_vocab[tgt.split()] 93 | src_sentence.append(self._src_vocab[self._src_vocab.eos_token]) 94 | tgt_sentence.insert(0, self._tgt_vocab[self._tgt_vocab.bos_token]) 95 | tgt_sentence.append(self._tgt_vocab[self._tgt_vocab.eos_token]) 96 | src_npy = np.array(src_sentence, dtype=np.int32) 97 | tgt_npy = np.array(tgt_sentence, dtype=np.int32) 98 | return src_npy, tgt_npy 99 | 100 | 101 | def process_dataset(dataset, src_vocab, tgt_vocab, src_max_len=-1, tgt_max_len=-1): 102 | start = time.time() 103 | dataset_processed = dataset.transform(TrainValDataTransform(src_vocab, tgt_vocab, 104 | src_max_len, 105 | tgt_max_len), lazy=False) 106 | end = time.time() 107 | print('Processing Time spent: {}'.format(end - start)) 108 | return dataset_processed 109 | 110 | 111 | def load_translation_data(dataset, src_lang='en', tgt_lang='de'): 112 | """Load translation dataset 113 | 114 | Parameters 115 | ---------- 116 | dataset : str 117 | src_lang : str, default 'en' 118 | tgt_lang : str, default 'de' 119 | 120 | Returns 121 | ------- 122 | 123 | """ 124 | if dataset == 'WMT2014BPE': 125 | common_prefix = 'WMT2014BPE_{}_{}_{}_{}'.format(src_lang, tgt_lang, 126 | hparams.src_max_len, hparams.tgt_max_len) 127 | data_train = nlp.data.WMT2014BPE('train', src_lang=src_lang, tgt_lang=tgt_lang) 128 | data_val = nlp.data.WMT2014BPE('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang) 129 | data_test = nlp.data.WMT2014BPE('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang, 130 | full=False) 131 | elif dataset == 'TOY': 132 | common_prefix = 'TOY_{}_{}_{}_{}'.format(src_lang, tgt_lang, 133 | hparams.src_max_len, hparams.tgt_max_len) 134 | data_train = nmt.dataset.TOY('train', src_lang=src_lang, tgt_lang=tgt_lang) 135 | data_val = nmt.dataset.TOY('val', src_lang=src_lang, tgt_lang=tgt_lang) 136 | data_test = nmt.dataset.TOY('test', src_lang=src_lang, tgt_lang=tgt_lang) 137 | else: 138 | raise NotImplementedError 139 | src_vocab, tgt_vocab = data_train.src_vocab, data_train.tgt_vocab 140 | data_train_processed = load_cached_dataset(common_prefix + '_train') 141 | if not data_train_processed: 142 | data_train_processed = process_dataset(data_train, src_vocab, tgt_vocab, 143 | hparams.src_max_len, hparams.tgt_max_len) 144 | cache_dataset(data_train_processed, common_prefix + '_train') 145 | data_val_processed = load_cached_dataset(common_prefix + '_val') 146 | if not data_val_processed: 147 | data_val_processed = process_dataset(data_val, src_vocab, tgt_vocab) 148 | cache_dataset(data_val_processed, common_prefix + '_val') 149 | data_test_processed = load_cached_dataset(common_prefix + '_' + str(False) + '_test') 150 | if not data_test_processed: 151 | data_test_processed = process_dataset(data_test, src_vocab, tgt_vocab) 152 | cache_dataset(data_test_processed, common_prefix + '_' + str(False) + '_test') 153 | fetch_tgt_sentence = lambda src, tgt: tgt 154 | if dataset == 'WMT2014BPE': 155 | val_text = nlp.data.WMT2014('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang) 156 | test_text = nlp.data.WMT2014('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang, 157 | full=False) 158 | elif dataset == 'TOY': 159 | val_text = data_val 160 | test_text = data_test 161 | else: 162 | raise NotImplementedError 163 | val_tgt_sentences = list(val_text.transform(fetch_tgt_sentence)) 164 | test_tgt_sentences = list(test_text.transform(fetch_tgt_sentence)) 165 | return data_train_processed, data_val_processed, data_test_processed, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab 166 | 167 | 168 | def get_data_lengths(dataset): 169 | return list(dataset.transform(lambda srg, tgt: (len(srg), len(tgt)))) 170 | -------------------------------------------------------------------------------- /machine_translation/utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | """Utilities for transformer.""" 20 | 21 | import numpy as np 22 | import math 23 | import mxnet as mx 24 | import time 25 | import logging 26 | import io 27 | import nmt 28 | import hyperparameters as hparams 29 | 30 | def evaluate(model, data_loader, test_loss_function, translator, tgt_vocab, detokenizer, context): 31 | """Evaluate given the data loader 32 | 33 | Parameters 34 | ---------- 35 | data_loader : DataLoader 36 | 37 | Returns 38 | ------- 39 | avg_loss : float 40 | Average loss 41 | real_translation_out : list of list of str 42 | The translation output 43 | """ 44 | translation_out = [] 45 | all_inst_ids = [] 46 | avg_loss_denom = 0 47 | avg_loss = 0.0 48 | for _, (src_seq, tgt_seq, src_valid_length, tgt_valid_length, inst_ids) \ 49 | in enumerate(data_loader): 50 | src_seq = src_seq.as_in_context(context) 51 | tgt_seq = tgt_seq.as_in_context(context) 52 | src_valid_length = src_valid_length.as_in_context(context) 53 | tgt_valid_length = tgt_valid_length.as_in_context(context) 54 | # Calculating Loss 55 | out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1) 56 | loss = test_loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean().asscalar() 57 | all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist()) 58 | avg_loss += loss * (tgt_seq.shape[1] - 1) 59 | avg_loss_denom += (tgt_seq.shape[1] - 1) 60 | # Translate 61 | samples, _, sample_valid_length = \ 62 | translator.translate(src_seq=src_seq, src_valid_length=src_valid_length) 63 | max_score_sample = samples[:, 0, :].asnumpy() 64 | sample_valid_length = sample_valid_length[:, 0].asnumpy() 65 | for i in range(max_score_sample.shape[0]): 66 | translation_out.append( 67 | [tgt_vocab.idx_to_token[ele] for ele in 68 | max_score_sample[i][1:(sample_valid_length[i] - 1)]]) 69 | avg_loss = avg_loss / avg_loss_denom 70 | real_translation_out = [None for _ in range(len(all_inst_ids))] 71 | for ind, sentence in zip(all_inst_ids, translation_out): 72 | real_translation_out[ind] = detokenizer(nmt.bleu._bpe_to_words(sentence), 73 | return_str=True) 74 | return avg_loss, real_translation_out 75 | 76 | def translate(translator, src_seq, src_vocab, tgt_vocab, detokenizer, ctx): 77 | src_sentence = src_vocab[src_seq.split()] 78 | src_sentence.append(src_vocab[src_vocab.eos_token]) 79 | src_npy = np.array(src_sentence, dtype=np.int32) 80 | src_nd = mx.nd.array(src_npy) 81 | src_nd = src_nd.reshape((1, -1)).as_in_context(ctx) 82 | src_valid_length = mx.nd.array([src_nd.shape[1]]).as_in_context(ctx) 83 | samples, _, sample_valid_length = \ 84 | translator.translate(src_seq=src_nd, src_valid_length=src_valid_length) 85 | max_score_sample = samples[:, 0, :].asnumpy() 86 | 87 | sample_valid_length = sample_valid_length[:, 0].asnumpy() 88 | translation_out = [] 89 | for i in range(max_score_sample.shape[0]): 90 | translation_out.append( 91 | [tgt_vocab.idx_to_token[ele] for ele in 92 | max_score_sample[i][1:(sample_valid_length[i] - 1)]]) 93 | real_translation_out = [None for _ in range(len(translation_out))] 94 | for ind, sentence in enumerate(translation_out): 95 | real_translation_out[ind] = detokenizer(nmt.bleu._bpe_to_words(sentence), 96 | return_str=True) 97 | return real_translation_out 98 | 99 | def train_one_epoch(epoch_id, model, train_data_loader, trainer, label_smoothing, loss_function, grad_interval, average_param_dict, update_average_param_dict, step_num, ctx): 100 | log_avg_loss = 0 101 | log_wc = 0 102 | loss_denom = 0 103 | step_loss = 0 104 | log_start_time = time.time() 105 | for batch_id, seqs in enumerate(train_data_loader): 106 | if batch_id % grad_interval == 0: 107 | step_num += 1 108 | new_lr = hparams.lr / math.sqrt(hparams.num_units) * min(1. / math.sqrt(step_num), step_num * hparams.warmup_steps ** (-1.5)) 109 | trainer.set_learning_rate(new_lr) 110 | src_wc, tgt_wc, bs = np.sum([(shard[2].sum(), shard[3].sum(), shard[0].shape[0]) 111 | for shard in seqs], axis=0) 112 | src_wc = src_wc.asscalar() 113 | tgt_wc = tgt_wc.asscalar() 114 | loss_denom += tgt_wc - bs 115 | seqs = [[seq.as_in_context(context) for seq in shard] 116 | for context, shard in zip([ctx], seqs)] 117 | Ls = [] 118 | with mx.autograd.record(): 119 | for src_seq, tgt_seq, src_valid_length, tgt_valid_length in seqs: 120 | out, _ = model(src_seq, tgt_seq[:, :-1], 121 | src_valid_length, tgt_valid_length - 1) 122 | smoothed_label = label_smoothing(tgt_seq[:, 1:]) 123 | ls = loss_function(out, smoothed_label, tgt_valid_length - 1).sum() 124 | Ls.append((ls * (tgt_seq.shape[1] - 1)) / hparams.batch_size / 100.0) 125 | for L in Ls: 126 | L.backward() 127 | if batch_id % grad_interval == grad_interval - 1 or\ 128 | batch_id == len(train_data_loader) - 1: 129 | if update_average_param_dict: 130 | for k, v in model.collect_params().items(): 131 | average_param_dict[k] = v.data(ctx).copy() 132 | update_average_param_dict = False 133 | 134 | trainer.step(float(loss_denom) / hparams.batch_size / 100.0) 135 | param_dict = model.collect_params() 136 | param_dict.zero_grad() 137 | if step_num > hparams.average_start: 138 | alpha = 1. / max(1, step_num - hparams.average_start) 139 | for name, average_param in average_param_dict.items(): 140 | average_param[:] += alpha * (param_dict[name].data(ctx) - average_param) 141 | step_loss += sum([L.asscalar() for L in Ls]) 142 | if batch_id % grad_interval == grad_interval - 1 or\ 143 | batch_id == len(train_data_loader) - 1: 144 | log_avg_loss += step_loss / loss_denom * hparams.batch_size * 100.0 145 | loss_denom = 0 146 | step_loss = 0 147 | log_wc += src_wc + tgt_wc 148 | if (batch_id + 1) % (hparams.log_interval * grad_interval) == 0: 149 | wps = log_wc / (time.time() - log_start_time) 150 | logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, ' 151 | 'throughput={:.2f}K wps, wc={:.2f}K' 152 | .format(epoch_id, batch_id + 1, len(train_data_loader), 153 | log_avg_loss / hparams.log_interval, 154 | np.exp(log_avg_loss / hparams.log_interval), 155 | wps / 1000, log_wc / 1000)) 156 | log_start_time = time.time() 157 | log_avg_loss = 0 158 | log_wc = 0 -------------------------------------------------------------------------------- /natural_language_understanding/bert/conversion_tools/convert_pytorch_model.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # 'License'); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # pylint:disable=redefined-outer-name,logging-format-interpolation 20 | """ Script for converting PyTorch Model to Gluon. """ 21 | 22 | import argparse 23 | import json 24 | import logging 25 | import os 26 | import sys 27 | 28 | import mxnet as mx 29 | import gluonnlp as nlp 30 | import torch 31 | from gluonnlp.model import BERTEncoder, BERTModel 32 | from gluonnlp.model.bert import bert_hparams 33 | 34 | sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))) 35 | from utils import get_hash, load_text_vocab, tf_vocab_to_gluon_vocab 36 | 37 | parser = argparse.ArgumentParser(description='Conversion script for PyTorch BERT model', 38 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 39 | parser.add_argument('--model', type=str, default='bert_12_768_12', 40 | choices=['bert_12_768_12', 'bert_24_1024_16'], help='BERT model name') 41 | parser.add_argument('--pytorch_checkpoint_dir', type=str, 42 | help='Path to Tensorflow checkpoint folder.') 43 | parser.add_argument('--vocab_file', type=str, help='Full path to the vocab.txt') 44 | parser.add_argument('--gluon_pytorch_name_mapping', type=str, 45 | default='gluon_to_pytorch_naming.json', 46 | help='Output of infer_pytorch_gluon_parameter_name_mapping.py') 47 | parser.add_argument('--out_dir', type=str, default=os.path.join('~', 'output'), 48 | help='Path to output folder. The folder must exist.') 49 | parser.add_argument('--debug', action='store_true', help='debugging mode') 50 | args = parser.parse_args() 51 | logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO) 52 | logging.info(args) 53 | 54 | # convert vocabulary 55 | vocab = tf_vocab_to_gluon_vocab(load_text_vocab(args.vocab_file)) 56 | 57 | # vocab serialization 58 | tmp_file_path = os.path.expanduser(os.path.join(args.out_dir, 'tmp')) 59 | with open(tmp_file_path, 'w') as f: 60 | f.write(vocab.to_json()) 61 | hash_full, hash_short = get_hash(tmp_file_path) 62 | gluon_vocab_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.vocab')) 63 | with open(gluon_vocab_path, 'w') as f: 64 | f.write(vocab.to_json()) 65 | logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full) 66 | 67 | # Load PyTorch Model 68 | pytorch_parameters = torch.load(os.path.join(args.pytorch_checkpoint_dir, 'pytorch_model.bin'), 69 | map_location=lambda storage, loc: storage) 70 | pytorch_parameters = {k: v.numpy() for k, v in pytorch_parameters.items()} 71 | 72 | # Make sure vocab fits to model 73 | assert pytorch_parameters['bert.embeddings.word_embeddings.weight'].shape[0] == len( 74 | vocab.idx_to_token) 75 | 76 | # Load Mapping 77 | with open(args.gluon_pytorch_name_mapping, 'r') as f: 78 | mapping = json.load(f) 79 | 80 | # BERT config 81 | tf_config_names_to_gluon_config_names = { 82 | 'attention_probs_dropout_prob': 'embed_dropout', 83 | 'hidden_act': None, 84 | 'hidden_dropout_prob': 'dropout', 85 | 'hidden_size': 'units', 86 | 'initializer_range': None, 87 | 'intermediate_size': 'hidden_size', 88 | 'max_position_embeddings': 'max_length', 89 | 'num_attention_heads': 'num_heads', 90 | 'num_hidden_layers': 'num_layers', 91 | 'type_vocab_size': 'token_type_vocab_size', 92 | 'vocab_size': None 93 | } 94 | predefined_args = bert_hparams[args.model] 95 | with open(os.path.join(args.pytorch_checkpoint_dir, 'bert_config.json'), 'r') as f: 96 | tf_config = json.load(f) 97 | assert len(tf_config) == len(tf_config_names_to_gluon_config_names) 98 | for tf_name, gluon_name in tf_config_names_to_gluon_config_names.items(): 99 | if tf_name is None or gluon_name is None: 100 | continue 101 | assert tf_config[tf_name] == predefined_args[gluon_name] 102 | 103 | # BERT encoder 104 | encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'], 105 | num_layers=predefined_args['num_layers'], units=predefined_args['units'], 106 | hidden_size=predefined_args['hidden_size'], 107 | max_length=predefined_args['max_length'], 108 | num_heads=predefined_args['num_heads'], scaled=predefined_args['scaled'], 109 | dropout=predefined_args['dropout'], 110 | use_residual=predefined_args['use_residual']) 111 | 112 | # Infer enabled BERTModel components 113 | use_pooler = any('pooler' in n for n in pytorch_parameters) 114 | use_decoder = any('cls.predictions.transform.dense.weight' in n for n in pytorch_parameters) 115 | use_classifier = any('cls.seq_relationship.weight' in n for n in pytorch_parameters) 116 | 117 | if not use_classifier and 'classifier.weight' in pytorch_parameters and \ 118 | pytorch_parameters['classifier.weight'].shape[0] == 2: 119 | logging.info('Assuming classifier weights in provided Pytorch model are ' 120 | 'from next sentence prediction task.') 121 | use_classifier = True 122 | 123 | logging.info('Inferred that the pytorch model provides the following parameters:') 124 | logging.info('- use_pooler = {}'.format(use_pooler)) 125 | logging.info('- use_decoder = {}'.format(use_decoder)) 126 | logging.info('- use_classifier = {}'.format(use_classifier)) 127 | 128 | # BERT model 129 | bert = BERTModel(encoder, len(vocab), 130 | token_type_vocab_size=predefined_args['token_type_vocab_size'], 131 | units=predefined_args['units'], embed_size=predefined_args['embed_size'], 132 | embed_dropout=predefined_args['embed_dropout'], 133 | word_embed=predefined_args['word_embed'], use_pooler=use_pooler, 134 | use_decoder=use_decoder, use_classifier=use_classifier) 135 | 136 | bert.initialize(init=mx.init.Normal(0.02)) 137 | 138 | ones = mx.nd.ones((2, 8)) 139 | out = bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]])) 140 | params = bert._collect_params_with_prefix() 141 | assert len(params) == len(pytorch_parameters), "Gluon model does not match PyTorch model. " \ 142 | "Please fix the BERTModel hyperparameters" 143 | 144 | # set parameter data 145 | loaded_params = {} 146 | for name in params: 147 | if name not in mapping: 148 | raise RuntimeError('Invalid json mapping file. ' 149 | 'The parameter {} is not described in the mapping file.'.format(name)) 150 | pytorch_name = mapping[name] 151 | if pytorch_name not in pytorch_parameters.keys(): 152 | # Handle inconsistent naming in PyTorch 153 | # The Expected names here are based on the PyTorch version of SciBert. 154 | # The Inconsistencies were found in ClinicalBert 155 | if 'LayerNorm' in pytorch_name: 156 | pytorch_name = pytorch_name.replace('weight', 'gamma') 157 | pytorch_name = pytorch_name.replace('bias', 'beta') 158 | assert pytorch_name in pytorch_parameters.keys() 159 | 160 | if 'cls.seq_relationship' in pytorch_name: 161 | pytorch_name = pytorch_name.replace('cls.seq_relationship', 'classifier') 162 | 163 | arr = mx.nd.array(pytorch_parameters[pytorch_name]) 164 | 165 | assert arr.shape == params[name].shape 166 | params[name].set_data(arr) 167 | loaded_params[name] = True 168 | 169 | if len(params) != len(loaded_params): 170 | raise RuntimeError('The Gluon BERTModel comprises {} parameter arrays, ' 171 | 'but {} have been extracted from the pytorch model. '.format( 172 | len(params), len(loaded_params))) 173 | 174 | # param serialization 175 | bert.save_parameters(tmp_file_path) 176 | hash_full, hash_short = get_hash(tmp_file_path) 177 | gluon_param_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.params')) 178 | logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full) 179 | bert.save_parameters(gluon_param_path) 180 | mx.nd.waitall() 181 | -------------------------------------------------------------------------------- /word_embedding/model.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | # pylint: disable= 21 | """Word embedding models.""" 22 | 23 | import mxnet as mx 24 | import numpy as np 25 | 26 | import gluonnlp as nlp 27 | 28 | 29 | class Net(mx.gluon.HybridBlock): 30 | """Base class for word2vec and fastText SkipGram and CBOW networks. 31 | 32 | Parameters 33 | ---------- 34 | token_to_idx : dict 35 | token_to_idx mapping of the vocabulary that this model is to be trained 36 | with. token_to_idx is used for __getitem__ and __contains__. For 37 | len(token_to_idx) is used during initialization to obtain the input_dim 38 | of the embedding matrix. 39 | output_dim : int 40 | Dimension of the dense embedding. 41 | batch_size : int 42 | Batchsize this model will be trained with. TODO temporary until 43 | random_like ops are supported 44 | negatives_weights : mxnet.nd.NDArray 45 | Weights for UnigramCandidateSampler for sampling negatives. 46 | smoothing : float, default 0.75 47 | Smoothing factor applied to negatives_weights. Final weights are 48 | mxnet.nd.power(negative_weights, smoothing). 49 | num_negatives : int, default 5 50 | Number of negatives to sample for each real sample. 51 | sparse_grad : bool, default True 52 | Specifies mxnet.gluon.nn.Embedding sparse_grad argument. 53 | dtype : str, default 'float32' 54 | dtype argument passed to gluon.nn.Embedding 55 | 56 | """ 57 | 58 | # pylint: disable=abstract-method 59 | def __init__(self, token_to_idx, output_dim, batch_size, negatives_weights, 60 | subword_function=None, num_negatives=5, smoothing=0.75, 61 | sparse_grad=True, dtype='float32', **kwargs): 62 | super(Net, self).__init__(**kwargs) 63 | 64 | self._kwargs = dict( 65 | input_dim=len(token_to_idx), output_dim=output_dim, dtype=dtype, 66 | sparse_grad=sparse_grad, num_negatives=num_negatives) 67 | 68 | with self.name_scope(): 69 | if subword_function is not None: 70 | self.embedding = nlp.model.train.FasttextEmbeddingModel( 71 | token_to_idx=token_to_idx, 72 | subword_function=subword_function, 73 | output_dim=output_dim, 74 | weight_initializer=mx.init.Uniform(scale=1 / output_dim), 75 | sparse_grad=sparse_grad, 76 | ) 77 | else: 78 | self.embedding = nlp.model.train.CSREmbeddingModel( 79 | token_to_idx=token_to_idx, 80 | output_dim=output_dim, 81 | weight_initializer=mx.init.Uniform(scale=1 / output_dim), 82 | sparse_grad=sparse_grad, 83 | ) 84 | self.embedding_out = mx.gluon.nn.Embedding( 85 | len(token_to_idx), output_dim=output_dim, 86 | weight_initializer=mx.init.Zero(), sparse_grad=sparse_grad, 87 | dtype=dtype) 88 | 89 | self.negatives_sampler = nlp.data.UnigramCandidateSampler( 90 | weights=negatives_weights**smoothing, shape=(batch_size, ), 91 | dtype='int64') 92 | 93 | def __getitem__(self, tokens): 94 | return self.embedding[tokens] 95 | 96 | 97 | class SG(Net): 98 | """SkipGram network""" 99 | 100 | # pylint: disable=arguments-differ 101 | def hybrid_forward(self, F, center, context, center_words): 102 | """SkipGram forward pass. 103 | 104 | Parameters 105 | ---------- 106 | center : mxnet.nd.NDArray or mxnet.sym.Symbol 107 | Sparse CSR array of word / subword indices of shape (batch_size, 108 | len(token_to_idx) + num_subwords). Embedding for center words are 109 | computed via F.sparse.dot between the CSR center array and the 110 | weight matrix. 111 | context : mxnet.nd.NDArray or mxnet.sym.Symbol 112 | Dense array of context words of shape (batch_size, ). Also used for 113 | row-wise independently masking negatives equal to one of context. 114 | center_words : mxnet.nd.NDArray or mxnet.sym.Symbol 115 | Dense array of center words of shape (batch_size, ). Only used for 116 | row-wise independently masking negatives equal to one of 117 | center_words. 118 | """ 119 | 120 | # negatives sampling 121 | negatives = [] 122 | mask = [] 123 | for _ in range(self._kwargs['num_negatives']): 124 | negatives.append(self.negatives_sampler(center_words)) 125 | mask_ = negatives[-1] != center_words 126 | mask_ = F.stack(mask_, (negatives[-1] != context)) 127 | mask.append(mask_.min(axis=0)) 128 | 129 | negatives = F.stack(*negatives, axis=1) 130 | mask = F.stack(*mask, axis=1).astype(np.float32) 131 | 132 | # center - context pairs 133 | emb_center = self.embedding(center).expand_dims(1) 134 | emb_context = self.embedding_out(context).expand_dims(2) 135 | pred_pos = F.batch_dot(emb_center, emb_context).squeeze() 136 | loss_pos = (F.relu(pred_pos) - pred_pos + F.Activation( 137 | -F.abs(pred_pos), act_type='softrelu')) / (mask.sum(axis=1) + 1) 138 | 139 | # center - negatives pairs 140 | emb_negatives = self.embedding_out(negatives).reshape( 141 | (-1, self._kwargs['num_negatives'], 142 | self._kwargs['output_dim'])).swapaxes(1, 2) 143 | pred_neg = F.batch_dot(emb_center, emb_negatives).squeeze() 144 | mask = mask.reshape((-1, self._kwargs['num_negatives'])) 145 | loss_neg = (F.relu(pred_neg) + F.Activation( 146 | -F.abs(pred_neg), act_type='softrelu')) * mask 147 | loss_neg = loss_neg.sum(axis=1) / (mask.sum(axis=1) + 1) 148 | 149 | return loss_pos + loss_neg 150 | 151 | 152 | class CBOW(Net): 153 | """CBOW network""" 154 | 155 | # pylint: disable=arguments-differ 156 | def hybrid_forward(self, F, center, context): 157 | """CBOW forward pass. 158 | 159 | Parameters 160 | ---------- 161 | center : mxnet.nd.NDArray or mxnet.sym.Symbol 162 | Dense array of center words of shape (batch_size, ). 163 | context : mxnet.nd.NDArray or mxnet.sym.Symbol 164 | Sparse CSR array of word / subword indices of shape (batch_size, 165 | len(vocab) + num_subwords). Embedding for context words are 166 | computed via F.sparse.dot between the CSR center array and the 167 | weight matrix. 168 | 169 | """ 170 | # negatives sampling 171 | negatives = [] 172 | mask = [] 173 | for _ in range(self._kwargs['num_negatives']): 174 | negatives.append(self.negatives_sampler(center)) 175 | mask.append(negatives[-1] != center) 176 | 177 | negatives = F.stack(*negatives, axis=1) 178 | mask = F.stack(*mask, axis=1).astype(np.float32) 179 | 180 | # context - center samples 181 | emb_context = self.embedding(context).expand_dims(1) 182 | emb_center = self.embedding_out(center).expand_dims(2) 183 | pred_pos = F.batch_dot(emb_context, emb_center).squeeze() 184 | loss_pos = (F.relu(pred_pos) - pred_pos + F.Activation( 185 | -F.abs(pred_pos), act_type='softrelu')) / (mask.sum(axis=1) + 1) 186 | 187 | # context - negatives samples 188 | emb_negatives = self.embedding_out(negatives).reshape( 189 | (-1, self._kwargs['num_negatives'], 190 | self._kwargs['output_dim'])).swapaxes(1, 2) 191 | pred_neg = F.batch_dot(emb_context, emb_negatives).squeeze() 192 | mask = mask.reshape((-1, self._kwargs['num_negatives'])) 193 | loss_neg = (F.relu(pred_neg) + F.Activation( 194 | -F.abs(pred_neg), act_type='softrelu')) * mask 195 | loss_neg = loss_neg.sum(axis=1) / (mask.sum(axis=1) + 1) 196 | 197 | return loss_pos + loss_neg 198 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/conversion_tools/compare_tf_gluon_model.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # 'License'); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | """Script for model comparison between TF and Gluon.""" 20 | 21 | # pylint: disable=wrong-import-position, wrong-import-order, wildcard-import 22 | 23 | import sys 24 | import os 25 | import argparse 26 | import numpy as np 27 | import mxnet as mx 28 | import gluonnlp as nlp 29 | 30 | sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))) 31 | 32 | parser = argparse.ArgumentParser(description='Comparison script for BERT model in Tensorflow' 33 | 'and that in Gluon. This script works with ' 34 | 'google/bert@f39e881b', 35 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 36 | parser.add_argument('--input_file', type=str, default='input.txt', 37 | help='sample input file for testing') 38 | parser.add_argument('--tf_bert_repo_dir', type=str, 39 | default='~/bert/', 40 | help='path to the original Tensorflow bert repository. ' 41 | 'The repo should be at f39e881b.') 42 | parser.add_argument('--tf_model_dir', type=str, 43 | default='~/uncased_L-12_H-768_A-12/', 44 | help='path to the original Tensorflow bert checkpoint directory.') 45 | parser.add_argument('--tf_model_prefix', type=str, 46 | default='bert_model.ckpt', 47 | help='name of bert checkpoint file.') 48 | parser.add_argument('--tf_config_name', type=str, 49 | default='bert_config.json', 50 | help='Name of Bert config file') 51 | parser.add_argument('--cased', action='store_true', 52 | help='if not set, inputs are converted to lower case') 53 | parser.add_argument('--gluon_dataset', type=str, default='book_corpus_wiki_en_uncased', 54 | help='gluon dataset name') 55 | parser.add_argument('--gluon_model', type=str, default='bert_12_768_12', 56 | help='gluon model name') 57 | parser.add_argument('--gluon_parameter_file', type=str, default=None, 58 | help='gluon parameter file name.') 59 | parser.add_argument('--gluon_vocab_file', type=str, default=None, 60 | help='gluon vocab file corresponding to --gluon_parameter_file.') 61 | 62 | args = parser.parse_args() 63 | 64 | input_file = os.path.expanduser(args.input_file) 65 | tf_bert_repo_dir = os.path.expanduser(args.tf_bert_repo_dir) 66 | tf_model_dir = os.path.expanduser(args.tf_model_dir) 67 | vocab_file = os.path.join(tf_model_dir, 'vocab.txt') 68 | bert_config_file = os.path.join(tf_model_dir, args.tf_config_name) 69 | init_checkpoint = os.path.join(tf_model_dir, args.tf_model_prefix) 70 | do_lower_case = not args.cased 71 | max_length = 128 72 | 73 | ############################################################################### 74 | # Tensorflow MODEL # 75 | ############################################################################### 76 | # import tensorflow modules 77 | sys.path.insert(0, tf_bert_repo_dir) 78 | 79 | # tensorflow model inference 80 | import modeling 81 | import tokenization 82 | from extract_features import * 83 | 84 | # data 85 | num_layers = int(args.gluon_model.split('_')[1]) 86 | layer_indexes = list(range(num_layers)) 87 | bert_config = modeling.BertConfig.from_json_file(bert_config_file) 88 | tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) 89 | examples = read_examples(input_file) 90 | 91 | features = convert_examples_to_features( 92 | examples=examples, seq_length=max_length, tokenizer=tokenizer) 93 | 94 | is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 95 | run_config = tf.contrib.tpu.RunConfig( 96 | master=None, 97 | tpu_config=tf.contrib.tpu.TPUConfig( 98 | num_shards=1, 99 | per_host_input_for_training=is_per_host)) 100 | # model 101 | model_fn = model_fn_builder( 102 | bert_config=bert_config, 103 | init_checkpoint=init_checkpoint, 104 | layer_indexes=layer_indexes, 105 | use_tpu=False, 106 | use_one_hot_embeddings=False) 107 | 108 | estimator = tf.contrib.tpu.TPUEstimator( 109 | use_tpu=False, 110 | model_fn=model_fn, 111 | config=run_config, 112 | predict_batch_size=1) 113 | 114 | input_fn = input_fn_builder( 115 | features=features, seq_length=max_length) 116 | 117 | tensorflow_all_out = [] 118 | for result in estimator.predict(input_fn, yield_single_examples=True): 119 | output_json = collections.OrderedDict() 120 | tensorflow_all_out_features = [] 121 | all_layers = [] 122 | for (j, layer_index) in enumerate(layer_indexes): 123 | layer_output = result['layer_output_%d' % j] 124 | layers = collections.OrderedDict() 125 | layers['index'] = layer_index 126 | layers['values'] = layer_output 127 | all_layers.append(layers) 128 | tensorflow_out_features = collections.OrderedDict() 129 | tensorflow_out_features['layers'] = all_layers 130 | tensorflow_all_out_features.append(tensorflow_out_features) 131 | 132 | output_json['features'] = tensorflow_all_out_features 133 | tensorflow_all_out.append(output_json) 134 | 135 | tf_outputs = [tensorflow_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes] 136 | 137 | ############################################################################### 138 | # Gluon MODEL # 139 | ############################################################################### 140 | 141 | if args.gluon_parameter_file: 142 | assert args.gluon_vocab_file, \ 143 | 'Must specify --gluon_vocab_file when specifying --gluon_parameter_file' 144 | with open(args.gluon_vocab_file, 'r') as f: 145 | vocabulary = nlp.Vocab.from_json(f.read()) 146 | bert, vocabulary = nlp.model.get_model(args.gluon_model, 147 | dataset_name=None, 148 | vocab=vocabulary, 149 | pretrained=not args.gluon_parameter_file, 150 | use_pooler=False, 151 | use_decoder=False, 152 | use_classifier=False) 153 | try: 154 | bert.cast('float16') 155 | bert.load_parameters(args.gluon_parameter_file, ignore_extra=True) 156 | bert.cast('float32') 157 | except AssertionError: 158 | bert.cast('float32') 159 | bert.load_parameters(args.gluon_parameter_file, ignore_extra=True) 160 | else: 161 | assert not args.gluon_vocab_file, \ 162 | 'Cannot specify --gluon_vocab_file without specifying --gluon_parameter_file' 163 | bert, vocabulary = nlp.model.get_model(args.gluon_model, 164 | dataset_name=args.gluon_dataset, 165 | pretrained=not args.gluon_parameter_file, 166 | use_pooler=False, 167 | use_decoder=False, 168 | use_classifier=False) 169 | 170 | print(bert) 171 | tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=do_lower_case) 172 | dataset = nlp.data.TSVDataset(input_file, field_separator=nlp.data.Splitter(' ||| ')) 173 | 174 | trans = nlp.data.BERTSentenceTransform(tokenizer, max_length) 175 | dataset = dataset.transform(trans) 176 | 177 | bert_dataloader = mx.gluon.data.DataLoader(dataset, batch_size=1, 178 | shuffle=True, last_batch='rollover') 179 | 180 | # verify the output of the first sample 181 | for i, seq in enumerate(bert_dataloader): 182 | input_ids, valid_length, type_ids = seq 183 | out = bert(input_ids, type_ids, 184 | valid_length.astype('float32')) 185 | length = valid_length.asscalar() 186 | a = tf_outputs[-1][:length] 187 | b = out[0][:length].asnumpy() 188 | 189 | print('stdev = %s' % (np.std(a - b))) 190 | mx.test_utils.assert_almost_equal(a, b, atol=5e-6, rtol=5e-6) 191 | break 192 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/data/baidu_ernie_data.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | # pylint: disable=line-too-long 21 | """Baidu ernie data, contains XNLI.""" 22 | 23 | __all__ = ['BaiduErnieXNLI', 'BaiduErnieLCQMC', 'BaiduErnieChnSentiCorp'] 24 | 25 | import os 26 | import sys 27 | import tarfile 28 | from gluonnlp.data.dataset import TSVDataset 29 | from gluonnlp.data.registry import register 30 | from gluonnlp.base import get_home_dir 31 | if sys.version_info[0] >= 3: 32 | from urllib.request import urlretrieve 33 | else: 34 | from urllib import urlretrieve 35 | 36 | _baidu_ernie_data_url = 'https://ernie.bj.bcebos.com/task_data.tgz' 37 | 38 | class _BaiduErnieDataset(TSVDataset): 39 | def __init__(self, root, dataset_name, segment, **kwargs): 40 | root = os.path.expanduser(root) 41 | if not os.path.isdir(root): 42 | os.makedirs(root) 43 | self._root = root 44 | download_data_path = os.path.join(self._root, 'task_data.tgz') 45 | if not os.path.exists(download_data_path): 46 | urlretrieve(_baidu_ernie_data_url, download_data_path) 47 | tar_file = tarfile.open(download_data_path, mode='r:gz') 48 | tar_file.extractall(self._root) 49 | filename = os.path.join(self._root, 'task_data', dataset_name, '%s.tsv' % segment) 50 | super(_BaiduErnieDataset, self).__init__(filename, **kwargs) 51 | 52 | 53 | @register(segment=['train', 'dev', 'test']) 54 | class BaiduErnieXNLI(_BaiduErnieDataset): 55 | """ The XNLI dataset redistributed by Baidu 56 | . 57 | 58 | Original from: 59 | Conneau, Alexis, et al. "Xnli: Evaluating cross-lingual sentence representations." 60 | arXiv preprint arXiv:1809.05053 (2018). 61 | https://github.com/facebookresearch/XNLI 62 | 63 | Licensed under a Creative Commons Attribution-NonCommercial 4.0 International License. 64 | License details: https://creativecommons.org/licenses/by-nc/4.0/ 65 | 66 | Parameters 67 | ---------- 68 | segment : {'train', 'dev', 'test'}, default 'train' 69 | Dataset segment. 70 | root : str, default '$MXNET_HOME/datasets/baidu_ernie_task_data' 71 | Path to temp folder for storing data. 72 | MXNET_HOME defaults to '~/.mxnet'. 73 | return_all_fields : bool, default False 74 | Return all fields available in the dataset. 75 | 76 | Examples 77 | -------- 78 | >>> xnli_dev = BaiduErnieXNLI('dev', root='./datasets/baidu_ernie_task_data/') 79 | -etc- 80 | >>> len(xnli_dev) 81 | 2490 82 | >>> len(xnli_dev[0]) 83 | 3 84 | >>> xnli_dev[0] 85 | ['他说,妈妈,我回来了。', '校车把他放下后,他立即给他妈妈打了电话。', 'neutral'] 86 | >>> xnli_test = BaiduErnieXNLI('test', root='./datasets/baidu_ernie_task_data/') 87 | -etc- 88 | >>> len(xnli_test) 89 | 5010 90 | >>> len(xnli_test[0]) 91 | 2 92 | >>> xnli_test[0] 93 | ['嗯,我根本没想过,但是我很沮丧,最后我又和他说话了。', '我还没有和他再次谈论。'] 94 | """ 95 | def __init__(self, segment='train', 96 | root=os.path.join(get_home_dir(), 'datasets', 'baidu_ernie_data'), 97 | return_all_fields=False): 98 | A_IDX, B_IDX, LABEL_IDX = 0, 1, 2 99 | if segment in ['train', 'dev']: 100 | field_indices = [A_IDX, B_IDX, LABEL_IDX] if not return_all_fields else None 101 | num_discard_samples = 1 102 | elif segment == 'test': 103 | field_indices = [A_IDX, B_IDX] if not return_all_fields else None 104 | num_discard_samples = 1 105 | 106 | super(BaiduErnieXNLI, self).__init__(root, 'xnli', segment, 107 | num_discard_samples=num_discard_samples, 108 | field_indices=field_indices) 109 | 110 | @register(segment=['train', 'dev', 'test']) 111 | class BaiduErnieLCQMC(_BaiduErnieDataset): 112 | """ The LCQMC dataset redistributed by Baidu 113 | . 114 | 115 | Original from: 116 | Xin Liu, Qingcai Chen, Chong Deng, Huajun Zeng, Jing Chen, Dongfang Li, Buzhou Tang, 117 | LCQMC: A Large-scale Chinese Question Matching Corpus,COLING2018. 118 | Licensed under a Creative Commons Attribution 4.0 International License. License details: 119 | http://creativecommons.org/licenses/by/4.0/ 120 | 121 | Parameters 122 | ---------- 123 | segment : {'train', 'dev', 'test'}, default 'train' 124 | Dataset segment. 125 | root : str, default '$MXNET_HOME/datasets/baidu_ernie_task_data' 126 | Path to temp folder for storing data. 127 | MXNET_HOME defaults to '~/.mxnet'. 128 | return_all_fields : bool, default False 129 | Return all fields available in the dataset. 130 | 131 | Examples 132 | -------- 133 | >>> lcqmc_dev = BaiduErnieLCQMC('dev', root='./datasets/baidu_ernie_task_data/') 134 | -etc- 135 | >>> len(lcqmc_dev) 136 | 8802 137 | >>> len(lcqmc_dev[0]) 138 | 3 139 | >>> lcqmc_dev[0] 140 | ['开初婚未育证明怎么弄?', '初婚未育情况证明怎么开?', '1'] 141 | >>> lcqmc_test = BaiduErnieLCQMC('test', root='./datasets/baidu_ernie_task_data/') 142 | -etc- 143 | >>> len(lcqmc_test) 144 | 12500 145 | >>> len(lcqmc_test[0]) 146 | 2 147 | >>> lcqmc_test[0] 148 | ['谁有狂三这张高清的', '这张高清图,谁有'] 149 | """ 150 | def __init__(self, segment='train', 151 | root=os.path.join(get_home_dir(), 'datasets', 'baidu_ernie_data'), 152 | return_all_fields=False): 153 | A_IDX, B_IDX, LABEL_IDX = 0, 1, 2 154 | if segment in ['train', 'dev']: 155 | field_indices = [A_IDX, B_IDX, LABEL_IDX] if not return_all_fields else None 156 | num_discard_samples = 1 157 | elif segment == 'test': 158 | field_indices = [A_IDX, B_IDX] if not return_all_fields else None 159 | num_discard_samples = 1 160 | 161 | super(BaiduErnieLCQMC, self).__init__(root, 'lcqmc', segment, 162 | num_discard_samples=num_discard_samples, 163 | field_indices=field_indices) 164 | 165 | 166 | @register(segment=['train', 'dev', 'test']) 167 | class BaiduErnieChnSentiCorp(_BaiduErnieDataset): 168 | """ The ChnSentiCorp dataset redistributed by Baidu 169 | . 170 | 171 | Original from Tan Songbo (Chinese Academy of Sciences, tansongbo@software.ict.ac.cn). 172 | 173 | Parameters 174 | ---------- 175 | segment : {'train', 'dev', 'test'}, default 'train' 176 | Dataset segment. 177 | root : str, default '$MXNET_HOME/datasets/baidu_ernie_task_data' 178 | Path to temp folder for storing data. 179 | MXNET_HOME defaults to '~/.mxnet'. 180 | return_all_fields : bool, default False 181 | Return all fields available in the dataset. 182 | 183 | Examples 184 | -------- 185 | >>> chnsenticorp_dev = BaiduErnieChnSentiCorp('dev', root='./datasets/baidu_ernie_task_data/') 186 | -etc- 187 | >>> len(chnsenticorp_dev) 188 | 1200 189 | >>> len(chnsenticorp_dev[0]) 190 | 2 191 | >>> chnsenticorp_dev[2] 192 | ['商品的不足暂时还没发现,京东的订单处理速度实在.......周二就打包完成,周五才发货...', '0'] 193 | >>> chnsenticorp_test = BaiduErnieChnSentiCorp('test', root='./datasets/baidu_ernie_task_data/') 194 | -etc- 195 | >>> len(chnsenticorp_test) 196 | 1200 197 | >>> len(chnsenticorp_test[0]) 198 | 1 199 | >>> chnsenticorp_test[0] 200 | ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'] 201 | """ 202 | def __init__(self, segment='train', 203 | root=os.path.join(get_home_dir(), 'datasets', 'baidu_ernie_data'), 204 | return_all_fields=False): 205 | LABEL_IDX, A_IDX = 0, 1 206 | if segment in ['train', 'dev']: 207 | field_indices = [A_IDX, LABEL_IDX] if not return_all_fields else None 208 | num_discard_samples = 1 209 | elif segment == 'test': 210 | field_indices = [A_IDX] if not return_all_fields else None 211 | num_discard_samples = 1 212 | 213 | super(BaiduErnieChnSentiCorp, self).__init__(root, 'chnsenticorp', segment, 214 | num_discard_samples=num_discard_samples, 215 | field_indices=field_indices) 216 | -------------------------------------------------------------------------------- /sequence_generation/text_generation/sequence_sampling.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate Sentences by Sampling and Beam Search 3 | ============================================== 4 | 5 | This example shows how to load a pre-trained language model on wikitext-2 in Gluon NLP Toolkit model 6 | zoo, and use sequence sampler and beam search sampler on the language model to generate sentences. 7 | """ 8 | 9 | # coding: utf-8 10 | 11 | # Licensed to the Apache Software Foundation (ASF) under one 12 | # or more contributor license agreements. See the NOTICE file 13 | # distributed with this work for additional information 14 | # regarding copyright ownership. The ASF licenses this file 15 | # to you under the Apache License, Version 2.0 (the 16 | # "License"); you may not use this file except in compliance 17 | # with the License. You may obtain a copy of the License at 18 | # 19 | # http://www.apache.org/licenses/LICENSE-2.0 20 | # 21 | # Unless required by applicable law or agreed to in writing, 22 | # software distributed under the License is distributed on an 23 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 24 | # KIND, either express or implied. See the License for the 25 | # specific language governing permissions and limitations 26 | # under the License. 27 | # pylint:disable=missing-docstring 28 | import argparse 29 | 30 | import numpy as np 31 | import mxnet as mx 32 | import gluonnlp as nlp 33 | 34 | import model # local 'model' module with the addition of GPT-2 35 | 36 | 37 | parser = argparse.ArgumentParser(description='Generate sentences by beam search. ' 38 | 'We load a LSTM model that is pre-trained on ' 39 | 'WikiText as our encoder.') 40 | 41 | # beam search sampler options 42 | subparsers = parser.add_subparsers(help='Sequence generation methods.', 43 | dest='command') 44 | subparsers.required = True 45 | beam_search_parser = subparsers.add_parser('beam-search', help='Use beam search for decoding.') 46 | beam_search_parser.add_argument('--alpha', type=float, default=0.0, 47 | help='Alpha in the length penalty term.') 48 | beam_search_parser.add_argument('--k', type=int, default=5, help='K in the length penalty term.') 49 | 50 | # random sampler options 51 | random_sample_parser = subparsers.add_parser('random-sample', 52 | help='Use random sampling for decoding.') 53 | random_sample_parser.add_argument('--temperature', type=float, default=1.0, 54 | help='Softmax temperature used in sampling.') 55 | random_sample_parser.add_argument('--use-top-k', type=int, required=False, 56 | help='Sample only from the top-k candidates.') 57 | 58 | # shared options 59 | for p in [beam_search_parser, random_sample_parser]: 60 | p.add_argument('--gpu', type=int, default=0, 61 | help='id of the gpu to use. Set it to empty means to use cpu.') 62 | p.add_argument('--lm-model', type=str, default='awd_lstm_lm_1150', 63 | help='type of the pre-trained model to load, can be "standard_lstm_lm_200", ' 64 | '"standard_lstm_lm_650", "standard_lstm_lm_1500", ' 65 | '"awd_lstm_lm_1150", etc.') 66 | p.add_argument('--max-length', type=int, default=20, help='Maximum sentence length.') 67 | p.add_argument('--print-num', type=int, default=3, help='Number of sentences to display.') 68 | p.add_argument('--bos', type=str, default='I think this works') 69 | p.add_argument('--beam-size', type=int, default=5, 70 | help='Beam size in the beam search sampler.') 71 | 72 | args = parser.parse_args() 73 | 74 | print(args) 75 | if args.gpu is not None and args.gpu < mx.context.num_gpus(): 76 | ctx = mx.gpu(args.gpu) 77 | else: 78 | if args.gpu: 79 | print('Specified GPU id {} does not exist. Available #GPUs: {}. Using CPU instead.'\ 80 | .format(args.gpu, mx.context.num_gpus())) 81 | ctx = mx.cpu() 82 | 83 | assert 0 < args.print_num <= args.beam_size,\ 84 | 'print_num must be between {} and {}, received={}'.format(1, args.beam_size, args.print_num) 85 | 86 | 87 | # Define the decoder function, we use log_softmax to map the output scores to log-likelihoods 88 | # Also, we transform the layout to NTC 89 | class LMDecoder(object): 90 | def __init__(self, net): 91 | self.net = net 92 | 93 | def __call__(self, inputs, states): 94 | outputs, states = self.net(mx.nd.expand_dims(inputs, axis=0), states) 95 | return outputs[0], states 96 | 97 | def state_info(self, *arg, **kwargs): 98 | return self.net.state_info(*arg, **kwargs) 99 | 100 | class GPT2Decoder(LMDecoder): 101 | def __call__(self, inputs, states): 102 | inputs = mx.nd.expand_dims(inputs, axis=1) 103 | out, new_states = self.net(inputs, states) 104 | out = mx.nd.slice_axis(out, axis=1, begin=0, end=1).reshape((inputs.shape[0], -1)) 105 | return out, new_states 106 | 107 | def get_decoder_vocab(lm_model): 108 | if lm_model.startswith('gpt2'): 109 | dataset_name = 'openai_webtext' 110 | decoder_cls = GPT2Decoder 111 | else: 112 | dataset_name = 'wikitext-2' 113 | decoder_cls = LMDecoder 114 | lm_model, vocab = model.get_model(name=lm_model, 115 | dataset_name=dataset_name, 116 | pretrained=True, 117 | ctx=ctx) 118 | decoder = decoder_cls(lm_model) 119 | return decoder, vocab 120 | 121 | def get_tokenizer(lm_model): 122 | if lm_model.startswith('gpt2'): 123 | return nlp.data.GPT2BPETokenizer(), nlp.data.GPT2BPEDetokenizer() 124 | else: 125 | return nlp.data.SacreMosesTokenizer(), nlp.data.SacreMosesDetokenizer(return_str=True) 126 | 127 | def get_initial_input_state(decoder, bos_ids): 128 | if isinstance(decoder, GPT2Decoder): 129 | inputs, begin_states = decoder.net( 130 | mx.nd.array([bos_ids], dtype=np.int32, ctx=ctx), None) 131 | inputs = inputs[:, -1, :] 132 | smoothed_probs = (inputs / args.temperature).softmax(axis=1) 133 | inputs = mx.nd.sample_multinomial(smoothed_probs, dtype=np.int32) 134 | return inputs, begin_states 135 | else: 136 | begin_states = decoder.net.begin_state(batch_size=1, ctx=ctx) 137 | if len(bos_ids) > 1: 138 | _, begin_states = decoder.net(mx.nd.expand_dims(mx.nd.array(bos_ids[:-1], ctx=ctx), 139 | axis=1), 140 | begin_states) 141 | inputs = mx.nd.full(shape=(1,), ctx=ctx, val=bos_ids[-1]) 142 | return inputs, begin_states 143 | 144 | 145 | def generate(): 146 | assert not args.lm_model.startswith('gpt2') or args.command != 'beam-search' 147 | decoder, vocab = get_decoder_vocab(args.lm_model) 148 | tokenizer, detokenizer = get_tokenizer(args.lm_model) 149 | bos_str = args.bos 150 | if not bos_str.startswith(' '): 151 | bos_str = ' ' + bos_str 152 | bos_tokens = tokenizer(bos_str) 153 | bos_ids = vocab[bos_tokens] 154 | eos_id = vocab[vocab.eos_token] 155 | if args.command == 'random-sample': 156 | print('Sampling Parameters: beam_size={}, temperature={}, use_top_k={}'\ 157 | .format(args.beam_size, args.temperature, args.use_top_k)) 158 | sampler = nlp.model.SequenceSampler(beam_size=args.beam_size, 159 | decoder=decoder, 160 | eos_id=eos_id, 161 | max_length=args.max_length - len(bos_tokens), 162 | temperature=args.temperature, 163 | top_k=args.use_top_k) 164 | else: 165 | print('Beam Seach Parameters: beam_size={}, alpha={}, K={}'\ 166 | .format(args.beam_size, args.alpha, args.k)) 167 | scorer = nlp.model.BeamSearchScorer(alpha=args.alpha, K=args.k, from_logits=False) 168 | sampler = nlp.model.BeamSearchSampler(beam_size=args.beam_size, 169 | decoder=decoder, 170 | eos_id=eos_id, 171 | scorer=scorer, 172 | max_length=args.max_length - len(bos_tokens)) 173 | inputs, begin_states = get_initial_input_state(decoder, bos_ids) 174 | # samples have shape (1, beam_size, length), scores have shape (1, beam_size) 175 | samples, scores, valid_lengths = sampler(inputs, begin_states) 176 | samples = samples[0].asnumpy() 177 | scores = scores[0].asnumpy() 178 | valid_lengths = valid_lengths[0].asnumpy() 179 | 180 | print('Generation Result:') 181 | for i in range(args.print_num): 182 | generated_tokens = [vocab.idx_to_token[ele] for ele in samples[i][:valid_lengths[i]]] 183 | tokens = bos_tokens + generated_tokens[1:] 184 | print([detokenizer(tokens).strip(), scores[i]]) 185 | 186 | 187 | if __name__ == '__main__': 188 | generate() 189 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/export/export.py: -------------------------------------------------------------------------------- 1 | """ 2 | Export the BERT Model for Deployment 3 | 4 | ==================================== 5 | 6 | This script exports the BERT model to a hybrid model serialized as a symbol.json file, 7 | which is suitable for deployment, or use with MXNet Module API. 8 | 9 | @article{devlin2018bert, 10 | title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding}, 11 | author={Devlin, Jacob and Chang, Ming- \ 12 | Wei and Lee, Kenton and Toutanova, Kristina}, 13 | journal={arXiv preprint arXiv:1810.04805}, 14 | year={2018} 15 | } 16 | """ 17 | 18 | # coding=utf-8 19 | 20 | # Licensed to the Apache Software Foundation (ASF) under one 21 | # or more contributor license agreements. See the NOTICE file 22 | # distributed with this work for additional information 23 | # regarding copyright ownership. The ASF licenses this file 24 | # to you under the Apache License, Version 2.0 (the 25 | # "License"); you may not use this file except in compliance 26 | # with the License. You may obtain a copy of the License at 27 | # 28 | # http://www.apache.org/licenses/LICENSE-2.0 29 | # 30 | # Unless required by applicable law or agreed to in writing, 31 | # software distributed under the License is distributed on an 32 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 33 | # KIND, either express or implied. See the License for the 34 | # specific language governing permissions and limitations 35 | # under the License. 36 | # pylint:disable=redefined-outer-name,logging-format-interpolation 37 | 38 | import argparse 39 | import logging 40 | import warnings 41 | import os 42 | import time 43 | 44 | import mxnet as mx 45 | import gluonnlp as nlp 46 | from hybrid_bert import get_hybrid_model 47 | from hybrid_bert import HybridBERTClassifier, HybridBERTRegression, HybridBERTForQA 48 | 49 | parser = argparse.ArgumentParser(description='Export hybrid BERT base model.') 50 | 51 | parser.add_argument('--model_parameters', 52 | type=str, 53 | default=None, 54 | help='The model parameter file saved from training.') 55 | 56 | parser.add_argument('--model_name', 57 | type=str, 58 | default='bert_12_768_12', 59 | choices=['bert_12_768_12', 'bert_24_1024_16'], 60 | help='BERT model name. Options are "bert_12_768_12" and "bert_24_1024_16"') 61 | 62 | parser.add_argument('--task', 63 | type=str, 64 | choices=['classification', 'regression', 'question_answering'], 65 | required=True, 66 | help='Task to export. Options are "classification", "regression", ' 67 | '"question_answering"') 68 | 69 | parser.add_argument('--dataset_name', 70 | type=str, 71 | default='book_corpus_wiki_en_uncased', 72 | choices=['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased', 73 | 'wiki_multilingual_uncased', 'wiki_multilingual_cased', 74 | 'wiki_cn_cased'], 75 | help='BERT dataset name. Options include ' 76 | '"book_corpus_wiki_en_uncased", "book_corpus_wiki_en_cased", ' 77 | '"wiki_multilingual_uncased", "wiki_multilingual_cased", ' 78 | '"wiki_cn_cased"') 79 | 80 | parser.add_argument('--output_dir', 81 | type=str, 82 | default='./output_dir', 83 | help='The directory where the exported model symbol will be created. ' 84 | 'The default is ./output_dir') 85 | 86 | parser.add_argument('--seq_length', 87 | type=int, 88 | default=384, 89 | help='The maximum total input sequence length after WordPiece tokenization.' 90 | 'Sequences longer than this needs to be truncated, and sequences shorter ' 91 | 'than this needs to be padded. Default is 384') 92 | 93 | parser.add_argument('--dropout', 94 | type=float, 95 | default=0.1, 96 | help='The dropout probability for the classification/regression head.') 97 | 98 | args = parser.parse_args() 99 | 100 | # create output dir 101 | output_dir = args.output_dir 102 | nlp.utils.mkdir(output_dir) 103 | 104 | ############################################################################### 105 | # Logging # 106 | ############################################################################### 107 | 108 | log = logging.getLogger('gluonnlp') 109 | log.setLevel(logging.DEBUG) 110 | formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s', 111 | datefmt='%H:%M:%S') 112 | fh = logging.FileHandler(os.path.join(args.output_dir, 'hybrid_export_bert.log'), mode='w') 113 | fh.setLevel(logging.INFO) 114 | fh.setFormatter(formatter) 115 | console = logging.StreamHandler() 116 | console.setLevel(logging.INFO) 117 | console.setFormatter(formatter) 118 | log.addHandler(console) 119 | log.addHandler(fh) 120 | log.info(args) 121 | 122 | ############################################################################### 123 | # Hybridize the model # 124 | ############################################################################### 125 | 126 | seq_length = args.seq_length 127 | 128 | if args.task == 'classification': 129 | bert, _ = get_hybrid_model( 130 | name=args.model_name, 131 | dataset_name=args.dataset_name, 132 | pretrained=False, 133 | use_pooler=True, 134 | use_decoder=False, 135 | use_classifier=False, 136 | seq_length=args.seq_length) 137 | net = HybridBERTClassifier(bert, num_classes=2, dropout=args.dropout) 138 | elif args.task == 'regression': 139 | bert, _ = get_hybrid_model( 140 | name=args.model_name, 141 | dataset_name=args.dataset_name, 142 | pretrained=False, 143 | use_pooler=True, 144 | use_decoder=False, 145 | use_classifier=False, 146 | seq_length=args.seq_length) 147 | net = HybridBERTRegression(bert, dropout=args.dropout) 148 | elif args.task == 'question_answering': 149 | bert, _ = get_hybrid_model( 150 | name=args.model_name, 151 | dataset_name=args.dataset_name, 152 | pretrained=False, 153 | use_pooler=False, 154 | use_decoder=False, 155 | use_classifier=False, 156 | seq_length=args.seq_length) 157 | net = HybridBERTForQA(bert) 158 | else: 159 | raise ValueError('unknown task: %s'%args.task) 160 | 161 | if args.model_parameters: 162 | net.load_parameters(args.model_parameters) 163 | else: 164 | net.initialize() 165 | warnings.warn('--model_parameters is not provided. The parameter checkpoint (.params) ' 166 | 'file will be created based on default parameter intialization.') 167 | 168 | net.hybridize(static_alloc=True, static_shape=True) 169 | 170 | ############################################################################### 171 | # Prepare dummy input data # 172 | ############################################################################### 173 | 174 | test_batch_size = 1 175 | 176 | inputs = mx.nd.arange(test_batch_size * seq_length) 177 | inputs = inputs.reshape(shape=(test_batch_size, seq_length)) 178 | token_types = mx.nd.zeros_like(inputs) 179 | valid_length = mx.nd.arange(test_batch_size) 180 | batch = inputs, token_types, valid_length 181 | 182 | def export(batch, prefix): 183 | """Export the model.""" 184 | log.info('Exporting the model ... ') 185 | inputs, token_types, valid_length = batch 186 | net(inputs, token_types, valid_length) 187 | net.export(prefix, epoch=0) 188 | assert os.path.isfile(prefix + '-symbol.json') 189 | assert os.path.isfile(prefix + '-0000.params') 190 | 191 | def infer(batch, prefix): 192 | """Evaluate the model on a mini-batch.""" 193 | log.info('Start inference ... ') 194 | 195 | # import with SymbolBlock. Alternatively, you can use Module.load APIs. 196 | imported_net = mx.gluon.nn.SymbolBlock.imports(prefix + '-symbol.json', 197 | ['data0', 'data1', 'data2'], 198 | prefix + '-0000.params') 199 | tic = time.time() 200 | # run forward inference 201 | inputs, token_types, valid_length = batch 202 | num_trials = 10 203 | for _ in range(num_trials): 204 | imported_net(inputs, token_types, valid_length) 205 | mx.nd.waitall() 206 | toc = time.time() 207 | log.info('Inference time cost={:.2f} s, Thoughput={:.2f} samples/s' 208 | .format(toc - tic, num_trials / (toc - tic))) 209 | 210 | 211 | ############################################################################### 212 | # Export the model # 213 | ############################################################################### 214 | if __name__ == '__main__': 215 | prefix = os.path.join(args.output_dir, args.task) 216 | export(batch, prefix) 217 | infer(batch, prefix) 218 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/embedding.py: -------------------------------------------------------------------------------- 1 | """BERT embedding.""" 2 | import argparse 3 | import io 4 | import logging 5 | import os 6 | 7 | import numpy as np 8 | import mxnet as mx 9 | 10 | from mxnet.gluon.data import DataLoader 11 | 12 | import gluonnlp 13 | from gluonnlp.data import BERTTokenizer, BERTSentenceTransform 14 | from gluonnlp.base import get_home_dir 15 | 16 | try: 17 | from data.embedding import BertEmbeddingDataset 18 | except ImportError: 19 | from .data.embedding import BertEmbeddingDataset 20 | 21 | try: 22 | unicode 23 | except NameError: 24 | # Define `unicode` for Python3 25 | def unicode(s, *_): 26 | return s 27 | 28 | 29 | def to_unicode(s): 30 | return unicode(s, 'utf-8') 31 | 32 | 33 | __all__ = ['BertEmbedding'] 34 | 35 | 36 | logger = logging.getLogger(__name__) 37 | 38 | 39 | class BertEmbedding(object): 40 | """ 41 | Encoding from BERT model. 42 | 43 | Parameters 44 | ---------- 45 | ctx : Context. 46 | running BertEmbedding on which gpu device id. 47 | dtype: str 48 | data type to use for the model. 49 | model : str, default bert_12_768_12. 50 | pre-trained BERT model 51 | dataset_name : str, default book_corpus_wiki_en_uncased. 52 | pre-trained model dataset 53 | params_path: str, default None 54 | path to a parameters file to load instead of the pretrained model. 55 | max_seq_length : int, default 25 56 | max length of each sequence 57 | batch_size : int, default 256 58 | batch size 59 | root : str, default '$MXNET_HOME/models' with MXNET_HOME defaults to '~/.mxnet' 60 | Location for keeping the model parameters. 61 | """ 62 | def __init__(self, ctx=mx.cpu(), dtype='float32', model='bert_12_768_12', 63 | dataset_name='book_corpus_wiki_en_uncased', params_path=None, 64 | max_seq_length=25, batch_size=256, 65 | root=os.path.join(get_home_dir(), 'models')): 66 | self.ctx = ctx 67 | self.dtype = dtype 68 | self.max_seq_length = max_seq_length 69 | self.batch_size = batch_size 70 | self.dataset_name = dataset_name 71 | 72 | # Don't download the pretrained models if we have a parameter path 73 | self.bert, self.vocab = gluonnlp.model.get_model(model, 74 | dataset_name=self.dataset_name, 75 | pretrained=params_path is None, 76 | ctx=self.ctx, 77 | use_pooler=False, 78 | use_decoder=False, 79 | use_classifier=False, 80 | root=root) 81 | self.bert.cast(self.dtype) 82 | 83 | if params_path: 84 | logger.info('Loading params from %s', params_path) 85 | self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True) 86 | 87 | lower = 'uncased' in self.dataset_name 88 | self.tokenizer = BERTTokenizer(self.vocab, lower=lower) 89 | self.transform = BERTSentenceTransform(tokenizer=self.tokenizer, 90 | max_seq_length=self.max_seq_length, 91 | pair=False) 92 | 93 | def __call__(self, sentences, oov_way='avg'): 94 | return self.embedding(sentences, oov_way='avg') 95 | 96 | def embedding(self, sentences, oov_way='avg'): 97 | """ 98 | Get tokens, tokens embedding 99 | 100 | Parameters 101 | ---------- 102 | sentences : List[str] 103 | sentences for encoding. 104 | oov_way : str, default avg. 105 | use **avg**, **sum** or **last** to get token embedding for those out of 106 | vocabulary words 107 | 108 | Returns 109 | ------- 110 | List[(List[str], List[ndarray])] 111 | List of tokens, and tokens embedding 112 | """ 113 | data_iter = self.data_loader(sentences=sentences) 114 | batches = [] 115 | for token_ids, valid_length, token_types in data_iter: 116 | token_ids = token_ids.as_in_context(self.ctx) 117 | valid_length = valid_length.as_in_context(self.ctx) 118 | token_types = token_types.as_in_context(self.ctx) 119 | sequence_outputs = self.bert(token_ids, token_types, 120 | valid_length.astype(self.dtype)) 121 | for token_id, sequence_output in zip(token_ids.asnumpy(), 122 | sequence_outputs.asnumpy()): 123 | batches.append((token_id, sequence_output)) 124 | return self.oov(batches, oov_way) 125 | 126 | def data_loader(self, sentences, shuffle=False): 127 | """Load, tokenize and prepare the input sentences.""" 128 | dataset = BertEmbeddingDataset(sentences, self.transform) 129 | return DataLoader(dataset=dataset, batch_size=self.batch_size, shuffle=shuffle) 130 | 131 | def oov(self, batches, oov_way='avg'): 132 | """ 133 | How to handle oov. Also filter out [CLS], [SEP] tokens. 134 | 135 | Parameters 136 | ---------- 137 | batches : List[(tokens_id, 138 | sequence_outputs, 139 | pooled_output]. 140 | batch token_ids (max_seq_length, ), 141 | sequence_outputs (max_seq_length, dim, ), 142 | pooled_output (dim, ) 143 | oov_way : str 144 | use **avg**, **sum** or **last** to get token embedding for those out of 145 | vocabulary words 146 | 147 | Returns 148 | ------- 149 | List[(List[str], List[ndarray])] 150 | List of tokens, and tokens embedding 151 | """ 152 | sentences = [] 153 | for token_ids, sequence_outputs in batches: 154 | tokens = [] 155 | tensors = [] 156 | oov_len = 1 157 | for token_id, sequence_output in zip(token_ids, sequence_outputs): 158 | if token_id == 1: 159 | # [PAD] token, sequence is finished. 160 | break 161 | if token_id in (2, 3): 162 | # [CLS], [SEP] 163 | continue 164 | token = self.vocab.idx_to_token[token_id] 165 | if token.startswith('##'): 166 | token = token[2:] 167 | tokens[-1] += token 168 | if oov_way == 'last': 169 | tensors[-1] = sequence_output 170 | else: 171 | tensors[-1] += sequence_output 172 | if oov_way == 'avg': 173 | oov_len += 1 174 | else: # iv, avg last oov 175 | if oov_len > 1: 176 | tensors[-1] /= oov_len 177 | oov_len = 1 178 | tokens.append(token) 179 | tensors.append(sequence_output) 180 | if oov_len > 1: # if the whole sentence is one oov, handle this special case 181 | tensors[-1] /= oov_len 182 | sentences.append((tokens, tensors)) 183 | return sentences 184 | 185 | 186 | if __name__ == '__main__': 187 | np.set_printoptions(threshold=5) 188 | parser = argparse.ArgumentParser(description='Get embeddings from BERT', 189 | formatter_class=argparse.RawTextHelpFormatter) 190 | parser.add_argument('--gpu', type=int, default=None, 191 | help='id of the gpu to use. Set it to empty means to use cpu.') 192 | parser.add_argument('--dtype', type=str, default='float32', help='data dtype') 193 | parser.add_argument('--model', type=str, default='bert_12_768_12', 194 | help='pre-trained model') 195 | parser.add_argument('--dataset_name', type=str, default='book_corpus_wiki_en_uncased', 196 | help='dataset') 197 | parser.add_argument('--params_path', type=str, default=None, 198 | help='path to a params file to load instead of the pretrained model.') 199 | parser.add_argument('--max_seq_length', type=int, default=25, 200 | help='max length of each sequence') 201 | parser.add_argument('--batch_size', type=int, default=256, 202 | help='batch size') 203 | parser.add_argument('--oov_way', type=str, default='avg', 204 | help='how to handle oov\n' 205 | 'avg: average all oov embeddings to represent the original token\n' 206 | 'sum: sum all oov embeddings to represent the original token\n' 207 | 'last: use last oov embeddings to represent the original token\n') 208 | parser.add_argument('--sentences', type=to_unicode, nargs='+', default=None, 209 | help='sentence for encoding') 210 | parser.add_argument('--file', type=str, default=None, 211 | help='file for encoding') 212 | parser.add_argument('--verbose', action='store_true', help='verbose logging') 213 | args = parser.parse_args() 214 | 215 | level = logging.DEBUG if args.verbose else logging.INFO 216 | logging.getLogger().setLevel(level) 217 | logging.info(args) 218 | 219 | if args.gpu is not None: 220 | context = mx.gpu(args.gpu) 221 | else: 222 | context = mx.cpu() 223 | bert_embedding = BertEmbedding(ctx=context, model=args.model, dataset_name=args.dataset_name, 224 | max_seq_length=args.max_seq_length, batch_size=args.batch_size) 225 | result = [] 226 | sents = [] 227 | if args.sentences: 228 | sents = args.sentences 229 | result = bert_embedding(sents, oov_way=args.oov_way) 230 | elif args.file: 231 | with io.open(args.file, 'r', encoding='utf8') as in_file: 232 | for line in in_file: 233 | sents.append(line.strip()) 234 | result = bert_embedding(sents, oov_way=args.oov_way) 235 | else: 236 | logger.error('Please specify --sentence or --file') 237 | 238 | if result: 239 | for sent, embeddings in zip(sents, result): 240 | print('Text: {}'.format(sent)) 241 | _, tokens_embedding = embeddings 242 | print('Tokens embedding: {}'.format(tokens_embedding)) 243 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/finetune_ner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Licensed to the Apache Software Foundation (ASF) under one 5 | # or more contributor license agreements. See the NOTICE file 6 | # distributed with this work for additional information 7 | # regarding copyright ownership. The ASF licenses this file 8 | # to you under the Apache License, Version 2.0 (the 9 | # "License"); you may not use this file except in compliance 10 | # with the License. You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, 15 | # software distributed under the License is distributed on an 16 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 17 | # KIND, either express or implied. See the License for the 18 | # specific language governing permissions and limitations 19 | # under the License. 20 | """Provides command-line interace for training BERT-based named entity recognition model.""" 21 | 22 | # coding: utf-8 23 | import argparse 24 | import logging 25 | import random 26 | 27 | import numpy as np 28 | import mxnet as mx 29 | 30 | import gluonnlp as nlp 31 | 32 | from ner_utils import get_context, get_bert_model, dump_metadata, str2bool 33 | from data.ner import BERTTaggingDataset, convert_arrays_to_text 34 | from model.ner import BERTTagger, attach_prediction 35 | 36 | # seqeval is a dependency that is specific to named entity recognition. 37 | import seqeval.metrics 38 | 39 | 40 | def parse_args(): 41 | """Parse command line arguments.""" 42 | arg_parser = argparse.ArgumentParser( 43 | description='Train a BERT-based named entity recognition model', 44 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 45 | 46 | # data file paths 47 | arg_parser.add_argument('--train-path', type=str, required=True, 48 | help='Path to the training data file') 49 | arg_parser.add_argument('--dev-path', type=str, required=True, 50 | help='Path to the development data file') 51 | arg_parser.add_argument('--test-path', type=str, required=True, 52 | help='Path to the test data file') 53 | 54 | arg_parser.add_argument('--save-checkpoint-prefix', type=str, required=False, default=None, 55 | help='Prefix of model checkpoint file') 56 | 57 | # bert options 58 | arg_parser.add_argument('--bert-model', type=str, default='bert_12_768_12', 59 | help='Name of the BERT model') 60 | arg_parser.add_argument('--cased', type=str2bool, default=True, 61 | help='Path to the development data file') 62 | arg_parser.add_argument('--dropout-prob', type=float, default=0.1, 63 | help='Dropout probability for the last layer') 64 | 65 | # optimization parameters 66 | arg_parser.add_argument('--seed', type=int, default=13531, 67 | help='Random number seed.') 68 | arg_parser.add_argument('--seq-len', type=int, default=180, 69 | help='The length of the sequence input to BERT.' 70 | ' An exception will raised if this is not large enough.') 71 | arg_parser.add_argument('--gpu', type=int, 72 | help='Number (index) of GPU to run on, e.g. 0. ' 73 | 'If not specified, uses CPU.') 74 | arg_parser.add_argument('--batch-size', type=int, default=32, help='Batch size for training') 75 | arg_parser.add_argument('--num-epochs', type=int, default=4, help='Number of epochs to train') 76 | arg_parser.add_argument('--optimizer', type=str, default='bertadam', 77 | help='Optimization algorithm to use') 78 | arg_parser.add_argument('--learning-rate', type=float, default=5e-5, 79 | help='Learning rate for optimization') 80 | arg_parser.add_argument('--warmup-ratio', type=float, default=0.1, 81 | help='Warmup ratio for learning rate scheduling') 82 | args = arg_parser.parse_args() 83 | return args 84 | 85 | 86 | def main(config): 87 | """Main method for training BERT-based NER model.""" 88 | # provide random seed for every RNGs we use 89 | np.random.seed(config.seed) 90 | random.seed(config.seed) 91 | mx.random.seed(config.seed) 92 | 93 | ctx = get_context(config.gpu) 94 | 95 | logging.info('Loading BERT model...') 96 | bert_model, text_vocab = get_bert_model(config.bert_model, config.cased, ctx, 97 | config.dropout_prob) 98 | 99 | dataset = BERTTaggingDataset(text_vocab, config.train_path, config.dev_path, config.test_path, 100 | config.seq_len, config.cased) 101 | 102 | train_data_loader = dataset.get_train_data_loader(config.batch_size) 103 | dev_data_loader = dataset.get_dev_data_loader(config.batch_size) 104 | test_data_loader = dataset.get_test_data_loader(config.batch_size) 105 | 106 | net = BERTTagger(bert_model, dataset.num_tag_types, config.dropout_prob) 107 | net.tag_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx) 108 | net.hybridize(static_alloc=True) 109 | 110 | loss_function = mx.gluon.loss.SoftmaxCrossEntropyLoss() 111 | loss_function.hybridize(static_alloc=True) 112 | 113 | # step size adaptation, adopted from: https://github.com/dmlc/gluon-nlp/blob/ 114 | # 87d36e3cc7c615f93732d01048cf7ce3b3b09eb7/scripts/bert/finetune_classifier.py#L348-L351 115 | step_size = config.batch_size 116 | num_train_steps = int(len(dataset.train_inputs) / step_size * config.num_epochs) 117 | num_warmup_steps = int(num_train_steps * config.warmup_ratio) 118 | 119 | optimizer_params = {'learning_rate': config.learning_rate} 120 | try: 121 | trainer = mx.gluon.Trainer(net.collect_params(), config.optimizer, optimizer_params) 122 | except ValueError as e: 123 | print(e) 124 | logging.warning('AdamW optimizer is not found. Please consider upgrading to ' 125 | 'mxnet>=1.5.0. Now the original Adam optimizer is used instead.') 126 | trainer = mx.gluon.Trainer(net.collect_params(), 'adam', optimizer_params) 127 | 128 | # collect differentiable parameters 129 | logging.info('Collect params...') 130 | # do not apply weight decay on LayerNorm and bias terms 131 | for _, v in net.collect_params('.*beta|.*gamma|.*bias').items(): 132 | v.wd_mult = 0.0 133 | params = [p for p in net.collect_params().values() if p.grad_req != 'null'] 134 | 135 | if config.save_checkpoint_prefix is not None: 136 | logging.info('dumping metadata...') 137 | dump_metadata(config, tag_vocab=dataset.tag_vocab) 138 | 139 | def train(data_loader, start_step_num): 140 | """Training loop.""" 141 | step_num = start_step_num 142 | logging.info('current starting step num: %d', step_num) 143 | for batch_id, (_, _, _, tag_ids, flag_nonnull_tag, out) in \ 144 | enumerate(attach_prediction(data_loader, net, ctx, is_train=True)): 145 | logging.info('training on batch index: %d/%d', batch_id, len(data_loader)) 146 | 147 | # step size adjustments 148 | step_num += 1 149 | if step_num < num_warmup_steps: 150 | new_lr = config.learning_rate * step_num / num_warmup_steps 151 | else: 152 | offset = ((step_num - num_warmup_steps) * config.learning_rate / 153 | (num_train_steps - num_warmup_steps)) 154 | new_lr = config.learning_rate - offset 155 | trainer.set_learning_rate(new_lr) 156 | 157 | with mx.autograd.record(): 158 | loss_value = loss_function(out, tag_ids, 159 | flag_nonnull_tag.expand_dims(axis=2)).mean() 160 | 161 | loss_value.backward() 162 | nlp.utils.clip_grad_global_norm(params, 1) 163 | trainer.step(1) 164 | 165 | pred_tags = out.argmax(axis=-1) 166 | logging.info('loss_value: %6f', loss_value.asscalar()) 167 | 168 | num_tag_preds = flag_nonnull_tag.sum().asscalar() 169 | logging.info( 170 | 'accuracy: %6f', (((pred_tags == tag_ids) * flag_nonnull_tag).sum().asscalar() 171 | / num_tag_preds)) 172 | return step_num 173 | 174 | def evaluate(data_loader): 175 | """Eval loop.""" 176 | predictions = [] 177 | 178 | for batch_id, (text_ids, _, valid_length, tag_ids, _, out) in \ 179 | enumerate(attach_prediction(data_loader, net, ctx, is_train=False)): 180 | logging.info('evaluating on batch index: %d/%d', batch_id, len(data_loader)) 181 | 182 | # convert results to numpy arrays for easier access 183 | np_text_ids = text_ids.astype('int32').asnumpy() 184 | np_pred_tags = out.argmax(axis=-1).asnumpy() 185 | np_valid_length = valid_length.astype('int32').asnumpy() 186 | np_true_tags = tag_ids.asnumpy() 187 | 188 | predictions += convert_arrays_to_text(text_vocab, dataset.tag_vocab, np_text_ids, 189 | np_true_tags, np_pred_tags, np_valid_length) 190 | 191 | all_true_tags = [[entry.true_tag for entry in entries] for entries in predictions] 192 | all_pred_tags = [[entry.pred_tag for entry in entries] for entries in predictions] 193 | seqeval_f1 = seqeval.metrics.f1_score(all_true_tags, all_pred_tags) 194 | return seqeval_f1 195 | 196 | best_dev_f1 = 0.0 197 | last_test_f1 = 0.0 198 | best_epoch = -1 199 | 200 | last_epoch_step_num = 0 201 | for epoch_index in range(config.num_epochs): 202 | last_epoch_step_num = train(train_data_loader, last_epoch_step_num) 203 | train_f1 = evaluate(train_data_loader) 204 | logging.info('train f1: %3f', train_f1) 205 | dev_f1 = evaluate(dev_data_loader) 206 | logging.info('dev f1: %3f, previous best dev f1: %3f', dev_f1, best_dev_f1) 207 | if dev_f1 > best_dev_f1: 208 | best_dev_f1 = dev_f1 209 | best_epoch = epoch_index 210 | logging.info('update the best dev f1 to be: %3f', best_dev_f1) 211 | test_f1 = evaluate(test_data_loader) 212 | logging.info('test f1: %3f', test_f1) 213 | last_test_f1 = test_f1 214 | 215 | # save params 216 | params_file = config.save_checkpoint_prefix + '_{:03d}.params'.format(epoch_index) 217 | logging.info('saving current checkpoint to: %s', params_file) 218 | net.save_parameters(params_file) 219 | 220 | logging.info('current best epoch: %d', best_epoch) 221 | 222 | logging.info('best epoch: %d, best dev f1: %3f, test f1 at tha epoch: %3f', 223 | best_epoch, best_dev_f1, last_test_f1) 224 | 225 | 226 | if __name__ == '__main__': 227 | logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', 228 | level=logging.DEBUG, datefmt='%Y-%m-%d %I:%M:%S') 229 | logging.getLogger().setLevel(logging.INFO) 230 | main(parse_args()) 231 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/conversion_tools/convert_tf_model.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # 'License'); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # pylint:disable=redefined-outer-name,logging-format-interpolation 20 | """ Script for converting TF Model to Gluon. """ 21 | 22 | import argparse 23 | import json 24 | import logging 25 | import os 26 | import sys 27 | 28 | import mxnet as mx 29 | import gluonnlp as nlp 30 | from gluonnlp.model import BERTEncoder, BERTModel 31 | from gluonnlp.model.bert import bert_hparams 32 | 33 | sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))) 34 | 35 | from utils import (get_hash, load_text_vocab, read_tf_checkpoint, 36 | tf_vocab_to_gluon_vocab) 37 | 38 | 39 | parser = argparse.ArgumentParser( 40 | description='Conversion script for Tensorflow BERT model', 41 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 42 | parser.add_argument('--model', 43 | type=str, 44 | default='bert_12_768_12', 45 | choices=['bert_12_768_12', 'bert_24_1024_16'], 46 | help='BERT model name') 47 | parser.add_argument('--tf_checkpoint_dir', 48 | type=str, 49 | help='Path to Tensorflow checkpoint folder.') 50 | parser.add_argument('--tf_model_prefix', type=str, 51 | default='bert_model.ckpt', 52 | help='name of bert checkpoint file.') 53 | parser.add_argument('--tf_config_name', type=str, 54 | default='bert_config.json', 55 | help='Name of Bert config file') 56 | parser.add_argument('--out_dir', 57 | type=str, 58 | default=os.path.join('~', 'output'), 59 | help='Path to output folder. The folder must exist.') 60 | parser.add_argument('--debug', action='store_true', help='debugging mode') 61 | args = parser.parse_args() 62 | logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO) 63 | logging.info(args) 64 | 65 | # convert vocabulary 66 | vocab_path = os.path.join(args.tf_checkpoint_dir, 'vocab.txt') 67 | vocab = tf_vocab_to_gluon_vocab(load_text_vocab(vocab_path)) 68 | 69 | # vocab serialization 70 | out_dir = os.path.expanduser(args.out_dir) 71 | nlp.utils.mkdir(out_dir) 72 | tmp_file_path = os.path.join(out_dir, 'tmp') 73 | with open(tmp_file_path, 'w') as f: 74 | f.write(vocab.to_json()) 75 | hash_full, hash_short = get_hash(tmp_file_path) 76 | gluon_vocab_path = os.path.join(out_dir, hash_short + '.vocab') 77 | with open(gluon_vocab_path, 'w') as f: 78 | f.write(vocab.to_json()) 79 | logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full) 80 | 81 | # load tf model 82 | tf_checkpoint_file = os.path.expanduser( 83 | os.path.join(args.tf_checkpoint_dir, args.tf_model_prefix)) 84 | logging.info('loading Tensorflow checkpoint %s ...', tf_checkpoint_file) 85 | tf_tensors = read_tf_checkpoint(tf_checkpoint_file) 86 | tf_names = sorted(tf_tensors.keys()) 87 | 88 | tf_names = filter(lambda name: not name.endswith('adam_m'), tf_names) 89 | tf_names = filter(lambda name: not name.endswith('adam_v'), tf_names) 90 | tf_names = filter(lambda name: name != 'global_step', tf_names) 91 | tf_names = list(tf_names) 92 | if len(tf_tensors) != len(tf_names): 93 | logging.info('Tensorflow model was saved with Optimizer parameters. ' 94 | 'Ignoring them.') 95 | 96 | for name in tf_names: 97 | logging.debug('%s: %s', name, tf_tensors[name].shape) 98 | 99 | # replace tensorflow parameter names with gluon parameter names 100 | NAME_MAP = [ 101 | ('bert/encoder/layer_', 'encoder.transformer_cells.'), 102 | ('/attention/self/', '.attention_cell.'), 103 | ('key', 'proj_key'), 104 | ('query', 'proj_query'), 105 | ('value', 'proj_value'), 106 | ('/attention/output/LayerNorm/', '.layer_norm.'), 107 | ('/attention/output/dense/', '.proj.'), 108 | ('cls/seq_relationship/output_weights', 'classifier.weight'), 109 | ('cls/seq_relationship/output_bias', 'classifier.bias'), 110 | ('cls/predictions/output_bias', 'decoder.3.bias'), 111 | ('cls/predictions/transform/dense/', 'decoder.0.'), 112 | ('cls/predictions/transform/LayerNorm/', 'decoder.2.'), 113 | ('kernel', 'weight'), 114 | ('/intermediate/dense/', '.ffn.ffn_1.'), 115 | ('/output/dense/', '.ffn.ffn_2.'), 116 | ('/output/LayerNorm/', '.ffn.layer_norm.'), 117 | ('bert/embeddings/LayerNorm/', 'encoder.layer_norm.'), 118 | ('bert/embeddings/position_embeddings', 'encoder.position_weight'), 119 | ('bert/embeddings/token_type_embeddings', 'token_type_embed.0.weight'), 120 | ('bert/embeddings/word_embeddings', 'word_embed.0.weight'), 121 | ('bert/pooler/dense/', 'pooler.'), 122 | ('/', '.'), 123 | ] 124 | 125 | # convert to gluon parameters 126 | mx_tensors = {} 127 | logging.info('converting to Gluon checkpoint ... ') 128 | for source_name in tf_names: 129 | # get the source tensor and its transpose 130 | source, source_t = tf_tensors[source_name], tf_tensors[source_name].T 131 | target, target_name = source, source_name 132 | for old, new in NAME_MAP: 133 | target_name = target_name.replace(old, new) 134 | # transpose kernel layer parameters 135 | if 'kernel' in source_name: 136 | target = source_t 137 | mx_tensors[target_name] = target 138 | if source_t.shape == source.shape and len(source.shape) > 1 and target is not source_t: 139 | logging.info('warning: %s has symmetric shape %s', target_name, target.shape) 140 | logging.debug('%s: %s', target_name, target.shape) 141 | 142 | # BERT config 143 | tf_config_names_to_gluon_config_names = { 144 | 'attention_probs_dropout_prob': 'embed_dropout', 145 | 'hidden_act': None, 146 | 'hidden_dropout_prob': 'dropout', 147 | 'hidden_size': 'units', 148 | 'initializer_range': None, 149 | 'intermediate_size': 'hidden_size', 150 | 'max_position_embeddings': 'max_length', 151 | 'num_attention_heads': 'num_heads', 152 | 'num_hidden_layers': 'num_layers', 153 | 'type_vocab_size': 'token_type_vocab_size', 154 | 'vocab_size': None 155 | } 156 | predefined_args = bert_hparams[args.model] 157 | with open(os.path.join(args.tf_checkpoint_dir, args.tf_config_name), 'r') as f: 158 | tf_config = json.load(f) 159 | assert len(tf_config) == len(tf_config_names_to_gluon_config_names) 160 | for tf_name, gluon_name in tf_config_names_to_gluon_config_names.items(): 161 | if tf_name is None or gluon_name is None: 162 | continue 163 | assert tf_config[tf_name] == predefined_args[gluon_name] 164 | 165 | # BERT encoder 166 | encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'], 167 | num_layers=predefined_args['num_layers'], 168 | units=predefined_args['units'], 169 | hidden_size=predefined_args['hidden_size'], 170 | max_length=predefined_args['max_length'], 171 | num_heads=predefined_args['num_heads'], 172 | scaled=predefined_args['scaled'], 173 | dropout=predefined_args['dropout'], 174 | use_residual=predefined_args['use_residual']) 175 | 176 | # Infer enabled BERTModel components 177 | use_pooler = any('pooler' in n for n in mx_tensors) 178 | use_decoder = any('decoder.0' in n for n in mx_tensors) 179 | use_classifier = any('classifier.weight' in n for n in mx_tensors) 180 | 181 | logging.info('Inferred that the tensorflow model provides the following parameters:') 182 | logging.info('- use_pooler = {}'.format(use_pooler)) 183 | logging.info('- use_decoder = {}'.format(use_decoder)) 184 | logging.info('- use_classifier = {}'.format(use_classifier)) 185 | 186 | # post processings for parameters: 187 | # - handle tied decoder weight 188 | logging.info('total number of tf parameters = %d', len(tf_names)) 189 | if use_decoder: 190 | mx_tensors['decoder.3.weight'] = mx_tensors['word_embed.0.weight'] 191 | logging.info('total number of mx parameters = %d' 192 | '(including decoder param for weight tying)', len(mx_tensors)) 193 | else: 194 | logging.info('total number of mx parameters = %d', len(mx_tensors)) 195 | 196 | # BERT model 197 | bert = BERTModel(encoder, len(vocab), 198 | token_type_vocab_size=predefined_args['token_type_vocab_size'], 199 | units=predefined_args['units'], 200 | embed_size=predefined_args['embed_size'], 201 | embed_dropout=predefined_args['embed_dropout'], 202 | word_embed=predefined_args['word_embed'], 203 | use_pooler=use_pooler, use_decoder=use_decoder, 204 | use_classifier=use_classifier) 205 | 206 | bert.initialize(init=mx.init.Normal(0.02)) 207 | 208 | ones = mx.nd.ones((2, 8)) 209 | out = bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]])) 210 | params = bert._collect_params_with_prefix() 211 | if len(params) != len(mx_tensors): 212 | raise RuntimeError('The Gluon BERTModel comprises {} parameter arrays, ' 213 | 'but {} have been extracted from the tf model. ' 214 | 'Most likely the BERTModel hyperparameters do not match ' 215 | 'the hyperparameters of the tf model.'.format(len(params), len(mx_tensors))) 216 | 217 | # set parameter data 218 | loaded_params = {} 219 | for name in params: 220 | try: 221 | arr = mx.nd.array(mx_tensors[name]) 222 | params[name].set_data(arr) 223 | loaded_params[name] = True 224 | # pylint: disable=broad-except 225 | except Exception: 226 | if name not in mx_tensors: 227 | raise RuntimeError('cannot initialize %s from tf checkpoint' % name) 228 | else: 229 | raise RuntimeError('cannot initialize %s. Expect shape = %s, but found %s' % 230 | name, params[name].shape, arr.shape) 231 | 232 | logging.info('num loaded params = %d, total num params = %d', 233 | len(loaded_params), len(mx_tensors)) 234 | for name in mx_tensors: 235 | if name not in loaded_params: 236 | logging.info('%s is not loaded', name) 237 | 238 | # param serialization 239 | bert.save_parameters(tmp_file_path) 240 | hash_full, hash_short = get_hash(tmp_file_path) 241 | gluon_param_path = os.path.join(out_dir, hash_short + '.params') 242 | logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full) 243 | bert.save_parameters(gluon_param_path) 244 | mx.nd.waitall() 245 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/fp16_utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | """Trainer for mixed precision training.""" 20 | import warnings 21 | import collections 22 | import numpy as np 23 | import mxnet as mx 24 | from mxnet import nd 25 | 26 | def grad_global_norm(parameters, max_norm): 27 | """Calculate the 2-norm of gradients of parameters, and how much they should be scaled down 28 | such that their 2-norm does not exceed `max_norm`. 29 | 30 | If gradients exist for more than one context for a parameter, user needs to explicitly call 31 | ``trainer.allreduce_grads`` so that the gradients are summed first before calculating 32 | the 2-norm. 33 | 34 | .. note:: 35 | 36 | This function is only for use when `update_on_kvstore` is set to False in trainer. 37 | 38 | Example:: 39 | 40 | trainer = Trainer(net.collect_params(), update_on_kvstore=False, ...) 41 | for x, y in mx.gluon.utils.split_and_load(X, [mx.gpu(0), mx.gpu(1)]): 42 | with mx.autograd.record(): 43 | y = net(x) 44 | loss = loss_fn(y, label) 45 | loss.backward() 46 | trainer.allreduce_grads() 47 | norm, ratio = grad_global_norm(net.collect_params().values(), max_norm) 48 | trainer.update(batch_size * ratio) 49 | ... 50 | 51 | Parameters 52 | ---------- 53 | parameters : list of Parameters 54 | 55 | Returns 56 | ------- 57 | NDArray 58 | Total norm. Shape is (1,) 59 | NDArray 60 | Ratio for rescaling gradients based on max_norm s.t. grad = grad / ratio. 61 | If total norm is NaN, ratio will be NaN, too. Shape is (1,) 62 | NDArray 63 | Whether the total norm is finite. Shape is (1,) 64 | """ 65 | # collect gradient arrays 66 | arrays = [] 67 | idx = 0 68 | for p in parameters: 69 | if p.grad_req != 'null': 70 | p_grads = p.list_grad() 71 | arrays.append(p_grads[idx % len(p_grads)]) 72 | idx += 1 73 | assert len(arrays) > 0, 'No parameter found available for gradient norm.' 74 | 75 | # compute gradient norms 76 | def _norm(array): 77 | # TODO(haibin) norm operator does not support fp16 safe reduction. 78 | # Issue is tracked at: https://github.com/apache/incubator-mxnet/issues/14126 79 | x = array.reshape((-1,)).astype('float32', copy=False) 80 | return nd.dot(x, x) 81 | 82 | norm_arrays = [_norm(arr) for arr in arrays] 83 | 84 | # group norm arrays by ctx 85 | def group_by_ctx(arr_list): 86 | groups = collections.defaultdict(list) 87 | for arr in arr_list: 88 | ctx = arr.context 89 | groups[ctx].append(arr) 90 | return groups 91 | norm_groups = group_by_ctx(norm_arrays) 92 | 93 | # reduce 94 | ctx, dtype = arrays[0].context, 'float32' 95 | norms = [nd.add_n(*g).as_in_context(ctx) for g in norm_groups.values()] 96 | total_norm = nd.add_n(*norms).sqrt() 97 | scale = total_norm / max_norm 98 | # is_finite = 0 if NaN or Inf, 1 otherwise. 99 | is_finite = nd.contrib.isfinite(scale) 100 | # if scale is finite, nd.maximum selects the max between scale and 1. That is, 101 | # 1 is returned if total_norm does not exceed max_norm. 102 | # if scale = NaN or Inf, the result of nd.minimum is undefined. Therefore, we use 103 | # choices.take to return NaN or Inf. 104 | scale_or_one = nd.maximum(nd.ones((1,), dtype=dtype, ctx=ctx), scale) 105 | choices = nd.concat(scale, scale_or_one, dim=0) 106 | chosen_scale = choices.take(is_finite) 107 | return total_norm, chosen_scale, is_finite 108 | 109 | 110 | class FP16Trainer(object): 111 | """ Trainer for mixed precision training. 112 | 113 | Parameters 114 | ---------- 115 | trainer: gluon.Trainer 116 | the original gluon Trainer object for fp32 training. 117 | dynamic_loss_scale: bool. Default is True 118 | whether to use dynamic loss scaling. This is recommended for optimizing model 119 | parameters using FP16. 120 | loss_scaler_params : dict 121 | Key-word arguments to be passed to loss scaler constructor. For example, 122 | `{"init_scale" : 2.**15, "scale_window" : 2000, "tolerance" : 0.05}` 123 | for `DynamicLossScaler`. 124 | See each `LossScaler` for a list of supported arguments' 125 | """ 126 | def __init__(self, trainer, dynamic_loss_scale=True, loss_scaler_params=None): 127 | if trainer._kvstore_params['update_on_kvstore'] is not False and trainer._kvstore: 128 | err = 'Only gluon.Trainer created with update_on_kvstore=False is supported.' 129 | raise NotImplementedError(err) 130 | self.fp32_trainer = trainer 131 | loss_scaler_params = loss_scaler_params if loss_scaler_params else {} 132 | self._scaler = DynamicLossScaler(**loss_scaler_params) if dynamic_loss_scale \ 133 | else StaticLossScaler(**loss_scaler_params) 134 | # if the optimizer supports NaN check, we can always defer the NaN check to the optimizer 135 | # TODO(haibin) this should be added via registry 136 | self._support_nan_check = trainer._optimizer.__class__.__name__ == 'BERTAdam' 137 | 138 | def backward(self, loss): 139 | """backward propagation with loss""" 140 | with mx.autograd.record(): 141 | if isinstance(loss, (tuple, list)): 142 | ls = [l * self._scaler.loss_scale for l in loss] 143 | else: 144 | ls = loss * self._scaler.loss_scale 145 | mx.autograd.backward(ls) 146 | 147 | def step(self, batch_size, max_norm=None): 148 | """Makes one step of parameter update. Should be called after 149 | `fp16_optimizer.backward()`, and outside of `record()` scope. 150 | 151 | Parameters 152 | ---------- 153 | batch_size : int 154 | Batch size of data processed. Gradient will be normalized by `1/batch_size`. 155 | Set this to 1 if you normalized loss manually with `loss = mean(loss)`. 156 | max_norm : NDArray, optional, default is None 157 | max value for global 2-norm of gradients. 158 | """ 159 | self.fp32_trainer.allreduce_grads() 160 | step_size = batch_size * self._scaler.loss_scale 161 | if max_norm: 162 | norm, ratio, is_finite = grad_global_norm(self.fp32_trainer._params, 163 | max_norm * self._scaler.loss_scale) 164 | step_size = ratio * step_size 165 | if self._support_nan_check: 166 | self.fp32_trainer.update(step_size) 167 | overflow = is_finite.asscalar() < 1 168 | else: 169 | overflow = not np.isfinite(norm.asscalar()) 170 | if not overflow: 171 | self.fp32_trainer.update(step_size) 172 | else: 173 | # TODO(haibin) optimize the performance when max_norm is not present 174 | # sequentially adding isnan/isinf results may be slow 175 | if self._support_nan_check: 176 | self.fp32_trainer.update(step_size) 177 | overflow = self._scaler.has_overflow(self.fp32_trainer._params) 178 | else: 179 | overflow = self._scaler.has_overflow(self.fp32_trainer._params) 180 | if not overflow: 181 | self.fp32_trainer.update(step_size) 182 | # update scale based on overflow information 183 | self._scaler.update_scale(overflow) 184 | 185 | class LossScaler(object): 186 | """Abstract loss scaler""" 187 | def has_overflow(self, params): 188 | """ detect inf and nan """ 189 | is_not_finite = 0 190 | for param in params: 191 | if param.grad_req != 'null': 192 | grad = param.list_grad()[0] 193 | is_not_finite += mx.nd.contrib.isnan(grad).sum() 194 | is_not_finite += mx.nd.contrib.isinf(grad).sum() 195 | # NDArray is implicitly converted to bool 196 | if is_not_finite == 0: 197 | return False 198 | else: 199 | return True 200 | 201 | def update_scale(self, overflow): 202 | raise NotImplementedError() 203 | 204 | class StaticLossScaler(LossScaler): 205 | """Static loss scaler""" 206 | def __init__(self, init_scale=1): 207 | self.loss_scale = init_scale 208 | 209 | def update_scale(self, overflow): 210 | """update loss scale""" 211 | pass 212 | 213 | class DynamicLossScaler(LossScaler): 214 | """Class that manages dynamic loss scaling. 215 | 216 | There are two problems regarding gradient scale when fp16 is used for training. 217 | One is overflow: the fp16 gradient is too large that it causes NaN. 218 | To combat such an issue, we need to scale down the gradient when such an event 219 | is detected. The other is underflow: the gradient is too small such that the 220 | precision suffers. This is hard to detect though. What dynamic loss scaler does 221 | it that, it starts the scale at a relatively large value (e.g. 2**15). 222 | Everytime when a NaN is detected in the gradient, the scale is reduced (by default) 223 | by 2x. On the other hand, if a NaN is not detected for a long time 224 | (e.g. 2000 steps), then the scale is increased (by default) by 2x.""" 225 | def __init__(self, init_scale=2.**15, scale_factor=2., scale_window=2000, 226 | tolerance=0.01): 227 | self.loss_scale = init_scale 228 | self.scale_factor = scale_factor 229 | self.scale_window = scale_window 230 | self.tolerance = tolerance 231 | self._num_steps = 0 232 | self._last_overflow_iter = -1 233 | self._last_rescale_iter = -1 234 | self._overflows_since_rescale = 0 235 | 236 | def update_scale(self, overflow): 237 | """dynamically update loss scale""" 238 | iter_since_rescale = self._num_steps - self._last_rescale_iter 239 | if overflow: 240 | self._last_overflow_iter = self._num_steps 241 | self._overflows_since_rescale += 1 242 | percentage = self._overflows_since_rescale / float(iter_since_rescale) 243 | # we tolerate a certrain amount of NaNs before actually scaling it down 244 | if percentage >= self.tolerance: 245 | self.loss_scale /= self.scale_factor 246 | self._last_rescale_iter = self._num_steps 247 | self._overflows_since_rescale = 0 248 | if self.loss_scale < 1: 249 | warnings.warn('DynamicLossScaler: overflow detected. set loss_scale = %s'% 250 | self.loss_scale) 251 | elif (self._num_steps - self._last_overflow_iter) % self.scale_window == 0: 252 | self.loss_scale *= self.scale_factor 253 | self._last_rescale_iter = self._num_steps 254 | self._num_steps += 1 255 | -------------------------------------------------------------------------------- /gluon_basics/autograd.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# Automatic Differentiation with `autograd`" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "slideshow": { 18 | "slide_type": "skip" 19 | } 20 | }, 21 | "source": [ 22 | "We train models to get better and better as a function of experience. Usually, getting better means minimizing a loss function. To achieve this goal, we often iteratively compute the gradient of the loss with respect to weights and then update the weights accordingly. While the gradient calculations are straightforward through a chain rule, for complex models, working it out by hand can be a pain.\n", 23 | "\n", 24 | "Before diving deep into the model training, let's go through how MXNet’s `autograd` package expedites this work by automatically calculating derivatives." 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "slideshow": { 31 | "slide_type": "slide" 32 | } 33 | }, 34 | "source": [ 35 | "## Basic usage\n", 36 | "\n", 37 | "Let's first import the `autograd` package." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 1, 43 | "metadata": { 44 | "ExecuteTime": { 45 | "end_time": "2019-06-13T16:36:29.255811Z", 46 | "start_time": "2019-06-13T16:36:28.119160Z" 47 | }, 48 | "slideshow": { 49 | "slide_type": "fragment" 50 | } 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "from mxnet import nd\n", 55 | "from mxnet import autograd" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": { 61 | "slideshow": { 62 | "slide_type": "slide" 63 | } 64 | }, 65 | "source": [ 66 | "### Case Study: Autograd for $f(x) = 2 x^2$" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": { 72 | "slideshow": { 73 | "slide_type": "slide" 74 | } 75 | }, 76 | "source": [ 77 | "Let's start by assigning an initial value of $x$." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 2, 83 | "metadata": { 84 | "ExecuteTime": { 85 | "end_time": "2019-06-13T16:36:29.262414Z", 86 | "start_time": "2019-06-13T16:36:29.258567Z" 87 | }, 88 | "attributes": { 89 | "classes": [], 90 | "id": "", 91 | "n": "3" 92 | }, 93 | "slideshow": { 94 | "slide_type": "fragment" 95 | } 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "x = nd.array([[1, 2], [3, 4]])" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": { 105 | "slideshow": { 106 | "slide_type": "skip" 107 | } 108 | }, 109 | "source": [ 110 | "In MXNet, we can tell an NDArray that we plan to calculate and store a gradient by invoking its `attach_grad` method." 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": { 116 | "slideshow": { 117 | "slide_type": "fragment" 118 | } 119 | }, 120 | "source": [ 121 | "#### Attach Gradient Storage\n", 122 | "\n", 123 | "Calculating gradients require extra computation, and we’ll need a place to store it." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 3, 129 | "metadata": { 130 | "ExecuteTime": { 131 | "end_time": "2019-06-13T16:36:29.267429Z", 132 | "start_time": "2019-06-13T16:36:29.264217Z" 133 | }, 134 | "attributes": { 135 | "classes": [], 136 | "id": "", 137 | "n": "6" 138 | }, 139 | "slideshow": { 140 | "slide_type": "fragment" 141 | } 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "x.attach_grad()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": { 151 | "slideshow": { 152 | "slide_type": "subslide" 153 | } 154 | }, 155 | "source": [ 156 | "#### Define and Record y = f(x)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": { 162 | "slideshow": { 163 | "slide_type": "skip" 164 | } 165 | }, 166 | "source": [ 167 | "Now we’re going to define the function $y=f(x)$. To let MXNet store $y$, so that we can compute gradients later, we need to put the definition inside a `autograd.record()` scope." 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 4, 173 | "metadata": { 174 | "ExecuteTime": { 175 | "end_time": "2019-06-13T16:36:29.274746Z", 176 | "start_time": "2019-06-13T16:36:29.269163Z" 177 | }, 178 | "attributes": { 179 | "classes": [], 180 | "id": "", 181 | "n": "7" 182 | }, 183 | "slideshow": { 184 | "slide_type": "fragment" 185 | } 186 | }, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "\n", 193 | "[[ 2. 8.]\n", 194 | " [18. 32.]]\n", 195 | "\n" 196 | ] 197 | } 198 | ], 199 | "source": [ 200 | "with autograd.record():\n", 201 | " y = 2 * x**2\n", 202 | "print(y)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": { 208 | "slideshow": { 209 | "slide_type": "fragment" 210 | } 211 | }, 212 | "source": [ 213 | "#### Invoke Back Propagation" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": { 219 | "slideshow": { 220 | "slide_type": "skip" 221 | } 222 | }, 223 | "source": [ 224 | "Let’s invoke back propagation (backprop) by calling `y.backward()`." 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 5, 230 | "metadata": { 231 | "ExecuteTime": { 232 | "end_time": "2019-06-13T16:36:29.280657Z", 233 | "start_time": "2019-06-13T16:36:29.277604Z" 234 | }, 235 | "attributes": { 236 | "classes": [], 237 | "id": "", 238 | "n": "8" 239 | }, 240 | "slideshow": { 241 | "slide_type": "fragment" 242 | } 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "y.backward()" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": { 252 | "slideshow": { 253 | "slide_type": "subslide" 254 | } 255 | }, 256 | "source": [ 257 | "#### Verify Computed Gradients\n", 258 | "\n", 259 | "Note that $y=2x^2$ and $\\frac{dy}{dx} = 4x$, which should be\n", 260 | "\n", 261 | "`[[4, 8],[12, 16]]`" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 6, 267 | "metadata": { 268 | "ExecuteTime": { 269 | "end_time": "2019-06-13T16:36:29.293743Z", 270 | "start_time": "2019-06-13T16:36:29.282783Z" 271 | }, 272 | "attributes": { 273 | "classes": [], 274 | "id": "", 275 | "n": "9" 276 | }, 277 | "slideshow": { 278 | "slide_type": "fragment" 279 | } 280 | }, 281 | "outputs": [ 282 | { 283 | "data": { 284 | "text/plain": [ 285 | "\n", 286 | "[[ 4. 8.]\n", 287 | " [12. 16.]]\n", 288 | "" 289 | ] 290 | }, 291 | "execution_count": 6, 292 | "metadata": {}, 293 | "output_type": "execute_result" 294 | } 295 | ], 296 | "source": [ 297 | "x.grad" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": { 303 | "slideshow": { 304 | "slide_type": "slide" 305 | } 306 | }, 307 | "source": [ 308 | "## Using Python Control Flows" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": { 314 | "slideshow": { 315 | "slide_type": "notes" 316 | } 317 | }, 318 | "source": [ 319 | "Sometimes we want to write dynamic programs where the execution depends on some real-time values. MXNet will record the execution trace and compute the gradient as well." 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": { 325 | "slideshow": { 326 | "slide_type": "subslide" 327 | } 328 | }, 329 | "source": [ 330 | "### Cast Study: \n", 331 | "\n", 332 | "`f(a)`: it doubles `a` until `norm(a)` reaches 1000. Then it selects one element depending on the sum of its elements." 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 7, 338 | "metadata": { 339 | "ExecuteTime": { 340 | "end_time": "2019-06-13T16:36:29.301810Z", 341 | "start_time": "2019-06-13T16:36:29.296328Z" 342 | }, 343 | "slideshow": { 344 | "slide_type": "fragment" 345 | } 346 | }, 347 | "outputs": [], 348 | "source": [ 349 | "def f(a):\n", 350 | " # a is a vector of two elements\n", 351 | " b = a * 2\n", 352 | " while b.norm() < 1000:\n", 353 | " b = b * 2\n", 354 | " return b[0] if b.sum() >= 0 else b[1]" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": { 360 | "slideshow": { 361 | "slide_type": "subslide" 362 | } 363 | }, 364 | "source": [ 365 | "#### Feed in a Random Value and Record:" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 8, 371 | "metadata": { 372 | "ExecuteTime": { 373 | "end_time": "2019-06-13T16:36:29.316038Z", 374 | "start_time": "2019-06-13T16:36:29.303593Z" 375 | }, 376 | "slideshow": { 377 | "slide_type": "fragment" 378 | } 379 | }, 380 | "outputs": [], 381 | "source": [ 382 | "a = nd.random.uniform(shape=2)\n", 383 | "a.attach_grad()\n", 384 | "with autograd.record():\n", 385 | " c = f(a)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": { 391 | "slideshow": { 392 | "slide_type": "subslide" 393 | } 394 | }, 395 | "source": [ 396 | "#### Compute and Verify Gradients\n", 397 | "\n", 398 | "`b` is a linear function of `a`, and `c` is chosen from `b`. \n", 399 | "\n", 400 | "The gradient w.r.t. `a` be will be either `[c/a[0], 0]` or `[0, c/a[1]]`.\n", 401 | "\n", 402 | "```python\n", 403 | "def f(a):\n", 404 | " b = a * 2\n", 405 | " while b.norm() < 1000:\n", 406 | " b = b * 2\n", 407 | " return b[0] if b.sum() >= 0 else b[1]\n", 408 | "```" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 9, 414 | "metadata": { 415 | "ExecuteTime": { 416 | "end_time": "2019-06-13T16:36:29.323825Z", 417 | "start_time": "2019-06-13T16:36:29.317884Z" 418 | }, 419 | "slideshow": { 420 | "slide_type": "subslide" 421 | } 422 | }, 423 | "outputs": [ 424 | { 425 | "data": { 426 | "text/plain": [ 427 | "[\n", 428 | " [2048. 0.]\n", 429 | " , \n", 430 | " [2048. 1895.8933]\n", 431 | " ]" 432 | ] 433 | }, 434 | "execution_count": 9, 435 | "metadata": {}, 436 | "output_type": "execute_result" 437 | } 438 | ], 439 | "source": [ 440 | "c.backward()\n", 441 | "[a.grad, c/a]" 442 | ] 443 | } 444 | ], 445 | "metadata": { 446 | "kernelspec": { 447 | "display_name": "conda_mxnet_p36", 448 | "language": "python", 449 | "name": "conda_mxnet_p36" 450 | }, 451 | "language_info": { 452 | "codemirror_mode": { 453 | "name": "ipython", 454 | "version": 3 455 | }, 456 | "file_extension": ".py", 457 | "mimetype": "text/x-python", 458 | "name": "python", 459 | "nbconvert_exporter": "python", 460 | "pygments_lexer": "ipython3", 461 | "version": "3.6.5" 462 | } 463 | }, 464 | "nbformat": 4, 465 | "nbformat_minor": 2 466 | } 467 | -------------------------------------------------------------------------------- /natural_language_understanding/bert/conversion_tools/convert_paddle_to_gluon.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | from __future__ import unicode_literals 4 | from __future__ import absolute_import 5 | from __future__ import print_function 6 | 7 | import collections 8 | import os 9 | import sys 10 | import numpy as np 11 | import argparse 12 | import logging 13 | import json 14 | import mxnet as mx 15 | import gluonnlp as nlp 16 | import paddle.fluid as fluid 17 | 18 | from gluonnlp.model import BERTEncoder, BERTModel 19 | from gluonnlp.model.bert import bert_hparams 20 | from utils import get_hash, tf_vocab_to_gluon_vocab, load_text_vocab 21 | 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument("--gluon_bert_model_base", default='ernie_12_768_12', type=str, help=".") 24 | parser.add_argument("--init_pretraining_params", default='./ERNIE_stable-1.0.1/params', 25 | type=str, help=".") 26 | parser.add_argument("--ernie_config_path", default='./ERNIE_stable-1.0.1/ernie_config.json', 27 | type=str, help=".") 28 | parser.add_argument("--ernie_vocab_path", default='./ERNIE_stable-1.0.1/vocab.txt', 29 | type=str, help=".") 30 | parser.add_argument("--out_dir", default='./ernie_gluon_model2', type=str, help=".") 31 | parser.add_argument("--baidu_lark_repo_dir", default='../../../../LARK', type=str, 32 | help='path to the original baidu lark repository. ' 33 | 'The repo should be at f97e3c8581e36dc1979560d62f75df862acd9585.' 34 | '(https://github.com/PaddlePaddle/LARK.git)') 35 | args = parser.parse_args() 36 | 37 | sys.path = [os.path.join(args.baidu_lark_repo_dir,'ERNIE')] + sys.path 38 | try: 39 | from model.ernie import ErnieConfig 40 | from finetune.classifier import create_model 41 | except: 42 | raise ImportError('Place clone ERNIE first') 43 | 44 | def if_exist(var): 45 | return os.path.exists(os.path.join(args.init_pretraining_params, var.name)) 46 | 47 | 48 | def build_weight_map(): 49 | weight_map = collections.OrderedDict({ 50 | 'word_embedding': 'word_embed.0.weight', 51 | 'pos_embedding': 'encoder.position_weight', 52 | 'sent_embedding': 'token_type_embed.0.weight', 53 | 'pre_encoder_layer_norm_scale': 'encoder.layer_norm.gamma', 54 | 'pre_encoder_layer_norm_bias': 'encoder.layer_norm.beta', 55 | }) 56 | 57 | def add_w_and_b(ernie_pre, gluon_pre): 58 | weight_map[ernie_pre + ".w_0"] = gluon_pre + ".weight" 59 | weight_map[ernie_pre + ".b_0"] = gluon_pre + ".bias" 60 | 61 | def add_one_encoder_layer(layer_number): 62 | # attention 63 | add_w_and_b("encoder_layer_{}_multi_head_att_query_fc".format(layer_number), 64 | "encoder.transformer_cells.{}.attention_cell.proj_query".format(layer_number)) 65 | add_w_and_b("encoder_layer_{}_multi_head_att_key_fc".format(layer_number), 66 | "encoder.transformer_cells.{}.attention_cell.proj_key".format(layer_number)) 67 | add_w_and_b("encoder_layer_{}_multi_head_att_value_fc".format(layer_number), 68 | "encoder.transformer_cells.{}.attention_cell.proj_value".format(layer_number)) 69 | add_w_and_b("encoder_layer_{}_multi_head_att_output_fc".format(layer_number), 70 | "encoder.transformer_cells.{}.proj".format(layer_number)) 71 | weight_map["encoder_layer_{}_post_att_layer_norm_bias".format(layer_number)] = \ 72 | "encoder.transformer_cells.{}.layer_norm.beta".format(layer_number) 73 | weight_map["encoder_layer_{}_post_att_layer_norm_scale".format(layer_number)] = \ 74 | "encoder.transformer_cells.{}.layer_norm.gamma".format(layer_number) 75 | # intermediate 76 | add_w_and_b("encoder_layer_{}_ffn_fc_0".format(layer_number), 77 | "encoder.transformer_cells.{}.ffn.ffn_1".format(layer_number)) 78 | # output 79 | add_w_and_b("encoder_layer_{}_ffn_fc_1".format(layer_number), 80 | "encoder.transformer_cells.{}.ffn.ffn_2".format(layer_number)) 81 | weight_map["encoder_layer_{}_post_ffn_layer_norm_bias".format(layer_number)] = \ 82 | "encoder.transformer_cells.{}.ffn.layer_norm.beta".format(layer_number) 83 | weight_map["encoder_layer_{}_post_ffn_layer_norm_scale".format(layer_number)] = \ 84 | "encoder.transformer_cells.{}.ffn.layer_norm.gamma".format(layer_number) 85 | 86 | for i in range(12): 87 | add_one_encoder_layer(i) 88 | add_w_and_b('pooled_fc', 'pooler') 89 | return weight_map 90 | 91 | 92 | def extract_weights(args): 93 | # add ERNIE to environment 94 | print('extract weights start'.center(60, '=')) 95 | startup_prog = fluid.Program() 96 | test_prog = fluid.Program() 97 | place = fluid.CPUPlace() 98 | exe = fluid.Executor(place) 99 | exe.run(startup_prog) 100 | args.max_seq_len = 512 101 | args.use_fp16 = False 102 | args.num_labels = 2 103 | args.loss_scaling = 1.0 104 | print('model config:') 105 | ernie_config = ErnieConfig(args.ernie_config_path) 106 | ernie_config.print_config() 107 | with fluid.program_guard(test_prog, startup_prog): 108 | with fluid.unique_name.guard(): 109 | _, _ = create_model( 110 | args, 111 | pyreader_name='train', 112 | ernie_config=ernie_config) 113 | fluid.io.load_vars(exe, args.init_pretraining_params, main_program=test_prog, predicate=if_exist) 114 | state_dict = collections.OrderedDict() 115 | weight_map = build_weight_map() 116 | for ernie_name, gluon_name in weight_map.items(): 117 | fluid_tensor = fluid.global_scope().find_var(ernie_name).get_tensor() 118 | fluid_array = np.array(fluid_tensor, dtype=np.float32) 119 | if 'w_0' in ernie_name: 120 | fluid_array = fluid_array.transpose() 121 | state_dict[gluon_name] = fluid_array 122 | print(f'{ernie_name} -> {gluon_name} {fluid_array.shape}') 123 | print('extract weights done!'.center(60, '=')) 124 | return state_dict 125 | 126 | 127 | def save_model(new_gluon_parameters, output_dir): 128 | print('save model start'.center(60, '=')) 129 | if not os.path.exists(output_dir): 130 | os.makedirs(output_dir) 131 | # save model 132 | # load vocab 133 | vocab_f = open(os.path.join(output_dir, "vocab.txt"), "wt", encoding='utf-8') 134 | with open(args.ernie_vocab_path, "rt", encoding='utf-8') as f: 135 | for line in f: 136 | data = line.strip().split("\t") 137 | vocab_f.writelines(data[0] + "\n") 138 | vocab_f.close() 139 | vocab = tf_vocab_to_gluon_vocab(load_text_vocab(os.path.join(output_dir, "vocab.txt"))) 140 | # vocab serialization 141 | tmp_file_path = os.path.expanduser(os.path.join(output_dir, 'tmp')) 142 | if not os.path.exists(os.path.join(args.out_dir)): 143 | os.makedirs(os.path.join(args.out_dir)) 144 | with open(tmp_file_path, 'w') as f: 145 | f.write(vocab.to_json()) 146 | hash_full, hash_short = get_hash(tmp_file_path) 147 | gluon_vocab_path = os.path.expanduser(os.path.join(output_dir, hash_short + '.vocab')) 148 | with open(gluon_vocab_path, 'w') as f: 149 | f.write(vocab.to_json()) 150 | logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full) 151 | 152 | # BERT config 153 | tf_config_names_to_gluon_config_names = { 154 | 'attention_probs_dropout_prob': 'embed_dropout', 155 | 'hidden_act': None, 156 | 'hidden_dropout_prob': 'dropout', 157 | 'hidden_size': 'units', 158 | 'initializer_range': None, 159 | # 'intermediate_size': 'hidden_size', 160 | 'max_position_embeddings': 'max_length', 161 | 'num_attention_heads': 'num_heads', 162 | 'num_hidden_layers': 'num_layers', 163 | 'type_vocab_size': 'token_type_vocab_size', 164 | 'vocab_size': None 165 | } 166 | predefined_args = bert_hparams[args.gluon_bert_model_base] 167 | with open(args.ernie_config_path, 'r') as f: 168 | tf_config = json.load(f) 169 | if 'layer_norm_eps' in tf_config: # ignore layer_norm_eps 170 | del tf_config['layer_norm_eps'] 171 | assert len(tf_config) == len(tf_config_names_to_gluon_config_names) 172 | for tf_name, gluon_name in tf_config_names_to_gluon_config_names.items(): 173 | if tf_name is None or gluon_name is None: 174 | continue 175 | if gluon_name != 'max_length': 176 | assert tf_config[tf_name] == predefined_args[gluon_name] 177 | 178 | encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'], 179 | num_layers=predefined_args['num_layers'], units=predefined_args['units'], 180 | hidden_size=predefined_args['hidden_size'], 181 | max_length=predefined_args['max_length'], 182 | num_heads=predefined_args['num_heads'], scaled=predefined_args['scaled'], 183 | dropout=predefined_args['dropout'], 184 | use_residual=predefined_args['use_residual'], 185 | activation='relu') 186 | 187 | bert = BERTModel(encoder, len(vocab), 188 | token_type_vocab_size=predefined_args['token_type_vocab_size'], 189 | units=predefined_args['units'], embed_size=predefined_args['embed_size'], 190 | embed_dropout=predefined_args['embed_dropout'], 191 | word_embed=predefined_args['word_embed'], use_pooler=True, 192 | use_decoder=False, use_classifier=False) 193 | 194 | bert.initialize(init=mx.init.Normal(0.02)) 195 | 196 | ones = mx.nd.ones((2, 8)) 197 | out = bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]])) 198 | params = bert._collect_params_with_prefix() 199 | assert len(params) == len(new_gluon_parameters), "Gluon model does not match paddle model. " \ 200 | "Please fix the BERTModel hyperparameters" 201 | 202 | # post processings for parameters: 203 | # - handle tied decoder weight 204 | new_gluon_parameters['decoder.3.weight'] = new_gluon_parameters['word_embed.0.weight'] 205 | # set parameter data 206 | loaded_params = {} 207 | for name in params: 208 | if name == 'word_embed.0.weight': 209 | arr = mx.nd.array(new_gluon_parameters[name][:params[name].shape[0]]) 210 | else: 211 | arr = mx.nd.array(new_gluon_parameters[name]) 212 | try: 213 | assert arr.shape == params[name].shape 214 | except: 215 | print(name) 216 | params[name].set_data(arr) 217 | loaded_params[name] = True 218 | 219 | # post processings for parameters: 220 | # - handle tied decoder weight 221 | # - update word embedding for reserved tokens 222 | 223 | if len(params) != len(loaded_params): 224 | raise RuntimeError('The Gluon BERTModel comprises {} parameter arrays, ' 225 | 'but {} have been extracted from the paddle model. '.format( 226 | len(params), len(loaded_params))) 227 | 228 | # param serialization 229 | bert.save_parameters(tmp_file_path) 230 | hash_full, hash_short = get_hash(tmp_file_path) 231 | gluon_param_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.params')) 232 | logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full) 233 | bert.save_parameters(gluon_param_path) 234 | mx.nd.waitall() 235 | # save config 236 | print('finish save vocab') 237 | print('save model done!'.center(60, '=')) 238 | 239 | 240 | if __name__ == "__main__": 241 | state_dict = extract_weights(args) 242 | save_model(state_dict, args.out_dir) --------------------------------------------------------------------------------