├── .amlignore ├── .bumpversion.cfg ├── .flake8 ├── .github ├── ISSUE_TEMPLATE.md ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ └── general-ask.md └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── DatasetReferences.md ├── LICENSE ├── MANIFEST.in ├── NLP-Logo.png ├── NOTICE.txt ├── README.md ├── SETUP.md ├── VERSIONING.md ├── _config.yml ├── cgmanifest.json ├── docker └── Dockerfile ├── docs ├── Makefile ├── README.md ├── _config.yml └── source │ ├── azureml.rst │ ├── conf.py │ └── index.rst ├── examples ├── README.md ├── annotation │ ├── Doccano.md │ └── README.md ├── embeddings │ ├── README.md │ └── embedding_trainer.ipynb ├── entailment │ ├── README.md │ ├── entailment_multinli_transformers.ipynb │ └── entailment_xnli_bert_azureml.ipynb ├── model_explainability │ ├── README.md │ └── interpret_dnn_layers.ipynb ├── named_entity_recognition │ ├── README.md │ └── ner_wikigold_transformer.ipynb ├── question_answering │ ├── README.md │ ├── bert_run_squad_azureml.py │ ├── bidaf_aml_deep_dive.ipynb │ ├── bidaf_config.json │ ├── pretrained-BERT-SQuAD-deep-dive-aml.ipynb │ ├── question_answering_squad_transformers.ipynb │ └── question_answering_system_bidaf_quickstart.ipynb ├── sentence_similarity │ ├── README.md │ ├── automl_local_deployment_aci.ipynb │ ├── automl_with_pipelines_deployment_aks.ipynb │ ├── baseline_deep_dive.ipynb │ ├── bert_encoder.ipynb │ ├── bert_senteval.ipynb │ ├── gensen_aml_deep_dive.ipynb │ ├── gensen_config.json │ ├── gensen_local.ipynb │ ├── gensen_train.py │ └── gensen_wrapper.py ├── sentiment_analysis │ └── absa │ │ ├── README.md │ │ ├── absa.ipynb │ │ ├── absa_azureml.ipynb │ │ └── dataset │ │ └── data.md ├── text_classification │ ├── README.md │ ├── tc_bert_azureml.ipynb │ ├── tc_mnli_mtdnn.ipynb │ ├── tc_mnli_transformers.ipynb │ └── tc_multi_languages_transformers.ipynb └── text_summarization │ ├── abstractive_summarization_bertsum_cnndm_distributed_train.py │ ├── abstractive_summarization_bertsumabs_cnndm.ipynb │ ├── abstractive_summarization_minilm_cnndm.ipynb │ ├── abstractive_summarization_unilm_cnndm.ipynb │ ├── abstractive_summarization_unilm_cnndm.py │ ├── extractive_summarization_cnndm_aml_distributed.ipynb │ ├── extractive_summarization_cnndm_distributed_train.py │ ├── extractive_summarization_cnndm_transformer.ipynb │ └── summarization_evaluation.ipynb ├── pyproject.toml ├── setup.py ├── tests ├── README.md ├── __init__.py ├── ci │ ├── azureml_integration_tests.yml │ ├── component_governance.yml │ ├── cpu_integration_tests_linux.yml │ ├── cpu_unit_tests_linux.yml │ ├── gpu_integration_tests_linux.yml │ ├── gpu_unit_tests_linux.yml │ ├── notebooks_cpu_unit_tests_linux.yml │ └── notebooks_gpu_unit_tests_linux.yml ├── conftest.py ├── integration │ ├── test_ddp_summarization.py │ ├── test_gpu_utils.py │ ├── test_notebooks_abstractive_summarization_bertsumabs.py │ ├── test_notebooks_embeddings.py │ ├── test_notebooks_entailment.py │ ├── test_notebooks_extractive_summarization.py │ ├── test_notebooks_interpretability.py │ ├── test_notebooks_minilm_abstractive_summarization.py │ ├── test_notebooks_named_entity_recognition.py │ ├── test_notebooks_question_answering.py │ ├── test_notebooks_sentence_similarity.py │ ├── test_notebooks_text_classification.py │ └── test_notebooks_unilm_abstractive_summarization.py ├── notebooks_common.py ├── smoke │ ├── test_dataset.py │ ├── test_gpu_utils.py │ └── test_word_embeddings.py └── unit │ ├── test_abstractive_summarization_bertsum.py │ ├── test_abstractive_summarization_seq2seq.py │ ├── test_bert_common.py │ ├── test_bert_encoder.py │ ├── test_bert_sentence_encoding.py │ ├── test_common_pytorch_utils.py │ ├── test_data_loaders.py │ ├── test_dataset.py │ ├── test_dataset_pytorch.py │ ├── test_distributed_sampler.py │ ├── test_eval_classification.py │ ├── test_eval_compute_rouge.py │ ├── test_extractive_summarization.py │ ├── test_gensen_utils.py │ ├── test_interpreter.py │ ├── test_models_transformers_question_answering.py │ ├── test_notebooks_cpu.py │ ├── test_notebooks_gpu.py │ ├── test_preprocess.py │ ├── test_timer.py │ ├── test_transformers_sequence_classification.py │ └── test_transformers_token_classification.py ├── tools ├── README.md ├── __init__.py ├── generate_conda_file.py ├── generate_requirements_txt.py └── remove_pixelserver.py └── utils_nlp ├── README.md ├── __init__.py ├── azureml ├── README.md ├── __init__.py ├── azureml_bert_util.py └── azureml_utils.py ├── common ├── README.md ├── __init__.py ├── pytorch_utils.py └── timer.py ├── dataset ├── README.md ├── __init__.py ├── bbc_hindi.py ├── cnndm.py ├── dac.py ├── data_loaders.py ├── msrpc.py ├── multinli.py ├── ner_utils.py ├── preprocess.py ├── sentence_selection.py ├── snli.py ├── squad.py ├── stsbenchmark.py ├── url_utils.py ├── wikigold.py ├── xnli.py └── xnli_torch_dataset.py ├── eval ├── README.md ├── SentEval │ ├── .gitignore │ ├── LICENSE │ ├── README.md │ ├── data │ │ └── downstream │ │ │ ├── get_transfer_data.bash │ │ │ └── tokenizer.sed │ ├── senteval │ │ ├── __init__.py │ │ ├── binary.py │ │ ├── engine.py │ │ ├── mrpc.py │ │ ├── probing.py │ │ ├── rank.py │ │ ├── sick.py │ │ ├── snli.py │ │ ├── sst.py │ │ ├── sts.py │ │ ├── tools │ │ │ ├── __init__.py │ │ │ ├── classifier.py │ │ │ ├── ranking.py │ │ │ ├── relatedness.py │ │ │ └── validation.py │ │ ├── trec.py │ │ └── utils.py │ └── setup.py ├── __init__.py ├── classification.py ├── evaluate_squad.py ├── evaluate_summarization.py ├── question_answering.py ├── rouge │ ├── compute_rouge.py │ └── rouge_ext.py └── senteval.py ├── interpreter ├── Interpreter.py ├── README.md └── __init__.py ├── language_utils └── hi │ └── hindi_stemmer.py └── models ├── README.md ├── bert ├── README.md ├── __init__.py ├── common.py ├── sequence_classification.py ├── sequence_classification_distributed.py ├── sequence_encoding.py └── token_classification.py ├── gensen ├── README.md ├── __init__.py ├── create_gensen_model.py ├── gensen.py ├── multi_task_model.py ├── preprocess_utils.py └── utils.py ├── glove ├── Makefile ├── README.md ├── demo.sh └── src │ ├── README.md │ ├── cooccur.c │ ├── glove.c │ ├── shuffle.c │ └── vocab_count.c ├── pretrained_embeddings ├── README.md ├── __init__.py ├── fasttext.py ├── glove.py └── word2vec.py ├── pytorch_modules ├── README.md ├── __init__.py └── conditional_gru.py ├── transformers ├── abstractive_summarization_bertsum.py ├── abstractive_summarization_seq2seq.py ├── bertsum │ ├── __init__.py │ ├── adam.py │ ├── beam.py │ ├── data_loader.py │ ├── dataset.py │ ├── decoder.py │ ├── encoder.py │ ├── loss.py │ ├── model_builder.py │ ├── neural.py │ ├── optimizers.py │ ├── penalties.py │ └── predictor.py ├── common.py ├── datasets.py ├── extractive_summarization.py ├── named_entity_recognition.py ├── question_answering.py └── sequence_classification.py └── xlnet ├── README.md ├── common.py └── sequence_classification.py /.amlignore: -------------------------------------------------------------------------------- 1 | data/ 2 | examples/ 3 | -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.0.0 3 | commit = True 4 | tag = True 5 | message = "Bump version: {current_version} -> {new_version}" 6 | 7 | [bumpversion:file:setup.py] 8 | search = version='{current_version}' 9 | replace = version='{new_version}' 10 | 11 | [bumpversion:file:utils_nlp/__init__.py] 12 | search = __version__ = '{current_version}' 13 | replace = __version__ = '{new_version}' -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # Intial set of rules 3 | # Feel Free to add any new rule here with description of what it does. 4 | 5 | # E203 Whitespace before ':' 6 | # E266 Too many leading '#' for block comment 7 | # E501 Line too long (82 > 79 characters) 8 | # W503 Line break occurred before a binary operator 9 | # F403 'from module import *' used; unable to detect undefined names 10 | # F405 '' may be undefined, or defined from star imports 11 | # E402 module level import not at top of file 12 | # E731 do not assign a lambda expression, use a def 13 | # F821 undefined name 'get_ipython' --> from generated python files using nbconvert 14 | # E722: do not use bare except 15 | # E231: missing white space after "," --> black generates autoformat [,] which fails flake8 16 | ignore = E203, E266, W503, F403, F405, E402, E731, F821, E722, E231 17 | 18 | max-line-length = 88 19 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ### Description 2 | 3 | 4 | 5 | ### In which platform does it happen? 6 | 7 | 8 | 9 | 10 | 11 | 12 | ### How do we replicate the issue? 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | ### Expected behavior (i.e. solution) 21 | 22 | 23 | 24 | 25 | ### Other Comments 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG] " 5 | labels: 'bug' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Description 11 | 12 | 13 | 14 | ### How do we replicate the bug? 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | ### Expected behavior (i.e. solution) 23 | 24 | 25 | 26 | 27 | ### Other Comments 28 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEATURE] " 5 | labels: 'enhancement' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Description 11 | 12 | 13 | 14 | ### Expected behavior with the suggested feature 15 | 16 | 17 | 18 | 19 | ### Other Comments 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/general-ask.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: General ask 3 | about: Technical/non-technical asks about the repo 4 | title: "[ASK] " 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Description 11 | 12 | 13 | 14 | ### Other Comments 15 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ### Description 2 | 3 | 4 | 5 | 6 | ### Related Issues 7 | 8 | 9 | 10 | ### Checklist: 11 | 12 | 13 | - [ ] My code follows the code style of this project, as detailed in our [contribution guidelines](https://github.com/microsoft/nlp-recipes/blob/master/CONTRIBUTING.md). 14 | - [ ] I have added tests. 15 | - [ ] I have updated the documentation accordingly. 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | pip-wheel-metadata/ 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | 107 | 108 | ########################## 109 | .DS_Store 110 | .~* 111 | Untitled*.ipynb 112 | *-Copy*.ipynb 113 | ~$* 114 | output.ipynb 115 | .idea/ 116 | *.npz 117 | *.data 118 | *.dat 119 | *.csv 120 | *.tsv 121 | *.zip 122 | .vscode/ 123 | tools/repo_metrics/config.py 124 | *.jar 125 | *.item 126 | *.pkl 127 | nlp_*.yaml 128 | nohup.out 129 | temp/ 130 | tmp/ 131 | logs/ 132 | score.py 133 | 134 | # Data 135 | data/ 136 | squad/ 137 | bidaf-question-answering/ 138 | */question_answering/bidaf.tar.gz 139 | */question_answering/bidafenv.yml 140 | */question_answering/config.json 141 | */question_answering/vocabulary/ 142 | */question_answering/weights.th 143 | 144 | # AML Config 145 | aml_config/ 146 | .azureml/ 147 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/ambv/black 3 | rev: stable 4 | hooks: 5 | - id: black 6 | language_version: python3.6 7 | - repo: https://github.com/pre-commit/pre-commit-hooks 8 | rev: v1.2.3 9 | hooks: 10 | - id: flake8 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft utils_nlp 2 | 3 | global-exclude *.py[cod] __pycache__ *.so *.dylib 4 | 5 | exclude README.md 6 | exclude SETUP.md 7 | exclude CONTRIBUTING.md 8 | 9 | -------------------------------------------------------------------------------- /NLP-Logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/nlp-recipes/7db6d204e5116da07bb3c549df546e49cb7ab5a5/NLP-Logo.png -------------------------------------------------------------------------------- /VERSIONING.md: -------------------------------------------------------------------------------- 1 | # Semantic Versioning 2 | > NOTE: Support for `setuptools_scm` is currently removed due to a known [issue](https://github.com/pypa/setuptools_scm/issues/357) with the way pip installations restrict access to certain SCM metadata during package installation. Support will be restored when `setuptools_scm` and `pip` developers fix this with a patch. 3 | 4 | This library is configured to use 5 | [setuptools_scm](https://github.com/pypa/setuptools_scm/) to automatically get package version from git commit histories. 6 | 7 | **There shouldn't be any references to manually coded versions**. 8 | 9 | Verify what git tag to use by running: 10 | 11 | ```bash 12 | python setup.py --version 13 | ``` 14 | It should look something like `0.1.0.dev4+gdfedba7.d20190209` 15 | 16 | Using the information above the master branch, after a merge commit, can be _**Tagged**_ with the above semantic version `0.1.0` (ignoring the `dev4+gdfedba7.d20190209`) 17 | 18 | For example: 19 | 20 | git tag v0.1.0 21 | 22 | Now verify the semantic version for the package: 23 | 24 | python setup.py --version 25 | 26 | 27 | All new merged commit on master must have a 28 | [Semantic Versioning](https://semver.org/) release version with an 29 | accompanying tag. TL;DR: 30 | * `major.minor.patch` 31 | * Patch is for bugfix 32 | * Minor is for new features 33 | * Major is for backwards-incompatible changes 34 | * tags should be of the form `v0.1.2` 35 | 36 | Installing this library into another clean git repository with a tag version, you should get a nice version like `0.2.1`. 37 | 38 | However, if you inspect the `__version__` in this repo, 39 | you'll get a nice **'dirty'** version number like `'0.2.1.dev0+g850a76d.d20180908'`. 40 | 41 | This is useful for debugging, building sphinx docs in dev and so on. 42 | 43 | You should never have to specify a version manually except just tagging your commit from the tag calculation generated by running 44 | 45 | python setup.py --version 46 | 47 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /cgmanifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "Registrations": [ 3 | { 4 | "component": { 5 | "type": "git", 6 | "git": { 7 | "repositoryUrl": "https://github.com/facebookresearch/XLM", 8 | "commitHash": "" 9 | } 10 | }, 11 | "license": "CC BY-NC 4.0" 12 | }, 13 | { 14 | "component": { 15 | "type": "git", 16 | "git": { 17 | "repositoryUrl": "https://github.com/allenai/bi-att-flow", 18 | "commitHash": "e444acf13892cf62189b9eac3c7654bd83baf848" 19 | } 20 | }, 21 | "license": "Apache-2.0" 22 | }, 23 | { 24 | "component": { 25 | "type": "git", 26 | "git": { 27 | "repositoryUrl": "https://github.com/stanfordnlp/glove", 28 | "commitHash": "26f6e18eb117ca7b080d01acb453fd1c9742418d" 29 | } 30 | }, 31 | "license": "Apache-2.0" 32 | }, 33 | { 34 | "component": { 35 | "type": "git", 36 | "git": { 37 | "repositoryUrl": "https://github.com/nlpyang/PreSumm", 38 | "commitHash": "2df3312582a3a014aacbc1be810841705c67d06e" 39 | } 40 | }, 41 | "license": "MIT License" 42 | } 43 | ], 44 | "Version": 1 45 | } 46 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda 2 | 3 | # Install Anaconda 4 | # Non interactive installation instructions can be found 5 | # https://hub.docker.com/r/continuumio/anaconda/dockerfile 6 | # https://hub.docker.com/r/continuumio/miniconda/dockerfile 7 | ENV PATH /opt/conda/bin:$PATH 8 | SHELL ["/bin/bash", "-c"] 9 | 10 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ 11 | libglib2.0-0 libxext6 libsm6 libxrender1 \ 12 | git mercurial subversion 13 | 14 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda2-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \ 15 | /bin/bash ~/miniconda.sh -b -p /opt/conda && \ 16 | rm ~/miniconda.sh && \ 17 | ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ 18 | echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ 19 | echo "conda activate base" >> ~/.bashrc 20 | 21 | # Get the latest version repository 22 | WORKDIR /root 23 | RUN apt-get install -y zip && \ 24 | wget --quiet https://github.com/microsoft/nlp-recipes/archive/staging.zip -O staging.zip && \ 25 | unzip staging.zip && rm staging.zip 26 | 27 | # Install the packages 28 | WORKDIR /root/nlp-recipes-staging 29 | RUN python /root/nlp-recipes-staging/tools/generate_conda_file.py --gpu && \ 30 | conda env create -n nlp_gpu -f nlp_gpu.yaml 31 | RUN source activate nlp_gpu && \ 32 | pip install -e . && \ 33 | python -m ipykernel install --user --name nlp_gpu --display-name "Python (nlp_gpu)" 34 | 35 | # Run notebook 36 | EXPOSE 8888/tcp 37 | WORKDIR /root/nlp-recipes-staging 38 | CMD source activate nlp_gpu && \ 39 | jupyter notebook --allow-root --ip 0.0.0.0 --port 8888 --no-browser --notebook-dir . 40 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | To setup the documentation, first you need to install the dependencies of the cpu environment. For it please follow the [SETUP.md](../SETUP.md). Then type: 4 | 5 | conda activate nlp_cpu 6 | pip install sphinx_rtd_theme 7 | 8 | 9 | To build the documentation as HTML: 10 | 11 | cd docs 12 | make html 13 | 14 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /docs/source/azureml.rst: -------------------------------------------------------------------------------- 1 | .. _azureml: 2 | 3 | AzureML module 4 | ************************** 5 | 6 | AzureML module from NLP utilities. 7 | 8 | AzureML utils 9 | =============================== 10 | 11 | .. automodule:: utils_nlp.azureml.azureml_utils 12 | :members: 13 | 14 | 15 | AzureML utils for BERT 16 | =============================== 17 | 18 | .. automodule:: utils_nlp.azureml.azureml_bert_util 19 | :members: 20 | 21 | 22 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | 2 | NLP Utilities 3 | =================================================== 4 | 5 | The `NLP repository `_ provides examples and best practices for building NLP systems, provided as Jupyter notebooks. 6 | 7 | The module `utils_nlp `_ contains functions to simplify common tasks used when developing and 8 | evaluating NLP systems. 9 | 10 | .. toctree:: 11 | :maxdepth: 1 12 | :caption: Contents: 13 | 14 | AzureML 15 | Common 16 | Dataset 17 | Evaluation 18 | NLP Algorithms 19 | NLP Interpretability 20 | 21 | 22 | Indices and tables 23 | ================== 24 | 25 | * :ref:`genindex` 26 | * :ref:`modindex` 27 | * :ref:`search` 28 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | This folder contains examples and best practices, written in Jupyter notebooks, for building Natural Language Processing systems for the following scenarios. 4 | 5 | |Category|Applications|Methods|Languages| 6 | |---| ------------------------ | ------------------- |---| 7 | |[Text Classification](text_classification)|Topic Classification|BERT, XLNet, RoBERTa, DistilBERT|en, hi, ar| 8 | |[Named Entity Recognition](named_entity_recognition) |Wikipedia NER|BERT|en| 9 | |[Text Summarization](text_summarization)|News Summarization, Headline Generation|Extractive: BERTSumExt
Abstractive: UniLM (s2s-ft)|en 10 | |[Entailment](entailment)|MultiNLI Natural Language Inference|BERT|en| 11 | |[Question Answering](question_answering) |SQuAD|BiDAF, BERT, XLNet, DistilBERT|en| 12 | |[Sentence Similarity](sentence_similarity)|STS Benchmark|BERT, GenSen|en| 13 | |[Embeddings](embeddings)|Custom Embeddings Training|Word2Vec, fastText, GloVe|en| 14 | |[Annotation](annotation)|Text Annotation|Doccano|en| 15 | |[Model Explainability](model_explainability)|DNN Layer Explanation|DUUDNM (Guan et al.)|en| 16 | 17 | ## Data/Telemetry 18 | The Azure Machine Learning notebooks collect browser usage data and send it to Microsoft to help improve our products and services. Read Microsoft's [privacy statement to learn more](https://privacy.microsoft.com/en-US/privacystatement). 19 | 20 | To opt out of tracking, a Python [script](../tools/remove_pixelserver.py) under the `tools` folder is also provided. Executing the script will check all notebooks under the `examples` folder, and automatically remove the telemetry cell: 21 | 22 | ```sh 23 | python ../tools/remove_pixelserver.py 24 | ``` 25 | -------------------------------------------------------------------------------- /examples/annotation/README.md: -------------------------------------------------------------------------------- 1 | # Text Annotation 2 | 3 | This folder contains a tutorial that walks through how to deploy text annotation tool on Azure and how to collaboratively annotate text data for natural language processing tasks. 4 | 5 | - **[Doccano](Doccano.md)** 6 | Doccano is an open source tools that provides three main text annotation features. This tutorial only shows a Named Entity Recognition (NER) annotation task as an example. 7 | 8 | -------------------------------------------------------------------------------- /examples/embeddings/README.md: -------------------------------------------------------------------------------- 1 | # Word Embedding 2 | 3 | This folder contains examples and best practices, written in Jupyter notebooks, for training word embedding on custom data from scratch. 4 | There are 5 | three typical ways for training word embedding: 6 | [Word2Vec](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf), 7 | [GloVe](https://nlp.stanford.edu/pubs/glove.pdf), and [fastText](https://arxiv.org/abs/1607.01759). 8 | All of the three methods provide pretrained models ([pretrained model with 9 | Word2Vec](https://code.google.com/archive/p/word2vec/), [pretrained model with 10 | Glove](https://github.com/stanfordnlp/GloVe), [pretrained model with 11 | fastText](https://fasttext.cc/docs/en/crawl-vectors.html)). 12 | These pretrained models are trained with 13 | general corpus like Wikipedia data, Common Crawl data, etc., and may not serve well for situations 14 | where you have a domain-specific language learning problem or there is no pretrained model for the 15 | language you need to work with. In this folder, we provide examples of how to apply each of the 16 | three methods to train your own word embeddings. 17 | 18 | # What is Word Embedding? 19 | 20 | Word embedding is a technique to map words or phrases from a vocabulary to vectors or real numbers. 21 | The learned vector representations of words capture syntactic and semantic word relationships and 22 | therefore can be very useful for tasks like sentence similary, text classifcation, etc. 23 | 24 | 25 | ## Summary 26 | 27 | 28 | |Notebook|Environment|Description|Dataset| Language | 29 | |---|---|---|---|---| 30 | |[Developing Word Embeddings](embedding_trainer.ipynb)|Local| A notebook shows how to learn word representation with Word2Vec, fastText and Glove|[STS Benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#STS_benchmark_dataset_and_companion_dataset) | en | 31 | -------------------------------------------------------------------------------- /examples/entailment/README.md: -------------------------------------------------------------------------------- 1 | # Natural Language Inference (NLI) 2 | 3 | This folder provides end-to-end examples of building Natural Language Inference (NLI) models. We 4 | demonstrate the best practices of data preprocessing and model building for NLI task and use the 5 | utility scripts in the [utils_nlp](../../utils_nlp) folder to speed up these processes. 6 | NLI is one of many NLP tasks that require robust compositional sentence understanding, but it's 7 | simpler compared to other tasks like question answering and machine translation. 8 | Currently, we focus on fine-tuning pre-trained BERT model. If you are interested in pre-training your own BERT model, you can view the [AzureML-BERT repo](https://github.com/microsoft/AzureML-BERT), which walks through the process in depth. We plan to continue adding state-of-the-art models as they come up and welcome community contributions. 9 | 10 | ## Natural Language Inference 11 | 12 | Natural Language Inference or Recognizing Textual Entailment (RTE) is the task of classifying 13 | a pair of premise and hypothesis sentences into three classes: contradiction, neutral, and 14 | entailment. For example, 15 | 16 | |Premise|Hypothesis|Label| 17 | |-------|----------|-----| 18 | |A man inspects the uniform of a figure in some East Asian country.|The man is sleeping.|contradiction| 19 | |An older and younger man smiling.|Two men are smiling and laughing at the cats playing on the floor.|neutral| 20 | |A soccer game with multiple males playing.|Some men are playing a sport.|entailment| 21 | 22 | ## Summary 23 | 24 | |Notebook|Environment|Description|Dataset| Language | 25 | |--------|:-----------:|-------|----------|---------| 26 | |[entailment_multinli_transformers.ipynb](entailment_multinli_transformers.ipynb)|Local|Fine-tuning of pre-trained BERT model for NLI|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)| en | 27 | |[entailment_xnli_bert_azureml.ipynb](entailment_xnli_bert_azureml.ipynb)|AzureML|**Distributed** fine-tuning of pre-trained BERT model for NLI|[XNLI](https://www.nyu.edu/projects/bowman/xnli/)| en 28 | -------------------------------------------------------------------------------- /examples/model_explainability/README.md: -------------------------------------------------------------------------------- 1 | # Model Explainability 2 | 3 | This folder contains examples and best practices, written in Jupyter notebooks, for explaining and 4 | interpreting models. Being able to explain and understand machine learning models not only helps 5 | guiding further model improvements, but more importantly, it's critical for gaining users' trust in the 6 | models and detecting biases caused by the training data. 7 | 8 | ## Summary 9 | 10 | |Notebook|Environment|Description|Dataset| Language | 11 | |---|:---:|---|---|---| 12 | |[DUUDNM](interpret_dnn_layers.ipynb)|Local| Interpreting DNN Layers using Mutual Information.||en| 13 | -------------------------------------------------------------------------------- /examples/named_entity_recognition/README.md: -------------------------------------------------------------------------------- 1 | # Named Entity Recognition (NER) 2 | 3 | This folder contains examples and best practices, written in Jupyter notebooks, for building Named 4 | Entity Recognition models. We use the 5 | utility scripts in the [utils_nlp](../../utils_nlp) folder to speed up data preprocessing and model building for NER. 6 | The models can be used in a wide variety of applications, such as 7 | information extraction and filtering. It also plays an important role in other NLP tasks like 8 | question answering and text summarization. 9 | Currently, we focus on fine-tuning pre-trained BERT 10 | model. We plan to continue adding state-of-the-art models as they come up and welcome community 11 | contributions. 12 | 13 | ## What is Named Entity Recognition (NER) 14 | 15 | Named Entity Recognition (NER) is the task of detecting and classifying real-world objects mentioned 16 | in text. Common named entities include person names, locations, organizations, etc. The 17 | [state-of-the art](https://paperswithcode.com/task/named-entity-recognition-ner) NER methods include 18 | combining Long Short-Term Memory neural network with Conditional Random Field (LSTM-CRF) and 19 | pretrained language models like BERT. 20 | 21 | NER usually involves assigning an entity label to each word in a sentence as shown in the figure below. 22 |

23 |  Fine-tuned BERT for NER tasks 24 |

25 | 26 | * O: Not an entity 27 | * I-LOC: Location 28 | * I-ORG: Organization 29 | * I-PER: Person 30 | 31 | There are a few standard labeling schemes and you can find the details 32 | [here](http://cs229.stanford.edu/proj2005/KrishnanGanapathy-NamedEntityRecognition.pdf). The data 33 | can also be labeled with custom entities as required by the use case. 34 | 35 | ## Summary 36 | 37 | |Notebook|Environment|Description|Dataset|Language| 38 | |---|:---:|---|---|---| 39 | |[BERT](ner_wikigold_transformer.ipynb)|Local| Fine-tune a pretrained BERT model for token classification.|[wikigold](https://www.aclweb.org/anthology/W09-3302)| English | 40 | -------------------------------------------------------------------------------- /examples/question_answering/README.md: -------------------------------------------------------------------------------- 1 | # Question Answering (QA) 2 | 3 | This folder contains examples and best practices, written in Jupyter notebooks, for building 4 | question answering models. These models can be used in a wide variety of applications, such as 5 | search engines, and virtual assistants. 6 | 7 | 8 | ## What is Question Answering? 9 | 10 | Question Answering is a classical NLP task which consists of determining the relevant "answer" 11 | (snippet of text out of a provided passage) that answers a user's "question". This task is a subset 12 | of Machine Comprehension, or measuring how well a machine comprehends a passage of text. The 13 | Stanford Question Answering Dataset ([SQuAD](https://rajpurkar.github.io/SQuAD-explorer/)) 14 | leader board displays the state-of-the-art models in this space. Traditional QA models are variants 15 | of Bidirectional Recurrent Neural Networks (BRNN). 16 | 17 | ## Summary 18 | 19 | |Notebook|Environment|Description|Dataset | Language 20 | |---|---|---|---|----| 21 | |[Deployed QA System in Under 20 minutes](question_answering_system_bidaf_quickstart.ipynb)|Azure Container Instances| Learn how to deploy a QA system in under 20 minutes using Azure Container Instances (ACI) and a popular AllenNLP pre-trained model called BiDAF.|[SQuAD](https://rajpurkar.github.io/SQuAD-explorer/)| English | 22 | |[BiDAF Deep Dive](bidaf_aml_deep_dive.ipynb)|Azure ML| Learn about the architecture of the BiDAF model and how to train it from scratch using the AllenNLP library on the AzureML platform.|[SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) | English | 23 | |[Pretrained BERT SQuAD Deep Dive](pretrained-BERT-SQuAD-deep-dive-aml.ipynb)|Azure ML| Learn about the mechanism of the BERT model in an end to end pipeline on the AzureML platform and how to fine tune it from scratch using the distributed training with Horovod. Show the improvement on the model performance using hyper-parameter tuning|[SQuAD](https://rajpurkar.github.io/SQuAD-explorer/)| English | 24 | 25 | -------------------------------------------------------------------------------- /examples/question_answering/bidaf_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_reader": { 3 | "type": "squad", 4 | "token_indexers": { 5 | "tokens": { 6 | "type": "single_id", 7 | "lowercase_tokens": true 8 | }, 9 | "token_characters": { 10 | "type": "characters", 11 | "character_tokenizer": { 12 | "byte_encoding": "utf-8", 13 | "start_tokens": [259], 14 | "end_tokens": [260] 15 | }, 16 | "min_padding_length": 5 17 | } 18 | } 19 | }, 20 | "train_data_path": "https://allennlp.s3.amazonaws.com/datasets/squad/squad-train-v1.1.json", 21 | "validation_data_path": "https://allennlp.s3.amazonaws.com/datasets/squad/squad-dev-v1.1.json", 22 | "evaluate_on_test": true, 23 | "model": { 24 | "type": "bidaf", 25 | "text_field_embedder": { 26 | "token_embedders": { 27 | "tokens": { 28 | "type": "embedding", 29 | "pretrained_file": "https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.100d.txt.gz", 30 | "embedding_dim": 100, 31 | "trainable": false 32 | }, 33 | "token_characters": { 34 | "type": "character_encoding", 35 | "embedding": { 36 | "num_embeddings": 262, 37 | "embedding_dim": 16 38 | }, 39 | "encoder": { 40 | "type": "cnn", 41 | "embedding_dim": 16, 42 | "num_filters": 100, 43 | "ngram_filter_sizes": [5] 44 | }, 45 | "dropout": 0.2 46 | } 47 | } 48 | }, 49 | "num_highway_layers": 2, 50 | "phrase_layer": { 51 | "type": "lstm", 52 | "bidirectional": true, 53 | "input_size": 200, 54 | "hidden_size": 100, 55 | "num_layers": 1 56 | }, 57 | "similarity_function": { 58 | "type": "linear", 59 | "combination": "x,y,x*y", 60 | "tensor_1_dim": 200, 61 | "tensor_2_dim": 200 62 | }, 63 | "modeling_layer": { 64 | "type": "lstm", 65 | "bidirectional": true, 66 | "input_size": 800, 67 | "hidden_size": 100, 68 | "num_layers": 2, 69 | "dropout": 0.2 70 | }, 71 | "span_end_encoder": { 72 | "type": "lstm", 73 | "bidirectional": true, 74 | "input_size": 1400, 75 | "hidden_size": 100, 76 | "num_layers": 1 77 | }, 78 | "dropout": 0.2 79 | }, 80 | "iterator": { 81 | "type": "bucket", 82 | "sorting_keys": [["passage", "num_tokens"], ["question", "num_tokens"]], 83 | "batch_size": 40 84 | }, 85 | 86 | "trainer": { 87 | "num_epochs": 20, 88 | "grad_norm": 5.0, 89 | "patience": 10, 90 | "validation_metric": "+em", 91 | "cuda_device": 0, 92 | "learning_rate_scheduler": { 93 | "type": "reduce_on_plateau", 94 | "factor": 0.5, 95 | "mode": "max", 96 | "patience": 2 97 | }, 98 | "optimizer": { 99 | "type": "adam", 100 | "betas": [0.9, 0.9] 101 | } 102 | } 103 | } -------------------------------------------------------------------------------- /examples/sentence_similarity/gensen_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "training": { 3 | "optimizer": "adam", 4 | "clip_c": 1, 5 | "lrate": 0.0001, 6 | "batch_size": 48, 7 | "n_gpus": 1, 8 | "stop_patience": 2 9 | }, 10 | "management": { 11 | "monitor_loss": 480, 12 | "print_samples": 12800, 13 | "checkpoint_freq": 480000, 14 | "eval_freq": 9600 15 | }, 16 | "data": {"paths": [ 17 | { 18 | "train_src": "snli_1.0_train.txt.s1.tok", 19 | "train_trg": "snli_1.0_train.txt.s2.tok", 20 | "val_src": "snli_1.0_dev.txt.s1.tok", 21 | "val_trg": "snli_1.0_dev.txt.s1.tok", 22 | "taskname": "snli" 23 | } 24 | ], 25 | "max_src_length": 90, 26 | "max_trg_length": 90, 27 | "task": "multi-seq2seq-nli", 28 | "save_dir": "models/", 29 | "nli_train": "snli_1.0_train.txt.clean.noblank", 30 | "nli_dev": "snli_1.0_dev.txt.clean.noblank", 31 | "nli_test": "snli_1.0_test.txt.clean.noblank" 32 | }, 33 | "model": { 34 | "dim_src": 2048, 35 | "dim_trg": 2048, 36 | "dim_word_src": 512, 37 | "dim_word_trg": 512, 38 | "n_words_src": 80000, 39 | "n_words_trg": 30000, 40 | "n_layers_src": 1, 41 | "bidirectional": true, 42 | "layernorm": false, 43 | "dropout": 0.8 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /examples/sentiment_analysis/absa/README.md: -------------------------------------------------------------------------------- 1 | # Aspect Based Sentiment Analysis 2 | 3 | This folder contains examples and best practices, written in Jupyter notebooks, for training [Aspect Based Sentiment Analysis Models using Intel's NLP Architect](http://nlp_architect.nervanasys.com/absa.html) 4 | models with the azure machine learning service. 5 | 6 | # What is Aspect Based Sentiment Analysis? 7 | 8 | Aspect based sentiment analysis (ABSA) is an advanced sentiment analysis technique that identifies and provides coresponding sentiment scores to the aspects of a given text. ABSA a powerful tool for getting actionable insight from your text data. 9 | 10 | For example consider the sentence following resturant review 11 | 12 | ``` 13 | The ambiance is charming. Uncharacteristically, the service was DREADFUL.When we wanted to pay our bill at the end of the evening, our waitress was nowhere to be found... 14 | ``` 15 | 16 | While traditional sentiment analysis models such as [Azure Text Analytics](https://azure.microsoft.com/en-us/services/cognitive-services/text-analytics/?WT.mc_id=absa-notebook-abornst) will correctly classify the sentiment of this model as negative. An aspect based model will provide more granular insight by highlighting the fact that the while the **service** and **waitress** provided a negative expirence the resturants **ambiance** was indeed positive. 17 | 18 | ## Summary 19 | 20 | |Notebook|Environment|Description|Dataset| 21 | |---|---|---|---| 22 | |[Aspect based sentiment analysis](absa.ipynb)|Local| A notebook for training and deploying [Aspect Based Sentiment Analysis Models using Intel's NLP Architect](http://nlp_architect.nervanasys.com/absa.html) | 23 | -------------------------------------------------------------------------------- /examples/sentiment_analysis/absa/dataset/data.md: -------------------------------------------------------------------------------- 1 | # About the Dataset 2 | 3 | Review data for this demo is sourced from the text reviews of [Women's E-Commerce Clothing Review](https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews/) dataset. The dataset has a CC0: Public Domain License and has been reformatted to build validation and training sets for both standard sentiment analysis and ABSA models. 4 | -------------------------------------------------------------------------------- /examples/text_classification/README.md: -------------------------------------------------------------------------------- 1 | # Text Classification 2 | This folder contains examples and best practices, written in Jupyter notebooks, for building text classification models. We use the 3 | utility scripts in the [utils_nlp](../../utils_nlp) folder to speed up data preprocessing and model building for text classification. 4 | The models can be used in a wide variety of applications, such as 5 | sentiment analysis, document indexing in digital libraries, hate speech detection, and general-purpose categorization in medical, academic, legal, and many other domains. 6 | Currently, we focus on fine-tuning pre-trained BERT and XLNet models. We plan to continue adding state-of-the-art models as they come up and welcome community 7 | contributions. 8 | 9 | ## What is Text Classification? 10 | Text classification is a supervised learning method of learning and predicting the category or the 11 | class of a document given its text content. The state-of-the-art methods are based on neural 12 | networks of different architectures as well as pre-trained language models or word embeddings. 13 | 14 | 15 | ## Summary 16 | 17 | The following summarizes each notebook for Text Classification. Each notebook provides more details and guiding in principles on building state of the art models. 18 | 19 | |Notebook|Environment|Description|Dataset| 20 | |---|---|---|---| 21 | |[BERT for text classification on AzureML](tc_bert_azureml.ipynb) |Azure ML|A notebook which walks through fine-tuning and evaluating pre-trained BERT model on a distributed setup with AzureML. |[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)| 22 | |[Text Classification of MultiNLI Sentences using Multiple Transformer Models](tc_mnli_transformers.ipynb)|Local| A notebook which walks through fine-tuning and evaluating a number of pre-trained transformer models|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)| 23 | |[Text Classification of Multi Language Datasets using Transformer Model](tc_multi_languages_transformers.ipynb)|Local|A notebook which walks through fine-tuning and evaluating a pre-trained transformer model for multiple datasets in different language|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)
[BBC Hindi News](https://github.com/NirantK/hindi2vec/releases/tag/bbc-hindi-v0.1)
[DAC](https://data.mendeley.com/datasets/v524p5dhpj/2) 24 | -------------------------------------------------------------------------------- /examples/text_summarization/abstractive_summarization_unilm_cnndm.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import argparse 3 | import jsonlines 4 | 5 | import torch 6 | 7 | from utils_nlp.models.transformers.abstractive_summarization_seq2seq import ( 8 | S2SAbsSumProcessor, 9 | S2SAbstractiveSummarizer 10 | ) 11 | 12 | from utils_nlp.eval import compute_rouge_python 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument( 16 | "--local_rank", type=int, default=-1, help="For distributed training: local_rank" 17 | ) 18 | parser.add_argument("--fp16", type=bool, default=False) 19 | parser.add_argument("--fp16_opt_level", type=str, default="O2") 20 | args = parser.parse_args() 21 | 22 | 23 | QUICK_RUN = True 24 | OUTPUT_FILE = "./nlp_cnndm_finetuning_results.txt" 25 | 26 | # model parameters 27 | MODEL_NAME = "unilm-large-cased" 28 | MAX_SEQ_LENGTH = 768 29 | MAX_SOURCE_SEQ_LENGTH = 640 30 | MAX_TARGET_SEQ_LENGTH = 128 31 | 32 | # fine-tuning parameters 33 | TRAIN_PER_GPU_BATCH_SIZE = 1 34 | GRADIENT_ACCUMULATION_STEPS = 2 35 | LEARNING_RATE = 3e-5 36 | if QUICK_RUN: 37 | TOP_N = 100 38 | WARMUP_STEPS = 10 39 | MAX_STEPS = 100 40 | else: 41 | TOP_N = -1 42 | WARMUP_STEPS = 500 43 | MAX_STEPS = 5000 44 | 45 | # inference parameters 46 | TEST_PER_GPU_BATCH_SIZE = 8 47 | BEAM_SIZE = 5 48 | FORBID_IGNORE_WORD = "." 49 | 50 | train_ds = "train_ds.jsonl" 51 | test_ds = "test_ds.jsonl" 52 | 53 | 54 | def main(): 55 | torch.distributed.init_process_group( 56 | timeout=datetime.timedelta(0, 5400), backend="nccl", 57 | ) 58 | 59 | if args.local_rank not in [-1, 0]: 60 | torch.distributed.barrier() 61 | 62 | processor = S2SAbsSumProcessor(model_name=MODEL_NAME) 63 | 64 | abs_summarizer = S2SAbstractiveSummarizer( 65 | model_name=MODEL_NAME, 66 | max_seq_length=MAX_SEQ_LENGTH, 67 | max_source_seq_length=MAX_SOURCE_SEQ_LENGTH, 68 | max_target_seq_length=MAX_TARGET_SEQ_LENGTH, 69 | ) 70 | 71 | if args.local_rank == 0: 72 | torch.distributed.barrier() 73 | 74 | train_dataset = processor.s2s_dataset_from_json_or_file( 75 | train_ds, train_mode=True, local_rank=args.local_rank 76 | ) 77 | 78 | test_dataset = processor.s2s_dataset_from_json_or_file( 79 | test_ds, train_mode=False, local_rank=args.local_rank 80 | ) 81 | 82 | abs_summarizer.fit( 83 | train_dataset=train_dataset, 84 | per_gpu_batch_size=TRAIN_PER_GPU_BATCH_SIZE, 85 | gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, 86 | learning_rate=LEARNING_RATE, 87 | warmup_steps=WARMUP_STEPS, 88 | max_steps=MAX_STEPS, 89 | fp16=args.fp16, 90 | fp16_opt_level=args.fp16_opt_level, 91 | local_rank=args.local_rank, 92 | save_model_to_dir=".", 93 | ) 94 | 95 | torch.distributed.barrier() 96 | 97 | if args.local_rank in [-1, 0]: 98 | res = abs_summarizer.predict( 99 | test_dataset=test_dataset, 100 | per_gpu_batch_size=TEST_PER_GPU_BATCH_SIZE, 101 | beam_size=BEAM_SIZE, 102 | forbid_ignore_word=FORBID_IGNORE_WORD, 103 | fp16=args.fp16, 104 | ) 105 | 106 | for r in res[:5]: 107 | print(r) 108 | 109 | with open(OUTPUT_FILE, "w", encoding="utf-8") as f: 110 | for line in res: 111 | f.write(line + "\n") 112 | 113 | tgt = [] 114 | with jsonlines.open(test_ds) as reader: 115 | for item in reader: 116 | tgt.append(item["tgt"]) 117 | 118 | for t in tgt[:5]: 119 | print(t) 120 | 121 | print(compute_rouge_python(cand=res, ref=tgt)) 122 | 123 | 124 | if __name__ == "__main__": 125 | main() 126 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 88 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | #!/usr/bin/env python 5 | # -*- encoding: utf-8 -*- 6 | from __future__ import absolute_import 7 | from __future__ import print_function 8 | import io 9 | import re 10 | from os.path import dirname, join 11 | from setuptools import setup 12 | from utils_nlp import VERSION, AUTHOR, TITLE, LICENSE 13 | 14 | 15 | def read(*names, **kwargs): 16 | with io.open(join(dirname(__file__), *names), encoding=kwargs.get("encoding", "utf8")) as fh: 17 | return fh.read() 18 | 19 | 20 | setup( 21 | name="utils_nlp", 22 | version=VERSION, 23 | license=LICENSE, 24 | description="NLP Utility functions that are used for best practices in building state-of-the-art NLP methods and scenarios. Developed by Microsoft AI CAT", 25 | long_description="%s\n%s" 26 | % ( 27 | re.compile("^.. start-badges.*^.. end-badges", re.M | re.S).sub("", read("README.md")), 28 | re.sub(":[a-z]+:`~?(.*?)`", r"``\1``", read("CONTRIBUTING.md")), 29 | ), 30 | author=AUTHOR, 31 | author_email="teamsharat@microsoft.com", 32 | url="https://github.com/microsoft/nlp-recipes", 33 | packages=["utils_nlp"], 34 | include_package_data=True, 35 | zip_safe=True, 36 | classifiers=[ 37 | # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers 38 | "Development Status :: 5 - Production/Stable", 39 | "Intended Audience :: Developers", 40 | "License :: OSI Approved :: MIT License", 41 | "Operating System :: Unix", 42 | "Operating System :: POSIX", 43 | "Operating System :: Microsoft :: Windows", 44 | "Programming Language :: Python :: 3.6", 45 | "Programming Language :: Python :: 3.7", 46 | "Programming Language :: Python :: Implementation :: CPython", 47 | "Programming Language :: Python :: Implementation :: PyPy", 48 | "Topic :: Text Processing :: Linguistic", 49 | "Topic :: Utilities", 50 | "Intended Audience :: Science/Research", 51 | "Intended Audience :: Developers", 52 | "Intended Audience :: Education", 53 | "Intended Audience :: Financial and Insurance Industry", 54 | "Intended Audience :: Healthcare Industry", 55 | "Intended Audience :: Information Technology", 56 | "Intended Audience :: Telecommunications Industry", 57 | ], 58 | project_urls={ 59 | "Documentation": "https://github.com/microsoft/nlp-recipes/", 60 | "Issue Tracker": "https://github.com/microsoft/nlp-recipes/issues", 61 | }, 62 | keywords=[ 63 | "Microsoft NLP", 64 | "NLP Recipes", 65 | "Natural Language Processing", 66 | "Text Processing", 67 | "Word Embedding", 68 | ], 69 | python_requires=">=3.6", 70 | install_requires=[], 71 | dependency_links=[], 72 | extras_require={}, 73 | use_scm_version=False, 74 | setup_requires=[], 75 | ) 76 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/nlp-recipes/7db6d204e5116da07bb3c549df546e49cb7ab5a5/tests/__init__.py -------------------------------------------------------------------------------- /tests/ci/azureml_integration_tests.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | # More info on scheduling: https://docs.microsoft.com/en-us/azure/devops/pipelines/build/triggers?view=azure-devops&tabs=yaml#scheduled-triggers 5 | # Implementing the scheduler from the dashboard 6 | # Uncomment in case it wants to be done from using the yml 7 | #schedules: 8 | #- cron: "56 22 * * *" 9 | # displayName: Daily computation of nightly builds 10 | # branches: 11 | # include: 12 | # - master 13 | # always: true 14 | 15 | # no PR builds 16 | pr: none 17 | 18 | # no CI trigger 19 | trigger: none 20 | 21 | variables: 22 | - group: AzureMLConfig 23 | - name : 'resource_group' 24 | value : 'nlpbp_project_resources' 25 | - name : 'workspace_name' 26 | value : 'nlpazuremltestws' 27 | - name : 'workspace_region' 28 | value : 'eastus2' 29 | - name : 'junitxml' 30 | value : 'reports/test-azureml.xml' 31 | 32 | jobs: 33 | - job: AzureMLNotebookTest 34 | timeoutInMinutes: 300 # how long to run the job before automatically cancelling 35 | pool: 36 | vmImage: 'Ubuntu-16.04' 37 | steps: 38 | - bash: | 39 | echo "##vso[task.prependpath]/usr/share/miniconda/bin" 40 | displayName: Add Conda to PATH 41 | 42 | - bash: | 43 | python tools/generate_conda_file.py --gpu 44 | conda env create -n nlp_gpu -f nlp_gpu.yaml 45 | pip install paramiko==2.4.2 46 | source activate nlp_gpu 47 | conda env list 48 | echo Login Azure Account 49 | az login --service-principal -u $(spidentity) -p $(spsecret) --tenant $(sptenant) 50 | az account set --subscription $(subscriptionid) 51 | displayName: 'Create and activate conda environment' 52 | 53 | - bash: | 54 | source activate nlp_gpu 55 | pytest --durations=0 tests/integration -m "azureml" -q --subscription_id=$(subscriptionid) --resource_group=$(resource_group) --workspace_name=$(workspace_name) --workspace_region=$(workspace_region) --junitxml $(junitxml) 56 | displayName: 'Run AzureML notebook tests' 57 | 58 | - bash: | 59 | echo Ensure Resource Group Deletion $(resource_group) 60 | existResponse=$(az group exists -n $(resource_group)) 61 | if [ "$existResponse" == "true" ]; then 62 | echo Deleting project resource group 63 | az group delete --name $(resource_group) --yes 64 | else 65 | echo Project resource group did not exist 66 | fi 67 | echo Done Cleanup 68 | displayName: 'Cleanup Task' 69 | condition: always() 70 | 71 | - task: PublishTestResults@2 72 | displayName: 'Publish Test Results **/test-*.xml' 73 | inputs: 74 | testResultsFiles: '**/test-*.xml' 75 | failTaskOnFailedTests: true 76 | condition: succeededOrFailed() -------------------------------------------------------------------------------- /tests/ci/component_governance.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | # Pull request against these branches will trigger this build 5 | pr: 6 | - master 7 | 8 | # no CI trigger 9 | trigger: none 10 | 11 | jobs: 12 | - job: Component_governance 13 | timeoutInMinutes: 20 # how long to run the job before automatically cancelling 14 | pool: 15 | vmImage: 'ubuntu-16.04' 16 | 17 | steps: 18 | - bash: | 19 | python tools/generate_requirements_txt.py 20 | displayName: 'Generate requirements.txt file from generate_conda_file.py' 21 | 22 | - task: ComponentGovernanceComponentDetection@0 23 | inputs: 24 | scanType: 'Register' 25 | verbosity: 'Verbose' 26 | alertWarningLevel: 'High' 27 | -------------------------------------------------------------------------------- /tests/ci/cpu_integration_tests_linux.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | 5 | # More info on scheduling: https://docs.microsoft.com/en-us/azure/devops/pipelines/build/triggers?view=azure-devops&tabs=yaml#scheduled-triggers 6 | # Implementing the scheduler from the dashboard 7 | # Uncomment in case it wants to be done from using the yml 8 | #schedules: 9 | #- cron: "56 22 * * *" 10 | # displayName: Daily computation of nightly builds 11 | # branches: 12 | # include: 13 | # - master 14 | # always: true 15 | 16 | 17 | # no PR builds 18 | pr: none 19 | 20 | # no CI trigger 21 | trigger: none 22 | 23 | jobs: 24 | - job: nightly 25 | displayName : 'Nightly tests' 26 | timeoutInMinutes: 180 # how long to run the job before automatically cancelling 27 | pool: 28 | name: nlpagentpool 29 | 30 | steps: 31 | - bash: | 32 | echo "##vso[task.prependpath]/data/anaconda/bin" 33 | conda env list 34 | displayName: 'Add Conda to PATH' 35 | 36 | # Conda creation can take around 10min 37 | - bash: | 38 | python tools/generate_conda_file.py 39 | conda env create -n integration_cpu -f nlp_cpu.yaml 40 | displayName: 'Creating Conda Environment with dependencies' 41 | 42 | - bash: | 43 | source activate integration_cpu 44 | pytest --durations=0 tests/smoke -m "smoke and not gpu and not azureml" --junitxml=junit/test-smoke-test.xml 45 | displayName: 'Run smoke tests' 46 | 47 | - bash: | 48 | source activate integration_cpu 49 | pytest --durations=0 tests/integration -m "integration and not gpu and not azureml" --junitxml=junit/test-integration-test.xml 50 | displayName: 'Run integration tests' 51 | 52 | - bash: | 53 | echo Remove Conda Environment 54 | conda remove -n integration_cpu --all -q --force -y 55 | echo Done Cleanup 56 | displayName: 'Cleanup Task' 57 | condition: always() 58 | 59 | - task: PublishTestResults@2 60 | inputs: 61 | testResultsFiles: '**/test-*-test.xml' 62 | testRunTitle: 'Test results for PyTest' -------------------------------------------------------------------------------- /tests/ci/cpu_unit_tests_linux.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | # Pull request against these branches will trigger this build 5 | pr: 6 | - master 7 | - staging 8 | 9 | #Any commit to this branch will trigger the build. 10 | trigger: 11 | - staging 12 | - master 13 | 14 | 15 | jobs: 16 | - job: cpu_unit_tests_linux 17 | timeoutInMinutes: 12 # how long to run the job before automatically cancelling 18 | pool: 19 | # vmImage: 'ubuntu-16.04' # hosted machine 20 | name: nlpagentpool 21 | 22 | steps: 23 | 24 | # Uncomment if hosted machine 25 | # - task: UsePythonVersion@0 26 | # inputs: 27 | # versionSpec: '3.6.8' 28 | # architecture: 'x64' 29 | # addToPath: true 30 | # displayName: 'Use Python 3.6.8' 31 | 32 | - bash: | 33 | echo "##vso[task.prependpath]/data/anaconda/bin" 34 | conda env list 35 | displayName: Add Conda to PATH 36 | 37 | # Uncomment if needed 38 | # Conda creation can take around 10min 39 | # - bash: | 40 | # python tools/generate_conda_file.py 41 | # conda env create -n nlp_cpu -f nlp_cpu.yaml 42 | # displayName: 'Creating Conda Environment with dependencies' 43 | 44 | - bash: | 45 | source activate nlp_cpu 46 | pytest --durations=0 tests/unit -m "not notebooks and not gpu and not azureml" --junitxml=junit/test-unitttest.xml 47 | displayName: 'Run Unit tests' 48 | 49 | # Uncomment if needed 50 | # - bash: | 51 | # echo Remove Conda Environment 52 | # conda remove -n nlp_cpu --all -q --force -y 53 | # echo Done Cleanup 54 | # displayName: 'Cleanup Task' 55 | # condition: always() 56 | 57 | - task: PublishTestResults@2 58 | inputs: 59 | testResultsFiles: '**/test-unitttest.xml' 60 | testRunTitle: 'Test results for PyTest' 61 | 62 | 63 | -------------------------------------------------------------------------------- /tests/ci/gpu_integration_tests_linux.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | 5 | # More info on scheduling: https://docs.microsoft.com/en-us/azure/devops/pipelines/build/triggers?view=azure-devops&tabs=yaml#scheduled-triggers 6 | # Implementing the scheduler from the dashboard 7 | # Uncomment in case it wants to be done from using the yml 8 | #schedules: 9 | #- cron: "56 11 * * *" 10 | # displayName: Daily computation of nightly builds 11 | # branches: 12 | # include: 13 | # - master 14 | # always: true 15 | 16 | 17 | # no PR builds 18 | pr: none 19 | 20 | # no CI trigger 21 | trigger: none 22 | 23 | jobs: 24 | - job: nightly 25 | displayName : 'Nightly tests' 26 | timeoutInMinutes: 180 # how long to run the job before automatically cancelling 27 | pool: 28 | name: nlpagentpool 29 | 30 | steps: 31 | - bash: | 32 | echo "##vso[task.prependpath]/data/anaconda/bin" 33 | conda env list 34 | displayName: 'Add Conda to PATH' 35 | 36 | # Conda creation can take around 10min 37 | - bash: | 38 | python tools/generate_conda_file.py --gpu 39 | conda env create -n integration_gpu -f nlp_gpu.yaml 40 | displayName: 'Creating Conda Environment with dependencies' 41 | 42 | - bash: | 43 | source activate integration_gpu 44 | pytest --durations=0 tests/smoke -m "smoke and gpu and not azureml" --junitxml=junit/test-smoke-test.xml 45 | displayName: 'Run smoke tests' 46 | 47 | - bash: | 48 | source activate integration_gpu 49 | pytest --durations=0 tests/integration -m "integration and gpu and not azureml" --junitxml=junit/test-integration-test.xml 50 | displayName: 'Run integration tests' 51 | 52 | - bash: | 53 | echo Remove Conda Environment 54 | conda remove -n integration_gpu --all -q --force -y 55 | echo Done Cleanup 56 | displayName: 'Cleanup Task' 57 | condition: always() 58 | 59 | - task: PublishTestResults@2 60 | inputs: 61 | testResultsFiles: '**/test-*-test.xml' 62 | testRunTitle: 'Test results for PyTest' 63 | -------------------------------------------------------------------------------- /tests/ci/gpu_unit_tests_linux.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | # Pull request against these branches will trigger this build 5 | pr: 6 | - master 7 | - staging 8 | 9 | #Any commit to this branch will trigger the build. 10 | trigger: 11 | - staging 12 | - master 13 | 14 | jobs: 15 | - job: gpu_unit_tests_linux 16 | timeoutInMinutes: 30 # how long to run the job before automatically cancelling 17 | pool: 18 | name: nlpagentpool 19 | 20 | steps: 21 | - bash: | 22 | echo "##vso[task.prependpath]/data/anaconda/bin" 23 | conda env list 24 | displayName: Add Conda to PATH 25 | 26 | # Uncomment if needed 27 | # Conda creation can take around 10min 28 | # - bash: | 29 | # python tools/generate_conda_file.py --gpu 30 | # conda env create -n nlp_gpu -f nlp_gpu.yaml 31 | # displayName: 'Creating Conda Environment with dependencies' 32 | 33 | - bash: | 34 | source activate nlp_gpu 35 | pytest --durations=0 tests/unit -m "not notebooks and gpu and not azureml" --junitxml=junit/test-unitttest.xml 36 | displayName: 'Run Unit tests' 37 | 38 | # Uncomment if needed 39 | # - bash: | 40 | # echo Remove Conda Environment 41 | # conda remove -n nlp_gpu --all -q --force -y 42 | # echo Done Cleanup 43 | # displayName: 'Cleanup Task' 44 | # condition: always() 45 | 46 | - task: PublishTestResults@2 47 | inputs: 48 | testResultsFiles: '**/test-unitttest.xml' 49 | testRunTitle: 'Test results for PyTest' 50 | 51 | 52 | -------------------------------------------------------------------------------- /tests/ci/notebooks_cpu_unit_tests_linux.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | # Pull request against these branches will trigger this build 5 | pr: 6 | - master 7 | - staging 8 | 9 | #Any commit to this branch will trigger the build. 10 | trigger: 11 | - staging 12 | - master 13 | 14 | jobs: 15 | - job: notebooks_cpu_unit_tests_linux 16 | timeoutInMinutes: 10 # how long to run the job before automatically cancelling 17 | pool: 18 | name: nlpagentpool 19 | 20 | steps: 21 | - bash: | 22 | echo "##vso[task.prependpath]/data/anaconda/bin" 23 | conda env list 24 | displayName: Add Conda to PATH 25 | 26 | # Uncomment if needed 27 | # Conda creation can take around 10min 28 | # - bash: | 29 | # python tools/generate_conda_file.py 30 | # conda env create -n nlp_cpu -f nlp_cpu.yaml 31 | # displayName: 'Creating Conda Environment with dependencies' 32 | 33 | - bash: | 34 | source activate nlp_cpu 35 | pytest --durations=0 tests/unit -m "notebooks and not gpu and not azureml" --junitxml=junit/test-unitttest.xml 36 | displayName: 'Run Unit tests' 37 | 38 | # Uncomment if needed 39 | # - bash: | 40 | # echo Remove Conda Environment 41 | # conda remove -n nlp_cpu --all -q --force -y 42 | # echo Done Cleanup 43 | # displayName: 'Cleanup Task' 44 | # condition: always() 45 | 46 | - task: PublishTestResults@2 47 | inputs: 48 | testResultsFiles: '**/test-unitttest.xml' 49 | testRunTitle: 'Test results for PyTest' 50 | 51 | -------------------------------------------------------------------------------- /tests/ci/notebooks_gpu_unit_tests_linux.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | # Pull request against these branches will trigger this build 5 | pr: 6 | - master 7 | - staging 8 | 9 | #Any commit to this branch will trigger the build. 10 | trigger: 11 | - staging 12 | - master 13 | 14 | jobs: 15 | - job: notebooks_gpu_unit_tests_linux 16 | timeoutInMinutes: 10 # how long to run the job before automatically cancelling 17 | pool: 18 | name: nlpagentpool 19 | 20 | steps: 21 | - bash: | 22 | echo "##vso[task.prependpath]/data/anaconda/bin" 23 | conda env list 24 | displayName: Add Conda to PATH 25 | 26 | # Uncomment if needed 27 | # Conda creation can take around 10min 28 | # - bash: | 29 | # python tools/generate_conda_file.py --gpu 30 | # conda env create -n nlp_gpu -f nlp_gpu.yaml 31 | # displayName: 'Creating Conda Environment with dependencies' 32 | 33 | - bash: | 34 | source activate nlp_gpu 35 | pytest --durations=0 tests/unit -m "notebooks and gpu and not azureml" --junitxml=junit/test-unitttest.xml 36 | displayName: 'Run Unit tests' 37 | 38 | # Uncomment if needed 39 | # - bash: | 40 | # echo Remove Conda Environment 41 | # conda remove -n nlp_gpu --all -q --force -y 42 | # echo Done Cleanup 43 | # displayName: 'Cleanup Task' 44 | # condition: always() 45 | 46 | - task: PublishTestResults@2 47 | inputs: 48 | testResultsFiles: '**/test-unitttest.xml' 49 | testRunTitle: 'Test results for PyTest' 50 | 51 | -------------------------------------------------------------------------------- /tests/integration/test_ddp_summarization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | import pytest 6 | import torch 7 | 8 | @pytest.mark.gpu 9 | @pytest.mark.integration 10 | def test_ddp_extractive_summarization_cnndm_transformers(scripts, tmp): 11 | ddp_env = os.environ.copy() 12 | ddp_env["OMP_NUM_THREADS"] = str(torch.cuda.device_count()) 13 | ddp_env["KMP_AFFINITY"] = "verbose" 14 | script = scripts["ddp_bertsumext"] 15 | summary_filename = "bertsumext_prediction.txt" 16 | import subprocess 17 | 18 | process = subprocess.Popen( 19 | [ 20 | "python", 21 | script, 22 | "--data_dir", 23 | tmp, 24 | "--cache_dir", 25 | tmp, 26 | "--output_dir", 27 | tmp, 28 | "--quick_run", 29 | "true", 30 | "--summary_filename", 31 | summary_filename, 32 | ], 33 | env=ddp_env, 34 | stdout=subprocess.PIPE, 35 | stderr=subprocess.PIPE, 36 | ) 37 | stdout, stderr = process.communicate() 38 | print(stdout) 39 | if process.returncode: 40 | print(stdout) 41 | print(stderr) 42 | assert False 43 | assert os.path.exists(os.path.join(tmp, summary_filename)) 44 | 45 | 46 | @pytest.mark.skip( 47 | reason="""it takes too long; if the previous test works, 48 | and the notebook runs, this should also work.""" 49 | ) 50 | @pytest.mark.gpu 51 | @pytest.mark.integration 52 | def test_ddp_abstractive_summarization_cnndm_transformers(scripts, tmp): 53 | script = scripts["ddp_bertsumabs"] 54 | summary_filename = "bertsumabs_prediction.txt" 55 | import subprocess 56 | 57 | process = subprocess.Popen( 58 | [ 59 | "python", 60 | script, 61 | "--data_dir", 62 | tmp, 63 | "--cache_dir", 64 | tmp, 65 | "--output_dir", 66 | tmp, 67 | "--quick_run", 68 | "true", 69 | "--batch_size", 70 | "1", 71 | "--summary_filename", 72 | summary_filename, 73 | ], 74 | stdout=subprocess.PIPE, 75 | stderr=subprocess.PIPE, 76 | ) 77 | stdout, stderr = process.communicate() 78 | print(stdout) 79 | if process.returncode: 80 | print(stdout) 81 | print(stderr) 82 | assert False 83 | assert os.path.exists(os.path.join(tmp, summary_filename)) 84 | -------------------------------------------------------------------------------- /tests/integration/test_gpu_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | import torch 6 | 7 | 8 | @pytest.mark.gpu 9 | @pytest.mark.integration 10 | def test_machine_is_gpu_machine(): 11 | assert torch.cuda.is_available() is True 12 | -------------------------------------------------------------------------------- /tests/integration/test_notebooks_abstractive_summarization_bertsumabs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import papermill as pm 5 | import pytest 6 | import scrapbook as sb 7 | from tests.notebooks_common import KERNEL_NAME, OUTPUT_NOTEBOOK 8 | import torch 9 | 10 | ABS_TOL = 0.02 11 | 12 | 13 | @pytest.mark.gpu 14 | @pytest.mark.integration 15 | def test_abstractive_summarization_bertsumabs_cnndm(notebooks, tmp): 16 | notebook_path = notebooks["abstractive_summarization_bertsumabs_cnndm"] 17 | pm.execute_notebook( 18 | notebook_path, 19 | OUTPUT_NOTEBOOK, 20 | kernel_name=KERNEL_NAME, 21 | parameters=dict( 22 | QUICK_RUN=True, 23 | TOP_N=1000, 24 | MAX_POS=512, 25 | DATA_FOLDER=tmp, 26 | CACHE_DIR=tmp, 27 | BATCH_SIZE_PER_GPU=3, 28 | REPORT_EVERY=50, 29 | MAX_STEPS=100, 30 | MODEL_NAME="bert-base-uncased", 31 | ), 32 | ) 33 | result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict 34 | assert pytest.approx(result["rouge_2_f_score"], 0.01, abs=ABS_TOL) 35 | -------------------------------------------------------------------------------- /tests/integration/test_notebooks_embeddings.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | import papermill as pm 6 | 7 | from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME 8 | 9 | @pytest.mark.integration 10 | @pytest.mark.skip(reason="") 11 | @pytest.mark.notebooks 12 | def test_embedding_trainer_runs(notebooks): 13 | notebook_path = notebooks["embedding_trainer"] 14 | pm.execute_notebook( 15 | notebook_path, 16 | OUTPUT_NOTEBOOK, 17 | kernel_name=KERNEL_NAME, 18 | parameters=dict(NLP_REPO_PATH=".") 19 | ) 20 | -------------------------------------------------------------------------------- /tests/integration/test_notebooks_entailment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | import papermill as pm 6 | import scrapbook as sb 7 | import os 8 | import json 9 | import shutil 10 | from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME 11 | 12 | ABS_TOL = 0.1 13 | 14 | 15 | @pytest.mark.gpu 16 | @pytest.mark.integration 17 | def test_entailment_multinli_bert(notebooks, tmp): 18 | notebook_path = notebooks["entailment_multinli_transformers"] 19 | pm.execute_notebook( 20 | notebook_path, 21 | OUTPUT_NOTEBOOK, 22 | parameters={ 23 | "MODEL_NAME": "bert-base-uncased", 24 | "TO_LOWER": True, 25 | "TRAIN_DATA_USED_FRACTION": 0.05, 26 | "DEV_DATA_USED_FRACTION": 0.05, 27 | "NUM_EPOCHS": 1, 28 | "CACHE_DIR": tmp 29 | }, 30 | kernel_name=KERNEL_NAME, 31 | ) 32 | result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict 33 | assert pytest.approx(result["matched_precision"], 0.76, abs=ABS_TOL) 34 | assert pytest.approx(result["matched_recall"], 0.76, abs=ABS_TOL) 35 | assert pytest.approx(result["matched_f1"], 0.76, abs=ABS_TOL) 36 | assert pytest.approx(result["mismatched_precision"], 0.76, abs=ABS_TOL) 37 | assert pytest.approx(result["mismatched_recall"], 0.76, abs=ABS_TOL) 38 | assert pytest.approx(result["mismatched_f1"], 0.76, abs=ABS_TOL) 39 | 40 | @pytest.mark.integration 41 | @pytest.mark.azureml 42 | def test_entailment_xnli_bert_azureml( 43 | notebooks, subscription_id, resource_group, workspace_name, workspace_region, cluster_name 44 | ): 45 | notebook_path = notebooks["entailment_xnli_bert_azureml"] 46 | pm.execute_notebook( 47 | notebook_path, 48 | OUTPUT_NOTEBOOK, 49 | parameters={ 50 | "DATA_PERCENT_USED": 0.0025, 51 | "subscription_id": subscription_id, 52 | "resource_group": resource_group, 53 | "workspace_name": workspace_name, 54 | "workspace_region": workspace_region, 55 | "cluster_name": cluster_name, 56 | }, 57 | kernel_name=KERNEL_NAME, 58 | ) 59 | 60 | with open("outputs/results.json", "r") as handle: 61 | result_dict = json.load(handle) 62 | assert result_dict["weighted avg"]["f1-score"] == pytest.approx(0.2, abs=ABS_TOL) 63 | 64 | if os.path.exists("outputs"): 65 | shutil.rmtree("outputs") 66 | -------------------------------------------------------------------------------- /tests/integration/test_notebooks_extractive_summarization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import papermill as pm 5 | import pytest 6 | import scrapbook as sb 7 | from tests.notebooks_common import KERNEL_NAME, OUTPUT_NOTEBOOK 8 | 9 | ABS_TOL = 0.02 10 | 11 | 12 | @pytest.mark.gpu 13 | @pytest.mark.integration 14 | def test_extractive_summarization_cnndm_transformers(notebooks, tmp): 15 | notebook_path = notebooks["extractive_summarization_cnndm_transformer"] 16 | pm.execute_notebook( 17 | notebook_path, 18 | OUTPUT_NOTEBOOK, 19 | kernel_name=KERNEL_NAME, 20 | parameters=dict( 21 | QUICK_RUN=True, 22 | TOP_N=100, 23 | CHUNK_SIZE=200, 24 | USE_PREPROCESSED_DATA=False, 25 | DATA_PATH=tmp, 26 | CACHE_DIR=tmp, 27 | BATCH_SIZE=3000, 28 | REPORT_EVERY=50, 29 | MAX_STEPS=100, 30 | WARMUP_STEPS=5e2, 31 | MODEL_NAME="distilbert-base-uncased", 32 | ), 33 | ) 34 | result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict 35 | assert pytest.approx(result["rouge_2_f_score"], 0.1, abs=ABS_TOL) 36 | 37 | 38 | @pytest.mark.skip(reason="no need to test") 39 | @pytest.mark.gpu 40 | @pytest.mark.integration 41 | def test_extractive_summarization_cnndm_transformers_processed(notebooks, tmp): 42 | notebook_path = notebooks["extractive_summarization_cnndm_transformer"] 43 | pm.execute_notebook( 44 | notebook_path, 45 | OUTPUT_NOTEBOOK, 46 | kernel_name=KERNEL_NAME, 47 | parameters=dict( 48 | QUICK_RUN=True, 49 | TOP_N=100, 50 | CHUNK_SIZE=200, 51 | USE_PREPROCESSED_DATA=True, 52 | DATA_PATH=tmp, 53 | CACHE_DIR=tmp, 54 | PROCESSED_DATA_PATH=tmp, 55 | BATCH_SIZE=3000, 56 | REPORT_EVERY=50, 57 | MAX_STEPS=100, 58 | WARMUP_STEPS=5e2, 59 | MODEL_NAME="distilbert-base-uncased", 60 | ), 61 | ) 62 | result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict 63 | assert pytest.approx(result["rouge_2_f_score"], 0.1, abs=ABS_TOL) 64 | -------------------------------------------------------------------------------- /tests/integration/test_notebooks_interpretability.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | import numpy as np 6 | import papermill as pm 7 | import scrapbook as sb 8 | from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME 9 | 10 | 11 | @pytest.mark.gpu 12 | @pytest.mark.integration 13 | def test_deep_and_unified_understanding(notebooks): 14 | notebook_path = notebooks["deep_and_unified_understanding"] 15 | pm.execute_notebook( 16 | notebook_path, 17 | OUTPUT_NOTEBOOK, 18 | kernel_name=KERNEL_NAME) 19 | 20 | result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict 21 | sigma_numbers = [0.00317593, 0.00172284, 0.00634005, 0.00164305, 0.00317159] 22 | sigma_bert = [0.1735696 , 0.14028822, 0.14590865, 0.2263149 , 0.20640415, 23 | 0.21249843, 0.18685372, 0.14112663, 0.25824168, 0.22399105, 24 | 0.2393731 , 0.12868434, 0.27386534, 0.35876372] 25 | 26 | np.testing.assert_array_almost_equal(result["sigma_numbers"], sigma_numbers, decimal=3) 27 | np.testing.assert_array_almost_equal(result["sigma_bert"], sigma_bert, decimal=1) 28 | -------------------------------------------------------------------------------- /tests/integration/test_notebooks_minilm_abstractive_summarization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import papermill as pm 5 | import pytest 6 | import scrapbook as sb 7 | from tests.notebooks_common import KERNEL_NAME, OUTPUT_NOTEBOOK 8 | import torch 9 | 10 | ABS_TOL = 0.02 11 | 12 | 13 | @pytest.mark.gpu 14 | @pytest.mark.integration 15 | def test_minilm_abstractive_summarization(notebooks, tmp): 16 | notebook_path = notebooks["minilm_abstractive_summarization"] 17 | pm.execute_notebook( 18 | notebook_path, 19 | OUTPUT_NOTEBOOK, 20 | kernel_name=KERNEL_NAME, 21 | parameters=dict( 22 | QUICK_RUN=True, 23 | NUM_GPUS=torch.cuda.device_count(), 24 | TOP_N=100, 25 | WARMUP_STEPS=5, 26 | MAX_STEPS=50, 27 | GRADIENT_ACCUMULATION_STEPS=1, 28 | TEST_PER_GPU_BATCH_SIZE=2, 29 | BEAM_SIZE=3, 30 | CLEANUP_RESULTS=True, 31 | ), 32 | ) 33 | result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict 34 | assert pytest.approx(result["rouge_1_f_score"], 0.2, abs=ABS_TOL) 35 | assert pytest.approx(result["rouge_2_f_score"], 0.07, abs=ABS_TOL) 36 | assert pytest.approx(result["rouge_l_f_score"], 0.16, abs=ABS_TOL) 37 | 38 | @pytest.mark.cpu 39 | @pytest.mark.integration 40 | def test_minilm_abstractive_summarization(notebooks, tmp): 41 | notebook_path = notebooks["minilm_abstractive_summarization"] 42 | pm.execute_notebook( 43 | notebook_path, 44 | OUTPUT_NOTEBOOK, 45 | kernel_name=KERNEL_NAME, 46 | parameters=dict( 47 | QUICK_RUN=True, 48 | NUM_GPUS=0, 49 | TOP_N=2, 50 | WARMUP_STEPS=5, 51 | MAX_STEPS=50, 52 | GRADIENT_ACCUMULATION_STEPS=1, 53 | TEST_PER_GPU_BATCH_SIZE=2, 54 | BEAM_SIZE=3, 55 | CLEANUP_RESULTS=True, 56 | ), 57 | ) 58 | result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict 59 | assert pytest.approx(result["rouge_1_f_score"], 0.1, abs=ABS_TOL) 60 | assert pytest.approx(result["rouge_2_f_score"], 0.05, abs=ABS_TOL) 61 | assert pytest.approx(result["rouge_l_f_score"], 0.1, abs=ABS_TOL) 62 | 63 | -------------------------------------------------------------------------------- /tests/integration/test_notebooks_named_entity_recognition.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | import papermill as pm 6 | import scrapbook as sb 7 | from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME 8 | 9 | ABS_TOL = 0.05 10 | 11 | @pytest.mark.gpu 12 | @pytest.mark.integration 13 | def test_ner_wikigold_bert(notebooks, tmp): 14 | notebook_path = notebooks["ner_wikigold_transformer"] 15 | pm.execute_notebook( 16 | notebook_path, 17 | OUTPUT_NOTEBOOK, 18 | parameters={ 19 | "DATA_PATH": tmp, 20 | "CACHE_DIR": tmp 21 | }, 22 | kernel_name=KERNEL_NAME, 23 | ) 24 | result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict 25 | assert pytest.approx(result["precision"], 0.80, abs=ABS_TOL) 26 | assert pytest.approx(result["recall"], 0.83, abs=ABS_TOL) 27 | assert pytest.approx(result["f1"], 0.83, abs=ABS_TOL) -------------------------------------------------------------------------------- /tests/integration/test_notebooks_text_classification.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | import json 6 | import shutil 7 | import pytest 8 | import papermill as pm 9 | import scrapbook as sb 10 | from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME 11 | 12 | 13 | ABS_TOL = 0.1 14 | 15 | 16 | @pytest.mark.gpu 17 | @pytest.mark.integration 18 | def test_tc_mnli_transformers(notebooks, tmp): 19 | notebook_path = notebooks["tc_mnli_transformers"] 20 | pm.execute_notebook( 21 | notebook_path, 22 | OUTPUT_NOTEBOOK, 23 | kernel_name=KERNEL_NAME, 24 | parameters=dict( 25 | NUM_GPUS=1, 26 | DATA_FOLDER=tmp, 27 | CACHE_DIR=tmp, 28 | BATCH_SIZE=16, 29 | NUM_EPOCHS=1, 30 | TRAIN_DATA_FRACTION=0.05, 31 | TEST_DATA_FRACTION=0.05, 32 | MODEL_NAMES=["distilbert-base-uncased"], 33 | ), 34 | ) 35 | result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict 36 | assert pytest.approx(result["accuracy"], 0.885, abs=ABS_TOL) 37 | assert pytest.approx(result["f1"], 0.885, abs=ABS_TOL) 38 | 39 | 40 | @pytest.mark.integration 41 | @pytest.mark.azureml 42 | @pytest.mark.gpu 43 | def test_tc_bert_azureml( 44 | notebooks, subscription_id, resource_group, workspace_name, workspace_region, tmp 45 | ): 46 | notebook_path = notebooks["tc_bert_azureml"] 47 | 48 | train_folder = os.path.join(tmp, "train") 49 | test_folder = os.path.join(tmp, "test") 50 | 51 | parameters = { 52 | "config_path": None, 53 | "subscription_id": subscription_id, 54 | "resource_group": resource_group, 55 | "workspace_name": workspace_name, 56 | "workspace_region": workspace_region, 57 | "cluster_name": "tc-bert-cluster", 58 | "DATA_FOLDER": tmp, 59 | "TRAIN_FOLDER": train_folder, 60 | "TEST_FOLDER": test_folder, 61 | "PROJECT_FOLDER": ".", 62 | "NUM_PARTITIONS": 1, 63 | "NODE_COUNT": 1, 64 | } 65 | 66 | pm.execute_notebook( 67 | notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=parameters 68 | ) 69 | 70 | with open("outputs/results.json", "r") as handle: 71 | result_dict = json.load(handle) 72 | assert result_dict["weighted avg"]["f1-score"] == pytest.approx(0.85, abs=ABS_TOL) 73 | 74 | if os.path.exists("outputs"): 75 | shutil.rmtree("outputs") 76 | 77 | 78 | @pytest.mark.gpu 79 | @pytest.mark.integration 80 | def test_multi_languages_transformer(notebooks, tmp): 81 | notebook_path = notebooks["tc_multi_languages_transformers"] 82 | pm.execute_notebook( 83 | notebook_path, 84 | OUTPUT_NOTEBOOK, 85 | kernel_name=KERNEL_NAME, 86 | parameters={"QUICK_RUN": True, "USE_DATASET": "dac"}, 87 | ) 88 | result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict 89 | assert pytest.approx(result["precision"], 0.94, abs=ABS_TOL) 90 | assert pytest.approx(result["recall"], 0.94, abs=ABS_TOL) 91 | assert pytest.approx(result["f1"], 0.94, abs=ABS_TOL) 92 | -------------------------------------------------------------------------------- /tests/integration/test_notebooks_unilm_abstractive_summarization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import papermill as pm 5 | import pytest 6 | import scrapbook as sb 7 | from tests.notebooks_common import KERNEL_NAME, OUTPUT_NOTEBOOK 8 | import torch 9 | 10 | ABS_TOL = 0.02 11 | 12 | 13 | @pytest.mark.gpu 14 | @pytest.mark.integration 15 | def test_unilm_abstractive_summarization(notebooks, tmp): 16 | notebook_path = notebooks["unilm_abstractive_summarization"] 17 | pm.execute_notebook( 18 | notebook_path, 19 | OUTPUT_NOTEBOOK, 20 | kernel_name=KERNEL_NAME, 21 | parameters=dict( 22 | QUICK_RUN=True, 23 | NUM_GPUS=torch.cuda.device_count(), 24 | TOP_N=100, 25 | WARMUP_STEPS=5, 26 | MAX_STEPS=50, 27 | GRADIENT_ACCUMULATION_STEPS=1, 28 | TEST_PER_GPU_BATCH_SIZE=2, 29 | BEAM_SIZE=3, 30 | MODEL_DIR=tmp, 31 | RESULT_DIR=tmp, 32 | ), 33 | ) 34 | result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict 35 | assert pytest.approx(result["rouge_1_f_score"], 0.2, abs=ABS_TOL) 36 | assert pytest.approx(result["rouge_2_f_score"], 0.07, abs=ABS_TOL) 37 | assert pytest.approx(result["rouge_l_f_score"], 0.16, abs=ABS_TOL) 38 | 39 | @pytest.mark.cpu 40 | @pytest.mark.integration 41 | def test_unilm_abstractive_summarization(notebooks, tmp): 42 | notebook_path = notebooks["unilm_abstractive_summarization"] 43 | pm.execute_notebook( 44 | notebook_path, 45 | OUTPUT_NOTEBOOK, 46 | kernel_name=KERNEL_NAME, 47 | parameters=dict( 48 | QUICK_RUN=True, 49 | NUM_GPUS=0, 50 | TOP_N=2, 51 | WARMUP_STEPS=5, 52 | MAX_STEPS=50, 53 | GRADIENT_ACCUMULATION_STEPS=1, 54 | TEST_PER_GPU_BATCH_SIZE=2, 55 | BEAM_SIZE=3, 56 | MODEL_DIR=tmp, 57 | RESULT_DIR=tmp, 58 | ), 59 | ) 60 | result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict 61 | assert pytest.approx(result["rouge_1_f_score"], 0.1, abs=ABS_TOL) 62 | assert pytest.approx(result["rouge_2_f_score"], 0.05, abs=ABS_TOL) 63 | assert pytest.approx(result["rouge_l_f_score"], 0.1, abs=ABS_TOL) 64 | 65 | -------------------------------------------------------------------------------- /tests/notebooks_common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | 6 | # Unless manually modified, python3 should be the name of the current jupyter kernel 7 | # that runs on the activated conda environment 8 | KERNEL_NAME = "python3" 9 | OUTPUT_NOTEBOOK = "output.ipynb" 10 | 11 | 12 | def path_notebooks(): 13 | """Returns the path of the notebooks folder""" 14 | return os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, "examples")) 15 | -------------------------------------------------------------------------------- /tests/smoke/test_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | import pytest 6 | 7 | from utils_nlp.dataset import msrpc 8 | from utils_nlp.dataset import xnli 9 | 10 | 11 | @pytest.mark.smoke 12 | def test_msrpc_download(tmp_path): 13 | filepath = msrpc.download_msrpc(tmp_path) 14 | statinfo = os.stat(filepath) 15 | assert statinfo.st_size == 1359872 16 | 17 | 18 | @pytest.mark.skip(reason="Can't test it programmatically, needs input") 19 | @pytest.mark.smoke 20 | def test_msrpc_load_df(tmp_path): 21 | df_train = msrpc.load_pandas_df( 22 | local_cache_path=tmp_path, dataset_type="train" 23 | ) 24 | 25 | 26 | @pytest.mark.smoke 27 | def test_xnli(tmp_path): 28 | df_train = xnli.load_pandas_df( 29 | local_cache_path=tmp_path, file_split="train" 30 | ) 31 | assert df_train.shape == (392702, 2) 32 | -------------------------------------------------------------------------------- /tests/smoke/test_gpu_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | import torch 6 | 7 | 8 | @pytest.mark.smoke 9 | @pytest.mark.gpu 10 | def test_machine_is_gpu_machine(): 11 | assert torch.cuda.is_available() is True 12 | -------------------------------------------------------------------------------- /tests/smoke/test_word_embeddings.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | 6 | import pytest 7 | from gensim.models.fasttext import FastText 8 | from gensim.models.keyedvectors import Word2VecKeyedVectors 9 | 10 | from utils_nlp.models.pretrained_embeddings.fasttext import ( 11 | load_pretrained_vectors as load_fasttext, 12 | ) 13 | from utils_nlp.models.pretrained_embeddings.glove import ( 14 | load_pretrained_vectors as load_glove, 15 | ) 16 | from utils_nlp.models.pretrained_embeddings.word2vec import ( 17 | load_pretrained_vectors as load_word2vec, 18 | ) 19 | 20 | 21 | @pytest.mark.smoke 22 | def test_load_pretrained_vectors_word2vec(tmp_path): 23 | filename = "GoogleNews-vectors-negative300.bin" 24 | model = load_word2vec(tmp_path, limit=500000) 25 | filepath = os.path.join(os.path.join(tmp_path, "word2vec"), filename) 26 | statinfo = os.stat(filepath) 27 | assert statinfo.st_size == 3644258522 28 | assert isinstance(model, Word2VecKeyedVectors) 29 | assert len(model.vocab) == 500000 30 | 31 | 32 | @pytest.mark.smoke 33 | def test_load_pretrained_vectors_glove(tmp_path): 34 | filename = "glove.840B.300d.txt" 35 | model = load_glove(tmp_path, limit=50000) 36 | filepath = os.path.join(os.path.join(tmp_path, "gloVe"), filename) 37 | statinfo = os.stat(filepath) 38 | assert statinfo.st_size == 5646236541 39 | assert isinstance(model, Word2VecKeyedVectors) 40 | assert len(model.vocab) == 50000 41 | 42 | 43 | @pytest.mark.smoke 44 | def test_load_pretrained_vectors_fasttext(tmp_path): 45 | filename = "wiki.simple.bin" 46 | model = load_fasttext(tmp_path) 47 | filepath = os.path.join(os.path.join(tmp_path, "fastText"), filename) 48 | statinfo = os.stat(filepath) 49 | assert statinfo.st_size == 2668450750 50 | assert isinstance(model, FastText) 51 | 52 | 53 | -------------------------------------------------------------------------------- /tests/unit/test_bert_common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | 6 | from utils_nlp.models.bert.common import create_data_loader 7 | 8 | 9 | def test_tokenize(bert_english_tokenizer): 10 | text = ["Hello World.", "How you doing?", "greatttt"] 11 | tokens = bert_english_tokenizer.tokenize(text) 12 | assert len(tokens) == len(text) 13 | assert len(tokens[0]) == 3 14 | assert len(tokens[1]) == 4 15 | assert len(tokens[2]) == 3 16 | assert tokens[2][1].startswith("##") 17 | 18 | 19 | def test_tokenize_ner(ner_test_data, bert_english_tokenizer): 20 | seq_length = 20 21 | 22 | # test providing labels 23 | preprocessed_tokens = bert_english_tokenizer.tokenize_ner( 24 | text=ner_test_data["INPUT_TEXT"], 25 | labels=ner_test_data["INPUT_LABELS"], 26 | label_map=ner_test_data["LABEL_MAP"], 27 | max_len=seq_length, 28 | ) 29 | 30 | assert len(preprocessed_tokens[0][0]) == seq_length 31 | assert len(preprocessed_tokens[1][0]) == seq_length 32 | assert ( 33 | preprocessed_tokens[2] == ner_test_data["EXPECTED_TRAILING_TOKEN_MASK"] 34 | ) 35 | assert preprocessed_tokens[3] == ner_test_data["EXPECTED_LABEL_IDS"] 36 | 37 | # test when input is a single list 38 | preprocessed_tokens = bert_english_tokenizer.tokenize_ner( 39 | text=ner_test_data["INPUT_TEXT_SINGLE"], 40 | labels=ner_test_data["INPUT_LABELS_SINGLE"], 41 | label_map=ner_test_data["LABEL_MAP"], 42 | max_len=seq_length, 43 | ) 44 | 45 | assert len(preprocessed_tokens[0][0]) == seq_length 46 | assert len(preprocessed_tokens[1][0]) == seq_length 47 | assert ( 48 | preprocessed_tokens[2] == ner_test_data["EXPECTED_TRAILING_TOKEN_MASK"] 49 | ) 50 | assert preprocessed_tokens[3] == ner_test_data["EXPECTED_LABEL_IDS"] 51 | 52 | # test not providing labels 53 | preprocessed_tokens = bert_english_tokenizer.tokenize_ner( 54 | text=ner_test_data["INPUT_TEXT"], 55 | label_map=ner_test_data["LABEL_MAP"], 56 | max_len=20, 57 | ) 58 | assert ( 59 | preprocessed_tokens[2] == ner_test_data["EXPECTED_TRAILING_TOKEN_MASK"] 60 | ) 61 | 62 | # text exception when number of words and number of labels are different 63 | with pytest.raises(ValueError): 64 | preprocessed_tokens = bert_english_tokenizer.tokenize_ner( 65 | text=ner_test_data["INPUT_TEXT"], 66 | labels=ner_test_data["INPUT_LABELS_WRONG"], 67 | label_map=ner_test_data["LABEL_MAP"], 68 | max_len=seq_length, 69 | ) 70 | 71 | 72 | def test_create_data_loader(ner_test_data): 73 | with pytest.raises(ValueError): 74 | create_data_loader( 75 | input_ids=ner_test_data["INPUT_TOKEN_IDS"], 76 | input_mask=ner_test_data["INPUT_MASK"], 77 | label_ids=ner_test_data["INPUT_LABEL_IDS"], 78 | sample_method="dummy", 79 | ) 80 | 81 | create_data_loader( 82 | input_ids=ner_test_data["INPUT_TOKEN_IDS"], 83 | input_mask=ner_test_data["INPUT_MASK"], 84 | label_ids=ner_test_data["INPUT_LABEL_IDS"], 85 | sample_method="sequential", 86 | ) 87 | 88 | create_data_loader( 89 | input_ids=ner_test_data["INPUT_TOKEN_IDS"], 90 | input_mask=ner_test_data["INPUT_MASK"], 91 | label_ids=ner_test_data["INPUT_LABEL_IDS"], 92 | sample_method="random", 93 | ) 94 | -------------------------------------------------------------------------------- /tests/unit/test_bert_encoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | 6 | from utils_nlp.models.bert.common import Language 7 | from utils_nlp.models.bert.sequence_encoding import BERTSentenceEncoder 8 | 9 | @pytest.fixture() 10 | def data(): 11 | return ["The quick brown fox jumps over the lazy dog", "the coffee is very acidic"] 12 | 13 | def test_encoder(tmp, data): 14 | se = BERTSentenceEncoder( 15 | language=Language.ENGLISH, 16 | num_gpus=0, 17 | cache_dir=tmp, 18 | ) 19 | embeddings = se.encode(data, as_numpy=True) 20 | assert len(embeddings) == 2 21 | assert len(embeddings[0]) == 768 -------------------------------------------------------------------------------- /tests/unit/test_bert_sentence_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | 6 | from utils_nlp.models.bert.common import Language 7 | from utils_nlp.models.bert.sequence_encoding import BERTSentenceEncoder, PoolingStrategy 8 | from sklearn.metrics.pairwise import cosine_similarity 9 | 10 | 11 | @pytest.fixture() 12 | def data(): 13 | return [ 14 | "how old are you?", 15 | "what's your age?", 16 | "my phone is good", 17 | "your cellphone looks great.", 18 | ] 19 | 20 | 21 | def test_sentence_encoding(tmp, data): 22 | se = BERTSentenceEncoder( 23 | language=Language.ENGLISH, 24 | num_gpus=0, 25 | to_lower=True, 26 | max_len=128, 27 | layer_index=-2, 28 | pooling_strategy=PoolingStrategy.MEAN, 29 | cache_dir=tmp, 30 | ) 31 | 32 | result = se.encode(data, as_numpy=False) 33 | similarity = cosine_similarity(result["values"].values.tolist()) 34 | assert similarity[0, 0] > similarity[1, 0] 35 | assert similarity[0, 1] > similarity[0, 2] 36 | -------------------------------------------------------------------------------- /tests/unit/test_data_loaders.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import random 5 | 6 | import numpy as np 7 | import pytest 8 | import json 9 | import os 10 | import io 11 | 12 | from utils_nlp.dataset.data_loaders import DaskCSVLoader 13 | from utils_nlp.dataset.data_loaders import DaskJSONLoader 14 | 15 | UNIF1 = {"a": 4, "b": 6, "n": 10000} # some uniform distribution 16 | row_size = 5 # "a,b\n (5 bytes)" 17 | json_row_size = 18 # "{"a": 1, "b": 5}\n (18 bytes)" 18 | 19 | 20 | @pytest.fixture() 21 | def csv_file(tmpdir): 22 | random.seed(0) 23 | f = tmpdir.mkdir("test_loaders").join("tl_data.csv") 24 | f.write( 25 | "\n".join( 26 | [ 27 | "{},{}".format( 28 | random.randint(0, 1), 29 | random.randint(UNIF1["a"], UNIF1["b"]), 30 | ) 31 | for x in range(UNIF1["n"]) 32 | ] 33 | ) 34 | ) 35 | return str(f) 36 | 37 | 38 | @pytest.fixture() 39 | def json_file(tmpdir): 40 | random.seed(0) 41 | json_path = os.path.join(tmpdir, "test.jsonl") 42 | with io.open(json_path, "w", encoding="utf8") as f: 43 | for _ in range(UNIF1["n"]): 44 | data_dict = { 45 | "a": random.randint(0, 1), 46 | "b": random.randint(UNIF1["a"], UNIF1["b"]), 47 | } 48 | json.dump(data_dict, f) 49 | f.write("\n") 50 | return json_path 51 | 52 | 53 | def test_dask_csv_rnd_loader(csv_file): 54 | num_batches = 500 55 | batch_size = 12 56 | num_partitions = 4 57 | 58 | loader = DaskCSVLoader( 59 | csv_file, 60 | header=None, 61 | block_size=row_size * int(UNIF1["n"] / num_partitions), 62 | random_seed=0, 63 | ) 64 | 65 | sample = [] 66 | for batch in loader.get_random_batches(num_batches, batch_size): 67 | sample.append(list(batch.iloc[:, 1])) 68 | sample = np.concatenate(sample) 69 | 70 | assert loader.df.npartitions == num_partitions 71 | assert sample.mean().round() == (UNIF1["a"] + UNIF1["b"]) / 2 72 | assert len(sample) <= num_batches * batch_size 73 | 74 | 75 | def test_dask_csv_seq_loader(csv_file): 76 | batch_size = 12 77 | num_partitions = 4 78 | 79 | loader = DaskCSVLoader( 80 | csv_file, 81 | header=None, 82 | block_size=row_size * int(UNIF1["n"] / num_partitions), 83 | ) 84 | 85 | sample = [] 86 | for batch in loader.get_sequential_batches(batch_size): 87 | sample.append(list(batch.iloc[:, 1])) 88 | sample = np.concatenate(sample) 89 | 90 | assert loader.df.npartitions == num_partitions 91 | assert sample.mean().round() == (UNIF1["a"] + UNIF1["b"]) / 2 92 | assert len(sample) == UNIF1["n"] 93 | 94 | 95 | def test_dask_json_rnd_loader(json_file): 96 | num_batches = 500 97 | batch_size = 12 98 | num_partitions = 4 99 | 100 | loader = DaskJSONLoader( 101 | json_file, 102 | block_size=json_row_size * int(UNIF1["n"] / num_partitions), 103 | random_seed=0, 104 | lines=True, 105 | ) 106 | 107 | sample = [] 108 | for batch in loader.get_random_batches(num_batches, batch_size): 109 | sample.append(list(batch.iloc[:, 1])) 110 | sample = np.concatenate(sample) 111 | 112 | assert loader.df.npartitions == num_partitions 113 | assert sample.mean().round() == (UNIF1["a"] + UNIF1["b"]) / 2 114 | assert len(sample) <= num_batches * batch_size 115 | 116 | 117 | def test_dask_json_seq_loader(json_file): 118 | batch_size = 12 119 | num_partitions = 4 120 | 121 | loader = DaskJSONLoader( 122 | json_file, 123 | block_size=json_row_size * int(UNIF1["n"] / num_partitions), 124 | random_seed=0, 125 | lines=True, 126 | ) 127 | 128 | sample = [] 129 | for batch in loader.get_sequential_batches(batch_size): 130 | sample.append(list(batch.iloc[:, 1])) 131 | sample = np.concatenate(sample) 132 | 133 | assert loader.df.npartitions == num_partitions 134 | assert sample.mean().round() == (UNIF1["a"] + UNIF1["b"]) / 2 135 | assert len(sample) == UNIF1["n"] 136 | -------------------------------------------------------------------------------- /tests/unit/test_dataset_pytorch.py: -------------------------------------------------------------------------------- 1 | from utils_nlp.models.transformers.datasets import QADataset 2 | 3 | 4 | def test_QADataset(qa_test_df): 5 | dataset = QADataset( 6 | df=qa_test_df["test_df"], 7 | doc_text_col=qa_test_df["doc_text_col"], 8 | question_text_col=qa_test_df["question_text_col"], 9 | answer_start_col=qa_test_df["answer_start_col"], 10 | answer_text_col=qa_test_df["answer_text_col"], 11 | qa_id_col=qa_test_df["qa_id_col"], 12 | is_impossible_col=qa_test_df["is_impossible_col"], 13 | ) 14 | 15 | for i in range(2): 16 | assert dataset[i].doc_text == qa_test_df["test_df"][qa_test_df["doc_text_col"]][i] 17 | assert dataset[i].question_text == qa_test_df["test_df"][qa_test_df["question_text_col"]][i] 18 | assert dataset[i].answer_start == qa_test_df["test_df"][qa_test_df["answer_start_col"]][i] 19 | assert dataset[i].answer_text == qa_test_df["test_df"][qa_test_df["answer_text_col"]][i] 20 | assert dataset[i].qa_id == qa_test_df["test_df"][qa_test_df["qa_id_col"]][i] 21 | assert dataset[i].is_impossible == qa_test_df["test_df"][qa_test_df["is_impossible_col"]][i] 22 | 23 | dataset_default = QADataset( 24 | df=qa_test_df["test_df"], 25 | doc_text_col=qa_test_df["doc_text_col"], 26 | question_text_col=qa_test_df["question_text_col"], 27 | ) 28 | 29 | for i in range(2): 30 | assert dataset_default[i].doc_text == qa_test_df["test_df"][qa_test_df["doc_text_col"]][i] 31 | assert ( 32 | dataset_default[i].question_text 33 | == qa_test_df["test_df"][qa_test_df["question_text_col"]][i] 34 | ) 35 | assert dataset_default[i].answer_start == -1 36 | assert dataset_default[i].answer_text == "" 37 | assert dataset_default[i].qa_id == i 38 | assert dataset_default[i].is_impossible == False 39 | -------------------------------------------------------------------------------- /tests/unit/test_distributed_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | from utils_nlp.models.transformers.extractive_summarization import IterableDistributedSampler 6 | 7 | @pytest.mark.cpu 8 | def test_sampler(): 9 | sampler = IterableDistributedSampler(1, 0, -1) 10 | samples = list(sampler.iter('abcdefg')) 11 | assert ''.join(samples) == 'abcdefg' 12 | 13 | sampler = IterableDistributedSampler(2, 0, -1) 14 | samples = list(sampler.iter('abcdefg')) 15 | assert ''.join(samples) == 'abcdefg' 16 | 17 | sampler = IterableDistributedSampler(4, 1, 1) 18 | samples = list(sampler.iter('abcdefg')) 19 | assert ''.join(samples) == 'bf' 20 | 21 | sampler = IterableDistributedSampler(4, 2, 2) 22 | samples = list(sampler.iter('abcdefg')) 23 | assert ''.join(samples) == 'cg' 24 | 25 | sampler = IterableDistributedSampler(4, 3, 3) 26 | samples = list(sampler.iter('abcdefg')) 27 | assert ''.join(samples) == 'd' 28 | 29 | sampler = IterableDistributedSampler(8, 7, 3) 30 | samples = list(sampler.iter('abcdefghijklmn')) 31 | assert ''.join(samples) == 'h' 32 | 33 | 34 | -------------------------------------------------------------------------------- /tests/unit/test_eval_classification.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | 6 | from utils_nlp.eval.classification import compute_correlation_coefficients 7 | 8 | 9 | def test_compute(): 10 | x = np.random.rand(2, 100) 11 | df = compute_correlation_coefficients(x) 12 | assert df.shape == (2, 2) 13 | 14 | y = np.random.rand(2, 100) 15 | df = compute_correlation_coefficients(x, y) 16 | assert df.shape == (4, 4) 17 | -------------------------------------------------------------------------------- /tests/unit/test_extractive_summarization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import nltk 5 | import pytest 6 | from nltk import tokenize 7 | 8 | from utils_nlp.models.transformers.datasets import SummarizationDataset 9 | from utils_nlp.models.transformers.extractive_summarization import ( 10 | ExtractiveSummarizer, 11 | ExtSumProcessor, 12 | ) 13 | 14 | nltk.download("punkt") 15 | 16 | 17 | # @pytest.fixture() 18 | def source_data(): 19 | return ( 20 | "Boston, MA welcome to Microsoft/nlp. Welcome to text summarization." 21 | "Welcome to Microsoft NERD." 22 | "Look outside, what a beautiful Charlse River fall view." 23 | ) 24 | 25 | 26 | # @pytest.fixture() 27 | def target_data(): 28 | return ( 29 | "welcome to microsoft/nlp." 30 | "Welcome to text summarization." 31 | "Welcome to Microsoft NERD." 32 | ) 33 | 34 | 35 | MODEL_NAME = "distilbert-base-uncased" 36 | 37 | @pytest.fixture(scope="module") 38 | def data(tmp_module): 39 | source = source_data() 40 | target = target_data() 41 | train_dataset = SummarizationDataset( 42 | None, 43 | source=[source], 44 | target=[target], 45 | source_preprocessing=[tokenize.sent_tokenize], 46 | target_preprocessing=[tokenize.sent_tokenize], 47 | word_tokenize=nltk.word_tokenize, 48 | ) 49 | test_dataset = SummarizationDataset( 50 | None, 51 | source=[source], 52 | source_preprocessing=[tokenize.sent_tokenize], 53 | word_tokenize=nltk.word_tokenize, 54 | ) 55 | 56 | processor = ExtSumProcessor( 57 | model_name=MODEL_NAME, 58 | cache_dir=tmp_module, 59 | max_nsents=200, 60 | max_src_ntokens=2000, 61 | min_nsents=0, 62 | min_src_ntokens=1, 63 | ) 64 | ext_sum_train = processor.preprocess(train_dataset, oracle_mode="greedy") 65 | ext_sum_test = processor.preprocess(test_dataset, oracle_mode="greedy") 66 | return processor, ext_sum_train, ext_sum_test 67 | 68 | 69 | @pytest.mark.gpu 70 | def test_bert_training(data, tmp_module): 71 | 72 | CACHE_DIR = tmp_module 73 | ENCODER = "transformer" 74 | MAX_POS = 768 75 | BATCH_SIZE = 128 76 | LEARNING_RATE = 2e-3 77 | REPORT_EVERY = 50 78 | MAX_STEPS = 20 79 | WARMUP_STEPS = 1e2 80 | 81 | processor, train_dataset, test_dataset = data 82 | summarizer = ExtractiveSummarizer( 83 | processor, MODEL_NAME, ENCODER, MAX_POS, CACHE_DIR 84 | ) 85 | summarizer.fit( 86 | train_dataset, 87 | num_gpus=None, 88 | batch_size=BATCH_SIZE, 89 | gradient_accumulation_steps=1, 90 | max_steps=MAX_STEPS, 91 | lr=LEARNING_RATE, 92 | warmup_steps=WARMUP_STEPS, 93 | verbose=True, 94 | report_every=REPORT_EVERY, 95 | clip_grad_norm=False, 96 | ) 97 | 98 | prediction = summarizer.predict(test_dataset, num_gpus=None, batch_size=128) 99 | assert len(prediction) == 1 100 | -------------------------------------------------------------------------------- /tests/unit/test_gensen_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | 6 | import pandas as pd 7 | 8 | from utils_nlp.models.gensen.preprocess_utils import gensen_preprocess 9 | from utils_nlp.models.gensen.utils import DataIterator 10 | 11 | 12 | def test_gensen_preprocess(tmp_path): 13 | data = [ 14 | [ 15 | "neutral", 16 | "it is a lovely day", 17 | "the weather is great outside.", 18 | ["it", "is", "lovely", "day"], 19 | ["the", "weather", "is", "great", "outside"], 20 | ] 21 | ] 22 | 23 | df = pd.DataFrame(data) 24 | df.columns = [ 25 | "score", 26 | "sentence1", 27 | "sentence2", 28 | "sentence1_tokens", 29 | "sentence2_tokens", 30 | ] 31 | 32 | expected_files = [ 33 | "snli_1.0_test.txt.lab", 34 | "snli_1.0_test.txt.s1.tok", 35 | "snli_1.0_dev.txt.clean.noblank", 36 | "snli_1.0_train.txt.s1.tok", 37 | "snli_1.0_train.txt.lab", 38 | "snli_1.0_dev.txt.s1.tok", 39 | "snli_1.0_dev.txt.s2.tok", 40 | "snli_1.0_test.txt.s2.tok", 41 | "snli_1.0_train.txt.clean", 42 | "snli_1.0_train.txt.s2.tok", 43 | "snli_1.0_test.txt.clean.noblank", 44 | "snli_1.0_test.txt.clean", 45 | "snli_1.0_train.txt.clean.noblank", 46 | "snli_1.0_dev.txt.lab", 47 | "snli_1.0_dev.txt.clean", 48 | ] 49 | path = gensen_preprocess(df, df, df, tmp_path) 50 | assert os.path.isdir(path) is True 51 | assert set(os.listdir(path)) == set(expected_files) 52 | 53 | 54 | def test_data_iterator(): 55 | sentences = ["it is a lovely day", "the weather is great outside.", ] 56 | expected_vocab = ["it", "is", "a", "lovely", "day", "the", "weather", "is", "great", "outside."] 57 | 58 | vocab_size = 10 59 | di = DataIterator() 60 | word2id, id2word = di.construct_vocab(sentences, vocab_size) 61 | assert set(expected_vocab).issubset(word2id.keys()) 62 | assert set(expected_vocab).issubset(id2word.values()) 63 | -------------------------------------------------------------------------------- /tests/unit/test_interpreter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License 3 | 4 | import random 5 | 6 | import pytest 7 | 8 | import numpy as np 9 | import torch 10 | from torch import nn 11 | 12 | from utils_nlp.interpreter.Interpreter import ( 13 | Interpreter, 14 | calculate_regularization, 15 | ) 16 | 17 | 18 | def fixed_length_Phi(x): 19 | return x[0] * 10 + x[1] * 20 - x[2] * 20 - x[3] * 10 20 | 21 | 22 | def variable_length_Phi(function): 23 | return lambda x: (function(x.unsqueeze(0))[0][0]) 24 | 25 | 26 | @pytest.fixture 27 | def fixed_length_interp(): 28 | x = torch.randn(4, 10) 29 | regular = torch.randn(10) 30 | return Interpreter(x, fixed_length_Phi, regularization=regular) 31 | 32 | 33 | @pytest.fixture 34 | def variable_length_interp(): 35 | function = nn.LSTM(10, 10) 36 | x = torch.randn(4, 10) 37 | regular = torch.randn(1, 10) 38 | return Interpreter( 39 | x, variable_length_Phi(function), regularization=regular 40 | ) 41 | 42 | 43 | def test_fixed_length_regularization(): 44 | dataset = torch.randn(10, 4, 10) 45 | # calculate all hidden states 46 | hidden = [fixed_length_Phi(x).tolist() for x in dataset] 47 | # calculate the standard deviation 48 | hidden = np.array(hidden) 49 | regular_gt = np.std(hidden, axis=0) 50 | regular = calculate_regularization(dataset, fixed_length_Phi) 51 | assert np.sum(np.abs(regular - regular_gt)) < 1e-5 52 | 53 | 54 | def test_variable_length_regularization(): 55 | function = nn.LSTM(10, 10) 56 | dataset = [torch.randn(random.randint(5, 9), 10) for _ in range(10)] 57 | # calculate all hidden states 58 | hidden = [ 59 | np.mean( 60 | variable_length_Phi(function)(x).tolist(), axis=0, keepdims=True 61 | ) 62 | for x in dataset 63 | ] 64 | # calculate the standard deviation 65 | hidden = np.array(hidden) 66 | regular_gt = np.std(hidden, axis=0) 67 | regular = calculate_regularization( 68 | dataset, variable_length_Phi(function), reduced_axes=[0] 69 | ) 70 | assert np.sum(np.abs(regular - regular_gt)) < 1e-5 71 | 72 | 73 | def test_initialize_interpreter(): 74 | x = torch.randn(4, 10) 75 | regular = torch.randn(10) 76 | interpreter = Interpreter(x, fixed_length_Phi, regularization=regular) 77 | assert interpreter.s == 4 78 | assert interpreter.d == 10 79 | assert interpreter.regular.tolist() == regular.tolist() 80 | 81 | 82 | def test_train_fixed_length_interp(fixed_length_interp): 83 | init_ratio = fixed_length_interp.ratio + 0.0 # make a copy 84 | init_regular = fixed_length_interp.regular + 0.0 85 | fixed_length_interp.optimize(iteration=10) 86 | after_ratio = fixed_length_interp.ratio + 0.0 87 | after_regular = fixed_length_interp.regular + 0.0 88 | # make sure the ratio is changed when optimizing 89 | assert torch.sum(torch.abs(after_ratio - init_ratio)) > 1e-5 90 | # make sure the regular is not changed when optimizing 91 | assert torch.sum(torch.abs(after_regular - init_regular)) < 1e-5 92 | 93 | 94 | def test_train_variable_length_interp(variable_length_interp): 95 | init_ratio = variable_length_interp.ratio + 0.0 # make a copy 96 | init_regular = variable_length_interp.regular + 0.0 97 | variable_length_interp.optimize(iteration=10) 98 | after_ratio = variable_length_interp.ratio + 0.0 99 | after_regular = variable_length_interp.regular + 0.0 100 | # make sure the ratio is changed when optimizing 101 | assert torch.sum(torch.abs(after_ratio - init_ratio)) > 1e-5 102 | # make sure the regular is not changed when optimizing 103 | assert torch.sum(torch.abs(after_regular - init_regular)) < 1e-5 104 | 105 | 106 | def test_interpreter_get_simga(fixed_length_interp): 107 | sigma = fixed_length_interp.get_sigma() 108 | assert sigma.shape == (4,) 109 | -------------------------------------------------------------------------------- /tests/unit/test_notebooks_cpu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | import pytest 6 | from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME 7 | import papermill as pm 8 | from utils_nlp.models.bert.common import Language 9 | 10 | 11 | @pytest.mark.notebooks 12 | def test_bert_encoder(notebooks, tmp): 13 | notebook_path = notebooks["bert_encoder"] 14 | pm.execute_notebook( 15 | notebook_path, 16 | OUTPUT_NOTEBOOK, 17 | kernel_name=KERNEL_NAME, 18 | parameters=dict( 19 | NUM_GPUS=0, LANGUAGE=Language.ENGLISH, TO_LOWER=True, MAX_SEQ_LENGTH=128, CACHE_DIR=tmp 20 | ), 21 | ) 22 | -------------------------------------------------------------------------------- /tests/unit/test_notebooks_gpu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | import pytest 6 | from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME 7 | import papermill as pm 8 | from utils_nlp.models.bert.common import Language 9 | 10 | 11 | @pytest.mark.notebooks 12 | @pytest.mark.gpu 13 | def test_bert_encoder(notebooks, tmp): 14 | notebook_path = notebooks["bert_encoder"] 15 | pm.execute_notebook( 16 | notebook_path, 17 | OUTPUT_NOTEBOOK, 18 | kernel_name=KERNEL_NAME, 19 | parameters=dict( 20 | NUM_GPUS=1, LANGUAGE=Language.ENGLISH, TO_LOWER=True, MAX_SEQ_LENGTH=128, CACHE_DIR=tmp 21 | ), 22 | ) 23 | -------------------------------------------------------------------------------- /tests/unit/test_preprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | import pandas as pd 6 | import numpy as np 7 | 8 | import utils_nlp.dataset.preprocess as preprocess 9 | 10 | 11 | @pytest.fixture(scope="module") 12 | def df_sentences(): 13 | sentences = np.array( 14 | [ 15 | "The man is playing the piano.", 16 | "Some men are fighting.", 17 | "A man is spreading shreded cheese on a pizza.", 18 | "A man is playing the cello.", 19 | "A man is spreading shreded cheese on a pizza.", 20 | "A man is playing a large flute.", 21 | "A man is playing the cello.", 22 | "A man is playing on a guitar and singing.", 23 | "The man is playing the piano.", 24 | "Some men are fighting.", 25 | ] 26 | ).reshape(2, 5) 27 | 28 | return pd.DataFrame(sentences, columns=["s1", "s2", "s3", "s4", "s5"]) 29 | 30 | 31 | def test_to_lowercase_all(df_sentences): 32 | ldf = preprocess.to_lowercase_all(df_sentences) 33 | assert sum(map(lambda x: x.islower(), ldf.values.flatten())) == len( 34 | ldf.values.flatten() 35 | ) 36 | 37 | 38 | def test_to_lowercase_subset(df_sentences): 39 | ldf = preprocess.to_lowercase(df_sentences, column_names=["s4"]) 40 | assert sum(map(lambda x: x.islower(), ldf.s4.values.flatten())) == len( 41 | ldf.s4.values.flatten() 42 | ) 43 | 44 | 45 | def test_to_spacy_tokens(df_sentences): 46 | sentence_cols = ["s1", "s2"] 47 | token_cols = ["t1", "t2"] 48 | token_df = preprocess.to_spacy_tokens( 49 | df_sentences, sentence_cols=sentence_cols, token_cols=token_cols 50 | ) 51 | assert token_df.shape[1] == df_sentences.shape[1] + len( 52 | token_cols 53 | ) and sum( 54 | list( 55 | map(lambda x: (token_df[x].apply(type) == list).all(), token_cols) 56 | ) 57 | ) == len( 58 | token_cols 59 | ) 60 | 61 | 62 | def test_rm_spacy_stopwords(df_sentences): 63 | sentence_cols = ["s1", "s2"] 64 | stop_cols = ["stop1", "stop2"] 65 | stop_df = preprocess.rm_spacy_stopwords( 66 | df_sentences, sentence_cols=sentence_cols, stop_cols=stop_cols 67 | ) 68 | assert stop_df.shape[1] == df_sentences.shape[1] + len(stop_cols) and sum( 69 | list(map(lambda x: (stop_df[x].apply(type) == list).all(), stop_cols)) 70 | ) == len(stop_cols) 71 | 72 | 73 | def test_to_nltk_tokens(df_sentences): 74 | sentence_cols = ["s1", "s2"] 75 | token_cols = ["t1", "t2"] 76 | token_df = preprocess.to_nltk_tokens( 77 | df_sentences, sentence_cols=sentence_cols, token_cols=token_cols 78 | ) 79 | assert token_df.shape[1] == df_sentences.shape[1] + len( 80 | token_cols 81 | ) and sum( 82 | list( 83 | map(lambda x: (token_df[x].apply(type) == list).all(), token_cols) 84 | ) 85 | ) == len( 86 | token_cols 87 | ) 88 | 89 | 90 | def test_rm_nltk_stopwords(df_sentences): 91 | sentence_cols = ["s1", "s2"] 92 | stop_cols = ["stop1", "stop2"] 93 | stop_df = preprocess.rm_nltk_stopwords( 94 | df_sentences, sentence_cols=sentence_cols, stop_cols=stop_cols 95 | ) 96 | assert stop_df.shape[1] == df_sentences.shape[1] + len(stop_cols) and sum( 97 | list(map(lambda x: (stop_df[x].apply(type) == list).all(), stop_cols)) 98 | ) == len(stop_cols) 99 | 100 | 101 | def test_convert_to_unicode(): 102 | test_str = "test" 103 | test_byte = test_str.encode("utf-8") 104 | 105 | assert isinstance(preprocess.convert_to_unicode(test_str), str) 106 | assert isinstance(preprocess.convert_to_unicode(test_byte), str) 107 | -------------------------------------------------------------------------------- /tests/unit/test_timer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | 5 | import pytest 6 | import time 7 | from utils_nlp.common.timer import Timer 8 | 9 | 10 | TOL = 0.01 11 | 12 | 13 | @pytest.fixture(scope="function") 14 | def t(): 15 | return Timer() 16 | 17 | 18 | def test_no_time(t): 19 | assert t.interval == 0 20 | assert t.running == False 21 | 22 | 23 | def test_stop_before_start(t): 24 | with pytest.raises(ValueError): 25 | t.stop() 26 | 27 | 28 | def test_interval_before_stop(t): 29 | t.start() 30 | with pytest.raises(ValueError): 31 | t.interval 32 | 33 | 34 | def test_timer(t): 35 | t.start() 36 | assert t.running == True 37 | time.sleep(1) 38 | t.stop() 39 | assert t.running == False 40 | assert t.interval == pytest.approx(1, abs=TOL) 41 | with Timer() as t2: 42 | assert t2.running == True 43 | time.sleep(1) 44 | assert t2.interval == pytest.approx(1, abs=TOL) 45 | assert t2.running == False 46 | 47 | 48 | def test_timer_format(t): 49 | assert str(t) == "0.0000" 50 | assert str(t.interval) == "0" 51 | -------------------------------------------------------------------------------- /tests/unit/test_transformers_sequence_classification.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | import pandas as pd 6 | 7 | from utils_nlp.models.transformers.sequence_classification import ( 8 | SequenceClassifier, 9 | Processor, 10 | ) 11 | from utils_nlp.common.pytorch_utils import dataloader_from_dataset 12 | 13 | 14 | @pytest.fixture() 15 | def data(): 16 | return (["hi", "hello", "what's wrong with us", "can I leave?"], [0, 0, 1, 2]) 17 | 18 | 19 | @pytest.mark.cpu 20 | def test_classifier(data, tmpdir): 21 | 22 | df = pd.DataFrame({"text": data[0], "label": data[1]}) 23 | num_labels = len(pd.unique(data[1])) 24 | model_name = "bert-base-uncased" 25 | processor = Processor(model_name=model_name, cache_dir=tmpdir) 26 | ds = processor.dataset_from_dataframe(df, "text", "label") 27 | dl = dataloader_from_dataset(ds, batch_size=2, num_gpus=0, shuffle=True) 28 | classifier = SequenceClassifier( 29 | model_name=model_name, num_labels=num_labels, cache_dir=tmpdir 30 | ) 31 | classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=0, verbose=False) 32 | preds = classifier.predict(dl, num_gpus=0, verbose=False) 33 | assert len(preds) == len(data[1]) 34 | 35 | 36 | @pytest.mark.gpu 37 | def test_classifier_gpu_train_cpu_predict(data, tmpdir): 38 | 39 | df = pd.DataFrame({"text": data[0], "label": data[1]}) 40 | num_labels = len(pd.unique(data[1])) 41 | model_name = "bert-base-uncased" 42 | processor = Processor(model_name=model_name, cache_dir=tmpdir) 43 | ds = processor.dataset_from_dataframe(df, "text", "label") 44 | dl = dataloader_from_dataset(ds, batch_size=2, num_gpus=1, shuffle=True) 45 | classifier = SequenceClassifier( 46 | model_name=model_name, num_labels=num_labels, cache_dir=tmpdir 47 | ) 48 | classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=1, verbose=False) 49 | 50 | # gpu prediction, no model move 51 | preds = classifier.predict(dl, num_gpus=1, verbose=False) 52 | assert len(preds) == len(data[1]) 53 | # cpu prediction, need model move 54 | assert next(classifier.model.parameters()).is_cuda is True 55 | preds = classifier.predict(dl, num_gpus=0, verbose=False) 56 | assert next(classifier.model.parameters()).is_cuda is False 57 | -------------------------------------------------------------------------------- /tests/unit/test_transformers_token_classification.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | 6 | from utils_nlp.common.pytorch_utils import dataloader_from_dataset 7 | from utils_nlp.models.transformers.named_entity_recognition import ( 8 | TokenClassificationProcessor, 9 | TokenClassifier, 10 | ) 11 | from utils_nlp.models.transformers.common import MAX_SEQ_LEN 12 | 13 | 14 | @pytest.mark.cpu 15 | def test_token_classifier_fit_predict(tmpdir, ner_test_data): 16 | num_labels = 6 17 | max_seq_len = MAX_SEQ_LEN 18 | token_classifier = TokenClassifier( 19 | model_name="bert-base-uncased", num_labels=num_labels, cache_dir=tmpdir 20 | ) 21 | processor = TokenClassificationProcessor( 22 | model_name="bert-base-uncased", cache_dir=tmpdir 23 | ) 24 | 25 | # test fit, no warmup 26 | train_dataset = processor.preprocess( 27 | text=ner_test_data["INPUT_TEXT"], 28 | max_len=max_seq_len, 29 | labels=ner_test_data["INPUT_LABELS"], 30 | label_map=ner_test_data["LABEL_MAP"], 31 | ) 32 | train_dataloader = dataloader_from_dataset(train_dataset) 33 | token_classifier.fit(train_dataloader) 34 | 35 | # test predict, no labels 36 | preds = token_classifier.predict(train_dataloader, verbose=False) 37 | assert preds.shape == (len(train_dataloader), MAX_SEQ_LEN, num_labels) 38 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | # Tools 2 | 3 | This submodule includes: 4 | 1. A [script](generate_conda_file.py) to generate the Conda environment file for running Python scripts and notebooks in this Git repo 5 | 2. Python [script](remove_pixelserver.py) to remove pixelserver tracking from all example notebooks. 6 | 7 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/nlp-recipes/7db6d204e5116da07bb3c549df546e49cb7ab5a5/tools/__init__.py -------------------------------------------------------------------------------- /tools/generate_requirements_txt.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | # This file outputs a requirements.txt based on the libraries defined in generate_conda_file.py 5 | from generate_conda_file import ( 6 | CONDA_BASE, 7 | CONDA_GPU, 8 | PIP_BASE, 9 | PIP_GPU, 10 | PIP_DARWIN, 11 | PIP_LINUX, 12 | PIP_WIN32, 13 | CONDA_DARWIN, 14 | CONDA_LINUX, 15 | CONDA_WIN32, 16 | PIP_DARWIN_GPU, 17 | PIP_LINUX_GPU, 18 | PIP_WIN32_GPU, 19 | CONDA_DARWIN_GPU, 20 | CONDA_LINUX_GPU, 21 | CONDA_WIN32_GPU, 22 | ) 23 | 24 | 25 | if __name__ == "__main__": 26 | deps = list(CONDA_BASE.values()) 27 | deps += list(CONDA_GPU.values()) 28 | deps += list(PIP_BASE.values()) 29 | deps += list(PIP_GPU.values()) 30 | deps += list(PIP_DARWIN.values()) 31 | deps += list(PIP_LINUX.values()) 32 | deps += list(PIP_WIN32.values()) 33 | deps += list(CONDA_DARWIN.values()) 34 | deps += list(CONDA_LINUX.values()) 35 | deps += list(CONDA_WIN32.values()) 36 | deps += list(PIP_DARWIN_GPU.values()) 37 | deps += list(PIP_LINUX_GPU.values()) 38 | deps += list(PIP_WIN32_GPU.values()) 39 | deps += list(CONDA_DARWIN_GPU.values()) 40 | deps += list(CONDA_LINUX_GPU.values()) 41 | deps += list(CONDA_WIN32_GPU.values()) 42 | with open("requirements.txt", "w") as f: 43 | f.write("\n".join(set(deps))) 44 | 45 | -------------------------------------------------------------------------------- /tools/remove_pixelserver.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Copyright (c) Microsoft Corporation. All rights reserved. 4 | # Licensed under the MIT License. 5 | 6 | import json 7 | import os 8 | import sys 9 | import glob 10 | 11 | 12 | SIGNATURE = "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions" 13 | 14 | 15 | def remove_pixelserver_from_notebook(file_path): 16 | """ 17 | Remove pixelserver tracking from a notebook. If the pixcelserver signature found in 18 | the notebook, the pixelserver cell will be removed from the notebook file. File will 19 | be modified only when the pixelserver signature is found in it. 20 | 21 | Args: 22 | file_path (str): The notebook file path. 23 | """ 24 | 25 | with open(file_path, encoding='utf-8') as fd: 26 | raw_json = json.load(fd) 27 | 28 | if 'cells' not in raw_json: 29 | return 30 | 31 | cells = raw_json['cells'] 32 | pixel_cells = [] 33 | 34 | for idx, cell in enumerate(cells): 35 | if cell['cell_type'] != 'markdown': 36 | continue 37 | 38 | source = cell['source'] 39 | for row in source: 40 | if row.startswith(SIGNATURE): 41 | pixel_cells.append(idx) 42 | print("Found pixelserver in file: \"{}\", cell {}".format(file_path, idx)) 43 | 44 | for cell_id in pixel_cells[::-1]: 45 | cells.pop(cell_id) 46 | 47 | if pixel_cells: 48 | with open(file_path, 'w', encoding='utf-8') as fd: 49 | json.dump(raw_json, fd, indent=1) 50 | 51 | 52 | def get_all_notebook_files(): 53 | """ 54 | Get all example notebook files' path and return them as a list. 55 | 56 | Returns: 57 | list of str. A list of notebook file paths. 58 | """ 59 | 60 | root_path = os.path.dirname(sys.path[0]) 61 | examples_path = os.path.join(root_path, "examples") 62 | if not os.path.exists(examples_path): 63 | raise ValueError("Cannot find examples file path: {}".format(examples_path)) 64 | 65 | files = [f for f in glob.glob(os.path.join(examples_path, "*/*.ipynb"), recursive=True)] 66 | return files 67 | 68 | 69 | def main(): 70 | """ 71 | Remove pixelserver from all example notebooks. 72 | """ 73 | 74 | notebooks = get_all_notebook_files() 75 | for notebook in notebooks: 76 | remove_pixelserver_from_notebook(notebook) 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /utils_nlp/README.md: -------------------------------------------------------------------------------- 1 | # NLP Utilities 2 | 3 | Modern NLP research and development can involve tedious tasks ranging from data loading, dataset understanding, model development, model evaluation to productionize a trained NLP model. Recognizing the need of simplying these tedious tasks, we developed this module (**utils_nlp**) to provide a wide spectrum of classes, functions and utilities. Adoption of this module can greately speed up the development work and sample notebooks in [Examples](../examples) folder can demonstrate this. The following provides a short description of the sub-modules. For more details about what functions/classes/utitilies are available and how to use them, please review the doc-strings provided with the code and see the sample notebooks in [Examples](../examples) folder. 4 | 5 | ## Submodules 6 | 7 | ### [AzureML](azureml) 8 | 9 | The AzureML submodule contains utilities to connect to an Azure Machine Learning workspace, train, tune and operationalize NLP systems at scale using AzureML. 10 | 11 | ```python 12 | from utils_nlp.azureml.azureml_utils import get_or_create_workspace 13 | 14 | ###Note: you do not need to fill in these values if you have a config.json in the same folder as this notebook 15 | ws = get_or_create_workspace( 16 | config_path=config_path, 17 | subscription_id=subscription_id, 18 | resource_group=resource_group, 19 | workspace_name=workspace_name, 20 | workspace_region=workspace_region, 21 | ) 22 | ``` 23 | 24 | ### [Common](common) 25 | 26 | This submodule contains high-level utilities that are commonly used in multiple algorithms as well as helper functions for managing frameworks like pytorch. 27 | 28 | ### [Dataset](dataset) 29 | This submodule includes helper functions for interacting with well-known datasets, utility functions to process datasets for different NLP tasks, as well as utilities for splitting data for training/testing. For example, the [snli module](snli.py) will allow you to load a dataframe in pandas from the Stanford Natural Language Inference (SNLI) Corpus dataset, with the option to set the number of rows to load in order to test algorithms and evaluate performance benchmarks. Information on the datasets used in the repo can be found [here](https://github.com/microsoft/nlp-recipes/tree/staging/utils_nlp/dataset#datasets). 30 | 31 | Most datasets may be split into `train`, `dev`, and `test`. 32 | 33 | ```python 34 | from utils_nlp.dataset.snli import load_pandas_df 35 | 36 | df = load_pandas_df(DATA_FOLDER, file_split ="train", nrows = 1000) 37 | ``` 38 | 39 | ### [Evaluation](eval) 40 | The *eval* submodule includes functionalities for computing common classification evaluation metrics like accuracy, precision, recall, and f1 scores for classification scenarios. It also includes metric utitlities for normalizing and finding f1_scores for [The Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer/), and utilities to log the means and other coefficients in evaluating the quality of sentence embedding. 41 | 42 | ### [Models](models) 43 | The models submodule contains implementations of various algorithms that can be used in addition to external packages to evaluate and develop new natural language processing systems. A description of which algorithms are used in each scenario can be found on [this table](../README.md#content). 44 | 45 | A few highlights are 46 | * BERT 47 | * GenSen 48 | * XLNet 49 | 50 | 51 | ### [Model Explainability](interpreter) 52 | The interpreter submodule contains utils that help explain or diagnose models, such as interpreting layers of a neural network. 53 | -------------------------------------------------------------------------------- /utils_nlp/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | __title__ = "Microsoft NLP" 5 | __author__ = "AI CAT at Microsoft" 6 | __license__ = "MIT" 7 | __copyright__ = "Copyright 2018-present Microsoft Corporation" 8 | __version__ = "2.0.0" 9 | 10 | # Synonyms 11 | TITLE = __title__ 12 | AUTHOR = __author__ 13 | LICENSE = __license__ 14 | COPYRIGHT = __copyright__ 15 | VERSION = __version__ 16 | -------------------------------------------------------------------------------- /utils_nlp/azureml/README.md: -------------------------------------------------------------------------------- 1 | ## [AzureML](.) 2 | 3 | The AzureML submodule contains utilities to connect to a 4 | [workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-workspace), 5 | train, tune and operationalize NLP systems at scale using AzureML. 6 | For example, the `DistributedCommunicator` class defined in 7 | [azureml_bert_util.py](./azureml_bert_util.py) assists in making communication with multiple nodes 8 | for distributed training possible. [azureml_utils.py](./azureml_utils.py) contains a few helper functions that make it easy to authenticate, create, or retrieve an AzureML resource. 9 | -------------------------------------------------------------------------------- /utils_nlp/azureml/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/nlp-recipes/7db6d204e5116da07bb3c549df546e49cb7ab5a5/utils_nlp/azureml/__init__.py -------------------------------------------------------------------------------- /utils_nlp/common/README.md: -------------------------------------------------------------------------------- 1 | ## [Common](.) 2 | 3 | This submodule contains high-level common utilities used across multiple algorithms and 4 | frameworks as well as helper functions for managing aspects of different frameworks like pytorch. 5 | For example, [pytorch_utils.py](./pytorch_utils.py) contains utilities to interact with PyTorch 6 | like getting a device architecture (cpu or gpu), moving a model to a specific device, and handling 7 | parallelism when multiple gpus are present. 8 | -------------------------------------------------------------------------------- /utils_nlp/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/nlp-recipes/7db6d204e5116da07bb3c549df546e49cb7ab5a5/utils_nlp/common/__init__.py -------------------------------------------------------------------------------- /utils_nlp/common/timer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | """Timer utilities for benchmarking running times of executions.""" 5 | 6 | from timeit import default_timer 7 | 8 | 9 | class Timer(object): 10 | """Timer class. 11 | Original code: https://github.com/miguelgfierro/codebase 12 | 13 | Examples: 14 | >>> import time 15 | >>> t = Timer() 16 | >>> t.start() 17 | >>> time.sleep(1) 18 | >>> t.stop() 19 | >>> t.interval < 1 20 | True 21 | >>> with Timer() as t: 22 | ... time.sleep(1) 23 | >>> t.interval < 1 24 | True 25 | >>> "Time elapsed {}".format(t) #doctest: +ELLIPSIS 26 | 'Time elapsed 1...' 27 | """ 28 | 29 | def __init__(self): 30 | self._timer = default_timer 31 | self._interval = 0 32 | self.running = False 33 | 34 | def __enter__(self): 35 | self.start() 36 | return self 37 | 38 | def __exit__(self, *args): 39 | self.stop() 40 | 41 | def __str__(self): 42 | return "{:0.4f}".format(self.interval) 43 | 44 | def start(self): 45 | """Start the timer.""" 46 | self.init = self._timer() 47 | self.running = True 48 | 49 | def stop(self): 50 | """Stop the timer. Calculate the interval in seconds.""" 51 | self.end = self._timer() 52 | try: 53 | self._interval = self.end - self.init 54 | self.running = False 55 | except AttributeError: 56 | raise ValueError( 57 | "Timer has not been initialized: use start() or the contextual form with Timer() " 58 | "as t:" 59 | ) 60 | 61 | @property 62 | def interval(self): 63 | if self.running: 64 | raise ValueError("Timer has not been stopped, please use stop().") 65 | else: 66 | return self._interval 67 | -------------------------------------------------------------------------------- /utils_nlp/dataset/README.md: -------------------------------------------------------------------------------- 1 | ## [Dataset](.) 2 | This submodule includes helper functions for downloading datasets and formatting them appropriately as well as utilities for splitting data for training / testing. 3 | 4 | ## Data Loading 5 | There are dataloaders for several datasets. For example, the snli module will allow you to load a dataframe in pandas from the SNLI dataset, with the option to set the number of rows to load in order to test algorithms and evaluate performance benchmarks. 6 | Most datasets may be split into `train`, `dev`, and `test`, for example: 7 | 8 | ```python 9 | from utils_nlp.dataset.snli import load_pandas_df 10 | 11 | df = load_pandas_df(DATA_FOLDER, file_split ="train", nrows = 1000) 12 | ``` 13 | ## Dataset List 14 | |Dataset|Dataloader script| 15 | |-------|-----------------| 16 | |[Microsoft Research Paraphrase Corpus](https://www.microsoft.com/en-us/download/details.aspx?id=52398)|[msrpc.py](./msrpc.py)| 17 | |[The Multi-Genre NLI (MultiNLI) Corpus](https://www.nyu.edu/projects/bowman/multinli/)|[multinli.py](./multinli.py)| 18 | |[The Stanford Natural Language Inference (SNLI) Corpus](https://nlp.stanford.edu/projects/snli/)|[snli.py](./snli.py)| 19 | |[Wikigold NER](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold/CONLL-format/data)|[wikigold.py](./wikigold.py)| 20 | |[The Cross-Lingual NLI (XNLI) Corpus](https://www.nyu.edu/projects/bowman/xnli/)|[xnli.py](./xnli.py)| 21 | |[The STSbenchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark)|[stsbenchmark.py](./stsbenchmark.py)| 22 | |[The Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer/)|[squad.py](./squad.py)| 23 | |[CNN/Daily Mail(CNN/DM) Dataset](https://github.com/harvardnlp/sent-summary)|[cnndm.py](./cnndm.py)| 24 | |[Preprocessed CNN/Daily Mail(CNN/DM) Dataset for Extractive Summarization](https://github.com/nlpyang/BertSum)|[cnndm.py](./cnndm.py)| 25 | 26 | ## Dataset References 27 | Please see [Dataset References](../../DatasetReferences.md) for notice and information regarding datasets used. 28 | -------------------------------------------------------------------------------- /utils_nlp/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | from enum import Enum 5 | import nltk 6 | 7 | nltk.download("punkt", quiet=True) 8 | nltk.download("stopwords", quiet=True) 9 | 10 | 11 | class Split(str, Enum): 12 | TRAIN: str = "train" 13 | DEV: str = "dev" 14 | TEST: str = "test" 15 | -------------------------------------------------------------------------------- /utils_nlp/dataset/msrpc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | """ 5 | Utility functions for downloading, extracting, and reading the Microsoft 6 | Research Paraphrase Corpus (MSRPC) dataset. 7 | https://www.microsoft.com/en-us/download/details.aspx?id=52398 8 | """ 9 | 10 | import os 11 | import pathlib 12 | 13 | import pandas as pd 14 | 15 | from utils_nlp.dataset.url_utils import maybe_download, download_path 16 | 17 | DATASET_DICT = { 18 | "train": "msr_paraphrase_train.txt", 19 | "test": "msr_paraphrase_test.txt", 20 | "all": "msr_paraphrase_data.txt", 21 | } 22 | 23 | 24 | def download_msrpc(download_dir): 25 | """Downloads Windows Installer for Microsoft Paraphrase Corpus. 26 | 27 | Args: 28 | download_dir (str): File path for the downloaded file 29 | 30 | Returns: 31 | str: file_path to the downloaded dataset. 32 | """ 33 | 34 | url = ( 35 | "https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B" 36 | "-3604ED519838/MSRParaphraseCorpus.msi" 37 | ) 38 | return maybe_download(url, work_directory=download_dir) 39 | 40 | 41 | def load_pandas_df(local_cache_path=None, dataset_type="train"): 42 | """Load pandas dataframe and clean the data from the downloaded dataset 43 | 44 | Args: 45 | the dataset is already downloaded. 46 | dataset_type (str): Key to the DATASET_DICT item. Loads the dataset specified. 47 | Could be train or test. 48 | local_cache_path (str): Path to download the dataset installer. 49 | 50 | Returns: 51 | pd.DataFrame: A pandas dataframe with 3 columns, Sentence 1, Sentence 2 and 52 | score. 53 | 54 | """ 55 | 56 | if dataset_type not in DATASET_DICT.keys(): 57 | raise Exception("Dataset type not found!") 58 | 59 | with download_path(local_cache_path) as path: 60 | path = pathlib.Path(path) 61 | installer_datapath = download_msrpc(path) 62 | 63 | print( 64 | "The Windows Installer for Mircosoft Paraphrase Corpus has been " "downloaded at ", 65 | installer_datapath, 66 | "\n", 67 | ) 68 | data_directory = input("Please install and provide the installed directory. Thanks! \n") 69 | 70 | data_directory = pathlib.Path(data_directory) 71 | assert os.path.exists(data_directory) 72 | 73 | fields = ["Quality", "#1 String", "#2 String"] 74 | file_path = os.path.join(data_directory, DATASET_DICT[dataset_type]) 75 | df = ( 76 | pd.read_csv(file_path, delimiter="\t", error_bad_lines=False, usecols=fields) 77 | .dropna() 78 | .rename( 79 | index=str, 80 | columns={"Quality": "score", "#1 String": "sentence1", "#2 String": "sentence2"}, 81 | ) 82 | ) 83 | return df 84 | -------------------------------------------------------------------------------- /utils_nlp/dataset/ner_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | """Common helper functions for preprocessing Named Entity Recognition (NER) datasets.""" 5 | 6 | 7 | def preprocess_conll(text, sep="\t"): 8 | """ 9 | Converts data in CoNLL format to word and label lists. 10 | 11 | Args: 12 | text (str): Text string in conll format, e.g. 13 | "Amy B-PER 14 | ADAMS I-PER 15 | works O 16 | at O 17 | the O 18 | University B-ORG 19 | of I-ORG 20 | Minnesota I-ORG 21 | . O" 22 | sep (str, optional): Column separator 23 | Defaults to \t 24 | Returns: 25 | tuple: 26 | (list of word lists, list of token label lists) 27 | """ 28 | text_list = text.split("\n\n") 29 | if text_list[-1] in (" ", ""): 30 | text_list = text_list[:-1] 31 | 32 | max_seq_len = 0 33 | sentence_list = [] 34 | labels_list = [] 35 | for s in text_list: 36 | # split each sentence string into "word label" pairs 37 | s_split = s.split("\n") 38 | # split "word label" pairs 39 | s_split_split = [t.split(sep) for t in s_split] 40 | sentence_list.append([t[0] for t in s_split_split if len(t) > 1]) 41 | labels_list.append([t[1] for t in s_split_split if len(t) > 1]) 42 | if len(s_split_split) > max_seq_len: 43 | max_seq_len = len(s_split_split) 44 | print("Maximum sequence length is: {0}".format(max_seq_len)) 45 | return sentence_list, labels_list 46 | 47 | 48 | def read_conll_file(file_path, sep="\t", encoding=None): 49 | """ 50 | Reads a data file in CoNLL format and returns word and label lists. 51 | 52 | Args: 53 | file_path (str): Data file path. 54 | sep (str, optional): Column separator. Defaults to "\t". 55 | encoding (str): File encoding used when reading the file. 56 | Defaults to None. 57 | 58 | Returns: 59 | (list, list): A tuple of word and label lists (list of lists). 60 | """ 61 | with open(file_path, encoding=encoding) as f: 62 | data = f.read() 63 | return preprocess_conll(data, sep=sep) 64 | -------------------------------------------------------------------------------- /utils_nlp/dataset/xnli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | """ 5 | Utility functions for downloading, extracting, and reading the 6 | Cross-Lingual NLI Corpus (XNLI). 7 | https://www.nyu.edu/projects/bowman/xnli/ 8 | """ 9 | 10 | 11 | import os 12 | import pandas as pd 13 | 14 | from utils_nlp.dataset.url_utils import extract_zip, maybe_download 15 | from utils_nlp.dataset.preprocess import convert_to_unicode 16 | 17 | URL_XNLI = "https://cims.nyu.edu/~sbowman/xnli/XNLI-1.0.zip" 18 | URL_XNLI_MT = "https://cims.nyu.edu/~sbowman/xnli/XNLI-MT-1.0.zip" 19 | 20 | 21 | def load_pandas_df(local_cache_path=".", file_split="dev", language="zh"): 22 | """Downloads and extracts the dataset files. 23 | 24 | Utilities information can be found `on this link `_. 25 | 26 | Args: 27 | local_cache_path (str, optional): Path to store the data. 28 | Defaults to "./". 29 | file_split (str, optional): The subset to load. 30 | One of: {"train", "dev", "test"} 31 | Defaults to "dev". 32 | language (str, optional): language subset to read. 33 | One of: {"en", "fr", "es", "de", "el", "bg", "ru", 34 | "tr", "ar", "vi", "th", "zh", "hi", "sw", "ur"} 35 | Defaults to "zh" (Chinese). 36 | Returns: 37 | pd.DataFrame: pandas DataFrame containing the specified 38 | XNLI subset. 39 | """ 40 | 41 | if file_split in ("dev", "test"): 42 | url = URL_XNLI 43 | sentence_1_index = 6 44 | sentence_2_index = 7 45 | label_index = 1 46 | 47 | zip_file_name = url.split("/")[-1] 48 | folder_name = ".".join(zip_file_name.split(".")[:-1]) 49 | file_name = folder_name + "/" + ".".join(["xnli", file_split, "tsv"]) 50 | elif file_split == "train": 51 | url = URL_XNLI_MT 52 | sentence_1_index = 0 53 | sentence_2_index = 1 54 | label_index = 2 55 | 56 | zip_file_name = url.split("/")[-1] 57 | folder_name = ".".join(zip_file_name.split(".")[:-1]) 58 | file_name = folder_name + "/multinli/" + ".".join(["multinli", file_split, language, "tsv"]) 59 | 60 | maybe_download(url, zip_file_name, local_cache_path) 61 | 62 | if not os.path.exists(os.path.join(local_cache_path, folder_name)): 63 | extract_zip(os.path.join(local_cache_path, zip_file_name), local_cache_path) 64 | 65 | with open(os.path.join(local_cache_path, file_name), "r", encoding="utf-8") as f: 66 | lines = f.read().splitlines() 67 | 68 | line_list = [line.split("\t") for line in lines] 69 | 70 | # Remove the column name row 71 | line_list.pop(0) 72 | if file_split != "train": 73 | line_list = [line for line in line_list if line[0] == language] 74 | 75 | valid_lines = [ 76 | True if line[sentence_1_index] and line[sentence_2_index] else False for line in line_list 77 | ] 78 | total_line_count = len(line_list) 79 | line_list = [line for line, valid in zip(line_list, valid_lines) if valid] 80 | valid_line_count = len(line_list) 81 | 82 | if valid_line_count != total_line_count: 83 | print("{} invalid lines removed.".format(total_line_count - valid_line_count)) 84 | 85 | label_list = [convert_to_unicode(line[label_index]) for line in line_list] 86 | old_contradict_label = convert_to_unicode("contradictory") 87 | new_contradict_label = convert_to_unicode("contradiction") 88 | label_list = [ 89 | new_contradict_label if label == old_contradict_label else label for label in label_list 90 | ] 91 | text_list = [ 92 | (convert_to_unicode(line[sentence_1_index]), convert_to_unicode(line[sentence_2_index])) 93 | for line in line_list 94 | ] 95 | 96 | df = pd.DataFrame({"text": text_list, "label": label_list}) 97 | 98 | return df 99 | -------------------------------------------------------------------------------- /utils_nlp/eval/README.md: -------------------------------------------------------------------------------- 1 | ## [Evaluation](.) 2 | The evaluation (eval) submodule includes functionalities for computing metrics for evaluating NLP model performance. There are general evaluation metrics like accuracy, precision, recall, and f1 scores for classification scenarios. In addition, we also include evaluation utilities for specialized tasks like question answering and sentence embedding. 3 | -------------------------------------------------------------------------------- /utils_nlp/eval/SentEval/.gitignore: -------------------------------------------------------------------------------- 1 | # SentEval data and .pyc files 2 | 3 | 4 | 5 | # python 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # log files 11 | *.log 12 | *.txt 13 | 14 | # data files 15 | data/senteval_data* 16 | -------------------------------------------------------------------------------- /utils_nlp/eval/SentEval/LICENSE: -------------------------------------------------------------------------------- 1 | BSD License 2 | 3 | For SentEval software 4 | 5 | Copyright (c) 2017-present, Facebook, Inc. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name Facebook nor the names of its contributors may be used to 18 | endorse or promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /utils_nlp/eval/SentEval/data/downstream/get_transfer_data.bash: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | # 9 | # Download and tokenize data with MOSES tokenizer 10 | # 11 | 12 | 13 | """ adapted the original script to only download the STSBenchmark data""" 14 | data_path=. 15 | preprocess_exec=./tokenizer.sed 16 | 17 | # Get MOSES 18 | echo 'Cloning Moses github repository (for tokenization scripts)...' 19 | git clone https://github.com/moses-smt/mosesdecoder.git 20 | SCRIPTS=mosesdecoder/scripts 21 | MTOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl 22 | LOWER=$SCRIPTS/tokenizer/lowercase.perl 23 | 24 | if [ ! -d "$SCRIPTS" ]; then 25 | echo "Please set SCRIPTS variable correctly to point to Moses scripts." 26 | exit 27 | fi 28 | 29 | PTBTOKENIZER="sed -f tokenizer.sed" 30 | 31 | mkdir $data_path 32 | 33 | STSBenchmark='http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz' 34 | 35 | 36 | # STS 2012, 2013, 2014, 2015, 2016 37 | declare -A STS_tasks 38 | declare -A STS_paths 39 | declare -A STS_subdirs 40 | 41 | STS_tasks=(["STS12"]="MSRpar MSRvid SMTeuroparl surprise.OnWN surprise.SMTnews" ["STS13"]="FNWN headlines OnWN" ["STS14"]="deft-forum deft-news headlines OnWN images tweet-news" ["STS15"]="answers-forums answers-students belief headlines images" ["STS16"]="answer-answer headlines plagiarism postediting question-question") 42 | 43 | STS_paths=(["STS12"]="http://ixa2.si.ehu.es/stswiki/images/4/40/STS2012-en-test.zip" ["STS13"]="http://ixa2.si.ehu.es/stswiki/images/2/2f/STS2013-en-test.zip" ["STS14"]="http://ixa2.si.ehu.es/stswiki/images/8/8c/STS2014-en-test.zip" ["STS15"]="http://ixa2.si.ehu.es/stswiki/images/d/da/STS2015-en-test.zip" 44 | ["STS16"]="http://ixa2.si.ehu.es/stswiki/images/9/98/STS2016-en-test.zip") 45 | 46 | STS_subdirs=(["STS12"]="test-gold" ["STS13"]="test-gs" ["STS14"]="sts-en-test-gs-2014" ["STS15"]="test_evaluation_task2a" ["STS16"]="sts2016-english-with-gs-v1.0") 47 | 48 | 49 | 50 | 51 | ### STS datasets 52 | 53 | # STS12, STS13, STS14, STS15, STS16 54 | mkdir $data_path/STS 55 | 56 | for task in "${!STS_tasks[@]}"; #"${!STS_tasks[@]}"; 57 | do 58 | fpath=${STS_paths[$task]} 59 | echo $fpath 60 | curl -Lo $data_path/STS/data_$task.zip $fpath 61 | unzip $data_path/STS/data_$task.zip -d $data_path/STS 62 | mv $data_path/STS/${STS_subdirs[$task]} $data_path/STS/$task-en-test 63 | rm $data_path/STS/data_$task.zip 64 | 65 | for sts_task in ${STS_tasks[$task]} 66 | do 67 | fname=STS.input.$sts_task.txt 68 | task_path=$data_path/STS/$task-en-test/ 69 | 70 | if [ "$task" = "STS16" ] ; then 71 | echo 'Handling STS2016' 72 | mv $task_path/STS2016.input.$sts_task.txt $task_path/$fname 73 | mv $task_path/STS2016.gs.$sts_task.txt $task_path/STS.gs.$sts_task.txt 74 | fi 75 | 76 | 77 | 78 | cut -f1 $task_path/$fname | $MTOKENIZER -threads 8 -l en -no-escape | $LOWER > $task_path/tmp1 79 | cut -f2 $task_path/$fname | $MTOKENIZER -threads 8 -l en -no-escape | $LOWER > $task_path/tmp2 80 | paste $task_path/tmp1 $task_path/tmp2 > $task_path/$fname 81 | rm $task_path/tmp1 $task_path/tmp2 82 | done 83 | 84 | done 85 | 86 | 87 | # STSBenchmark (http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark) 88 | 89 | curl -Lo $data_path/Stsbenchmark.tar.gz $STSBenchmark 90 | tar -zxvf $data_path/Stsbenchmark.tar.gz -C $data_path 91 | rm $data_path/Stsbenchmark.tar.gz 92 | mv $data_path/stsbenchmark $data_path/STS/STSBenchmark 93 | 94 | for split in train dev test 95 | do 96 | fname=sts-$split.csv 97 | fdir=$data_path/STS/STSBenchmark 98 | cut -f1,2,3,4,5 $fdir/$fname > $fdir/tmp1 99 | cut -f6 $fdir/$fname | $MTOKENIZER -threads 8 -l en -no-escape | $LOWER > $fdir/tmp2 100 | cut -f7 $fdir/$fname | $MTOKENIZER -threads 8 -l en -no-escape | $LOWER > $fdir/tmp3 101 | paste $fdir/tmp1 $fdir/tmp2 $fdir/tmp3 > $fdir/$fname 102 | rm $fdir/tmp1 $fdir/tmp2 $fdir/tmp3 103 | done 104 | 105 | -------------------------------------------------------------------------------- /utils_nlp/eval/SentEval/data/downstream/tokenizer.sed: -------------------------------------------------------------------------------- 1 | # Sed script to produce Penn Treebank tokenization on arbitrary raw text. 2 | # Yeah, sure. 3 | 4 | # expected input: raw text with ONE SENTENCE TOKEN PER LINE 5 | 6 | # by Robert MacIntyre, University of Pennsylvania, late 1995. 7 | 8 | # If this wasn't such a trivial program, I'd include all that stuff about 9 | # no warrantee, free use, etc. from the GNU General Public License. If you 10 | # want to be picky, assume that all of its terms apply. Okay? 11 | 12 | # attempt to get correct directional quotes 13 | s=^"=`` =g 14 | s=\([ ([{<]\)"=\1 `` =g 15 | # close quotes handled at end 16 | 17 | s=\.\.\.= ... =g 18 | s=[,;:@#$%&]= & =g 19 | 20 | # Assume sentence tokenization has been done first, so split FINAL periods 21 | # only. 22 | s=\([^.]\)\([.]\)\([])}>"']*\)[ ]*$=\1 \2\3 =g 23 | # however, we may as well split ALL question marks and exclamation points, 24 | # since they shouldn't have the abbrev.-marker ambiguity problem 25 | s=[?!]= & =g 26 | 27 | # parentheses, brackets, etc. 28 | s=[][(){}<>]= & =g 29 | # Some taggers, such as Adwait Ratnaparkhi's MXPOST, use the parsed-file 30 | # version of these symbols. 31 | # UNCOMMENT THE FOLLOWING 6 LINES if you're using MXPOST. 32 | # s/(/-LRB-/g 33 | # s/)/-RRB-/g 34 | # s/\[/-LSB-/g 35 | # s/\]/-RSB-/g 36 | # s/{/-LCB-/g 37 | # s/}/-RCB-/g 38 | 39 | s=--= -- =g 40 | 41 | # NOTE THAT SPLIT WORDS ARE NOT MARKED. Obviously this isn't great, since 42 | # you might someday want to know how the words originally fit together -- 43 | # but it's too late to make a better system now, given the millions of 44 | # words we've already done "wrong". 45 | 46 | # First off, add a space to the beginning and end of each line, to reduce 47 | # necessary number of regexps. 48 | s=$= = 49 | s=^= = 50 | 51 | s="= '' =g 52 | # possessive or close-single-quote 53 | s=\([^']\)' =\1 ' =g 54 | # as in it's, I'm, we'd 55 | s='\([sSmMdD]\) = '\1 =g 56 | s='ll = 'll =g 57 | s='re = 're =g 58 | s='ve = 've =g 59 | s=n't = n't =g 60 | s='LL = 'LL =g 61 | s='RE = 'RE =g 62 | s='VE = 'VE =g 63 | s=N'T = N'T =g 64 | 65 | s= \([Cc]\)annot = \1an not =g 66 | s= \([Dd]\)'ye = \1' ye =g 67 | s= \([Gg]\)imme = \1im me =g 68 | s= \([Gg]\)onna = \1on na =g 69 | s= \([Gg]\)otta = \1ot ta =g 70 | s= \([Ll]\)emme = \1em me =g 71 | s= \([Mm]\)ore'n = \1ore 'n =g 72 | s= '\([Tt]\)is = '\1 is =g 73 | s= '\([Tt]\)was = '\1 was =g 74 | s= \([Ww]\)anna = \1an na =g 75 | # s= \([Ww]\)haddya = \1ha dd ya =g 76 | # s= \([Ww]\)hatcha = \1ha t cha =g 77 | 78 | # clean out extra spaces 79 | s= *= =g 80 | s=^ *==g 81 | -------------------------------------------------------------------------------- /utils_nlp/eval/SentEval/senteval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import 9 | 10 | from senteval.engine import SE 11 | -------------------------------------------------------------------------------- /utils_nlp/eval/SentEval/senteval/binary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | Binary classifier and corresponding datasets : MR, CR, SUBJ, MPQA 10 | ''' 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import io 14 | import os 15 | import numpy as np 16 | import logging 17 | 18 | from senteval.tools.validation import InnerKFoldClassifier 19 | 20 | 21 | class BinaryClassifierEval(object): 22 | def __init__(self, pos, neg, seed=1111): 23 | self.seed = seed 24 | self.samples, self.labels = pos + neg, [1] * len(pos) + [0] * len(neg) 25 | self.n_samples = len(self.samples) 26 | 27 | def do_prepare(self, params, prepare): 28 | # prepare is given the whole text 29 | return prepare(params, self.samples) 30 | # prepare puts everything it outputs in "params" : params.word2id etc 31 | # Those output will be further used by "batcher". 32 | 33 | def loadFile(self, fpath): 34 | with io.open(fpath, 'r', encoding='latin-1') as f: 35 | return [line.split() for line in f.read().splitlines()] 36 | 37 | def run(self, params, batcher): 38 | enc_input = [] 39 | # Sort to reduce padding 40 | sorted_corpus = sorted(zip(self.samples, self.labels), 41 | key=lambda z: (len(z[0]), z[1])) 42 | sorted_samples = [x for (x, y) in sorted_corpus] 43 | sorted_labels = [y for (x, y) in sorted_corpus] 44 | logging.info('Generating sentence embeddings') 45 | for ii in range(0, self.n_samples, params.batch_size): 46 | batch = sorted_samples[ii:ii + params.batch_size] 47 | embeddings = batcher(params, batch) 48 | enc_input.append(embeddings) 49 | enc_input = np.vstack(enc_input) 50 | logging.info('Generated sentence embeddings') 51 | 52 | config = {'nclasses': 2, 'seed': self.seed, 53 | 'usepytorch': params.usepytorch, 54 | 'classifier': params.classifier, 55 | 'nhid': params.nhid, 'kfold': params.kfold} 56 | clf = InnerKFoldClassifier(enc_input, np.array(sorted_labels), config) 57 | devacc, testacc = clf.run() 58 | logging.debug('Dev acc : {0} Test acc : {1}\n'.format(devacc, testacc)) 59 | return {'devacc': devacc, 'acc': testacc, 'ndev': self.n_samples, 60 | 'ntest': self.n_samples} 61 | 62 | 63 | class CREval(BinaryClassifierEval): 64 | def __init__(self, task_path, seed=1111): 65 | logging.debug('***** Transfer task : CR *****\n\n') 66 | pos = self.loadFile(os.path.join(task_path, 'custrev.pos')) 67 | neg = self.loadFile(os.path.join(task_path, 'custrev.neg')) 68 | super(self.__class__, self).__init__(pos, neg, seed) 69 | 70 | 71 | class MREval(BinaryClassifierEval): 72 | def __init__(self, task_path, seed=1111): 73 | logging.debug('***** Transfer task : MR *****\n\n') 74 | pos = self.loadFile(os.path.join(task_path, 'rt-polarity.pos')) 75 | neg = self.loadFile(os.path.join(task_path, 'rt-polarity.neg')) 76 | super(self.__class__, self).__init__(pos, neg, seed) 77 | 78 | 79 | class SUBJEval(BinaryClassifierEval): 80 | def __init__(self, task_path, seed=1111): 81 | logging.debug('***** Transfer task : SUBJ *****\n\n') 82 | obj = self.loadFile(os.path.join(task_path, 'subj.objective')) 83 | subj = self.loadFile(os.path.join(task_path, 'subj.subjective')) 84 | super(self.__class__, self).__init__(obj, subj, seed) 85 | 86 | 87 | class MPQAEval(BinaryClassifierEval): 88 | def __init__(self, task_path, seed=1111): 89 | logging.debug('***** Transfer task : MPQA *****\n\n') 90 | pos = self.loadFile(os.path.join(task_path, 'mpqa.pos')) 91 | neg = self.loadFile(os.path.join(task_path, 'mpqa.neg')) 92 | super(self.__class__, self).__init__(pos, neg, seed) 93 | -------------------------------------------------------------------------------- /utils_nlp/eval/SentEval/senteval/sst.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | SST - binary classification 10 | ''' 11 | 12 | from __future__ import absolute_import, division, unicode_literals 13 | 14 | import os 15 | import io 16 | import logging 17 | import numpy as np 18 | 19 | from senteval.tools.validation import SplitClassifier 20 | 21 | 22 | class SSTEval(object): 23 | def __init__(self, task_path, nclasses=2, seed=1111): 24 | self.seed = seed 25 | 26 | # binary of fine-grained 27 | assert nclasses in [2, 5] 28 | self.nclasses = nclasses 29 | self.task_name = 'Binary' if self.nclasses == 2 else 'Fine-Grained' 30 | logging.debug('***** Transfer task : SST %s classification *****\n\n', self.task_name) 31 | 32 | train = self.loadFile(os.path.join(task_path, 'sentiment-train')) 33 | dev = self.loadFile(os.path.join(task_path, 'sentiment-dev')) 34 | test = self.loadFile(os.path.join(task_path, 'sentiment-test')) 35 | self.sst_data = {'train': train, 'dev': dev, 'test': test} 36 | 37 | def do_prepare(self, params, prepare): 38 | samples = self.sst_data['train']['X'] + self.sst_data['dev']['X'] + \ 39 | self.sst_data['test']['X'] 40 | return prepare(params, samples) 41 | 42 | def loadFile(self, fpath): 43 | sst_data = {'X': [], 'y': []} 44 | with io.open(fpath, 'r', encoding='utf-8') as f: 45 | for line in f: 46 | if self.nclasses == 2: 47 | sample = line.strip().split('\t') 48 | sst_data['y'].append(int(sample[1])) 49 | sst_data['X'].append(sample[0].split()) 50 | elif self.nclasses == 5: 51 | sample = line.strip().split(' ', 1) 52 | sst_data['y'].append(int(sample[0])) 53 | sst_data['X'].append(sample[1].split()) 54 | assert max(sst_data['y']) == self.nclasses - 1 55 | return sst_data 56 | 57 | def run(self, params, batcher): 58 | sst_embed = {'train': {}, 'dev': {}, 'test': {}} 59 | bsize = params.batch_size 60 | 61 | for key in self.sst_data: 62 | logging.info('Computing embedding for {0}'.format(key)) 63 | # Sort to reduce padding 64 | sorted_data = sorted(zip(self.sst_data[key]['X'], 65 | self.sst_data[key]['y']), 66 | key=lambda z: (len(z[0]), z[1])) 67 | self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data)) 68 | 69 | sst_embed[key]['X'] = [] 70 | for ii in range(0, len(self.sst_data[key]['y']), bsize): 71 | batch = self.sst_data[key]['X'][ii:ii + bsize] 72 | embeddings = batcher(params, batch) 73 | sst_embed[key]['X'].append(embeddings) 74 | sst_embed[key]['X'] = np.vstack(sst_embed[key]['X']) 75 | sst_embed[key]['y'] = np.array(self.sst_data[key]['y']) 76 | logging.info('Computed {0} embeddings'.format(key)) 77 | 78 | config_classifier = {'nclasses': self.nclasses, 'seed': self.seed, 79 | 'usepytorch': params.usepytorch, 80 | 'classifier': params.classifier} 81 | 82 | clf = SplitClassifier(X={'train': sst_embed['train']['X'], 83 | 'valid': sst_embed['dev']['X'], 84 | 'test': sst_embed['test']['X']}, 85 | y={'train': sst_embed['train']['y'], 86 | 'valid': sst_embed['dev']['y'], 87 | 'test': sst_embed['test']['y']}, 88 | config=config_classifier) 89 | 90 | devacc, testacc = clf.run() 91 | logging.debug('\nDev acc : {0} Test acc : {1} for \ 92 | SST {2} classification\n'.format(devacc, testacc, self.task_name)) 93 | 94 | return {'devacc': devacc, 'acc': testacc, 95 | 'ndev': len(sst_embed['dev']['X']), 96 | 'ntest': len(sst_embed['test']['X'])} 97 | -------------------------------------------------------------------------------- /utils_nlp/eval/SentEval/senteval/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/nlp-recipes/7db6d204e5116da07bb3c549df546e49cb7ab5a5/utils_nlp/eval/SentEval/senteval/tools/__init__.py -------------------------------------------------------------------------------- /utils_nlp/eval/SentEval/senteval/trec.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | TREC question-type classification 10 | ''' 11 | 12 | from __future__ import absolute_import, division, unicode_literals 13 | 14 | import os 15 | import io 16 | import logging 17 | import numpy as np 18 | 19 | from senteval.tools.validation import KFoldClassifier 20 | 21 | 22 | class TRECEval(object): 23 | def __init__(self, task_path, seed=1111): 24 | logging.info('***** Transfer task : TREC *****\n\n') 25 | self.seed = seed 26 | self.train = self.loadFile(os.path.join(task_path, 'train_5500.label')) 27 | self.test = self.loadFile(os.path.join(task_path, 'TREC_10.label')) 28 | 29 | def do_prepare(self, params, prepare): 30 | samples = self.train['X'] + self.test['X'] 31 | return prepare(params, samples) 32 | 33 | def loadFile(self, fpath): 34 | trec_data = {'X': [], 'y': []} 35 | tgt2idx = {'ABBR': 0, 'DESC': 1, 'ENTY': 2, 36 | 'HUM': 3, 'LOC': 4, 'NUM': 5} 37 | with io.open(fpath, 'r', encoding='latin-1') as f: 38 | for line in f: 39 | target, sample = line.strip().split(':', 1) 40 | sample = sample.split(' ', 1)[1].split() 41 | assert target in tgt2idx, target 42 | trec_data['X'].append(sample) 43 | trec_data['y'].append(tgt2idx[target]) 44 | return trec_data 45 | 46 | def run(self, params, batcher): 47 | train_embeddings, test_embeddings = [], [] 48 | 49 | # Sort to reduce padding 50 | sorted_corpus_train = sorted(zip(self.train['X'], self.train['y']), 51 | key=lambda z: (len(z[0]), z[1])) 52 | train_samples = [x for (x, y) in sorted_corpus_train] 53 | train_labels = [y for (x, y) in sorted_corpus_train] 54 | 55 | sorted_corpus_test = sorted(zip(self.test['X'], self.test['y']), 56 | key=lambda z: (len(z[0]), z[1])) 57 | test_samples = [x for (x, y) in sorted_corpus_test] 58 | test_labels = [y for (x, y) in sorted_corpus_test] 59 | 60 | # Get train embeddings 61 | for ii in range(0, len(train_labels), params.batch_size): 62 | batch = train_samples[ii:ii + params.batch_size] 63 | embeddings = batcher(params, batch) 64 | train_embeddings.append(embeddings) 65 | train_embeddings = np.vstack(train_embeddings) 66 | logging.info('Computed train embeddings') 67 | 68 | # Get test embeddings 69 | for ii in range(0, len(test_labels), params.batch_size): 70 | batch = test_samples[ii:ii + params.batch_size] 71 | embeddings = batcher(params, batch) 72 | test_embeddings.append(embeddings) 73 | test_embeddings = np.vstack(test_embeddings) 74 | logging.info('Computed test embeddings') 75 | 76 | config_classifier = {'nclasses': 6, 'seed': self.seed, 77 | 'usepytorch': params.usepytorch, 78 | 'classifier': params.classifier, 79 | 'kfold': params.kfold} 80 | clf = KFoldClassifier({'X': train_embeddings, 81 | 'y': np.array(train_labels)}, 82 | {'X': test_embeddings, 83 | 'y': np.array(test_labels)}, 84 | config_classifier) 85 | devacc, testacc, _ = clf.run() 86 | logging.debug('\nDev acc : {0} Test acc : {1} \ 87 | for TREC\n'.format(devacc, testacc)) 88 | return {'devacc': devacc, 'acc': testacc, 89 | 'ndev': len(self.train['X']), 'ntest': len(self.test['X'])} 90 | -------------------------------------------------------------------------------- /utils_nlp/eval/SentEval/senteval/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import, division, unicode_literals 9 | 10 | import numpy as np 11 | import re 12 | import inspect 13 | from torch import optim 14 | 15 | 16 | def create_dictionary(sentences): 17 | words = {} 18 | for s in sentences: 19 | for word in s: 20 | if word in words: 21 | words[word] += 1 22 | else: 23 | words[word] = 1 24 | words[''] = 1e9 + 4 25 | words[''] = 1e9 + 3 26 | words['

'] = 1e9 + 2 27 | # words[''] = 1e9 + 1 28 | sorted_words = sorted(words.items(), key=lambda x: -x[1]) # inverse sort 29 | id2word = [] 30 | word2id = {} 31 | for i, (w, _) in enumerate(sorted_words): 32 | id2word.append(w) 33 | word2id[w] = i 34 | 35 | return id2word, word2id 36 | 37 | 38 | def cosine(u, v): 39 | return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)) 40 | 41 | 42 | class dotdict(dict): 43 | """ dot.notation access to dictionary attributes """ 44 | __getattr__ = dict.get 45 | __setattr__ = dict.__setitem__ 46 | __delattr__ = dict.__delitem__ 47 | 48 | 49 | def get_optimizer(s): 50 | """ 51 | Parse optimizer parameters. 52 | Input should be of the form: 53 | - "sgd,lr=0.01" 54 | - "adagrad,lr=0.1,lr_decay=0.05" 55 | """ 56 | if "," in s: 57 | method = s[:s.find(',')] 58 | optim_params = {} 59 | for x in s[s.find(',') + 1:].split(','): 60 | split = x.split('=') 61 | assert len(split) == 2 62 | assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None 63 | optim_params[split[0]] = float(split[1]) 64 | else: 65 | method = s 66 | optim_params = {} 67 | 68 | if method == 'adadelta': 69 | optim_fn = optim.Adadelta 70 | elif method == 'adagrad': 71 | optim_fn = optim.Adagrad 72 | elif method == 'adam': 73 | optim_fn = optim.Adam 74 | elif method == 'adamax': 75 | optim_fn = optim.Adamax 76 | elif method == 'asgd': 77 | optim_fn = optim.ASGD 78 | elif method == 'rmsprop': 79 | optim_fn = optim.RMSprop 80 | elif method == 'rprop': 81 | optim_fn = optim.Rprop 82 | elif method == 'sgd': 83 | optim_fn = optim.SGD 84 | assert 'lr' in optim_params 85 | else: 86 | raise Exception('Unknown optimization method: "%s"' % method) 87 | 88 | # check that we give good parameters to the optimizer 89 | expected_args = inspect.getfullargspec(optim_fn.__init__)[0] 90 | assert expected_args[:2] == ['self', 'params'] 91 | if not all(k in expected_args[2:] for k in optim_params.keys()): 92 | raise Exception('Unexpected parameters: expected "%s", got "%s"' % ( 93 | str(expected_args[2:]), str(optim_params.keys()))) 94 | 95 | return optim_fn, optim_params 96 | -------------------------------------------------------------------------------- /utils_nlp/eval/SentEval/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import io 9 | from setuptools import setup, find_packages 10 | 11 | with io.open('./README.md', encoding='utf-8') as f: 12 | readme = f.read() 13 | 14 | setup( 15 | name='SentEval', 16 | version='0.1.0', 17 | url='https://github.com/facebookresearch/SentEval', 18 | packages=find_packages(exclude=['examples']), 19 | license='Attribution-NonCommercial 4.0 International', 20 | long_description=readme, 21 | ) 22 | -------------------------------------------------------------------------------- /utils_nlp/eval/__init__.py: -------------------------------------------------------------------------------- 1 | from .rouge.compute_rouge import compute_rouge_perl, compute_rouge_python 2 | -------------------------------------------------------------------------------- /utils_nlp/eval/classification.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | """Utilities functions for computing general model evaluation metrics.""" 5 | 6 | from sklearn.metrics import ( 7 | accuracy_score, 8 | precision_score, 9 | recall_score, 10 | f1_score, 11 | confusion_matrix, 12 | ) 13 | from numpy import corrcoef 14 | 15 | from matplotlib import pyplot 16 | import seaborn as sn 17 | import numpy as np 18 | import pandas as pd 19 | 20 | 21 | def eval_classification(actual, predicted, round_decimals=4): 22 | """Returns common classification evaluation metrics. 23 | Args: 24 | actual (1d array-like): Array of actual values. 25 | predicted (1d array-like): Array of predicted values. 26 | round_decimals (int, optional): Number of decimal places. Defaults to 4. 27 | Returns: 28 | dict: A dictionary of evaluation metrics. 29 | """ 30 | return { 31 | "accuracy": accuracy_score(actual, predicted).round(round_decimals), 32 | "precision": list(precision_score(actual, predicted, average=None).round(round_decimals)), 33 | "recall": list(recall_score(actual, predicted, average=None).round(round_decimals)), 34 | "f1": list(f1_score(actual, predicted, average=None).round(round_decimals)), 35 | } 36 | 37 | 38 | def compute_correlation_coefficients(x, y=None): 39 | """ 40 | Compute Pearson product-moment correlation coefficients. 41 | 42 | Args: 43 | x: array_like 44 | A 1-D or 2-D array containing multiple variables and observations. 45 | Each row of `x` represents a variable, and each column a single 46 | observation of all those variables. 47 | 48 | y: array_like, optional 49 | An additional set of variables and observations. `y` has the same 50 | shape as `x`. 51 | 52 | Returns: 53 | pd.DataFrame : A pandas dataframe from the correlation coefficient matrix of the variables. 54 | """ 55 | return pd.DataFrame(corrcoef(x, y)) 56 | 57 | 58 | def plot_confusion_matrix( 59 | y_true, 60 | y_pred, 61 | labels, 62 | normalize=False, 63 | title="Confusion matrix", 64 | plot_size=(8, 5), 65 | font_scale=1.1, 66 | ): 67 | """Function that prints out a graphical representation of confusion matrix using Seaborn Heatmap 68 | 69 | Args: 70 | y_true (1d array-like): True labels from dataset 71 | y_pred (1d array-like): Predicted labels from the models 72 | labels: A list of labels 73 | normalize (Bool, optional): Boolean to Set Row Normalization for Confusion Matrix 74 | title (String, optional): String that is the title of the plot 75 | plot_size (tuple, optional): Tuple of Plot Dimensions Default "(8, 5)" 76 | font_scale (float, optional): float type scale factor for font within plot 77 | """ 78 | conf_matrix = np.array(confusion_matrix(y_true, y_pred)) 79 | if normalize: 80 | conf_matrix = np.round( 81 | conf_matrix.astype("float") / conf_matrix.sum(axis=1)[:, np.newaxis], 3 82 | ) 83 | conf_dataframe = pd.DataFrame(conf_matrix, labels, labels) 84 | fig, ax = pyplot.subplots(figsize=plot_size) 85 | sn.set(font_scale=font_scale) 86 | ax.set_title(title) 87 | ax = sn.heatmap(conf_dataframe, cmap="Blues", annot=True, annot_kws={"size": 16}, fmt="g") 88 | ax.set(xlabel="Predicted Labels", ylabel="True Labels") 89 | -------------------------------------------------------------------------------- /utils_nlp/eval/evaluate_squad.py: -------------------------------------------------------------------------------- 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """ 2 | 3 | # Original source: 4 | # https://github.com/allenai/bi-att-flow/blob/498c8026d92a8bcf0286e2d216d092d444d02d76/squad/evaluate-v1.1.py 5 | 6 | from __future__ import print_function 7 | from collections import Counter 8 | import string 9 | import re 10 | import argparse 11 | import json 12 | import sys 13 | 14 | 15 | def normalize_answer(s): 16 | """Lower text and remove punctuation, articles and extra whitespace.""" 17 | 18 | def remove_articles(text): 19 | return re.sub(r"\b(a|an|the)\b", " ", text) 20 | 21 | def white_space_fix(text): 22 | return " ".join(text.split()) 23 | 24 | def remove_punc(text): 25 | exclude = set(string.punctuation) 26 | return "".join(ch for ch in text if ch not in exclude) 27 | 28 | def lower(text): 29 | return text.lower() 30 | 31 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 32 | 33 | 34 | def f1_score(prediction, ground_truth): 35 | prediction_tokens = normalize_answer(prediction).split() 36 | ground_truth_tokens = normalize_answer(ground_truth).split() 37 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 38 | num_same = sum(common.values()) 39 | if num_same == 0: 40 | return 0 41 | precision = 1.0 * num_same / len(prediction_tokens) 42 | recall = 1.0 * num_same / len(ground_truth_tokens) 43 | f1 = (2 * precision * recall) / (precision + recall) 44 | return f1 45 | 46 | 47 | def exact_match_score(prediction, ground_truth): 48 | return normalize_answer(prediction) == normalize_answer(ground_truth) 49 | 50 | 51 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 52 | scores_for_ground_truths = [] 53 | for ground_truth in ground_truths: 54 | score = metric_fn(prediction, ground_truth) 55 | scores_for_ground_truths.append(score) 56 | return max(scores_for_ground_truths) 57 | 58 | 59 | def evaluate(dataset, predictions): 60 | f1 = exact_match = total = 0 61 | for article in dataset: 62 | for paragraph in article["paragraphs"]: 63 | for qa in paragraph["qas"]: 64 | total += 1 65 | if qa["id"] not in predictions: 66 | message = "Unanswered question " + qa["id"] + " will receive score 0." 67 | print(message, file=sys.stderr) 68 | continue 69 | ground_truths = list(map(lambda x: x["text"], qa["answers"])) 70 | prediction = predictions[qa["id"]] 71 | exact_match += metric_max_over_ground_truths( 72 | exact_match_score, prediction, ground_truths 73 | ) 74 | f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths) 75 | 76 | exact_match = 100.0 * exact_match / total 77 | f1 = 100.0 * f1 / total 78 | 79 | return {"exact_match": exact_match, "f1": f1} 80 | 81 | 82 | if __name__ == "__main__": 83 | expected_version = "1.1" 84 | parser = argparse.ArgumentParser(description="Evaluation for SQuAD " + expected_version) 85 | parser.add_argument("dataset_file", help="Dataset file") 86 | parser.add_argument("prediction_file", help="Prediction File") 87 | args = parser.parse_args() 88 | with open(args.dataset_file) as dataset_file: 89 | dataset_json = json.load(dataset_file) 90 | if dataset_json["version"] != expected_version: 91 | print( 92 | "Evaluation expects v-" 93 | + expected_version 94 | + ", but got dataset with v-" 95 | + dataset_json["version"], 96 | file=sys.stderr, 97 | ) 98 | dataset = dataset_json["data"] 99 | with open(args.prediction_file) as prediction_file: 100 | predictions = json.load(prediction_file) 101 | print(json.dumps(evaluate(dataset, predictions))) 102 | -------------------------------------------------------------------------------- /utils_nlp/eval/evaluate_summarization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | from random import random, seed 6 | 7 | from bertsum.others.utils import test_rouge 8 | 9 | 10 | def get_rouge(predictions, targets, temp_dir, random_seed=42): 11 | """ 12 | function to get the rouge metric for the prediction and the reference. 13 | 14 | Args: 15 | predictions (list of strings): Predictions to be compared. 16 | target (list of strings): References 17 | temp_dir (str): Path where temporary folders are created to host the files 18 | generated by ROUGE application. 19 | seed (int, optional): Random seed. Defaults to 42. 20 | 21 | Return: 22 | dictionary: rouge metric 23 | 24 | """ 25 | 26 | def _write_list_to_file(list_items, filename): 27 | with open(filename, "w") as filehandle: 28 | # for cnt, line in enumerate(filehandle): 29 | for item in list_items: 30 | filehandle.write("%s\n" % item) 31 | 32 | seed(random_seed) 33 | random_number = random() 34 | os.makedirs(temp_dir, exist_ok=True) 35 | candidate_path = os.path.join(temp_dir, "candidate" + str(random_number)) 36 | gold_path = os.path.join(temp_dir, "gold" + str(random_number)) 37 | _write_list_to_file(predictions, candidate_path) 38 | _write_list_to_file(targets, gold_path) 39 | rouge = test_rouge(temp_dir, candidate_path, gold_path) 40 | return rouge 41 | -------------------------------------------------------------------------------- /utils_nlp/eval/senteval.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | """Utilities for evaluating sentence embeddings.""" 5 | 6 | 7 | class SentEvalConfig: 8 | """Object to store static properties of senteval experiments 9 | 10 | Attributes: 11 | model_params (dict): model parameters that stay consistent across all runs 12 | senteval_params (dict): senteval parameters that stay consistent across all runs 13 | 14 | """ 15 | 16 | def __init__(self, model_params, senteval_params): 17 | """Summary 18 | 19 | Args: 20 | model_params (dict): model parameters that stay consistent across all runs 21 | senteval_params (dict): senteval parameters that stay consistent across all runs 22 | """ 23 | self.model_params = model_params 24 | self.senteval_params = senteval_params 25 | 26 | @property 27 | def model_params(self): 28 | return self._model_params 29 | 30 | @model_params.setter 31 | def model_params(self, model_params): 32 | self._model_params = model_params 33 | 34 | def append_senteval_params(self, params): 35 | """Util to append any params to senteval_params after initialization""" 36 | self.senteval_params = dict(self.senteval_params, **params) 37 | 38 | classifying_tasks = { 39 | "MR", 40 | "CR", 41 | "SUBJ", 42 | "MPQA", 43 | "SST2", 44 | "SST5", 45 | "TREC", 46 | "SICKEntailment", 47 | "SNLI", 48 | "MRPC", 49 | } 50 | 51 | if any(t in classifying_tasks for t in self.transfer_tasks): 52 | try: 53 | a = "classifier" in self.senteval_params 54 | if not a: 55 | raise ValueError("Include param['classifier'] to run task {}".format(t)) 56 | else: 57 | b = ( 58 | set("nhid", "optim", "batch_size", "tenacity", "epoch_size") 59 | in self.senteval_params["classifier"].keys() 60 | ) 61 | if not b: 62 | raise ValueError( 63 | "Include nhid, optim, batch_size, tenacity, and epoch_size params to " 64 | "run task {}".format(t) 65 | ) 66 | except ValueError as ve: 67 | print(ve) 68 | -------------------------------------------------------------------------------- /utils_nlp/interpreter/README.md: -------------------------------------------------------------------------------- 1 | # Towards a Deep and Unified Understanding of Deep Neural Models in NLP 2 | 3 | This submodule contains a tool for explaining hidden states of models. It is an implementation of the paper [*Towards a Deep and Unified Understanding of Deep Neural Models in NLP*](http://proceedings.mlr.press/v97/guan19a/guan19a.pdf) 4 | 5 | 6 | ## How to use 7 | 8 | We provide a notebook tutorial [here](../../examples/interpret_NLP_models/understand_models.ipynb) to help you get started quickly. The main class needed is the `Interpreter` in [Interpreter.py](Interpreter.py). Given any input word embeddings and a forward function $\Phi$ that transforms the word embeddings $\bf x$ to a hidden state $\bf s$, the Interpreter helps understand how much each input word contributes to the hidden state. Suppose the $\Phi$, the input $\bf x$ and the input words are defined as: 9 | ``` 10 | import torch 11 | 12 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 13 | 14 | x = torch.randn(5,256) / 100 15 | x = x.to(device) 16 | words = ['1','2','3','4','5'] 17 | 18 | def Phi(x): 19 | W = torch.tensor([10., 20., 5., -20., -10.]).to(device) 20 | return W @ x 21 | ``` 22 | 23 | To explain a certain hidden state, we also need to get its variance for regularization. We provide a simple tool in `Interpreter.py` for calculating regularization. You just need to provide your sampled x as a list and your Phi. as shown below: 24 | 25 | ``` 26 | from Interpreter import calculate_regularization 27 | 28 | # here we sample input x using random for simplicity 29 | sampled_x = [torch.randn(5,256) / 100 for _ in range(100)] 30 | 31 | regularization = calculate_regularization(sampled_x, Phi, device=device) 32 | ``` 33 | 34 | To explain this case, we need to initialize an `Interpreter` class, and pass $\bf x$, regularization and $\Phi$ to it (we also need to set hyper-parameter scale to a reasonable value: 10 * Std[embedding] is recommanded): 35 | ``` 36 | from Interpreter import Interpreter 37 | 38 | interpreter = Interpreter(x=x, Phi=Phi, regularization=regularization, scale=10 * 0.1, words=words).to(device) 39 | ``` 40 | Then, we need the interpreter to optimize itself by minimizing the loss function in paper. 41 | ``` 42 | interpreter.optimize(iteration=5000, lr=0.5, show_progress=True) 43 | ``` 44 | After optimizing, we can get the best sigma: 45 | ``` 46 | interpreter.get_sigma() 47 | ``` 48 | the result will be something like: 49 | ``` 50 | array([0.00315634, 0.00181308, 0.00633237, 0.00174878, 0.0030807 ], dtype=float32) 51 | ``` 52 | Every sigma stands for the change limit of input without changing hidden state too much. The smaller the sigma is, the more this input word contributes to the hidden state. 53 | 54 | Now, we can get the explanation by calling the visualize function: 55 | ``` 56 | interpreter.visualize() 57 | ``` 58 | Then, we can get results below: 59 | 60 | ![](https://nlpbp.blob.core.windows.net/images/result.png) 61 | 62 | which means that the second and forth words are most important to $\Phi$, which is reasonable because the weight of them are larger. 63 | 64 | ## Explain a certain layer in any saved pytorch model 65 | 66 | We provide an example on how to use our method to explain a saved pytorch model (*pre-trained BERT model in our case*) [here](../../examples/interpret_NLP_models/understand_models.ipynb). 67 | > NOTE: This result may not be consistent with the result in the paper because we use the pre-trained BERT model directly for simplicity, while the BERT model we use in paper is fine-tuned on a specific dataset like SST-2. 68 | -------------------------------------------------------------------------------- /utils_nlp/interpreter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/nlp-recipes/7db6d204e5116da07bb3c549df546e49cb7ab5a5/utils_nlp/interpreter/__init__.py -------------------------------------------------------------------------------- /utils_nlp/language_utils/hi/hindi_stemmer.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3.1 2 | # Script was downloaded from https://research.variancia.com/hindi_stemmer/ 3 | """ Lightweight Hindi stemmer 4 | Copyright © 2010 Luís Gomes . 5 | 6 | Implementation of algorithm described in 7 | 8 | A Lightweight Stemmer for Hindi 9 | Ananthakrishnan Ramanathan and Durgesh D Rao 10 | http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf 11 | 12 | @conference{ramanathan2003lightweight, 13 | title={{A lightweight stemmer for Hindi}}, 14 | author={Ramanathan, A. and Rao, D.}, 15 | booktitle={Workshop on Computational Linguistics for South-Asian Languages, EACL}, 16 | year={2003} 17 | } 18 | 19 | Ported from HindiStemmer.java, part of of Lucene. 20 | """ 21 | 22 | suffixes = { 23 | 1: ["ो", "े", "ू", "ु", "ी", "ि", "ा"], 24 | 2: [ 25 | "कर", 26 | "ाओ", 27 | "िए", 28 | "ाई", 29 | "ाए", 30 | "ने", 31 | "नी", 32 | "ना", 33 | "ते", 34 | "ीं", 35 | "ती", 36 | "ता", 37 | "ाँ", 38 | "ां", 39 | "ों", 40 | "ें", 41 | ], 42 | 3: [ 43 | "ाकर", 44 | "ाइए", 45 | "ाईं", 46 | "ाया", 47 | "ेगी", 48 | "ेगा", 49 | "ोगी", 50 | "ोगे", 51 | "ाने", 52 | "ाना", 53 | "ाते", 54 | "ाती", 55 | "ाता", 56 | "तीं", 57 | "ाओं", 58 | "ाएं", 59 | "ुओं", 60 | "ुएं", 61 | "ुआं", 62 | ], 63 | 4: [ 64 | "ाएगी", 65 | "ाएगा", 66 | "ाओगी", 67 | "ाओगे", 68 | "एंगी", 69 | "ेंगी", 70 | "एंगे", 71 | "ेंगे", 72 | "ूंगी", 73 | "ूंगा", 74 | "ातीं", 75 | "नाओं", 76 | "नाएं", 77 | "ताओं", 78 | "ताएं", 79 | "ियाँ", 80 | "ियों", 81 | "ियां", 82 | ], 83 | 5: ["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां"], 84 | } 85 | 86 | 87 | def hi_stem(word): 88 | for L in 5, 4, 3, 2, 1: 89 | if len(word) > L + 1: 90 | for suf in suffixes[L]: 91 | if word.endswith(suf): 92 | return word[:-L] 93 | return word 94 | 95 | 96 | if __name__ == "__main__": 97 | import sys 98 | 99 | if len(sys.argv) != 1: 100 | sys.exit("{} takes no arguments".format(sys.argv[0])) 101 | for line in sys.stdin: 102 | print(*[hi_stem(word) for word in line.split()]) 103 | -------------------------------------------------------------------------------- /utils_nlp/models/README.md: -------------------------------------------------------------------------------- 1 | # Models 2 | The models submodule contains implementations of various algorithms that can be used in addition to external packages to evaluate and develop new natural language processing systems. A description of which algorithms are used in each scenario can be found on [this table](../../README.md#content) 3 | 4 | ## Summary 5 | 6 | The following table summarizes each submodule. 7 | 8 | |Submodule|Description| 9 | |---|---| 10 | |[bert](./bert/README.md)| This submodule includes the BERT-based models for sequence classification, token classification, and sequence encoding.| 11 | |[gensen](./gensen/README.md)| This submodule includes a distributed Pytorch implementation based on [Horovod](https://github.com/horovod/horovod) of [learning general purpose distributed sentence representations via large scale multi-task learning](https://arxiv.org/abs/1804.00079) by refactoring https://github.com/Maluuba/gensen| 12 | |[pretrained embeddings](./pretrained_embeddings) | This submodule provides utilities to download and extract pretrained word embeddings trained with Word2Vec, GloVe, fastText methods.| 13 | |[pytorch_modules](./pytorch_modules/README.md)| This submodule provides Pytorch modules like Gated Recurrent Unit with peepholes. | 14 | |[xlnet](./xlnet/README.md)| This submodule includes the XLNet-based model for sequence classification.| 15 | -------------------------------------------------------------------------------- /utils_nlp/models/bert/README.md: -------------------------------------------------------------------------------- 1 | # BERT-based Classes 2 | 3 | This folder contains utility functions and classes based on the implementation of [Transformers](https://github.com/huggingface/transformers). 4 | 5 | ## Summary 6 | 7 | The following table summarizes each Python scripts. 8 | 9 | |Script|Description| 10 | |---|---| 11 | |[common.py](common.py)| This script includes

  • the languages supported by BERT-based classes
  • tokenization for text classification, name entity recognition, and encoding
  • utilities to load data, etc.
| 12 | |[sequence_classification.py](sequence_classification.py)| An implementation of sequence classification based on fine-turning BERT. It is commonly used for text classification.| 13 | |[sequence_classification_distributed.py](sequence_classification_distributed.py) | A distributed implementation of sequence classification with method based on fine-turning BERT. [Horovod](https://github.com/horovod/horovod) is the underlying distributed training framework.| 14 | |[sequence_encoding.py](sequence_encoding.py)| An implementation of sequence encoding based on BERT. Both pretrained and fine-tuned BERT models can be used. The hidden states from the loaded BERT model for the input sequence are used in the computation of the encoding. It provides mean, max and class pooling stragegies. It is commonly used in upstream tasks for sentence similarity. | 15 | |[token_classification.py](token_classification.py) | An implementation of token classification based on fine-turning BERT. It is commonly used for name entity recognition. | 16 | -------------------------------------------------------------------------------- /utils_nlp/models/bert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/nlp-recipes/7db6d204e5116da07bb3c549df546e49cb7ab5a5/utils_nlp/models/bert/__init__.py -------------------------------------------------------------------------------- /utils_nlp/models/gensen/README.md: -------------------------------------------------------------------------------- 1 | # GenSen 2 | 3 | Learning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning 4 | 5 | Sandeep Subramanian, Adam Trischler, Yoshua Bengio & Christopher Pal 6 | 7 | ICLR 2018 8 | 9 | 10 | ### About 11 | 12 | GenSen is a technique to learn general purpose, fixed-length representations of sentences via multi-task training. These representations are useful for transfer and low-resource learning. For details please refer to ICLR [paper](https://openreview.net/forum?id=B18WgG-CZ¬eId=B18WgG-CZ). 13 | 14 | ### Code 15 | 16 | We provide a distributed PyTorch with Horovod implementation of the paper along with pre-trained models as well as code to evaluate these models on a variety of transfer learning benchmarks. 17 | This code is based on the gibhub codebase from [Maluuba](https://github.com/Maluuba/gensen), but we have refactored the code in the following aspects: 18 | 1. Support a distributed PyTorch with Horovod 19 | 2. Clean and refactor the original code in a more structured form 20 | 3. Change the training file (`train.py`) from non-stopping to stop when the validation loss reaches to the local minimum 21 | 4. Update the code from Python 2.7 to 3+ and PyTorch from 0.2 or 0.3 to 1.0.1 22 | 5. Add some necessary comments 23 | 6. Add some code for training on AzureML platform 24 | 7. Fix the bug on when setting the batch size to 1, the training raises an error 25 | ### Requirements 26 | 27 | - Python 3+ 28 | - PyTorch 1.0.1 29 | - nltk 30 | - h5py 31 | - numpy 32 | - scikit-learn 33 | 34 | ### Reference 35 | 36 | ``` 37 | @article{subramanian2018learning, 38 | title={Learning general purpose distributed sentence representations via large scale multi-task learning}, 39 | author={Subramanian, Sandeep and Trischler, Adam and Bengio, Yoshua and Pal, Christopher J}, 40 | journal={arXiv preprint arXiv:1804.00079}, 41 | year={2018} 42 | } 43 | ``` 44 | -------------------------------------------------------------------------------- /utils_nlp/models/gensen/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | SNLI_CLEAN_PATH = "clean/snli_1.0" 5 | -------------------------------------------------------------------------------- /utils_nlp/models/gensen/create_gensen_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Microsoft Corporation. All rights reserved. 4 | # Licensed under the MIT License. 5 | 6 | """Creates a GenSen model from a MultiSeq2Seq model.""" 7 | import os 8 | import pickle 9 | 10 | import torch 11 | 12 | 13 | def create_multiseq2seq_model( 14 | trained_model_folder, 15 | save_folder, 16 | save_name, 17 | trained_model_name="best_model.model", 18 | ): 19 | 20 | """ 21 | Method that creates a GenSen model from a MultiSeq2Seq model. 22 | 23 | Args: 24 | trained_model_folder (str): Path to the folder containing a saved model 25 | save_folder (str): Path to save the encoder 26 | save_name (str): Name of the model 27 | trained_model_name (str, optional): Loaded model as the input 28 | 29 | Returns: None 30 | 31 | """ 32 | 33 | model = torch.load( 34 | open(os.path.join(trained_model_folder, trained_model_name), "rb") 35 | ) 36 | # model.copy() prevents raising the error. 37 | for item in model.copy().keys(): 38 | if not item.startswith("module.encoder") and not item.startswith( 39 | "module.src_embedding" 40 | ): 41 | model.pop(item) 42 | 43 | for item in model.copy().keys(): 44 | model[item.replace("module.", "")] = model[item] 45 | 46 | for item in model.copy().keys(): 47 | if item.startswith("module."): 48 | del model[item] 49 | 50 | torch.save(model, os.path.join(save_folder, "%s.model" % save_name)) 51 | # Add 'rb'. 52 | model_vocab = pickle.load( 53 | open(os.path.join(trained_model_folder, "src_vocab.pkl"), "rb") 54 | ) 55 | pickle.dump( 56 | model_vocab, 57 | open(os.path.join(save_folder, "%s_vocab.pkl" % save_name), "wb"), 58 | ) 59 | 60 | 61 | # Original source: https://github.com/Maluuba/gensen 62 | -------------------------------------------------------------------------------- /utils_nlp/models/glove/Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | #For older gcc, use -O3 or -O2 instead of -Ofast 3 | # CFLAGS = -lm -pthread -Ofast -march=native -funroll-loops -Wno-unused-result 4 | CFLAGS = -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic 5 | BUILDDIR := build 6 | SRCDIR := src 7 | 8 | all: dir glove shuffle cooccur vocab_count 9 | 10 | dir : 11 | mkdir -p $(BUILDDIR) 12 | glove : $(SRCDIR)/glove.c 13 | $(CC) $(SRCDIR)/glove.c -o $(BUILDDIR)/glove $(CFLAGS) 14 | shuffle : $(SRCDIR)/shuffle.c 15 | $(CC) $(SRCDIR)/shuffle.c -o $(BUILDDIR)/shuffle $(CFLAGS) 16 | cooccur : $(SRCDIR)/cooccur.c 17 | $(CC) $(SRCDIR)/cooccur.c -o $(BUILDDIR)/cooccur $(CFLAGS) 18 | vocab_count : $(SRCDIR)/vocab_count.c 19 | $(CC) $(SRCDIR)/vocab_count.c -o $(BUILDDIR)/vocab_count $(CFLAGS) 20 | 21 | clean: 22 | rm -rf glove shuffle cooccur vocab_count build 23 | -------------------------------------------------------------------------------- /utils_nlp/models/glove/README.md: -------------------------------------------------------------------------------- 1 | ## GloVe: Global Vectors for Word Representation 2 | 3 | 4 | | nearest neighbors of
frog | Litoria | Leptodactylidae | Rana | Eleutherodactylus | 5 | | --- | ------------------------------- | ------------------- | ---------------- | ------------------- | 6 | | Pictures | | | | | 7 | 8 | | Comparisons | man -> woman | city -> zip | comparative -> superlative | 9 | | --- | ------------------------|-------------------------|-------------------------| 10 | | GloVe Geometry | | | | 11 | 12 | We provide an implementation of the GloVe model for learning word representations, and describe how to download web-dataset vectors or train your own. See the [project page](http://nlp.stanford.edu/projects/glove/) or the [paper](http://nlp.stanford.edu/pubs/glove.pdf) for more information on glove vectors. 13 | 14 | ## Download pre-trained word vectors 15 | The links below contain word vectors obtained from the respective corpora. If you want word vectors trained on massive web datasets, you need only download one of these text files! Pre-trained word vectors are made available under the Public Domain Dedication and License. 16 |
17 |
    18 |
  • Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download): glove.42B.300d.zip
  • 19 |
  • Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download): glove.840B.300d.zip
  • 20 |
  • Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 300d vectors, 822 MB download): glove.6B.zip
  • 21 |
  • Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 200d vectors, 1.42 GB download): glove.twitter.27B.zip 22 |
23 |
24 | 25 | ## Train word vectors on a new corpus 26 | 27 | 28 | 29 | If the web datasets above don't match the semantics of your end use case, you can train word vectors on your own corpus. 30 | 31 | $ git clone http://github.com/stanfordnlp/glove 32 | $ cd glove && make 33 | $ ./demo.sh 34 | 35 | The demo.sh script downloads a small corpus, consisting of the first 100M characters of Wikipedia. It collects unigram counts, constructs and shuffles cooccurrence data, and trains a simple version of the GloVe model. It also runs a word analogy evaluation script in python to verify word vector quality. More details about training on your own corpus can be found by reading [demo.sh](https://github.com/stanfordnlp/GloVe/blob/master/demo.sh) or the [src/README.md](https://github.com/stanfordnlp/GloVe/tree/master/src) 36 | 37 | ### License 38 | All work contained in this package is licensed under the Apache License, Version 2.0. See the include LICENSE file. 39 | -------------------------------------------------------------------------------- /utils_nlp/models/glove/demo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Makes programs, downloads sample data, trains a GloVe model, and then evaluates it. 5 | # One optional argument can specify the language used for eval script: matlab, octave or [default] python 6 | 7 | make 8 | if [ ! -e text8 ]; then 9 | if hash wget 2>/dev/null; then 10 | wget http://mattmahoney.net/dc/text8.zip 11 | else 12 | curl -O http://mattmahoney.net/dc/text8.zip 13 | fi 14 | unzip text8.zip 15 | rm text8.zip 16 | fi 17 | 18 | CORPUS=text8 19 | VOCAB_FILE=vocab.txt 20 | COOCCURRENCE_FILE=cooccurrence.bin 21 | COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin 22 | BUILDDIR=build 23 | SAVE_FILE=vectors 24 | VERBOSE=2 25 | MEMORY=4.0 26 | VOCAB_MIN_COUNT=5 27 | VECTOR_SIZE=50 28 | MAX_ITER=15 29 | WINDOW_SIZE=15 30 | BINARY=2 31 | NUM_THREADS=8 32 | X_MAX=10 33 | 34 | echo 35 | echo "$ $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE" 36 | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE 37 | echo "$ $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE" 38 | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE 39 | echo "$ $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE" 40 | $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE 41 | echo "$ $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE" 42 | $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE 43 | if [ "$CORPUS" = 'text8' ]; then 44 | if [ "$1" = 'matlab' ]; then 45 | matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2 46 | elif [ "$1" = 'octave' ]; then 47 | octave < ./eval/octave/read_and_evaluate_octave.m 1>&2 48 | else 49 | echo "$ python eval/python/evaluate.py" 50 | python eval/python/evaluate.py 51 | fi 52 | fi 53 | -------------------------------------------------------------------------------- /utils_nlp/models/glove/src/README.md: -------------------------------------------------------------------------------- 1 | ### Package Contents 2 | 3 | To train your own GloVe vectors, first you'll need to prepare your corpus as a single text file with all words separated by one or more spaces or tabs. If your corpus has multiple documents, the documents (only) should be separated by new line characters. Cooccurrence contexts for words do not extend past newline characters. Once you create your corpus, you can train GloVe vectors using the following 4 tools. An example is included in `demo.sh`, which you can modify as necessary. 4 | 5 | The four main tools in this package are: 6 | 7 | #### 1) vocab_count 8 | This tool requires an input corpus that should already consist of whitespace-separated tokens. Use something like the [Stanford Tokenizer](https://nlp.stanford.edu/software/tokenizer.html) first on raw text. From the corpus, it constructs unigram counts from a corpus, and optionally thresholds the resulting vocabulary based on total vocabulary size or minimum frequency count. 9 | 10 | #### 2) cooccur 11 | Constructs word-word cooccurrence statistics from a corpus. The user should supply a vocabulary file, as produced by `vocab_count`, and may specify a variety of parameters, as described by running `./build/cooccur`. 12 | 13 | #### 3) shuffle 14 | Shuffles the binary file of cooccurrence statistics produced by `cooccur`. For large files, the file is automatically split into chunks, each of which is shuffled and stored on disk before being merged and shuffled together. The user may specify a number of parameters, as described by running `./build/shuffle`. 15 | 16 | #### 4) glove 17 | Train the GloVe model on the specified cooccurrence data, which typically will be the output of the `shuffle` tool. The user should supply a vocabulary file, as given by `vocab_count`, and may specify a number of other parameters, which are described by running `./build/glove`. 18 | -------------------------------------------------------------------------------- /utils_nlp/models/pretrained_embeddings/README.md: -------------------------------------------------------------------------------- 1 | # Pretrained Embeddings 2 | The pretrained embeddings submodule contains utility functions that help users quickly load and extract various types of pretrained embeddings such as fastText, GloVe, Word2Vec, etc. 3 | -------------------------------------------------------------------------------- /utils_nlp/models/pretrained_embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | WORD2VEC_URL = "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz" 5 | FASTTEXT_EN_URL = ( 6 | "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.zip" 7 | ) 8 | GLOVE_URL = "http://nlp.stanford.edu/data/glove.840B.300d.zip" 9 | -------------------------------------------------------------------------------- /utils_nlp/models/pretrained_embeddings/fasttext.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | """Functions to help users load and extract fastText pretrained embeddings.""" 5 | 6 | import os 7 | import zipfile 8 | 9 | from gensim.models.fasttext import load_facebook_model 10 | 11 | from utils_nlp.dataset.url_utils import maybe_download 12 | from utils_nlp.models.pretrained_embeddings import FASTTEXT_EN_URL 13 | 14 | 15 | def _extract_fasttext_vectors(zip_path, dest_path="."): 16 | """ Extracts fastText embeddings from zip file. 17 | 18 | Args: 19 | zip_path(str): Path to the downloaded compressed zip file. 20 | dest_path(str): Final destination directory path to the extracted zip file. 21 | Picks the current working directory by default. 22 | 23 | Returns: 24 | str: Returns the absolute path to the extracted folder. 25 | """ 26 | 27 | if os.path.exists(zip_path): 28 | with zipfile.ZipFile(zip_path, "r") as zip_ref: 29 | zip_ref.extractall(path=dest_path) 30 | else: 31 | raise Exception("Zipped file not found!") 32 | 33 | os.remove(zip_path) 34 | return dest_path 35 | 36 | 37 | def _download_fasttext_vectors(download_dir, file_name="wiki.simple.zip"): 38 | """ Downloads pre-trained word vectors for English, trained on Wikipedia using 39 | fastText. You can directly download the vectors from here: 40 | https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.zip 41 | 42 | For the full version of pre-trained word vectors, change the url for 43 | FASTTEXT_EN_URL to https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip 44 | in __init__.py 45 | 46 | Args: 47 | download_dir (str): File path to download the file 48 | file_name (str) : File name given by default but can be changed by the user. 49 | 50 | Returns: 51 | str: file_path to the downloaded vectors. 52 | """ 53 | 54 | return maybe_download( 55 | FASTTEXT_EN_URL, filename=file_name, work_directory=download_dir 56 | ) 57 | 58 | 59 | def _maybe_download_and_extract(dest_path, file_name): 60 | """ Downloads and extracts fastText vectors if they don’t already exist 61 | 62 | Args: 63 | dest_path(str): Final path where the vectors will be extracted. 64 | file_name(str): File name of the fastText vector file. 65 | 66 | Returns: 67 | str: File path to the fastText vector file. 68 | """ 69 | 70 | dir_path = os.path.join(dest_path, "fastText") 71 | file_path = os.path.join(dir_path, file_name) 72 | 73 | if not os.path.exists(file_path): 74 | if not os.path.exists(dir_path): 75 | os.makedirs(dir_path) 76 | zip_path = _download_fasttext_vectors(dir_path) 77 | _extract_fasttext_vectors(zip_path, dir_path) 78 | else: 79 | print("Vector file already exists. No changes made.") 80 | 81 | return file_path 82 | 83 | 84 | def load_pretrained_vectors(dest_path, file_name="wiki.simple.bin"): 85 | """ Method that loads fastText vectors. Downloads if it doesn't exist. 86 | 87 | Args: 88 | file_name(str): Name of the fastText file. 89 | dest_path(str): Path to the directory where fastText vectors exist or will be 90 | downloaded. 91 | 92 | Returns: 93 | gensim.models.fasttext.load_facebook_model: Loaded word2vectors 94 | 95 | """ 96 | 97 | file_path = _maybe_download_and_extract(dest_path, file_name) 98 | model = load_facebook_model(file_path) 99 | return model 100 | -------------------------------------------------------------------------------- /utils_nlp/models/pretrained_embeddings/glove.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | """Functions to help users load and extract GloVe pretrained embeddings.""" 5 | 6 | import os 7 | import zipfile 8 | 9 | from gensim.models import KeyedVectors 10 | from gensim.scripts.glove2word2vec import glove2word2vec 11 | from gensim.test.utils import get_tmpfile 12 | 13 | from utils_nlp.dataset.url_utils import maybe_download 14 | from utils_nlp.models.pretrained_embeddings import GLOVE_URL 15 | 16 | 17 | def _extract_glove_vectors(zip_path, dest_path="."): 18 | """ Extracts gloVe embeddings from zip file. 19 | 20 | Args: 21 | zip_path(str): Path to the downloaded compressed zip file. 22 | dest_path(str): Final destination directory path to the extracted zip file. 23 | Picks the current working directory by default. 24 | 25 | Returns: 26 | str: Returns the absolute path to the extracted folder. 27 | """ 28 | 29 | if os.path.exists(zip_path): 30 | with zipfile.ZipFile(zip_path, "r") as zip_ref: 31 | zip_ref.extractall(path=dest_path) 32 | else: 33 | raise Exception("Zipped file not found!") 34 | 35 | os.remove(zip_path) 36 | return dest_path 37 | 38 | 39 | def _download_glove_vectors(download_dir, file_name="glove.840B.300d.zip"): 40 | """ Downloads gloVe word vectors trained on Common Crawl corpus. You can 41 | directly download the vectors from here: 42 | http://nlp.stanford.edu/data/glove.840B.300d.zip 43 | 44 | Args: 45 | download_dir (str): File path to download the file 46 | file_name (str) : File name given by default but can be changed by the user. 47 | 48 | Returns: 49 | str: file_path to the downloaded vectors. 50 | """ 51 | 52 | return maybe_download( 53 | GLOVE_URL, filename=file_name, work_directory=download_dir 54 | ) 55 | 56 | 57 | def _maybe_download_and_extract(dest_path, file_name): 58 | """ Downloads and extracts gloVe vectors if they don’t already exist 59 | 60 | Args: 61 | dest_path(str): Final path where the vectors will be extracted. 62 | file_name(str): File name of the gloVe vector file. 63 | 64 | Returns: 65 | str: File path to the gloVe vector file. 66 | """ 67 | 68 | dir_path = os.path.join(dest_path, "gloVe") 69 | file_path = os.path.join(dir_path, file_name) 70 | 71 | if not os.path.exists(file_path): 72 | if not os.path.exists(dir_path): 73 | os.makedirs(dir_path) 74 | filepath = _download_glove_vectors(dir_path) 75 | _extract_glove_vectors(filepath, dir_path) 76 | else: 77 | print("Vector file already exists. No changes made.") 78 | 79 | return file_path 80 | 81 | 82 | def download_and_extract(dir_path, file_name="glove.840B.300d.txt"): 83 | """ Downloads and extracts gloVe vectors if they don’t already exist 84 | 85 | Args: 86 | dir_path(str): Final path where the vectors will be extracted. 87 | file_name(str): File name of the gloVe vector file. 88 | 89 | Returns: 90 | str: File path to the gloVe vector file. 91 | """ 92 | 93 | return _maybe_download_and_extract(dir_path, file_name) 94 | 95 | 96 | def load_pretrained_vectors( 97 | dir_path, file_name="glove.840B.300d.txt", limit=None 98 | ): 99 | """ Method that loads gloVe vectors. Downloads if it doesn't exist. 100 | 101 | Args: 102 | file_name(str): Name of the gloVe file. 103 | dir_path(str): Path to the directory where gloVe vectors exist or will be 104 | downloaded. 105 | limit(int): Number of word vectors that is loaded from gensim. This option 106 | allows us to save RAM space and avoid memory errors. 107 | 108 | Returns: 109 | gensim.models.keyedvectors.Word2VecKeyedVectors: Loaded word2vectors 110 | """ 111 | 112 | file_path = _maybe_download_and_extract(dir_path, file_name) 113 | tmp_file = get_tmpfile("test_word2vec.txt") 114 | 115 | # Convert GloVe format to word2vec 116 | _ = glove2word2vec(file_path, tmp_file) 117 | 118 | model = KeyedVectors.load_word2vec_format(tmp_file, limit=limit) 119 | os.remove(tmp_file) 120 | 121 | return model 122 | -------------------------------------------------------------------------------- /utils_nlp/models/pretrained_embeddings/word2vec.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | """Functions to help users load and extract Word2Vec pretrained embeddings.""" 5 | 6 | import gzip 7 | import os 8 | 9 | from gensim.models.keyedvectors import KeyedVectors 10 | 11 | from utils_nlp.dataset.url_utils import maybe_download 12 | from utils_nlp.models.pretrained_embeddings import WORD2VEC_URL 13 | 14 | 15 | def _extract_word2vec_vectors(zip_path, dest_filepath): 16 | """ Extracts word2vec embeddings from bin.gz archive 17 | 18 | Args: 19 | zip_path: Path to the downloaded compressed file. 20 | dest_filepath: Final destination file path to the extracted zip file. 21 | """ 22 | 23 | if os.path.exists(zip_path): 24 | with gzip.GzipFile(zip_path, "rb") as f_in, open( 25 | dest_filepath, "wb" 26 | ) as f_out: 27 | f_out.writelines(f_in) 28 | else: 29 | raise Exception("Zipped file not found!") 30 | 31 | os.remove(zip_path) 32 | 33 | 34 | def _download_word2vec_vectors( 35 | download_dir, file_name="GoogleNews-vectors-negative300.bin.gz" 36 | ): 37 | """ Downloads pretrained word vectors trained on GoogleNews corpus. You can 38 | directly download the vectors from here: 39 | https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz 40 | 41 | Args: 42 | download_dir (str): File path to download the file 43 | file_name (str) : File name given by default but can be changed by the user. 44 | 45 | Returns: 46 | str: file_path to the downloaded vectors. 47 | """ 48 | 49 | return maybe_download( 50 | WORD2VEC_URL, filename=file_name, work_directory=download_dir 51 | ) 52 | 53 | 54 | def _maybe_download_and_extract(dest_path, file_name): 55 | """ Downloads and extracts Word2vec vectors if they don’t already exist 56 | 57 | Args: 58 | dest_path: Path to the directory where the vectors will be extracted. 59 | file_name: File name of the word2vec vector file. 60 | 61 | Returns: 62 | str: File path to the word2vec vector file. 63 | """ 64 | 65 | dir_path = os.path.join(dest_path, "word2vec") 66 | file_path = os.path.join(dir_path, file_name) 67 | 68 | if not os.path.exists(file_path): 69 | if not os.path.exists(dir_path): 70 | os.makedirs(dir_path) 71 | filepath = _download_word2vec_vectors(dir_path) 72 | _extract_word2vec_vectors(filepath, file_path) 73 | else: 74 | print("Vector file already exists. No changes made.") 75 | 76 | return file_path 77 | 78 | 79 | def load_pretrained_vectors( 80 | dir_path, file_name="GoogleNews-vectors-negative300.bin", limit=None 81 | ): 82 | """ Method that loads word2vec vectors. Downloads if it doesn't exist. 83 | 84 | Args: 85 | file_name(str): Name of the word2vec file. 86 | dir_path(str): Path to the directory where word2vec vectors exist or will be 87 | downloaded. 88 | limit(int): Number of word vectors that is loaded from gensim. This option 89 | allows us to save RAM space and avoid memory errors. 90 | 91 | Returns: 92 | gensim.models.keyedvectors.Word2VecKeyedVectors: Loaded word2vectors 93 | 94 | """ 95 | file_path = _maybe_download_and_extract(dir_path, file_name) 96 | word2vec_vectors = KeyedVectors.load_word2vec_format( 97 | file_path, binary=True, limit=limit 98 | ) 99 | 100 | return word2vec_vectors 101 | -------------------------------------------------------------------------------- /utils_nlp/models/pytorch_modules/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Modules 2 | 3 | This folder contains the PyTorch modules that are used across the Git repository. 4 | 5 | ## Summary 6 | 7 | The following table summarizes each module. 8 | 9 | |Module|Description| 10 | |---|---| 11 | |[ConditionalGRU](conditional_gru.py)| An implemention of Gated Recurrent Unit (GRU) with peepholes, which was proposed in [Learning Precise Timing with LSTM Recurrent Networks](http://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf) by Gers, F. A., Schraudolph, N. N., and Schmidhuber, J.| 12 | -------------------------------------------------------------------------------- /utils_nlp/models/pytorch_modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | -------------------------------------------------------------------------------- /utils_nlp/models/pytorch_modules/conditional_gru.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | """A Gated Recurrent Unit (GRU) cell with peepholes.""" 5 | import math 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | 11 | class ConditionalGRU(nn.Module): 12 | """A Gated Recurrent Unit (GRU) cell with peepholes.""" 13 | 14 | def __init__(self, input_dim, hidden_dim, dropout=0.0): 15 | """Initialize params. 16 | 17 | Args: 18 | input_dim: Dimension of the input vector. 19 | hidden_dim: Dimension of the hidden layer. 20 | dropout: Dropout of the network. 21 | """ 22 | 23 | super(ConditionalGRU, self).__init__() 24 | self.input_dim = input_dim 25 | self.hidden_dim = hidden_dim 26 | 27 | self.input_weights = nn.Linear(self.input_dim, 3 * self.hidden_dim) 28 | self.hidden_weights = nn.Linear(self.hidden_dim, 3 * self.hidden_dim) 29 | self.peep_weights = nn.Linear(self.hidden_dim, 3 * self.hidden_dim) 30 | 31 | self.reset_parameters() 32 | 33 | def reset_parameters(self): 34 | """Set params. """ 35 | stdv = 1.0 / math.sqrt(self.hidden_dim) 36 | for weight in self.parameters(): 37 | weight.data.uniform_(-stdv, stdv) 38 | 39 | def forward(self, input, hidden, ctx): 40 | """Propogate input through the layer. 41 | 42 | Args: 43 | input: batch size x target sequence length x embedding dimension. 44 | hidden: batch size x hidden dimension. 45 | ctx: batch size x source sequence length x hidden dimension. 46 | 47 | Returns: 48 | output(torch.Tensor) - batch size x target sequence length x 49 | hidden dimension 50 | hidden(torch.Tensor) - (batch size x hidden dimension, batch size x hidden 51 | dimension) 52 | 53 | """ 54 | 55 | def recurrence(input, hidden, ctx): 56 | """Recurrence helper.""" 57 | input_gate = self.input_weights(input) 58 | hidden_gate = self.hidden_weights(hidden) 59 | peep_gate = self.peep_weights(ctx) 60 | i_r, i_i, i_n = input_gate.chunk(3, 1) 61 | h_r, h_i, h_n = hidden_gate.chunk(3, 1) 62 | p_r, p_i, p_n = peep_gate.chunk(3, 1) 63 | resetgate = F.sigmoid(i_r + h_r + p_r) 64 | inputgate = F.sigmoid(i_i + h_i + p_i) 65 | newgate = F.tanh(i_n + resetgate * h_n + p_n) 66 | hy = newgate + inputgate * (hidden - newgate) 67 | 68 | return hy 69 | 70 | input = input.transpose(0, 1) 71 | 72 | output = [] 73 | steps = range(input.size(0)) 74 | for i in steps: 75 | hidden = recurrence(input[i], hidden, ctx) 76 | if isinstance(hidden, tuple): 77 | output.append(hidden[0]) 78 | else: 79 | output.append(hidden) 80 | 81 | output = torch.cat(output, 0).view(input.size(0), *output[0].size()) 82 | output = output.transpose(0, 1) 83 | 84 | return output, hidden 85 | 86 | 87 | # Original source: https://github.com/Maluuba/gensen 88 | -------------------------------------------------------------------------------- /utils_nlp/models/transformers/bertsum/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/nlp-recipes/7db6d204e5116da07bb3c549df546e49cb7ab5a5/utils_nlp/models/transformers/bertsum/__init__.py -------------------------------------------------------------------------------- /utils_nlp/models/transformers/bertsum/dataset.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import torch 3 | from torch.utils.data import ( 4 | Dataset, 5 | IterableDataset, 6 | ) 7 | 8 | 9 | def get_dataset(file): 10 | yield torch.load(file) 11 | 12 | 13 | class ExtSumProcessedIterableDataset(IterableDataset): 14 | """Iterable dataset for extractive summarization preprocessed data 15 | """ 16 | 17 | def __init__(self, file_list, is_shuffle=False): 18 | """ Initiation function for iterable dataset for extractive summarization 19 | preprocessed data. 20 | 21 | Args: 22 | file_list (list of strings): List of files that the dataset is loaded from. 23 | is_shuffle (bool, optional): A boolean value specifies whether the list of 24 | files is shuffled when the dataset is loaded. Defaults to False. 25 | """ 26 | 27 | self.file_list = file_list 28 | self.is_shuffle = is_shuffle 29 | 30 | def get_stream(self): 31 | """ get a stream of cycled data from the dataset""" 32 | 33 | if self.is_shuffle: 34 | return itertools.chain.from_iterable( 35 | map(get_dataset, itertools.cycle(self.file_list)) 36 | ) 37 | else: 38 | return itertools.chain.from_iterable( 39 | map(get_dataset, itertools.cycle(random.shuffle(self.file_list))) 40 | ) 41 | 42 | def __iter__(self): 43 | return self.get_stream() 44 | 45 | 46 | class ExtSumProcessedDataset(Dataset): 47 | """Dataset for extractive summarization preprocessed data 48 | """ 49 | 50 | def __init__(self, file_list, is_shuffle=False): 51 | """ Initiation function for dataset for extractive summarization preprocessed data. 52 | 53 | Args: 54 | file_list (list of strings): List of files that the dataset is loaded from. 55 | is_shuffle (bool, optional): A boolean value specifies whether the list of 56 | files is shuffled when the dataset is loaded. Defaults to False. 57 | """ 58 | 59 | self.file_list = sorted(file_list) 60 | if is_shuffle: 61 | random.shuffle(self.file_list) 62 | self.data = [] 63 | for f in self.file_list: 64 | self.data.extend(torch.load(f)) 65 | 66 | def __len__(self): 67 | return len(self.data) 68 | 69 | def __getitem__(self, idx): 70 | return self.data[idx] 71 | -------------------------------------------------------------------------------- /utils_nlp/models/transformers/bertsum/penalties.py: -------------------------------------------------------------------------------- 1 | # Licensed under the MIT License. 2 | # This script reuses code from https://github.com/nlpyang/Presumm 3 | 4 | """ PenaltyBuilder Class used in prediction/translation """ 5 | 6 | from __future__ import division 7 | import torch 8 | 9 | 10 | class PenaltyBuilder(object): 11 | """ 12 | Returns the Length and Coverage Penalty function for Beam Search. 13 | 14 | Args: 15 | length_pen (str): option name of length pen 16 | cov_pen (str): option name of cov pen 17 | """ 18 | 19 | def __init__(self, length_pen): 20 | self.length_pen = length_pen 21 | 22 | def length_penalty(self): 23 | if self.length_pen == "wu": 24 | return self.length_wu 25 | elif self.length_pen == "avg": 26 | return self.length_average 27 | else: 28 | return self.length_none 29 | 30 | """ 31 | Below are all the different penalty terms implemented so far 32 | """ 33 | 34 | def length_wu(self, beam, logprobs, alpha=0.0): 35 | """ 36 | NMT length re-ranking score from 37 | "Google's Neural Machine Translation System" :cite:`wu2016google`. 38 | """ 39 | 40 | modifier = ((5 + len(beam.next_ys)) ** alpha) / ((5 + 1) ** alpha) 41 | return logprobs / modifier 42 | 43 | def length_average(self, beam, logprobs, alpha=0.0): 44 | """ 45 | Returns the average probability of tokens in a sequence. 46 | """ 47 | return logprobs / len(beam.next_ys) 48 | 49 | def length_none(self, beam, logprobs, alpha=0.0, beta=0.0): 50 | """ 51 | Returns unmodified scores. 52 | """ 53 | return logprobs 54 | -------------------------------------------------------------------------------- /utils_nlp/models/xlnet/README.md: -------------------------------------------------------------------------------- 1 | # XLNet-based Classes 2 | 3 | This folder contains utility functions and classes based on the implementation of [Transformers](https://github.com/huggingface/transformers). 4 | 5 | ## Summary 6 | 7 | The following table summarizes each Python script. 8 | 9 | |Script|Description| 10 | |---|---| 11 | |[common.py](common.py)| This script includes
  • the languages supported by XLNet-based classes
  • tokenization for text classification
  • utilities to load data, etc.
| 12 | |[sequence_classification.py](sequence_classification.py)| An implementation of sequence classification based on fine-turning XLNet. It is commonly used for text classification. The module includes logging functionality using MLFlow.| 13 | |[utils.py](utils.py)| This script includes a function to visualize a confusion matrix.| 14 | --------------------------------------------------------------------------------