├── .github └── workflows │ └── build.yml ├── .gitignore ├── LICENSE ├── NEWS.md ├── README.md ├── admin ├── deploy_models.py ├── sandbox.py └── tuning.py ├── dev-requirements.txt ├── docs ├── datasets.md ├── index.md ├── nerda_models.md ├── networks.md ├── performance.md ├── preamble.py ├── precooked_models.md ├── predictions.md └── workflow.ipynb ├── logo.png ├── mkdocs.yml ├── pytest.ini ├── setup.cfg ├── setup.py ├── src └── NERDA │ ├── __init__.py │ ├── datasets.py │ ├── models.py │ ├── networks.py │ ├── performance.py │ ├── precooked.py │ ├── predictions.py │ ├── preprocessing.py │ ├── training.py │ └── utils.py └── tests └── unit_tests ├── test_aaaNERDA.py ├── test_performance.py ├── test_precooked.py ├── test_predictions.py └── test_training.py /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------------------- 2 | # Build, Test and Publish Package 3 | #--------------------------------------------------------------------- 4 | name: build 5 | 6 | on: [push] 7 | 8 | jobs: 9 | 10 | Build: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | 17 | - name: Cache Python packages 18 | uses: actions/cache@v2 19 | with: 20 | path: ~/.cache/pip 21 | key: ${{ hashFiles('setup.py') }}-${{ hashFiles('dev-requirements.txt') }} 22 | 23 | - name: Set up Python 24 | uses: actions/setup-python@v2 25 | with: 26 | python-version: '3.7' 27 | 28 | - name: Display Python version 29 | run: python -c "import sys; print(sys.version)" 30 | 31 | - name: Install pip and dev requirements 32 | run: | 33 | python -m pip install --upgrade pip 34 | pip install -r dev-requirements.txt 35 | 36 | - name: Lint 37 | run: | 38 | # stop the build if there are Python syntax errors or undefined names 39 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude=.git,__pycache__,docs/source/conf.py,old,build,dist,admin 40 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 41 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --exclude=.git,__pycache__,docs/source/conf.py,old,build,dist,admin --statistics --format=html --htmldir=flake-report 42 | 43 | - name: Upload Lint results 44 | uses: actions/upload-artifact@v2 45 | with: 46 | name: lint-results 47 | path: flake-report/ 48 | 49 | - name: Run tests 50 | run: | 51 | python setup.py test 52 | 53 | - name: Publish test results 54 | uses: EnricoMi/publish-unit-test-result-action@v1.6 55 | if: always() 56 | with: 57 | github_token: ${{ secrets.GITHUB_TOKEN }} 58 | files: test-results/**/*.xml 59 | 60 | - name: Upload coverage to Codecov 61 | uses: codecov/codecov-action@v1 62 | with: 63 | token: ${{ secrets.CODECOV_TOKEN }} 64 | file: coverage.xml 65 | flags: unittests 66 | 67 | - name: Deploy docs 68 | if: github.ref == 'refs/heads/main' 69 | run: | 70 | pip install . 71 | mkdocs gh-deploy --force 72 | 73 | - name: Build and publish to TEST PyPI 74 | if: github.ref != 'refs/heads/main' 75 | env: 76 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 77 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 78 | run: | 79 | python setup.py sdist bdist_wheel 80 | twine upload -r testpypi dist/* 81 | 82 | - name: Publish to PyPI 83 | env: 84 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 85 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 86 | if: github.ref == 'refs/heads/main' 87 | run: | 88 | python setup.py sdist bdist_wheel 89 | twine upload dist/* 90 | 91 | 92 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Data and models 7 | /data 8 | /daNER 9 | src/*.pickle 10 | src/*.pkl 11 | *.csv 12 | *.pickle 13 | 14 | # playground 15 | tester.py 16 | tester2.py 17 | src/playground.py 18 | 19 | # C extensions 20 | *.so 21 | 22 | # Distribution / packaging 23 | .Python 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | pip-wheel-metadata/ 37 | share/python-wheels/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | MANIFEST 42 | test-results/ 43 | 44 | 45 | # PyInstaller 46 | # Usually these files are written by a python script from a template 47 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 48 | *.manifest 49 | *.spec 50 | 51 | # Installer logs 52 | pip-log.txt 53 | pip-delete-this-directory.txt 54 | 55 | # Unit test / coverage reports 56 | htmlcov/ 57 | flake-report/ 58 | .tox/ 59 | .nox/ 60 | .coverage 61 | .coverage.* 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | *.cover 66 | *.py,cover 67 | .hypothesis/ 68 | .pytest_cache/ 69 | 70 | # Translations 71 | *.mo 72 | *.pot 73 | 74 | # Django stuff: 75 | *.log 76 | local_settings.py 77 | db.sqlite3 78 | db.sqlite3-journal 79 | 80 | # Flask stuff: 81 | instance/ 82 | .webassets-cache 83 | 84 | # Scrapy stuff: 85 | .scrapy 86 | 87 | # documentation 88 | docs/_build/ 89 | 90 | # PyBuilder 91 | target/ 92 | 93 | # Jupyter Notebook 94 | .ipynb_checkpoints 95 | 96 | # IPython 97 | profile_default/ 98 | ipython_config.py 99 | 100 | # pyenv 101 | .python-version 102 | 103 | # pipenv 104 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 105 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 106 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 107 | # install all needed dependencies. 108 | #Pipfile.lock 109 | 110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 111 | __pypackages__/ 112 | 113 | # Celery stuff 114 | celerybeat-schedule 115 | celerybeat.pid 116 | 117 | # SageMath parsed files 118 | *.sage.py 119 | 120 | # Environments 121 | .env 122 | .venv 123 | env/ 124 | venv/ 125 | ENV/ 126 | env.bak/ 127 | venv.bak/ 128 | 129 | # Spyder project settings 130 | .spyderproject 131 | .spyproject 132 | 133 | # Rope project settings 134 | .ropeproject 135 | 136 | # mkdocs documentation 137 | /site 138 | 139 | # mypy 140 | .mypy_cache/ 141 | .dmypy.json 142 | dmypy.json 143 | 144 | # Pyre type checker 145 | .pyre/ 146 | 147 | # User generated folders 148 | runs/ 149 | 150 | # User created models 151 | *.bin 152 | 153 | # tensor board results 154 | .DS_Store 155 | .vscode -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Ekstra Bladet, PIN 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # NERDA 1.0.0 2 | 3 | * NERDA model class is now equipped with functions for saving (loading) weights for a fine-tuned NERDA Network to (from) file. See functions model.save_network() and model.load_network_from_file() 4 | 5 | # NERDA 0.9.7 6 | 7 | * return confidence scores for predictions of all tokens, e.g. model.predict(x, return_confidence=True). 8 | 9 | # NERDA 0.9.6 10 | 11 | * compute Precision, Recall and Accuracy (optional) with evaluate_performance(). 12 | * improve relative imports inside package. 13 | 14 | # NERDA 0.9.5 15 | 16 | * ... bugfixes. 17 | 18 | # NERDA 0.9.4 19 | 20 | * functionality for dynamic quantization, fp32 to fp16, padding parametrized. 21 | 22 | # NERDA 0.9.2 23 | 24 | * remove precooked DA_BERT_ML_16BIT, include precooked DA_DISTILBERT_ML. 25 | 26 | # NERDA 0.9.1 27 | 28 | * include 16 bit FP precooked DA_BERT_ML_16BIT. 29 | 30 | # NERDA 0.9.0 31 | 32 | * Support new versions of `transformers` (4.x) and `torch` 33 | 34 | # NERDA 0.8.7 35 | 36 | * BUGFIX: Restrict torch version. 37 | * Do not import datasets as part of Precooked Models. 38 | * Do not load datasets if not provided by user. 39 | 40 | # NERDA 0.8.6 41 | 42 | * First official release. 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NERDA 2 | 3 | ![Build status](https://github.com/ebanalyse/NERDA/workflows/build/badge.svg) 4 | [![codecov](https://codecov.io/gh/ebanalyse/NERDA/branch/main/graph/badge.svg?token=OB6LGFQZYX)](https://codecov.io/gh/ebanalyse/NERDA) 5 | ![PyPI](https://img.shields.io/pypi/v/NERDA.svg) 6 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/NERDA?color=green) 7 | ![License](https://img.shields.io/badge/license-MIT-blue.svg) 8 | 9 | Not only is `NERDA` a mesmerizing muppet-like character. `NERDA` is also 10 | a python package, that offers a slick easy-to-use interface for fine-tuning 11 | pretrained transformers for Named Entity Recognition 12 | (=NER) tasks. 13 | 14 | You can also utilize `NERDA` to access a selection of *precooked* `NERDA` models, 15 | that you can use right off the shelf for NER tasks. 16 | 17 | `NERDA` is built on `huggingface` `transformers` and the popular `pytorch` 18 | framework. 19 | 20 | ## Installation guide 21 | `NERDA` can be installed from [PyPI](https://pypi.org/project/NERDA/) with 22 | 23 | ``` 24 | pip install NERDA 25 | ``` 26 | 27 | If you want the development version then install directly from [GitHub](https://github.com/ebanalyse/NERDA). 28 | 29 | ## Named-Entity Recogntion tasks 30 | Named-entity recognition (NER) (also known as (named) entity identification, 31 | entity chunking, and entity extraction) is a subtask of information extraction 32 | that seeks to locate and classify named entities mentioned in unstructured 33 | text into pre-defined categories such as person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.[1] 34 | 35 | [1]: https://en.wikipedia.org/wiki/Named-entity_recognition 36 | 37 | ### Example Task: 38 | 39 | **Task** 40 | 41 | Identify person names and organizations in text: 42 | 43 | *Jim bought 300 shares of Acme Corp.* 44 | 45 | **Solution** 46 | 47 | | **Named Entity** | **Type** | 48 | |--------------------|-----------------------| 49 | | 'Jim' | Person | 50 | | 'Acme Corp.' | Organization | 51 | 52 | Read more about NER on [Wikipedia](https://en.wikipedia.org/wiki/Named-entity_recognition). 53 | 54 | ## Train Your Own `NERDA` Model 55 | 56 | Say, we want to fine-tune a pretrained [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) transformer for NER in English. 57 | 58 | Load package. 59 | 60 | ```python 61 | from NERDA.models import NERDA 62 | ``` 63 | 64 | Instantiate a `NERDA` model (*with default settings*) for the 65 | [`CoNLL-2003`](https://www.clips.uantwerpen.be/conll2003/ner/) 66 | English NER data set. 67 | 68 | ```python 69 | from NERDA.datasets import get_conll_data 70 | model = NERDA(dataset_training = get_conll_data('train'), 71 | dataset_validation = get_conll_data('valid'), 72 | transformer = 'bert-base-multilingual-uncased') 73 | ``` 74 | 75 | By default the network architecture is analogous to that of the models in [Hvingelby et al. 2020](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf). 76 | 77 | The model can then be trained/fine-tuned by invoking the `train` method, e.g. 78 | 79 | ```python 80 | model.train() 81 | ``` 82 | 83 | **Note**: this will take some time depending on the dimensions of your machine 84 | (if you want to skip training, you can go ahead and use one of the models, 85 | that we have already precooked for you in stead). 86 | 87 | After the model has been trained, the model can be used for predicting 88 | named entities in new texts. 89 | 90 | ```python 91 | # text to identify named entities in. 92 | text = 'Old MacDonald had a farm' 93 | model.predict_text(text) 94 | ([['Old', 'MacDonald', 'had', 'a', 'farm']], [['B-PER', 'I-PER', 'O', 'O', 'O']]) 95 | ``` 96 | This means, that the model identified 'Old MacDonald' as a *PER*son. 97 | 98 | Please note, that the `NERDA` model configuration above was instantiated 99 | with all default settings. You can however customize your `NERDA` model 100 | in a lot of ways: 101 | 102 | - Use your own data set (finetune a transformer for any given language) 103 | - Choose whatever transformer you like 104 | - Set all of the hyperparameters for the model 105 | - You can even apply your own Network Architecture 106 | 107 | Read more about advanced usage of `NERDA` in the [detailed documentation](https://ebanalyse.github.io/NERDA/workflow). 108 | 109 | ## Use a Precooked `NERDA` model 110 | 111 | We have precooked a number of `NERDA` models for Danish and English, that you can download 112 | and use right off the shelf. 113 | 114 | Here is an example. 115 | 116 | Instantiate a multilingual BERT model, that has been finetuned for NER in Danish, 117 | `DA_BERT_ML`. 118 | 119 | ```python 120 | from NERDA.precooked import DA_BERT_ML 121 | model = DA_BERT_ML() 122 | ``` 123 | 124 | Down(load) network from web: 125 | 126 | ```python 127 | model.download_network() 128 | model.load_network() 129 | ``` 130 | 131 | You can now predict named entities in new (Danish) texts 132 | 133 | ```python 134 | # (Danish) text to identify named entities in: 135 | # 'Jens Hansen har en bondegård' = 'Old MacDonald had a farm' 136 | text = 'Jens Hansen har en bondegård' 137 | model.predict_text(text) 138 | ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']]) 139 | ``` 140 | 141 | ### List of Precooked Models 142 | 143 | The table below shows the precooked `NERDA` models publicly available for download. 144 | 145 | | **Model** | **Language** | **Transformer** | **Dataset** | **F1-score** | 146 | |-----------------|--------------|-------------------|---------|-----| 147 | | `DA_BERT_ML` | Danish | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 82.8 | 148 | `DA_ELECTRA_DA` | Danish | [Danish ELECTRA](https://huggingface.co/Maltehb/-l-ctra-danish-electra-small-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 79.8 | 149 | | `EN_BERT_ML` | English | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased)| [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 90.4 | 150 | | `EN_ELECTRA_EN` | English | [English ELECTRA](https://huggingface.co/google/electra-small-discriminator) | [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 89.1 | 151 | 152 | **F1-score** is the micro-averaged F1-score across entity tags and is 153 | evaluated on the respective test sets (that have not been used for training nor 154 | validation of the models). 155 | 156 | Note, that we have not spent a lot of time on actually fine-tuning the models, 157 | so there could be room for improvement. If you are able to improve the models, 158 | we will be happy to hear from you and include your `NERDA` model. 159 | 160 | ### Model Performance 161 | 162 | The table below summarizes the performance (F1-scores) of the precooked `NERDA` models. 163 | 164 | | **Level** | `DA_BERT_ML` | `DA_ELECTRA_DA` | `EN_BERT_ML` | `EN_ELECTRA_EN` | 165 | |---------------|--------------|-----------------|--------------|-----------------| 166 | | B-PER | 93.8 | 92.0 | 96.0 | 95.1 | 167 | | I-PER | 97.8 | 97.1 | 98.5 | 97.9 | 168 | | B-ORG | 69.5 | 66.9 | 88.4 | 86.2 | 169 | | I-ORG | 69.9 | 70.7 | 85.7 | 83.1 | 170 | | B-LOC | 82.5 | 79.0 | 92.3 | 91.1 | 171 | | I-LOC | 31.6 | 44.4 | 83.9 | 80.5 | 172 | | B-MISC | 73.4 | 68.6 | 81.8 | 80.1 | 173 | | I-MISC | 86.1 | 63.6 | 63.4 | 68.4 | 174 | | **AVG_MICRO** | 82.8 | 79.8 | 90.4 | 89.1 | 175 | | **AVG_MACRO** | 75.6 | 72.8 | 86.3 | 85.3 | 176 | 177 | ## 'NERDA'? 178 | '`NERDA`' originally stands for *'Named Entity Recognition for DAnish'*. However, this 179 | is somewhat misleading, since the functionality is no longer limited to Danish. 180 | On the contrary it generalizes to all other languages, i.e. `NERDA` supports 181 | fine-tuning of transformers for NER tasks for any arbitrary 182 | language. 183 | 184 | ## Background 185 | `NERDA` is developed as a part of [Ekstra Bladet](https://ekstrabladet.dk/)’s activities on Platform Intelligence in News (PIN). PIN is an industrial research project that is carried out in collaboration between the [Technical University of Denmark](https://www.dtu.dk/), [University of Copenhagen](https://www.ku.dk/) and [Copenhagen Business School](https://www.cbs.dk/) with funding from [Innovation Fund Denmark](https://innovationsfonden.dk/). The project runs from 2020-2023 and develops recommender systems and natural language processing systems geared for news publishing, some of which are open sourced like `NERDA`. 186 | 187 | ## Shout-outs 188 | - Thanks to [Alexandra Institute](https://alexandra.dk/) for with the [`danlp`](https://github.com/alexandrainst/danlp) package to have encouraged us to develop this package. 189 | - Thanks to [Malte Højmark-Bertelsen](https://www.linkedin.com/in/malte-h%C3%B8jmark-bertelsen-9a618017b/) and [Kasper Junge](https://www.linkedin.com/in/kasper-juunge/?originalSubdomain=dk) for giving feedback on `NERDA`. 190 | 191 | ## Read more 192 | The detailed documentation for `NERDA` including code references and 193 | extended workflow examples can be accessed [here](https://ebanalyse.github.io/NERDA/). 194 | 195 | ## Cite this work 196 | 197 | ``` 198 | @inproceedings{nerda, 199 | title = {NERDA}, 200 | author = {Kjeldgaard, Lars and Nielsen, Lukas}, 201 | year = {2021}, 202 | publisher = {{GitHub}}, 203 | url = {https://github.com/ebanalyse/NERDA} 204 | } 205 | ``` 206 | 207 | ## Contact 208 | We hope, that you will find `NERDA` useful. 209 | 210 | Please direct any questions and feedbacks to 211 | [us](mailto:lars.kjeldgaard@eb.dk)! 212 | 213 | If you want to contribute (which we encourage you to), open a 214 | [PR](https://github.com/ebanalyse/NERDA/pulls). 215 | 216 | If you encounter a bug or want to suggest an enhancement, please 217 | [open an issue](https://github.com/ebanalyse/NERDA/issues). 218 | 219 | -------------------------------------------------------------------------------- /admin/deploy_models.py: -------------------------------------------------------------------------------- 1 | from NERDA.datasets import get_conll_data, get_dane_data 2 | import pandas as pd 3 | import torch 4 | import boto3 5 | 6 | def deploy_model_to_s3(model, test_set = get_dane_data('test')): 7 | """Deploy Model to S3 8 | 9 | Args: 10 | model: NERDA model. 11 | test_set: Test set for evaluating performance. 12 | 13 | Returns: 14 | str: message saying, if model was uploaded successfully. 15 | Model and text file with performance numbers uploaded 16 | as side-effects. 17 | """ 18 | 19 | model_name = type(model).__name__ 20 | 21 | file_model = f'{model_name}.bin' 22 | torch.save(model.network.state_dict(), file_model) 23 | 24 | # compute performance on test set and save. 25 | performance = model.evaluate_performance(test_set) 26 | 27 | # write to csv. 28 | file_performance = f'{model_name}_performance.csv' 29 | performance.to_csv(file_performance, index = False) 30 | 31 | # upload to S3 bucket. 32 | s3 = boto3.resource('s3') 33 | s3.Bucket('nerda').upload_file( 34 | Filename=file_model, 35 | Key = file_model) 36 | s3.Bucket('nerda').upload_file( 37 | Filename=file_performance, 38 | Key = file_performance) 39 | 40 | return "Model deployed to S3 successfully." 41 | 42 | if __name__ == '__main__': 43 | from NERDA.precooked import EN_ELECTRA_EN 44 | model = EN_ELECTRA_EN() 45 | model.train() 46 | 47 | deploy_model_to_s3(model) 48 | 49 | -------------------------------------------------------------------------------- /admin/sandbox.py: -------------------------------------------------------------------------------- 1 | from NERDA.models import NERDA 2 | from NERDA.datasets import get_conll_data, get_dane_data 3 | from transformers import AutoTokenizer 4 | trans = 'bert-base-multilingual-uncased' 5 | tokenizer = AutoTokenizer.from_pretrained(trans, do_lower_case = True) 6 | data = get_dane_data('train') 7 | 8 | sents = data.get('sentences') 9 | 10 | out = [] 11 | 12 | for sent in sents: 13 | sent = sents[3595] 14 | tokens = [] 15 | for word in sent: 16 | tokens.extend(tokenizer.tokenize(word)) 17 | out.append(tokens) 18 | 19 | lens = [len(x) for x in out] 20 | 21 | max(lens) 22 | 23 | sents[3595] 24 | 25 | 26 | from transformers import AutoTokenizer, AutoModel, AutoConfig 27 | t = 'google/electra-small-discriminator' 28 | cfg = AutoModel.from_pretrained(t) 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | #trn = get_conll_data('train') 41 | #idx_min = 3110 42 | #idx_max = 3115 43 | #valid = get_conll_data('valid') 44 | #valid['sentences'] = valid['sentences'][idx_min:idx_max+1] 45 | #valid['tags'] = valid['tags'][idx_min:idx_max+1] 46 | #trn['sentences'] = trn['sentences'][idx_min:idx_max+1] 47 | #trn['tags'] = trn['tags'][idx_min:idx_max+1] 48 | # model = NERDA(dataset_training=trn, 49 | # dataset_validation = valid) 50 | #model.train() 51 | #k=0 52 | #trn['sentences'][3111] 53 | #from transformers import AutoTokenizer 54 | #t = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') 55 | #valid = get_conll_data('valid') 56 | 57 | filename = 'en_bert_ml.pkl' 58 | # pickle.dump(model, open(filename, 'wb')) 59 | import pickle 60 | file = open(filename,'rb') 61 | model = pickle.load(file) 62 | test = get_conll_data('test') 63 | model.evaluate_performance(test, batch_size = 10) 64 | #for entry in range(3120,3140): 65 | # print(entry) 66 | # sent = trn['sentences'][entry] 67 | # [t.tokenize(word) for word in sent] 68 | 69 | test = get_conll_data('test') 70 | idx_min = 202 71 | idx_max = 202 72 | # valid = get_conll_data('valid') 73 | #valid['sentences'] = valid['sentences'][idx_min:idx_max+1] 74 | #valid['tags'] = valid['tags'][idx_min:idx_max+1] 75 | test['sentences'] = test['sentences'][idx_min:idx_max+1] 76 | test['tags'] = test['tags'][idx_min:idx_max+1] 77 | model.evaluate_performance(test) 78 | # model = NERDA(dataset_training=trn, 79 | # dataset_validation = valid) 80 | #model.train() 81 | #k=0 82 | #trn['sentences'][3111] 83 | #from transformers import AutoTokenizer 84 | #t = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') 85 | #valid = get_conll_data('valid') 86 | 87 | <<<<<<< HEAD:admin/sandbox.py 88 | 89 | transformer = "google/electra-small-discriminator" 90 | from transformers import AutoTokenizer, AutoModel, AutoConfig 91 | trans = AutoConfig.from_pretrained(transformer) 92 | 93 | def tester(): 94 | 95 | try: 96 | model = AutoModel.from_pretrained('google/electra-small-discriminator') 97 | except: 98 | print("Oops!", sys.exc_info()[0], "occurred.") 99 | 100 | return model 101 | ======= 102 | from NERDA.datasets import get_dane_data 103 | trn = get_conll_data('train', 5) 104 | valid = get_conll_data('dev', 5) 105 | transformer = 'bert-base-multilingual-uncased', 106 | model = NERDA(transformer = transformer, 107 | dataset_training = trn, 108 | dataset_validation = valid) 109 | >>>>>>> b5eea087ece5f61ec70aa3f99cd4c99b418ebb92:sandbox.py 110 | -------------------------------------------------------------------------------- /admin/tuning.py: -------------------------------------------------------------------------------- 1 | from sys import getdefaultencoding 2 | from NERDA.models import NERDA 3 | from NERDA.datasets import get_dane_data 4 | from hyperopt import fmin, hp, tpe, space_eval 5 | from hyperopt.pyll import scope 6 | import numpy as np 7 | 8 | def objective(params): 9 | 10 | print(params) 11 | 12 | model = NERDA(dataset_training = get_dane_data('train', 20), 13 | dataset_validation = get_dane_data('dev', 20), 14 | hyperparameters = params) 15 | 16 | model.train() 17 | 18 | return model.valid_loss 19 | 20 | def run_parameter_optimization(objective, number_of_evals = 3): 21 | 22 | hpspace = { 23 | 'learning_rate': hp.loguniform('lr', np.log(0.00005), np.log(0.01)), 24 | 'train_batch_size': scope.int(hp.uniform('batch_size', 8, 16)), 25 | 'epochs': scope.int(hp.uniform('epochs', 1, 3)), 26 | 'warmup_steps': hp.choice('warmup_steps', [0, 250, 500]), 27 | } 28 | 29 | print('Running hyperparameter optimization...') 30 | 31 | best_params = fmin(objective, space = hpspace, algo = tpe.suggest, max_evals= number_of_evals) 32 | 33 | return best_params 34 | 35 | # best_params = run_parameter_optimization(objective = objective, number_of_evals=3) 36 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | flake8-html 2 | wheel 3 | twine 4 | mkdocs-material 5 | mkdocstrings 6 | mknotebooks 7 | jupyter -------------------------------------------------------------------------------- /docs/datasets.md: -------------------------------------------------------------------------------- 1 | # Datasets 2 | ::: NERDA.datasets -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # NERDA 2 | 3 | ![Build status](https://github.com/ebanalyse/NERDA/workflows/build/badge.svg) 4 | [![codecov](https://codecov.io/gh/ebanalyse/NERDA/branch/main/graph/badge.svg?token=OB6LGFQZYX)](https://codecov.io/gh/ebanalyse/NERDA) 5 | ![PyPI](https://img.shields.io/pypi/v/NERDA.svg) 6 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/NERDA?color=green) 7 | ![License](https://img.shields.io/badge/license-MIT-blue.svg) 8 | 9 | Not only is `NERDA` a mesmerizing muppet-like character. `NERDA` is also 10 | a python package, that offers a slick easy-to-use interface for fine-tuning 11 | pretrained transformers for Named Entity Recognition 12 | (=NER) tasks. 13 | 14 | You can also utilize `NERDA` to access a selection of *precooked* `NERDA` models, 15 | that you can use right off the shelf for NER tasks. 16 | 17 | `NERDA` is built on `huggingface` `transformers` and the popular `pytorch` 18 | framework. 19 | 20 | ## Installation guide 21 | `NERDA` can be installed from [PyPI](https://pypi.org/project/NERDA/) with 22 | 23 | ``` 24 | pip install NERDA 25 | ``` 26 | 27 | If you want the development version then install directly from [GitHub](https://github.com/ebanalyse/NERDA). 28 | 29 | ## Named-Entity Recogntion tasks 30 | Named-entity recognition (NER) (also known as (named) entity identification, 31 | entity chunking, and entity extraction) is a subtask of information extraction 32 | that seeks to locate and classify named entities mentioned in unstructured 33 | text into pre-defined categories such as person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.[1] 34 | 35 | [1]: https://en.wikipedia.org/wiki/Named-entity_recognition 36 | 37 | ### Example Task: 38 | 39 | **Task** 40 | 41 | Identify person names and organizations in text: 42 | 43 | *Jim bought 300 shares of Acme Corp.* 44 | 45 | **Solution** 46 | 47 | | **Named Entity** | **Type** | 48 | |--------------------|-----------------------| 49 | | 'Jim' | Person | 50 | | 'Acme Corp.' | Organization | 51 | 52 | Read more about NER on [Wikipedia](https://en.wikipedia.org/wiki/Named-entity_recognition). 53 | 54 | ## Train Your Own `NERDA` Model 55 | 56 | Say, we want to fine-tune a pretrained [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) transformer for NER in English. 57 | 58 | Load package. 59 | 60 | ```python 61 | from NERDA.models import NERDA 62 | ``` 63 | 64 | Instantiate a `NERDA` model (*with default settings*) for the 65 | [`CoNLL-2003`](https://www.clips.uantwerpen.be/conll2003/ner/) 66 | English NER data set. 67 | 68 | ```python 69 | from NERDA.datasets import get_conll_data 70 | model = NERDA(dataset_training = get_conll_data('train'), 71 | dataset_validation = get_conll_data('valid'), 72 | transformer = 'bert-base-multilingual-uncased') 73 | ``` 74 | 75 | By default the network architecture is analogous to that of the models in [Hvingelby et al. 2020](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf). 76 | 77 | The model can then be trained/fine-tuned by invoking the `train` method, e.g. 78 | 79 | ```python 80 | model.train() 81 | ``` 82 | 83 | **Note**: this will take some time depending on the dimensions of your machine 84 | (if you want to skip training, you can go ahead and use one of the models, 85 | that we have already precooked for you in stead). 86 | 87 | After the model has been trained, the model can be used for predicting 88 | named entities in new texts. 89 | 90 | ```python 91 | # text to identify named entities in. 92 | text = 'Old MacDonald had a farm' 93 | model.predict_text(text) 94 | ([['Old', 'MacDonald', 'had', 'a', 'farm']], [['B-PER', 'I-PER', 'O', 'O', 'O']]) 95 | ``` 96 | This means, that the model identified 'Old MacDonald' as a *PER*son. 97 | 98 | Please note, that the `NERDA` model configuration above was instantiated 99 | with all default settings. You can however customize your `NERDA` model 100 | in a lot of ways: 101 | 102 | - Use your own data set (finetune a transformer for any given language) 103 | - Choose whatever transformer you like 104 | - Set all of the hyperparameters for the model 105 | - You can even apply your own Network Architecture 106 | 107 | Read more about advanced usage of `NERDA` in the [detailed documentation](https://ebanalyse.github.io/NERDA/workflow). 108 | 109 | ## Use a Precooked `NERDA` model 110 | 111 | We have precooked a number of `NERDA` models for Danish and English, that you can download 112 | and use right off the shelf. 113 | 114 | Here is an example. 115 | 116 | Instantiate a multilingual BERT model, that has been finetuned for NER in Danish, 117 | `DA_BERT_ML`. 118 | 119 | ```python 120 | from NERDA.precooked import DA_BERT_ML() 121 | model = DA_BERT_ML() 122 | ``` 123 | 124 | Down(load) network from web: 125 | 126 | ```python 127 | model.download_network() 128 | model.load_network() 129 | ``` 130 | 131 | You can now predict named entities in new (Danish) texts 132 | 133 | ```python 134 | # (Danish) text to identify named entities in: 135 | # 'Jens Hansen har en bondegård' = 'Old MacDonald had a farm' 136 | text = 'Jens Hansen har en bondegård' 137 | model.predict_text(text) 138 | ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']]) 139 | ``` 140 | 141 | ### List of Precooked Models 142 | 143 | The table below shows the precooked `NERDA` models publicly available for download. 144 | 145 | | **Model** | **Language** | **Transformer** | **Dataset** | **F1-score** | 146 | |-----------------|--------------|-------------------|---------|-----| 147 | | `DA_BERT_ML` | Danish | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 82.8 | 148 | `DA_ELECTRA_DA` | Danish | [Danish ELECTRA](https://huggingface.co/Maltehb/-l-ctra-danish-electra-small-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 79.8 | 149 | | `EN_BERT_ML` | English | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased)| [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 90.4 | 150 | | `EN_ELECTRA_EN` | English | [English ELECTRA](https://huggingface.co/google/electra-small-discriminator) | [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 89.1 | 151 | 152 | **F1-score** is the micro-averaged F1-score across entity tags and is 153 | evaluated on the respective test sets (that have not been used for training nor 154 | validation of the models). 155 | 156 | Note, that we have not spent a lot of time on actually fine-tuning the models, 157 | so there could be room for improvement. If you are able to improve the models, 158 | we will be happy to hear from you and include your `NERDA` model. 159 | 160 | ### Model Performance 161 | 162 | The table below summarizes the performance (F1-scores) of the precooked `NERDA` models. 163 | 164 | | **Level** | `DA_BERT_ML` | `DA_ELECTRA_DA` | `EN_BERT_ML` | `EN_ELECTRA_EN` | 165 | |---------------|--------------|-----------------|--------------|-----------------| 166 | | B-PER | 93.8 | 92.0 | 96.0 | 95.1 | 167 | | I-PER | 97.8 | 97.1 | 98.5 | 97.9 | 168 | | B-ORG | 69.5 | 66.9 | 88.4 | 86.2 | 169 | | I-ORG | 69.9 | 70.7 | 85.7 | 83.1 | 170 | | B-LOC | 82.5 | 79.0 | 92.3 | 91.1 | 171 | | I-LOC | 31.6 | 44.4 | 83.9 | 80.5 | 172 | | B-MISC | 73.4 | 68.6 | 81.8 | 80.1 | 173 | | I-MISC | 86.1 | 63.6 | 63.4 | 68.4 | 174 | | **AVG_MICRO** | 82.8 | 79.8 | 90.4 | 89.1 | 175 | | **AVG_MACRO** | 75.6 | 72.8 | 86.3 | 85.3 | 176 | 177 | ## 'NERDA'? 178 | '`NERDA`' originally stands for *'Named Entity Recognition for DAnish'*. However, this 179 | is somewhat misleading, since the functionality is no longer limited to Danish. 180 | On the contrary it generalizes to all other languages, i.e. `NERDA` supports 181 | fine-tuning of transformers for NER tasks for any arbitrary 182 | language. 183 | 184 | ## Background 185 | `NERDA` is developed as a part of [Ekstra Bladet](https://ekstrabladet.dk/)’s activities on Platform Intelligence in News (PIN). PIN is an industrial research project that is carried out in collaboration between the [Technical University of Denmark](https://www.dtu.dk/), [University of Copenhagen](https://www.ku.dk/) and [Copenhagen Business School](https://www.cbs.dk/) with funding from [Innovation Fund Denmark](https://innovationsfonden.dk/). The project runs from 2020-2023 and develops recommender systems and natural language processing systems geared for news publishing, some of which are open sourced like `NERDA`. 186 | 187 | ## Shout-outs 188 | - Thanks to [Alexandra Institute](https://alexandra.dk/) for with the [`danlp`](https://github.com/alexandrainst/danlp) package to have encouraged us to develop this package. 189 | - Thanks to [Malte Højmark-Bertelsen](https://www.linkedin.com/in/malte-h%C3%B8jmark-bertelsen-9a618017b/) and [Kasper Junge](https://www.linkedin.com/in/kasper-juunge/?originalSubdomain=dk) for giving feedback on `NERDA`. 190 | 191 | ## Contact 192 | We hope, that you will find `NERDA` useful. 193 | 194 | Please direct any questions and feedbacks to 195 | [us](mailto:lars.kjeldgaard@eb.dk)! 196 | 197 | If you want to contribute (which we encourage you to), open a 198 | [PR](https://github.com/ebanalyse/NERDA/pulls). 199 | 200 | If you encounter a bug or want to suggest an enhancement, please 201 | [open an issue](https://github.com/ebanalyse/NERDA/issues). 202 | 203 | -------------------------------------------------------------------------------- /docs/nerda_models.md: -------------------------------------------------------------------------------- 1 | # NERDA Models 2 | ::: NERDA.models -------------------------------------------------------------------------------- /docs/networks.md: -------------------------------------------------------------------------------- 1 | # Networks 2 | ::: NERDA.networks -------------------------------------------------------------------------------- /docs/performance.md: -------------------------------------------------------------------------------- 1 | # Performance 2 | ::: NERDA.performance -------------------------------------------------------------------------------- /docs/preamble.py: -------------------------------------------------------------------------------- 1 | # suppress warnings for notebook 2 | import warnings 3 | warnings.filterwarnings("ignore") 4 | # download nltk 'punkt' in order to use nltk word/sent-tokenize 5 | import nltk 6 | nltk.download('punkt') -------------------------------------------------------------------------------- /docs/precooked_models.md: -------------------------------------------------------------------------------- 1 | # Precooked NERDA models 2 | ::: NERDA.precooked -------------------------------------------------------------------------------- /docs/predictions.md: -------------------------------------------------------------------------------- 1 | # Predictions 2 | ::: NERDA.predictions -------------------------------------------------------------------------------- /docs/workflow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "language_info": { 4 | "codemirror_mode": { 5 | "name": "ipython", 6 | "version": 3 7 | }, 8 | "file_extension": ".py", 9 | "mimetype": "text/x-python", 10 | "name": "python", 11 | "nbconvert_exporter": "python", 12 | "pygments_lexer": "ipython3", 13 | "version": "3.9.0-final" 14 | }, 15 | "orig_nbformat": 2, 16 | "kernelspec": { 17 | "name": "python3", 18 | "display_name": "Python 3.9.0 64-bit ('3.9.0')", 19 | "metadata": { 20 | "interpreter": { 21 | "hash": "36071112a161297f2fd106003050184fbdff34ed057f375faa6d2f5f0cad40eb" 22 | } 23 | } 24 | } 25 | }, 26 | "nbformat": 4, 27 | "nbformat_minor": 2, 28 | "cells": [ 29 | { 30 | "source": [ 31 | "# Workflow Examples" 32 | ], 33 | "cell_type": "markdown", 34 | "metadata": {} 35 | }, 36 | { 37 | "source": [ 38 | "`NERDA` offers a simple easy-to-use interface for fine-tuning transformers for Named-Entity Recognition (=NER). We call this family of models `NERDA` models.\n", 39 | "\n", 40 | "`NERDA` can be used in two ways. You can either (1) train your own customized `NERDA` model or (2) download and use one of our precooked `NERDA` models for inference i.e. identifying named entities in new texts." 41 | ], 42 | "cell_type": "markdown", 43 | "metadata": {} 44 | }, 45 | { 46 | "source": [ 47 | "## Train Your Own `NERDA` model" 48 | ], 49 | "cell_type": "markdown", 50 | "metadata": {} 51 | }, 52 | { 53 | "source": [ 54 | "We want to fine-tune a transformer for English. \n", 55 | "\n", 56 | "First, we download an English NER dataset [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) with annotated Named Entities, that we will use for training and evaluation of our model." 57 | ], 58 | "cell_type": "markdown", 59 | "metadata": {} 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "output_type": "error", 68 | "ename": "ModuleNotFoundError", 69 | "evalue": "No module named 'NERDA'", 70 | "traceback": [ 71 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 72 | "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", 73 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# don't print warnings for this session\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mNERDA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatasets\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mget_dane_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload_dane_data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mdownload_dane_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 74 | "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'NERDA'" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "from NERDA.datasets import get_conll_data, download_conll_data\n", 80 | "download_conll_data()" 81 | ] 82 | }, 83 | { 84 | "source": [ 85 | "CoNLL-2003 operates with the following types of named entities:\n", 86 | "\n", 87 | "1. **PER**sons \n", 88 | "2. **ORG**anizations \n", 89 | "3. **LOC**ations \n", 90 | "4. **MISC**ellaneous \n", 91 | "5. **O**utside (Not a named Entity)\n", 92 | "\n", 93 | "An observation from the CoNLL-2003 data set looks like this." 94 | ], 95 | "cell_type": "markdown", 96 | "metadata": {} 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 3, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "output_type": "error", 105 | "ename": "NameError", 106 | "evalue": "name 'get_dane_data' is not defined", 107 | "traceback": [ 108 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 109 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 110 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtraining\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_dane_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'train'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mvalidation\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_dane_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'dev'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# example\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtraining\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'sentences'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mtags\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtraining\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'tags'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 111 | "\u001b[0;31mNameError\u001b[0m: name 'get_dane_data' is not defined" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "# extract the first _5_ rows from the training and validation data splits.\n", 117 | "training = get_conll_data('train', 5)\n", 118 | "validation = get_conll_data('valid', 5)\n", 119 | "# example\n", 120 | "sentence = training.get('sentences')[0]\n", 121 | "tags = training.get('tags')[0]\n", 122 | "print(\"\\n\".join([\"{}/{}\".format(word, tag) for word, tag in zip(sentence, tags)]))" 123 | ] 124 | }, 125 | { 126 | "source": [ 127 | "If you provide your own dataset, it must have the same structure:\n", 128 | "\n", 129 | "- It must be a dictionary\n", 130 | "- The dictionary must contain\n", 131 | " - 'sentences': a list of word-tokenized sentences with one sentence per entry \n", 132 | " - 'tags': a list with the corresponding named-entity tags.\n", 133 | "\n", 134 | "The data set does however *not* have to follow the Inside-Outside-Beginning (IOB) tagging scheme[1].\n", 135 | "\n", 136 | "[1]: https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)\n", 137 | "\n", 138 | "The IOB tagging scheme implies, that words that are beginning of named entities are tagged with *'B-'* and words 'inside' (=continuations of) named entities are tagged with *'I-'*. That means that 'Joe Biden' should be tagged as `Joe(B-PER) Biden(I-PER)`.\n", 139 | "\n", 140 | "Now, instantiate a `NERDA` model for finetuning an [ELECTRA](https://huggingface.co/google/electra-small-discriminator) transformer for NER. " 141 | ], 142 | "cell_type": "markdown", 143 | "metadata": {} 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "from NERDA.models import NERDA\n", 152 | "tag_scheme = ['B-PER',\n", 153 | " 'I-PER', \n", 154 | " 'B-ORG', \n", 155 | " 'I-ORG', \n", 156 | " 'B-LOC', \n", 157 | " 'I-LOC', \n", 158 | " 'B-MISC', \n", 159 | " 'I-MISC']\n", 160 | "model = NERDA(dataset_training = training,\n", 161 | " dataset_validation = validation,\n", 162 | " tag_scheme = tag_scheme,\n", 163 | " tag_outside = 'O',\n", 164 | " transformer = 'google/electra-small-discriminator',\n", 165 | " hyperparameters = {'epochs' : 1,\n", 166 | " 'warmup_steps' : 10,\n", 167 | " 'train_batch_size': 5,\n", 168 | " 'learning_rate': 0.0001},)" 169 | ] 170 | }, 171 | { 172 | "source": [ 173 | "Note, this model configuration only uses 5 sentences for model training to minimize execution time. Also the hyperparameters for the model have been chosen in order to minimize execution time. Therefore this example only serves to illustrate the functionality i.e. the resulting model will suck.\n", 174 | "\n", 175 | "By default the network architecture is analogous that of the models in [Hvingelby et al. 2020](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf). \n", 176 | "\n", 177 | "The model can be trained right away by invoking the `train` method." 178 | ], 179 | "cell_type": "markdown", 180 | "metadata": {} 181 | }, 182 | { 183 | "source": [ 184 | "model.train()" 185 | ], 186 | "cell_type": "code", 187 | "metadata": {}, 188 | "execution_count": null, 189 | "outputs": [] 190 | }, 191 | { 192 | "source": [ 193 | "We can compute the performance of the model on a test set (limited to 5 sentences):" 194 | ], 195 | "cell_type": "markdown", 196 | "metadata": {} 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "test = get_conll_data('test', 5)\n", 205 | "model.evaluate_performance(test)" 206 | ] 207 | }, 208 | { 209 | "source": [ 210 | "Unsurprisingly, the model sucks in this case due to the ludicrous specification.\n", 211 | "\n", 212 | "Named Entities in new texts can be predicted with `predict` functions." 213 | ], 214 | "cell_type": "markdown", 215 | "metadata": {} 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 2, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "text = \"Old MacDonald had a farm\"\n", 224 | "model.predict_text(text)" 225 | ] 226 | }, 227 | { 228 | "source": [ 229 | "Needless to say the predicted entities for this model are nonsensical.\n", 230 | "\n", 231 | "To get a more reasonable model, provide more data and a more meaningful model specification.\n", 232 | "\n", 233 | "In general `NERDA` has the following handles, that you use.\n", 234 | "\n", 235 | "1. provide your own data set \n", 236 | "2. choose whatever pretrained transformer you would like to fine-tune\n", 237 | "3. provide your own set of hyperparameters and lastly\n", 238 | "4. provide your own `torch` network (architecture). You can do this by instantiating a `NERDA` model with the parameter 'network' set to your own network (torch.nn.Module)." 239 | ], 240 | "cell_type": "markdown", 241 | "metadata": {} 242 | }, 243 | { 244 | "source": [ 245 | "## Use a Precooked `NERDA` model" 246 | ], 247 | "cell_type": "markdown", 248 | "metadata": {} 249 | }, 250 | { 251 | "source": [ 252 | "We have precooked a number of `NERDA` models, that you can download \n", 253 | "and use right off the shelf. \n", 254 | "\n", 255 | "Here is an example.\n", 256 | "\n", 257 | "Instantiate a `NERDA` model based on the English [ELECTRA](https://huggingface.co/google/electra-small-discriminator) transformer, that has been finetuned for NER in English,\n", 258 | "`EN_ELECTRA_EN`." 259 | ], 260 | "cell_type": "markdown", 261 | "metadata": {} 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "\n", 270 | "from NERDA.precooked import EN_ELECTRA_EN\n", 271 | "model = EN_ELECTRA_EN()\n", 272 | "\n" 273 | ] 274 | }, 275 | { 276 | "source": [ 277 | "(Down)load network:" 278 | ], 279 | "cell_type": "markdown", 280 | "metadata": {} 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "\n", 289 | "model.download_network()\n", 290 | "model.load_network()\n" 291 | ] 292 | }, 293 | { 294 | "source": [ 295 | "This model performs much better:" 296 | ], 297 | "cell_type": "markdown", 298 | "metadata": {} 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "model.evaluate_performance(get_conll_data('test', 100))" 307 | ] 308 | }, 309 | { 310 | "source": [ 311 | "Predict named entities in new texts" 312 | ], 313 | "cell_type": "markdown", 314 | "metadata": {} 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "text = 'Old MacDonald had a farm'\n", 323 | "model.predict_text(text)\n" 324 | ] 325 | }, 326 | { 327 | "source": [ 328 | "### List of Precooked Models" 329 | ], 330 | "cell_type": "markdown", 331 | "metadata": {} 332 | }, 333 | { 334 | "source": [ 335 | "The table below shows the precooked `NERDA` models publicly available for download. We have trained models for Danish and English.\n", 336 | "\n", 337 | "\n", 338 | "| **Model** | **Language** | **Transformer** | **Dataset** | **F1-score** | \n", 339 | "|-----------------|--------------|-------------------|---------|-----|\n", 340 | "| `DA_BERT_ML` | Danish | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 82.8 | \n", 341 | "`DA_ELECTRA_DA` | Danish | [Danish ELECTRA](https://huggingface.co/Maltehb/-l-ctra-danish-electra-small-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 79.8 |\n", 342 | "| `EN_BERT_ML` | English | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased)| [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 90.4 |\n", 343 | "| `EN_ELECTRA_EN` | English | [English ELECTRA](https://huggingface.co/google/electra-small-discriminator) | [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 89.1 |\n", 344 | "\n", 345 | "**F1-score** is the micro-averaged F1-score across entity tags and is \n", 346 | "evaluated on the respective test sets (that have not been used for training nor\n", 347 | "validation of the models).\n", 348 | "\n", 349 | "Note, that we have not spent a lot of time on actually fine-tuning the models,\n", 350 | "so there could be room for improvement. If you are able to improve the models,\n", 351 | "we will be happy to hear from you and include your `NERDA` model.\n", 352 | "\n", 353 | "#### Performance of Precooked Models\n", 354 | "\n", 355 | "The table below summarizes the performance as measured by F1-scores of the model\n", 356 | " configurations, that `NERDA` ships with. \n", 357 | "\n", 358 | "| **Level** | `DA_BERT_ML` | `DA_ELECTRA_DA` | `EN_BERT_ML` | `EN_ELECTRA_EN` |\n", 359 | "|---------------|-----------|------------|-------------|----------------|\n", 360 | "| B-PER | 93.8 | 92.0 | 96.0 | 95.1 | \n", 361 | "| I-PER | 97.8 | 97.1 | 98.5 | 97.9 | \n", 362 | "| B-ORG | 69.5 | 66.9 | 88.4 | 86.2 | \n", 363 | "| I-ORG | 69.9 | 70.7 | 85.7 | 83.1 | \n", 364 | "| B-LOC | 82.5 | 79.0 | 92.3 | 91.1 | \n", 365 | "| I-LOC | 31.6 | 44.4 | 83.9 | 80.5 | \n", 366 | "| B-MISC | 73.4 | 68.6 | 81.8 | 80.1 | \n", 367 | "| I-MISC | 86.1 | 63.6 | 63.4 | 68.4 | \n", 368 | "| **AVG_MICRO** | 82.8 | 79.8 | 90.4 | 89.1 | \n", 369 | "| **AVG_MACRO** | 75.6 | 72.8 | 86.3 | 85.3 |" 370 | ], 371 | "cell_type": "markdown", 372 | "metadata": {} 373 | }, 374 | { 375 | "source": [ 376 | "This concludes our walkthrough of `NERDA`. If you have any questions, please do not hesitate to [contact us](mailto:lars.kjeldgaard@eb.dk)!" 377 | ], 378 | "cell_type": "markdown", 379 | "metadata": {} 380 | } 381 | ] 382 | } -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebanalyse/NERDA/ae45d7e5368059721d1073384201433ea7a6e820/logo.png -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: NERDA 2 | theme: 3 | name: "material" 4 | 5 | plugins: 6 | - search 7 | - mkdocstrings: 8 | handlers: 9 | python: 10 | setup_commands: 11 | - import sys 12 | - sys.path.append("src") 13 | - mknotebooks: 14 | execute: True 15 | preamble: "docs/preamble.py" 16 | 17 | nav: 18 | - Home: index.md 19 | - Workflow Examples: workflow.ipynb 20 | - Code Reference: 21 | - NERDA Models: nerda_models.md 22 | - Precooked NERDA Models: precooked_models.md 23 | - Datasets: datasets.md 24 | - Predictions: predictions.md 25 | - Networks: networks.md 26 | - Performance: performance.md 27 | 28 | 29 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = -v -s --junitxml=test-results/tests.xml --cov=./ --cov-report=xml -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="NERDA", 8 | version="1.0.0", 9 | author="Lars Kjeldgaard, Lukas Christian Nielsen", 10 | author_email="lars.kjeldgaard@eb.dk", 11 | description="A Framework for Finetuning Transformers for Named-Entity Recognition", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/ebanalyse/NERDA", 15 | packages=setuptools.find_packages(where='src'), 16 | package_dir={'': 'src'}, 17 | python_requires='>=3.6', 18 | install_requires=[ 19 | 'torch', 20 | 'transformers', 21 | 'sklearn', 22 | 'nltk', 23 | 'pandas', 24 | 'progressbar', 25 | 'pyconll' 26 | ], 27 | setup_requires=['pytest-runner'], 28 | tests_require=['pytest', 29 | 'pytest-cov'], 30 | classifiers=[ 31 | "Programming Language :: Python :: 3", 32 | "License :: OSI Approved :: MIT License", 33 | ], 34 | include_package_data=True 35 | ) 36 | -------------------------------------------------------------------------------- /src/NERDA/__init__.py: -------------------------------------------------------------------------------- 1 | import NERDA -------------------------------------------------------------------------------- /src/NERDA/datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | This section covers functionality for (down)loading Named Entity 3 | Recognition data sets. 4 | """ 5 | 6 | import csv 7 | import os 8 | import pyconll 9 | from io import BytesIO 10 | from itertools import compress 11 | from pathlib import Path 12 | from typing import Union, List, Dict 13 | from urllib.request import urlopen 14 | from zipfile import ZipFile 15 | import ssl 16 | 17 | def download_unzip(url_zip: str, 18 | dir_extract: str) -> str: 19 | """Download and unzip a ZIP archive to folder. 20 | 21 | Loads a ZIP file from URL and extracts all of the files to a 22 | given folder. Does not save the ZIP file itself. 23 | 24 | Args: 25 | url_zip (str): URL to ZIP file. 26 | dir_extract (str): Directory where files are extracted. 27 | 28 | Returns: 29 | str: a message telling, if the archive was succesfully 30 | extracted. Obviously the files in the ZIP archive are 31 | extracted to the desired directory as a side-effect. 32 | """ 33 | 34 | # suppress ssl certification 35 | ctx = ssl.create_default_context() 36 | ctx.check_hostname = False 37 | ctx.verify_mode = ssl.CERT_NONE 38 | 39 | print(f'Reading {url_zip}') 40 | with urlopen(url_zip, context=ctx) as zipresp: 41 | with ZipFile(BytesIO(zipresp.read())) as zfile: 42 | zfile.extractall(dir_extract) 43 | 44 | return f'archive extracted to {dir_extract}' 45 | 46 | def download_dane_data(dir: str = None) -> str: 47 | """Download DaNE data set. 48 | 49 | Downloads the 'DaNE' data set annotated for Named Entity 50 | Recognition developed and hosted by 51 | [Alexandra Institute](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane). 52 | 53 | Args: 54 | dir (str, optional): Directory where DaNE datasets will be saved. If no directory is provided, data will be saved to a hidden folder '.dane' in your home directory. 55 | 56 | Returns: 57 | str: a message telling, if the archive was in fact 58 | succesfully extracted. Obviously the DaNE datasets are 59 | extracted to the desired directory as a side-effect. 60 | 61 | Examples: 62 | >>> download_dane_data() 63 | >>> download_dane_data(dir = 'DaNE') 64 | 65 | """ 66 | # set to default directory if nothing else has been provided by user. 67 | if dir is None: 68 | dir = os.path.join(str(Path.home()), '.dane') 69 | 70 | return download_unzip(url_zip = 'http://danlp-downloads.alexandra.dk/datasets/ddt.zip', 71 | dir_extract = dir) 72 | 73 | def get_dane_data(split: str = 'train', 74 | limit: int = None, 75 | dir: str = None) -> dict: 76 | """Load DaNE data split. 77 | 78 | Loads a single data split from the DaNE data set kindly hosted 79 | by [Alexandra Institute](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane). 80 | 81 | Args: 82 | split (str, optional): Choose which split to load. Choose 83 | from 'train', 'dev' and 'test'. Defaults to 'train'. 84 | limit (int, optional): Limit the number of observations to be 85 | returned from a given split. Defaults to None, which implies 86 | that the entire data split is returned. 87 | dir (str, optional): Directory where data is cached. If set to 88 | None, the function will try to look for files in '.dane' folder in home directory. 89 | 90 | Returns: 91 | dict: Dictionary with word-tokenized 'sentences' and named 92 | entity 'tags' in IOB format. 93 | 94 | Examples: 95 | Get test split 96 | >>> get_dane_data('test') 97 | 98 | Get first 5 observations from training split 99 | >>> get_dane_data('train', limit = 5) 100 | 101 | """ 102 | assert isinstance(split, str) 103 | splits = ['train', 'dev', 'test'] 104 | assert split in splits, f'Choose between the following splits: {splits}' 105 | 106 | # set to default directory if nothing else has been provided by user. 107 | if dir is None: 108 | dir = os.path.join(str(Path.home()), '.dane') 109 | assert os.path.isdir(dir), f'Directory {dir} does not exist. Try downloading DaNE data with download_dane_data()' 110 | 111 | file_path = os.path.join(dir, f'ddt.{split}.conllu') 112 | assert os.path.isfile(file_path), f'File {file_path} does not exist. Try downloading DaNE data with download_dane_data()' 113 | 114 | split = pyconll.load_from_file(file_path) 115 | 116 | sentences = [] 117 | entities = [] 118 | 119 | for sent in split: 120 | sentences.append([token.form for token in sent._tokens]) 121 | entities.append([token.misc['name'].pop() for token in sent._tokens]) 122 | 123 | if limit is not None: 124 | sentences = sentences[:limit] 125 | entities = entities[:limit] 126 | 127 | return {'sentences': sentences, 'tags': entities} 128 | 129 | 130 | 131 | def download_conll_data(dir: str = None) -> str: 132 | """Download CoNLL-2003 English data set. 133 | 134 | Downloads the [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) 135 | English data set annotated for Named Entity Recognition. 136 | 137 | Args: 138 | dir (str, optional): Directory where CoNLL-2003 datasets will be saved. If no directory is provided, data will be saved to a hidden folder '.dane' in your home directory. 139 | 140 | Returns: 141 | str: a message telling, if the archive was in fact 142 | succesfully extracted. Obviously the CoNLL datasets are 143 | extracted to the desired directory as a side-effect. 144 | 145 | Examples: 146 | >>> download_conll_data() 147 | >>> download_conll_data(dir = 'conll') 148 | 149 | """ 150 | # set to default directory if nothing else has been provided by user. 151 | if dir is None: 152 | dir = os.path.join(str(Path.home()), '.conll') 153 | 154 | return download_unzip(url_zip = 'https://data.deepai.org/conll2003.zip', 155 | dir_extract = dir) 156 | 157 | def get_conll_data(split: str = 'train', 158 | limit: int = None, 159 | dir: str = None) -> dict: 160 | """Load CoNLL-2003 (English) data split. 161 | 162 | Loads a single data split from the 163 | [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) 164 | (English) data set. 165 | 166 | Args: 167 | split (str, optional): Choose which split to load. Choose 168 | from 'train', 'valid' and 'test'. Defaults to 'train'. 169 | limit (int, optional): Limit the number of observations to be 170 | returned from a given split. Defaults to None, which implies 171 | that the entire data split is returned. 172 | dir (str, optional): Directory where data is cached. If set to 173 | None, the function will try to look for files in '.conll' folder in home directory. 174 | 175 | Returns: 176 | dict: Dictionary with word-tokenized 'sentences' and named 177 | entity 'tags' in IOB format. 178 | 179 | Examples: 180 | Get test split 181 | >>> get_conll_data('test') 182 | 183 | Get first 5 observations from training split 184 | >>> get_conll_data('train', limit = 5) 185 | 186 | """ 187 | assert isinstance(split, str) 188 | splits = ['train', 'valid', 'test'] 189 | assert split in splits, f'Choose between the following splits: {splits}' 190 | 191 | # set to default directory if nothing else has been provided by user. 192 | if dir is None: 193 | dir = os.path.join(str(Path.home()), '.conll') 194 | assert os.path.isdir(dir), f'Directory {dir} does not exist. Try downloading CoNLL-2003 data with download_conll_data()' 195 | 196 | file_path = os.path.join(dir, f'{split}.txt') 197 | assert os.path.isfile(file_path), f'File {file_path} does not exist. Try downloading CoNLL-2003 data with download_conll_data()' 198 | 199 | # read data from file. 200 | data = [] 201 | with open(file_path, 'r') as file: 202 | reader = csv.reader(file, delimiter = ' ') 203 | for row in reader: 204 | data.append([row]) 205 | 206 | sentences = [] 207 | sentence = [] 208 | entities = [] 209 | tags = [] 210 | 211 | for row in data: 212 | # extract first element of list. 213 | row = row[0] 214 | # TO DO: move to data reader. 215 | if len(row) > 0 and row[0] != '-DOCSTART-': 216 | sentence.append(row[0]) 217 | tags.append(row[-1]) 218 | if len(row) == 0 and len(sentence) > 0: 219 | # clean up sentence/tags. 220 | # remove white spaces. 221 | selector = [word != ' ' for word in sentence] 222 | sentence = list(compress(sentence, selector)) 223 | tags = list(compress(tags, selector)) 224 | # append if sentence length is still greater than zero.. 225 | if len(sentence) > 0: 226 | sentences.append(sentence) 227 | entities.append(tags) 228 | sentence = [] 229 | tags = [] 230 | 231 | 232 | if limit is not None: 233 | sentences = sentences[:limit] 234 | entities = entities[:limit] 235 | 236 | return {'sentences': sentences, 'tags': entities} 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | -------------------------------------------------------------------------------- /src/NERDA/models.py: -------------------------------------------------------------------------------- 1 | """ 2 | This section covers the interface for `NERDA` models, that is 3 | implemented as its own Python class [NERDA.models.NERDA][]. 4 | 5 | The interface enables you to easily 6 | 7 | - specify your own [NERDA.models.NERDA][] model 8 | - train it 9 | - evaluate it 10 | - use it to predict entities in new texts. 11 | """ 12 | from NERDA.datasets import get_conll_data 13 | from NERDA.networks import NERDANetwork 14 | from NERDA.predictions import predict, predict_text 15 | from NERDA.performance import compute_f1_scores, flatten 16 | from NERDA.training import train_model 17 | import pandas as pd 18 | import numpy as np 19 | import torch 20 | import os 21 | import sys 22 | import sklearn.preprocessing 23 | from sklearn.metrics import accuracy_score 24 | from transformers import AutoModel, AutoTokenizer, AutoConfig 25 | from typing import List 26 | 27 | class NERDA: 28 | """NERDA model 29 | 30 | A NERDA model object containing a complete model configuration. 31 | The model can be trained with the `train` method. Afterwards 32 | new observations can be predicted with the `predict` and 33 | `predict_text` methods. The performance of the model can be 34 | evaluated on a set of new observations with the 35 | `evaluate_performance` method. 36 | 37 | Examples: 38 | Model for a VERY small subset (5 observations) of English NER data 39 | >>> from NERDA.datasets import get_conll_data 40 | >>> trn = get_conll_data('train', 5) 41 | >>> valid = get_conll_data('valid', 5) 42 | >>> tag_scheme = ['B-PER', 'I-PER', 'B-LOC', 'I-LOC', 43 | 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC'] 44 | >>> tag_outside = 'O' 45 | >>> transformer = 'bert-base-multilingual-uncased' 46 | >>> model = NERDA(transformer = transformer, 47 | tag_scheme = tag_scheme, 48 | tag_outside = tag_outside, 49 | dataset_training = trn, 50 | dataset_validation = valid) 51 | 52 | Model for complete English NER data set CoNLL-2003 with modified hyperparameters 53 | >>> trn = get_conll_data('train') 54 | >>> valid = get_conll_data('valid') 55 | >>> transformer = 'bert-base-multilingual-uncased' 56 | >>> hyperparameters = {'epochs' : 3, 57 | 'warmup_steps' : 400, 58 | 'train_batch_size': 16, 59 | 'learning_rate': 0.0001}, 60 | >>> model = NERDA(transformer = transformer, 61 | dataset_training = trn, 62 | dataset_validation = valid, 63 | tag_scheme = tag_scheme, 64 | tag_outside = tag_outside, 65 | dropout = 0.1, 66 | hyperparameters = hyperparameters) 67 | 68 | Attributes: 69 | network (torch.nn.Module): network for Named Entity 70 | Recognition task. 71 | tag_encoder (sklearn.preprocessing.LabelEncoder): encoder for the 72 | NER labels/tags. 73 | transformer_model (transformers.PreTrainedModel): (Auto)Model derived from the 74 | transformer. 75 | transformer_tokenizer (transformers.PretrainedTokenizer): (Auto)Tokenizer 76 | derived from the transformer. 77 | transformer_config (transformers.PretrainedConfig): (Auto)Config derived from 78 | the transformer. 79 | train_losses (list): holds training losses, once the model has been 80 | trained. 81 | valid_loss (float): holds validation loss, once the model has been trained. 82 | """ 83 | def __init__(self, 84 | transformer: str = 'bert-base-multilingual-uncased', 85 | device: str = None, 86 | tag_scheme: List[str] = [ 87 | 'B-PER', 88 | 'I-PER', 89 | 'B-ORG', 90 | 'I-ORG', 91 | 'B-LOC', 92 | 'I-LOC', 93 | 'B-MISC', 94 | 'I-MISC' 95 | ], 96 | tag_outside: str = 'O', 97 | dataset_training: dict = None, 98 | dataset_validation: dict = None, 99 | max_len: int = 128, 100 | network: torch.nn.Module = NERDANetwork, 101 | dropout: float = 0.1, 102 | hyperparameters: dict = {'epochs' : 4, 103 | 'warmup_steps' : 500, 104 | 'train_batch_size': 13, 105 | 'learning_rate': 0.0001}, 106 | tokenizer_parameters: dict = {'do_lower_case' : True}, 107 | validation_batch_size: int = 8, 108 | num_workers: int = 1) -> None: 109 | """Initialize NERDA model 110 | 111 | Args: 112 | transformer (str, optional): which pretrained 'huggingface' 113 | transformer to use. 114 | device (str, optional): the desired device to use for computation. 115 | If not provided by the user, we take a guess. 116 | tag_scheme (List[str], optional): All available NER 117 | tags for the given data set EXCLUDING the special outside tag, 118 | that is handled separately. 119 | tag_outside (str, optional): the value of the special outside tag. 120 | Defaults to 'O'. 121 | dataset_training (dict, optional): the training data. Must consist 122 | of 'sentences': word-tokenized sentences and 'tags': corresponding 123 | NER tags. You can look at examples of, how the dataset should 124 | look like by invoking functions get_dane_data() or get_conll_data(). 125 | Defaults to None, in which case the English CoNLL-2003 data set is used. 126 | dataset_validation (dict, optional): the validation data. Must consist 127 | of 'sentences': word-tokenized sentences and 'tags': corresponding 128 | NER tags. You can look at examples of, how the dataset should 129 | look like by invoking functions get_dane_data() or get_conll_data(). 130 | Defaults to None, in which case the English CoNLL-2003 data set 131 | is used. 132 | max_len (int, optional): the maximum sentence length (number of 133 | tokens after applying the transformer tokenizer) for the transformer. 134 | Sentences are truncated accordingly. Look at your data to get an 135 | impression of, what could be a meaningful setting. Also be aware 136 | that many transformers have a maximum accepted length. Defaults 137 | to 128. 138 | network (torch.nn.module, optional): network to be trained. Defaults 139 | to a default generic `NERDANetwork`. Can be replaced with your own 140 | customized network architecture. It must however take the same 141 | arguments as `NERDANetwork`. 142 | dropout (float, optional): dropout probability. Defaults to 0.1. 143 | hyperparameters (dict, optional): Hyperparameters for the model. Defaults 144 | to {'epochs' : 3, 'warmup_steps' : 500, 'train_batch_size': 16, 145 | 'learning_rate': 0.0001}. 146 | tokenizer_parameters (dict, optional): parameters for the transformer 147 | tokenizer. Defaults to {'do_lower_case' : True}. 148 | validation_batch_size (int, optional): batch size for validation. Defaults 149 | to 8. 150 | num_workers (int, optional): number of workers for data loader. 151 | """ 152 | 153 | # set device automatically if not provided by user. 154 | if device is None: 155 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu' 156 | print("Device automatically set to:", self.device) 157 | else: 158 | self.device = device 159 | print("Device set to:", self.device) 160 | self.tag_scheme = tag_scheme 161 | self.tag_outside = tag_outside 162 | self.transformer = transformer 163 | self.dataset_training = dataset_training 164 | self.dataset_validation = dataset_validation 165 | self.hyperparameters = hyperparameters 166 | self.tag_outside = tag_outside 167 | self.tag_scheme = tag_scheme 168 | tag_complete = [tag_outside] + tag_scheme 169 | # fit encoder to _all_ possible tags. 170 | self.max_len = max_len 171 | self.tag_encoder = sklearn.preprocessing.LabelEncoder() 172 | self.tag_encoder.fit(tag_complete) 173 | self.transformer_model = AutoModel.from_pretrained(transformer) 174 | self.transformer_tokenizer = AutoTokenizer.from_pretrained(transformer, **tokenizer_parameters) 175 | self.transformer_config = AutoConfig.from_pretrained(transformer) 176 | self.network = NERDANetwork(self.transformer_model, self.device, len(tag_complete), dropout = dropout) 177 | self.network.to(self.device) 178 | self.validation_batch_size = validation_batch_size 179 | self.num_workers = num_workers 180 | self.train_losses = [] 181 | self.valid_loss = np.nan 182 | self.quantized = False 183 | self.halved = False 184 | 185 | def train(self) -> str: 186 | """Train Network 187 | 188 | Trains the network from the NERDA model specification. 189 | 190 | Returns: 191 | str: a message saying if the model was trained succesfully. 192 | The network in the 'network' attribute is trained as a 193 | side-effect. Training losses and validation loss are saved 194 | in 'training_losses' and 'valid_loss' 195 | attributes respectively as side-effects. 196 | """ 197 | network, train_losses, valid_loss = train_model(network = self.network, 198 | tag_encoder = self.tag_encoder, 199 | tag_outside = self.tag_outside, 200 | transformer_tokenizer = self.transformer_tokenizer, 201 | transformer_config = self.transformer_config, 202 | dataset_training = self.dataset_training, 203 | dataset_validation = self.dataset_validation, 204 | validation_batch_size = self.validation_batch_size, 205 | max_len = self.max_len, 206 | device = self.device, 207 | num_workers = self.num_workers, 208 | **self.hyperparameters) 209 | 210 | # attach as attributes to class 211 | setattr(self, "network", network) 212 | setattr(self, "train_losses", train_losses) 213 | setattr(self, "valid_loss", valid_loss) 214 | 215 | return "Model trained successfully" 216 | 217 | def load_network_from_file(self, model_path = "model.bin") -> str: 218 | """Load Pretrained NERDA Network from file 219 | 220 | Loads weights for a pretrained NERDA Network from file. 221 | 222 | Args: 223 | model_path (str, optional): Path for model file. 224 | Defaults to "model.bin". 225 | 226 | Returns: 227 | str: message telling if weights for network were 228 | loaded succesfully. 229 | """ 230 | # TODO: change assert to Raise. 231 | assert os.path.exists(model_path), "File does not exist. You can download network with download_network()" 232 | self.network.load_state_dict(torch.load(model_path, map_location = torch.device(self.device))) 233 | self.network.device = self.device 234 | return f'Weights for network loaded from {model_path}' 235 | 236 | def save_network(self, model_path:str = "model.bin") -> None: 237 | """Save Weights of NERDA Network 238 | 239 | Saves weights for a fine-tuned NERDA Network to file. 240 | 241 | Args: 242 | model_path (str, optional): Path for model file. 243 | Defaults to "model.bin". 244 | 245 | Returns: 246 | Nothing. Saves model to file as a side-effect. 247 | """ 248 | torch.save(self.network.state_dict(), model_path) 249 | print(f"Network written to file {model_path}") 250 | 251 | def quantize(self): 252 | """Apply dynamic quantization to increase performance. 253 | 254 | Quantization and half precision inference are mutually exclusive. 255 | 256 | Read more: https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html 257 | 258 | Returns: 259 | Nothing. Applies dynamic quantization to Network as a side-effect. 260 | """ 261 | assert not (self.quantized), "Dynamic quantization already applied" 262 | assert not (self.halved), "Can't run both quantization and half precision" 263 | 264 | self.network = torch.quantization.quantize_dynamic( 265 | self.network, {torch.nn.Linear}, dtype=torch.qint8 266 | ) 267 | self.quantized = True 268 | 269 | def half(self): 270 | """Convert weights from Float32 to Float16 to increase performance 271 | 272 | Quantization and half precision inference are mutually exclusive. 273 | 274 | Read more: https://pytorch.org/docs/master/generated/torch.nn.Module.html?highlight=half#torch.nn.Module.half 275 | 276 | Returns: 277 | Nothing. Model is "halved" as a side-effect. 278 | """ 279 | assert not (self.halved), "Half precision already applied" 280 | assert not (self.quantized), "Can't run both quantization and half precision" 281 | 282 | self.network.half() 283 | self.halved = True 284 | 285 | def predict(self, sentences: List[List[str]], 286 | return_confidence: bool = False, 287 | **kwargs) -> List[List[str]]: 288 | """Predict Named Entities in Word-Tokenized Sentences 289 | 290 | Predicts word-tokenized sentences with trained model. 291 | 292 | Args: 293 | sentences (List[List[str]]): word-tokenized sentences. 294 | kwargs: arbitrary keyword arguments. For instance 295 | 'batch_size' and 'num_workers'. 296 | return_confidence (bool, optional): if True, return 297 | confidence scores for all predicted tokens. Defaults 298 | to False. 299 | 300 | Returns: 301 | List[List[str]]: Predicted tags for sentences - one 302 | predicted tag/entity per word token. 303 | """ 304 | return predict(network = self.network, 305 | sentences = sentences, 306 | transformer_tokenizer = self.transformer_tokenizer, 307 | transformer_config = self.transformer_config, 308 | max_len = self.max_len, 309 | device = self.device, 310 | tag_encoder = self.tag_encoder, 311 | tag_outside = self.tag_outside, 312 | return_confidence = return_confidence, 313 | **kwargs) 314 | 315 | def predict_text(self, text: str, 316 | return_confidence:bool = False, **kwargs) -> list: 317 | """Predict Named Entities in a Text 318 | 319 | Args: 320 | text (str): text to predict entities in. 321 | kwargs: arbitrary keyword arguments. For instance 322 | 'batch_size' and 'num_workers'. 323 | return_confidence (bool, optional): if True, return 324 | confidence scores for all predicted tokens. Defaults 325 | to False. 326 | 327 | Returns: 328 | tuple: word-tokenized sentences and predicted 329 | tags/entities. 330 | """ 331 | return predict_text(network = self.network, 332 | text = text, 333 | transformer_tokenizer = self.transformer_tokenizer, 334 | transformer_config = self.transformer_config, 335 | max_len = self.max_len, 336 | device = self.device, 337 | tag_encoder = self.tag_encoder, 338 | tag_outside = self.tag_outside, 339 | return_confidence=return_confidence, 340 | **kwargs) 341 | 342 | def evaluate_performance(self, dataset: dict, 343 | return_accuracy: bool=False, 344 | **kwargs) -> pd.DataFrame: 345 | """Evaluate Performance 346 | 347 | Evaluates the performance of the model on an arbitrary 348 | data set. 349 | 350 | Args: 351 | dataset (dict): Data set that must consist of 352 | 'sentences' and NER'tags'. You can look at examples 353 | of, how the dataset should look like by invoking functions 354 | get_dane_data() or get_conll_data(). 355 | kwargs: arbitrary keyword arguments for predict. For 356 | instance 'batch_size' and 'num_workers'. 357 | return_accuracy (bool): Return accuracy 358 | as well? Defaults to False. 359 | 360 | 361 | Returns: 362 | DataFrame with performance numbers, F1-scores, 363 | Precision and Recall. Returns dictionary with 364 | this AND accuracy, if return_accuracy is set to 365 | True. 366 | """ 367 | 368 | tags_predicted = self.predict(dataset.get('sentences'), 369 | **kwargs) 370 | 371 | # compute F1 scores by entity type 372 | f1 = compute_f1_scores(y_pred = tags_predicted, 373 | y_true = dataset.get('tags'), 374 | labels = self.tag_scheme, 375 | average = None) 376 | 377 | # create DataFrame with performance scores (=F1) 378 | df = list(zip(self.tag_scheme, f1[2], f1[0], f1[1])) 379 | df = pd.DataFrame(df, columns = ['Level', 'F1-Score', 'Precision', 'Recall']) 380 | 381 | # compute MICRO-averaged F1-scores and add to table. 382 | f1_micro = compute_f1_scores(y_pred = tags_predicted, 383 | y_true = dataset.get('tags'), 384 | labels = self.tag_scheme, 385 | average = 'micro') 386 | f1_micro = pd.DataFrame({'Level' : ['AVG_MICRO'], 387 | 'F1-Score': [f1_micro[2]], 388 | 'Precision': [np.nan], 389 | 'Recall': [np.nan]}) 390 | df = df.append(f1_micro) 391 | 392 | # compute MACRO-averaged F1-scores and add to table. 393 | f1_macro = compute_f1_scores(y_pred = tags_predicted, 394 | y_true = dataset.get('tags'), 395 | labels = self.tag_scheme, 396 | average = 'macro') 397 | f1_macro = pd.DataFrame({'Level' : ['AVG_MICRO'], 398 | 'F1-Score': [f1_macro[2]], 399 | 'Precision': [np.nan], 400 | 'Recall': [np.nan]}) 401 | df = df.append(f1_macro) 402 | 403 | # compute and return accuracy if desired 404 | if return_accuracy: 405 | accuracy = accuracy_score(y_pred = flatten(tags_predicted), 406 | y_true = flatten(dataset.get('tags'))) 407 | return {'f1':df, 'accuracy': accuracy} 408 | 409 | return df 410 | 411 | -------------------------------------------------------------------------------- /src/NERDA/networks.py: -------------------------------------------------------------------------------- 1 | """This section covers `torch` networks for `NERDA`""" 2 | import torch 3 | import torch.nn as nn 4 | from transformers import AutoConfig 5 | from NERDA.utils import match_kwargs 6 | 7 | class NERDANetwork(nn.Module): 8 | """A Generic Network for NERDA models. 9 | 10 | The network has an analogous architecture to the models in 11 | [Hvingelby et al. 2020](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf). 12 | 13 | Can be replaced with a custom user-defined network with 14 | the restriction, that it must take the same arguments. 15 | """ 16 | 17 | def __init__(self, transformer: nn.Module, device: str, n_tags: int, dropout: float = 0.1) -> None: 18 | """Initialize a NERDA Network 19 | 20 | Args: 21 | transformer (nn.Module): huggingface `torch` transformer. 22 | device (str): Computational device. 23 | n_tags (int): Number of unique entity tags (incl. outside tag) 24 | dropout (float, optional): Dropout probability. Defaults to 0.1. 25 | """ 26 | super(NERDANetwork, self).__init__() 27 | 28 | # extract transformer name 29 | transformer_name = transformer.name_or_path 30 | # extract AutoConfig, from which relevant parameters can be extracted. 31 | transformer_config = AutoConfig.from_pretrained(transformer_name) 32 | 33 | self.transformer = transformer 34 | self.dropout = nn.Dropout(dropout) 35 | self.tags = nn.Linear(transformer_config.hidden_size, n_tags) 36 | self.device = device 37 | 38 | # NOTE: 'offsets 'are not used in model as-is, but they are expected as output 39 | # down-stream. So _DON'T_ remove! :) 40 | def forward(self, 41 | input_ids: torch.Tensor, 42 | masks: torch.Tensor, 43 | token_type_ids: torch.Tensor, 44 | target_tags: torch.Tensor, 45 | offsets: torch.Tensor) -> torch.Tensor: 46 | """Model Forward Iteration 47 | 48 | Args: 49 | input_ids (torch.Tensor): Input IDs. 50 | masks (torch.Tensor): Attention Masks. 51 | token_type_ids (torch.Tensor): Token Type IDs. 52 | target_tags (torch.Tensor): Target tags. Are not used 53 | in model as-is, but they are expected downstream, 54 | so they can not be left out. 55 | offsets (torch.Tensor): Offsets to keep track of original 56 | words. Are not used in model as-is, but they are 57 | expected as down-stream, so they can not be left out. 58 | 59 | Returns: 60 | torch.Tensor: predicted values. 61 | """ 62 | 63 | # TODO: can be improved with ** and move everything to device in a 64 | # single step. 65 | transformer_inputs = { 66 | 'input_ids': input_ids.to(self.device), 67 | 'masks': masks.to(self.device), 68 | 'token_type_ids': token_type_ids.to(self.device) 69 | } 70 | 71 | # match args with transformer 72 | transformer_inputs = match_kwargs(self.transformer.forward, **transformer_inputs) 73 | 74 | outputs = self.transformer(**transformer_inputs)[0] 75 | 76 | # apply drop-out 77 | outputs = self.dropout(outputs) 78 | 79 | # outputs for all labels/tags 80 | outputs = self.tags(outputs) 81 | 82 | return outputs 83 | 84 | -------------------------------------------------------------------------------- /src/NERDA/performance.py: -------------------------------------------------------------------------------- 1 | """ 2 | This section covers functionality for computing performance 3 | for [NERDA.models.NERDA][] models. 4 | """ 5 | 6 | from typing import List 7 | from sklearn.metrics import precision_recall_fscore_support 8 | import warnings 9 | 10 | def flatten(l: list): 11 | """Flattens list""" 12 | return [item for sublist in l for item in sublist] 13 | 14 | 15 | def compute_f1_scores(y_pred: List[List[str]], 16 | y_true: List[List[str]], 17 | labels: List[str], 18 | **kwargs) -> list: 19 | """Compute F1 scores. 20 | 21 | Computes F1 Scores 22 | 23 | Args: 24 | y_pred (List): predicted values. 25 | y_true (List): observed/true values. 26 | labels (List): all possible tags. 27 | kwargs: all optional arguments for precision/recall function. 28 | 29 | Returns: 30 | list: resulting F1 scores. 31 | 32 | """ 33 | # check inputs. 34 | assert sum([len(t) < len(p) for t, p in zip(y_true, y_pred)]) == 0, "Length of predictions must not exceed length of observed values" 35 | 36 | # check, if some lengths of observed values exceed predicted values. 37 | n_exceeds = sum([len(t) > len(p) for t, p in zip(y_true, y_pred)]) 38 | if n_exceeds > 0: 39 | warnings.warn(f'length of observed values exceeded lengths of predicted values in {n_exceeds} cases and were truncated. _Consider_ increasing max_len parameter for your model.') 40 | 41 | # truncate observed values dimensions to match predicted values, 42 | # this is needed if predictions have been truncated earlier in 43 | # the flow. 44 | y_true = [t[:len(p)] for t, p in zip(y_true, y_pred)] 45 | 46 | y_pred = flatten(y_pred) 47 | y_true = flatten(y_true) 48 | 49 | f1_scores = precision_recall_fscore_support(y_true = y_true, 50 | y_pred = y_pred, 51 | labels = labels, 52 | **kwargs) 53 | 54 | return f1_scores 55 | -------------------------------------------------------------------------------- /src/NERDA/precooked.py: -------------------------------------------------------------------------------- 1 | """ 2 | This sections covers NERDA Models that have been 'precooked' by 3 | Ekstra Bladet and are publicly available for download. 4 | """ 5 | from NERDA.datasets import get_dane_data, get_conll_data 6 | from NERDA.models import NERDA 7 | import os 8 | import urllib 9 | from pathlib import Path 10 | from progressbar import ProgressBar 11 | 12 | pbar = None 13 | 14 | # helper function to show progressbar 15 | def show_progress(block_num, block_size, total_size): 16 | global pbar 17 | if pbar is None: 18 | pbar = ProgressBar(maxval=total_size) 19 | 20 | downloaded = block_num * block_size 21 | pbar.start() 22 | if downloaded < total_size: 23 | pbar.update(downloaded) 24 | else: 25 | pbar.finish() 26 | pbar = None 27 | 28 | class Precooked(NERDA): 29 | """Precooked NERDA Model 30 | 31 | NERDA model specification that has been precooked/pretrained 32 | and is available for download. 33 | 34 | Inherits from [NERDA.models.NERDA][]. 35 | """ 36 | def __init__(self, **kwargs) -> None: 37 | """Initialize Precooked NERDA Model 38 | 39 | Args: 40 | kwargs: all arguments for NERDA Model. 41 | """ 42 | super().__init__(**kwargs) 43 | 44 | def download_network(self, dir = None) -> None: 45 | """Download Precooked Network from Web 46 | 47 | Args: 48 | dir (str, optional): Directory where the model file 49 | will be saved. Defaults to None, in which case 50 | the model will be saved in a folder '.nerda' in 51 | your home directory. 52 | 53 | Returns: 54 | str: Message saying if the download was successfull. 55 | Model is downloaded as a side-effect. 56 | """ 57 | 58 | model_name = type(self).__name__ 59 | 60 | # url for public S3 bucket with NERDA models. 61 | url_s3 = 'https://nerda.s3-eu-west-1.amazonaws.com' 62 | url_model = f'{url_s3}/{model_name}.bin' 63 | 64 | if dir is None: 65 | dir = os.path.join(str(Path.home()), '.nerda') 66 | 67 | if not os.path.exists(dir): 68 | os.mkdir(dir) 69 | 70 | file_path = os.path.join(dir, f'{model_name}.bin') 71 | 72 | print( 73 | """ 74 | Please make sure, that you're running the latest version of 'NERDA' 75 | otherwise the model is not guaranteed to work. 76 | """ 77 | ) 78 | print(f'Downloading {url_model} to {file_path}') 79 | urllib.request.urlretrieve(url_model, file_path, show_progress) 80 | 81 | return "Network downloaded successfully. Load network with 'load_network'." 82 | 83 | def load_network(self, file_path: str = None) -> None: 84 | """Load Pretrained Network 85 | 86 | Loads pretrained network from file. 87 | 88 | Args: 89 | file_path (str, optional): Path to model file. Defaults to None, 90 | in which case, the function points to the '.nerda' folder 91 | the home directory. 92 | """ 93 | 94 | model_name = type(self).__name__ 95 | 96 | if file_path is None: 97 | file_path = os.path.join(str(Path.home()), '.nerda', f'{model_name}.bin') 98 | 99 | assert os.path.exists(file_path), "File does not exist! You can download network with download_network()" 100 | print( 101 | """ 102 | Model loaded. Please make sure, that you're running the latest version 103 | of 'NERDA' otherwise the model is not guaranteed to work. 104 | """ 105 | ) 106 | self.load_network_from_file(file_path) 107 | 108 | class DA_BERT_ML(Precooked): 109 | """NERDA [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) 110 | for Danish Finetuned on [DaNE data set](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane). 111 | 112 | Inherits from [NERDA.precooked.Precooked][]. 113 | 114 | Examples: 115 | >>> from NERDA.precooked import DA_BERT_ML() 116 | >>> model = DA_BERT_ML() 117 | >>> model.download_network() 118 | >>> model.load_network() 119 | >>> text = 'Jens Hansen har en bondegård' 120 | >>> model.predict_text(text) 121 | ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']]) 122 | 123 | """ 124 | def __init__(self, device: str = None) -> None: 125 | """Initialize model""" 126 | super().__init__(transformer = 'bert-base-multilingual-uncased', 127 | device = device, 128 | tag_scheme = [ 129 | 'B-PER', 130 | 'I-PER', 131 | 'B-ORG', 132 | 'I-ORG', 133 | 'B-LOC', 134 | 'I-LOC', 135 | 'B-MISC', 136 | 'I-MISC' 137 | ], 138 | tag_outside = 'O', 139 | max_len = 128, 140 | dropout = 0.1, 141 | hyperparameters = {'epochs' : 4, 142 | 'warmup_steps' : 500, 143 | 'train_batch_size': 13, 144 | 'learning_rate': 0.0001}, 145 | tokenizer_parameters = {'do_lower_case' : True}) 146 | 147 | class DA_DISTILBERT_ML(Precooked): 148 | """NERDA [Multilingual BERT](https://huggingface.co/distilbert-base-multilingual-cased) 149 | for Danish Finetuned on [DaNE data set](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane). 150 | 151 | Inherits from [NERDA.precooked.Precooked][]. 152 | 153 | Examples: 154 | >>> from NERDA.precooked import DA_DISTILBERT_ML() 155 | >>> model = DA_DISTILBERT_ML() 156 | >>> model.download_network() 157 | >>> model.load_network() 158 | >>> text = 'Jens Hansen har en bondegård' 159 | >>> model.predict_text(text) 160 | ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']]) 161 | 162 | """ 163 | def __init__(self, device: str = None) -> None: 164 | """Initialize model""" 165 | super().__init__(transformer = 'distilbert-base-multilingual-cased', 166 | device = device, 167 | tag_scheme = [ 168 | 'B-PER', 169 | 'I-PER', 170 | 'B-ORG', 171 | 'I-ORG', 172 | 'B-LOC', 173 | 'I-LOC', 174 | 'B-MISC', 175 | 'I-MISC' 176 | ], 177 | tag_outside = 'O', 178 | max_len = 128, 179 | dropout = 0.1, 180 | hyperparameters = {'epochs' : 4, 181 | 'warmup_steps' : 500, 182 | 'train_batch_size': 13, 183 | 'learning_rate': 0.0001}, 184 | tokenizer_parameters = {'do_lower_case' : False}) 185 | 186 | class DA_ELECTRA_DA(Precooked): 187 | """NERDA [Danish ELECTRA](https://huggingface.co/Maltehb/-l-ctra-danish-electra-small-uncased) 188 | for Danish finetuned on [DaNE data set](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane). 189 | 190 | Inherits from [NERDA.precooked.Precooked][]. 191 | 192 | Examples: 193 | >>> from NERDA.precooked import DA_ELECTRA_DA() 194 | >>> model = DA_ELECTRA_DA() 195 | >>> model.download_network() 196 | >>> model.load_network() 197 | >>> text = 'Jens Hansen har en bondegård' 198 | >>> model.predict_text(text) 199 | ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']]) 200 | 201 | """ 202 | def __init__(self, device: str = None) -> None: 203 | """Initialize model""" 204 | super().__init__(transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased', 205 | device = device, 206 | tag_scheme = [ 207 | 'B-PER', 208 | 'I-PER', 209 | 'B-ORG', 210 | 'I-ORG', 211 | 'B-LOC', 212 | 'I-LOC', 213 | 'B-MISC', 214 | 'I-MISC' 215 | ], 216 | tag_outside = 'O', 217 | max_len = 128, 218 | dropout = 0.1, 219 | hyperparameters = {'epochs' : 5, 220 | 'warmup_steps' : 500, 221 | 'train_batch_size': 13, 222 | 'learning_rate': 0.0001}, 223 | tokenizer_parameters = {'do_lower_case' : True}) 224 | 225 | class EN_ELECTRA_EN(Precooked): 226 | """NERDA [English ELECTRA](https://huggingface.co/google/electra-small-discriminator) 227 | for English finetuned on [CoNLL-2003 data set](https://www.clips.uantwerpen.be/conll2003/ner/). 228 | 229 | Inherits from [NERDA.precooked.Precooked][]. 230 | 231 | Examples: 232 | >>> from NERDA.precooked import EN_ELECTRA_EN() 233 | >>> model = EN_ELECTRA_EN() 234 | >>> model.download_network() 235 | >>> model.load_network() 236 | >>> text = 'Old MacDonald had a farm' 237 | >>> model.predict_text(text) 238 | ([['Old', 'MacDonald', 'had', 'a', 'farm']], [['B-PER', 'I-PER', 'O', 'O', 'O']]) 239 | 240 | """ 241 | def __init__(self, device: str = None) -> None: 242 | """Initialize model""" 243 | super().__init__(transformer = 'google/electra-small-discriminator', 244 | device = device, 245 | tag_scheme = [ 246 | 'B-PER', 247 | 'I-PER', 248 | 'B-ORG', 249 | 'I-ORG', 250 | 'B-LOC', 251 | 'I-LOC', 252 | 'B-MISC', 253 | 'I-MISC' 254 | ], 255 | tag_outside = 'O', 256 | max_len = 128, 257 | dropout = 0.1, 258 | hyperparameters = {'epochs' : 4, 259 | 'warmup_steps' : 250, 260 | 'train_batch_size': 13, 261 | 'learning_rate': 8e-05}, 262 | tokenizer_parameters = {'do_lower_case' : True}) 263 | 264 | 265 | class EN_BERT_ML(Precooked): 266 | """NERDA [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) 267 | for English finetuned on [CoNLL-2003 data set](https://www.clips.uantwerpen.be/conll2003/ner/). 268 | 269 | Inherits from [NERDA.precooked.Precooked][]. 270 | 271 | Examples: 272 | >>> from NERDA.precooked import EN_BERT_ML() 273 | >>> model = EN_BERT_ML() 274 | >>> model.download_network() 275 | >>> model.load_network() 276 | >>> text = 'Old MacDonald had a farm' 277 | >>> model.predict_text(text) 278 | ([['Old', 'MacDonald', 'had', 'a', 'farm']], [['B-PER', 'I-PER', 'O', 'O', 'O']]) 279 | 280 | """ 281 | def __init__(self, device: str = None) -> None: 282 | """Initialize model""" 283 | super().__init__(transformer = 'bert-base-multilingual-uncased', 284 | device = device, 285 | tag_scheme = [ 286 | 'B-PER', 287 | 'I-PER', 288 | 'B-ORG', 289 | 'I-ORG', 290 | 'B-LOC', 291 | 'I-LOC', 292 | 'B-MISC', 293 | 'I-MISC' 294 | ], 295 | tag_outside = 'O', 296 | max_len = 128, 297 | dropout = 0.1, 298 | hyperparameters = {'epochs' : 4, 299 | 'warmup_steps' : 500, 300 | 'train_batch_size': 13, 301 | 'learning_rate': 0.0001}, 302 | tokenizer_parameters = {'do_lower_case' : True}) 303 | 304 | 305 | 306 | 307 | -------------------------------------------------------------------------------- /src/NERDA/predictions.py: -------------------------------------------------------------------------------- 1 | """ 2 | This section covers functionality for computing predictions 3 | with a [NERDA.models.NERDA][] model. 4 | """ 5 | 6 | from NERDA.preprocessing import create_dataloader 7 | import torch 8 | import numpy as np 9 | from tqdm import tqdm 10 | from nltk.tokenize import sent_tokenize, word_tokenize 11 | from typing import List, Callable 12 | import transformers 13 | import sklearn.preprocessing 14 | 15 | def sigmoid_transform(x): 16 | prob = 1/(1 + np.exp(-x)) 17 | return prob 18 | 19 | def predict(network: torch.nn.Module, 20 | sentences: List[List[str]], 21 | transformer_tokenizer: transformers.PreTrainedTokenizer, 22 | transformer_config: transformers.PretrainedConfig, 23 | max_len: int, 24 | device: str, 25 | tag_encoder: sklearn.preprocessing.LabelEncoder, 26 | tag_outside: str, 27 | batch_size: int = 8, 28 | num_workers: int = 1, 29 | return_tensors: bool = False, 30 | return_confidence: bool = False, 31 | pad_sequences: bool = True) -> List[List[str]]: 32 | """Compute predictions. 33 | 34 | Computes predictions for a list with word-tokenized sentences 35 | with a `NERDA` model. 36 | 37 | Args: 38 | network (torch.nn.Module): Network. 39 | sentences (List[List[str]]): List of lists with word-tokenized 40 | sentences. 41 | transformer_tokenizer (transformers.PreTrainedTokenizer): 42 | tokenizer for transformer model. 43 | transformer_config (transformers.PretrainedConfig): config 44 | for transformer model. 45 | max_len (int): Maximum length of sentence after applying 46 | transformer tokenizer. 47 | device (str): Computational device. 48 | tag_encoder (sklearn.preprocessing.LabelEncoder): Encoder 49 | for Named-Entity tags. 50 | tag_outside (str): Special 'outside' NER tag. 51 | batch_size (int, optional): Batch Size for DataLoader. 52 | Defaults to 8. 53 | num_workers (int, optional): Number of workers. Defaults 54 | to 1. 55 | return_tensors (bool, optional): if True, return tensors. 56 | return_confidence (bool, optional): if True, return 57 | confidence scores for all predicted tokens. Defaults 58 | to False. 59 | pad_sequences (bool, optional): if True, pad sequences. 60 | Defaults to True. 61 | 62 | Returns: 63 | List[List[str]]: List of lists with predicted Entity 64 | tags. 65 | """ 66 | # make sure, that input has the correct format. 67 | assert isinstance(sentences, list), "'sentences' must be a list of list of word-tokens" 68 | assert isinstance(sentences[0], list), "'sentences' must be a list of list of word-tokens" 69 | assert isinstance(sentences[0][0], str), "'sentences' must be a list of list of word-tokens" 70 | 71 | # set network to appropriate mode. 72 | network.eval() 73 | 74 | # fill 'dummy' tags (expected input for dataloader). 75 | tag_fill = [tag_encoder.classes_[0]] 76 | tags_dummy = [tag_fill * len(sent) for sent in sentences] 77 | 78 | dl = create_dataloader(sentences = sentences, 79 | tags = tags_dummy, 80 | transformer_tokenizer = transformer_tokenizer, 81 | transformer_config = transformer_config, 82 | max_len = max_len, 83 | batch_size = batch_size, 84 | tag_encoder = tag_encoder, 85 | tag_outside = tag_outside, 86 | num_workers = num_workers, 87 | pad_sequences = pad_sequences) 88 | 89 | predictions = [] 90 | probabilities = [] 91 | tensors = [] 92 | 93 | with torch.no_grad(): 94 | for _, dl in enumerate(dl): 95 | 96 | outputs = network(**dl) 97 | 98 | # conduct operations on sentence level. 99 | for i in range(outputs.shape[0]): 100 | 101 | # extract prediction and transform. 102 | 103 | # find max by row. 104 | values, indices = outputs[i].max(dim=1) 105 | 106 | preds = tag_encoder.inverse_transform(indices.cpu().numpy()) 107 | probs = values.cpu().numpy() 108 | 109 | if return_tensors: 110 | tensors.append(outputs) 111 | 112 | # subset predictions for original word tokens. 113 | preds = [prediction for prediction, offset in zip(preds.tolist(), dl.get('offsets')[i]) if offset] 114 | if return_confidence: 115 | probs = [prob for prob, offset in zip(probs.tolist(), dl.get('offsets')[i]) if offset] 116 | 117 | # Remove special tokens ('CLS' + 'SEP'). 118 | preds = preds[1:-1] 119 | if return_confidence: 120 | probs = probs[1:-1] 121 | 122 | # make sure resulting predictions have same length as 123 | # original sentence. 124 | 125 | # TODO: Move assert statement to unit tests. Does not work 126 | # in boundary. 127 | # assert len(preds) == len(sentences[i]) 128 | predictions.append(preds) 129 | if return_confidence: 130 | probabilities.append(probs) 131 | 132 | if return_confidence: 133 | return predictions, probabilities 134 | 135 | if return_tensors: 136 | return tensors 137 | 138 | return predictions 139 | 140 | def predict_text(network: torch.nn.Module, 141 | text: str, 142 | transformer_tokenizer: transformers.PreTrainedTokenizer, 143 | transformer_config: transformers.PretrainedConfig, 144 | max_len: int, 145 | device: str, 146 | tag_encoder: sklearn.preprocessing.LabelEncoder, 147 | tag_outside: str, 148 | batch_size: int = 8, 149 | num_workers: int = 1, 150 | pad_sequences: bool = True, 151 | return_confidence: bool = False, 152 | sent_tokenize: Callable = sent_tokenize, 153 | word_tokenize: Callable = word_tokenize) -> tuple: 154 | """Compute Predictions for Text. 155 | 156 | Computes predictions for a text with `NERDA` model. 157 | Text is tokenized into sentences before computing predictions. 158 | 159 | Args: 160 | network (torch.nn.Module): Network. 161 | text (str): text to predict entities in. 162 | transformer_tokenizer (transformers.PreTrainedTokenizer): 163 | tokenizer for transformer model. 164 | transformer_config (transformers.PretrainedConfig): config 165 | for transformer model. 166 | max_len (int): Maximum length of sentence after applying 167 | transformer tokenizer. 168 | device (str): Computational device. 169 | tag_encoder (sklearn.preprocessing.LabelEncoder): Encoder 170 | for Named-Entity tags. 171 | tag_outside (str): Special 'outside' NER tag. 172 | batch_size (int, optional): Batch Size for DataLoader. 173 | Defaults to 8. 174 | num_workers (int, optional): Number of workers. Defaults 175 | to 1. 176 | pad_sequences (bool, optional): if True, pad sequences. 177 | Defaults to True. 178 | return_confidence (bool, optional): if True, return 179 | confidence scores for predicted tokens. Defaults 180 | to False. 181 | 182 | Returns: 183 | tuple: sentence- and word-tokenized text with corresponding 184 | predicted named-entity tags. 185 | """ 186 | assert isinstance(text, str), "'text' must be a string." 187 | sentences = sent_tokenize(text) 188 | 189 | sentences = [word_tokenize(sentence) for sentence in sentences] 190 | 191 | predictions = predict(network = network, 192 | sentences = sentences, 193 | transformer_tokenizer = transformer_tokenizer, 194 | transformer_config = transformer_config, 195 | max_len = max_len, 196 | device = device, 197 | return_confidence = return_confidence, 198 | batch_size = batch_size, 199 | num_workers = num_workers, 200 | pad_sequences = pad_sequences, 201 | tag_encoder = tag_encoder, 202 | tag_outside = tag_outside) 203 | 204 | return sentences, predictions 205 | 206 | -------------------------------------------------------------------------------- /src/NERDA/preprocessing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import warnings 3 | import transformers 4 | import sklearn.preprocessing 5 | 6 | class NERDADataSetReader(): 7 | """Generic NERDA DataSetReader""" 8 | 9 | def __init__(self, 10 | sentences: list, 11 | tags: list, 12 | transformer_tokenizer: transformers.PreTrainedTokenizer, 13 | transformer_config: transformers.PretrainedConfig, 14 | max_len: int, 15 | tag_encoder: sklearn.preprocessing.LabelEncoder, 16 | tag_outside: str, 17 | pad_sequences : bool = True) -> None: 18 | """Initialize DataSetReader 19 | 20 | Initializes DataSetReader that prepares and preprocesses 21 | DataSet for Named-Entity Recognition Task and training. 22 | 23 | Args: 24 | sentences (list): Sentences. 25 | tags (list): Named-Entity tags. 26 | transformer_tokenizer (transformers.PreTrainedTokenizer): 27 | tokenizer for transformer. 28 | transformer_config (transformers.PretrainedConfig): Config 29 | for transformer model. 30 | max_len (int): Maximum length of sentences after applying 31 | transformer tokenizer. 32 | tag_encoder (sklearn.preprocessing.LabelEncoder): Encoder 33 | for Named-Entity tags. 34 | tag_outside (str): Special Outside tag. 35 | pad_sequences (bool): Pad sequences to max_len. Defaults 36 | to True. 37 | """ 38 | self.sentences = sentences 39 | self.tags = tags 40 | self.transformer_tokenizer = transformer_tokenizer 41 | self.max_len = max_len 42 | self.tag_encoder = tag_encoder 43 | self.pad_token_id = transformer_config.pad_token_id 44 | self.tag_outside_transformed = tag_encoder.transform([tag_outside])[0] 45 | self.pad_sequences = pad_sequences 46 | 47 | def __len__(self): 48 | return len(self.sentences) 49 | 50 | def __getitem__(self, item): 51 | sentence = self.sentences[item] 52 | tags = self.tags[item] 53 | # encode tags 54 | tags = self.tag_encoder.transform(tags) 55 | 56 | # check inputs for consistancy 57 | assert len(sentence) == len(tags) 58 | 59 | input_ids = [] 60 | target_tags = [] 61 | tokens = [] 62 | offsets = [] 63 | 64 | # for debugging purposes 65 | # print(item) 66 | for i, word in enumerate(sentence): 67 | # bert tokenization 68 | wordpieces = self.transformer_tokenizer.tokenize(word) 69 | tokens.extend(wordpieces) 70 | # make room for CLS if there is an identified word piece 71 | if len(wordpieces)>0: 72 | offsets.extend([1]+[0]*(len(wordpieces)-1)) 73 | # Extends the ner_tag if the word has been split by the wordpiece tokenizer 74 | target_tags.extend([tags[i]] * len(wordpieces)) 75 | 76 | # Make room for adding special tokens (one for both 'CLS' and 'SEP' special tokens) 77 | # max_len includes _all_ tokens. 78 | if len(tokens) > self.max_len-2: 79 | msg = f'Sentence #{item} length {len(tokens)} exceeds max_len {self.max_len} and has been truncated' 80 | warnings.warn(msg) 81 | tokens = tokens[:self.max_len-2] 82 | target_tags = target_tags[:self.max_len-2] 83 | offsets = offsets[:self.max_len-2] 84 | 85 | # encode tokens for BERT 86 | # TO DO: prettify this. 87 | input_ids = self.transformer_tokenizer.convert_tokens_to_ids(tokens) 88 | input_ids = [self.transformer_tokenizer.cls_token_id] + input_ids + [self.transformer_tokenizer.sep_token_id] 89 | 90 | # fill out other inputs for model. 91 | target_tags = [self.tag_outside_transformed] + target_tags + [self.tag_outside_transformed] 92 | masks = [1] * len(input_ids) 93 | # set to 0, because we are not doing NSP or QA type task (across multiple sentences) 94 | # token_type_ids distinguishes sentences. 95 | token_type_ids = [0] * len(input_ids) 96 | offsets = [1] + offsets + [1] 97 | 98 | # Padding to max length 99 | # compute padding length 100 | if self.pad_sequences: 101 | padding_len = self.max_len - len(input_ids) 102 | input_ids = input_ids + ([self.pad_token_id] * padding_len) 103 | masks = masks + ([0] * padding_len) 104 | offsets = offsets + ([0] * padding_len) 105 | token_type_ids = token_type_ids + ([0] * padding_len) 106 | target_tags = target_tags + ([self.tag_outside_transformed] * padding_len) 107 | 108 | return {'input_ids' : torch.tensor(input_ids, dtype = torch.long), 109 | 'masks' : torch.tensor(masks, dtype = torch.long), 110 | 'token_type_ids' : torch.tensor(token_type_ids, dtype = torch.long), 111 | 'target_tags' : torch.tensor(target_tags, dtype = torch.long), 112 | 'offsets': torch.tensor(offsets, dtype = torch.long)} 113 | 114 | def create_dataloader(sentences, 115 | tags, 116 | transformer_tokenizer, 117 | transformer_config, 118 | max_len, 119 | tag_encoder, 120 | tag_outside, 121 | batch_size = 1, 122 | num_workers = 1, 123 | pad_sequences = True): 124 | 125 | if not pad_sequences and batch_size > 1: 126 | print("setting pad_sequences to True, because batch_size is more than one.") 127 | pad_sequences = True 128 | 129 | data_reader = NERDADataSetReader( 130 | sentences = sentences, 131 | tags = tags, 132 | transformer_tokenizer = transformer_tokenizer, 133 | transformer_config = transformer_config, 134 | max_len = max_len, 135 | tag_encoder = tag_encoder, 136 | tag_outside = tag_outside, 137 | pad_sequences = pad_sequences) 138 | # Don't pad sequences if batch size == 1. This improves performance. 139 | 140 | data_loader = torch.utils.data.DataLoader( 141 | data_reader, batch_size = batch_size, num_workers = num_workers 142 | ) 143 | 144 | return data_loader 145 | 146 | -------------------------------------------------------------------------------- /src/NERDA/training.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .preprocessing import create_dataloader 3 | from sklearn import preprocessing 4 | from transformers import AdamW, get_linear_schedule_with_warmup 5 | import random 6 | import torch 7 | from tqdm import tqdm 8 | 9 | def train(model, data_loader, optimizer, device, scheduler, n_tags): 10 | """One Iteration of Training""" 11 | 12 | model.train() 13 | final_loss = 0.0 14 | 15 | for dl in tqdm(data_loader, total=len(data_loader)): 16 | 17 | optimizer.zero_grad() 18 | outputs = model(**dl) 19 | loss = compute_loss(outputs, 20 | dl.get('target_tags'), 21 | dl.get('masks'), 22 | device, 23 | n_tags) 24 | loss.backward() 25 | optimizer.step() 26 | scheduler.step() 27 | final_loss += loss.item() 28 | 29 | # Return average loss 30 | return final_loss / len(data_loader) 31 | 32 | def validate(model, data_loader, device, n_tags): 33 | """One Iteration of Validation""" 34 | 35 | model.eval() 36 | final_loss = 0.0 37 | 38 | for dl in tqdm(data_loader, total=len(data_loader)): 39 | 40 | outputs = model(**dl) 41 | loss = compute_loss(outputs, 42 | dl.get('target_tags'), 43 | dl.get('masks'), 44 | device, 45 | n_tags) 46 | final_loss += loss.item() 47 | 48 | # Return average loss. 49 | return final_loss / len(data_loader) 50 | 51 | def compute_loss(preds, target_tags, masks, device, n_tags): 52 | 53 | # initialize loss function. 54 | lfn = torch.nn.CrossEntropyLoss() 55 | 56 | # Compute active loss to not compute loss of paddings 57 | active_loss = masks.view(-1) == 1 58 | 59 | active_logits = preds.view(-1, n_tags) 60 | active_labels = torch.where( 61 | active_loss, 62 | target_tags.view(-1), 63 | torch.tensor(lfn.ignore_index).type_as(target_tags) 64 | ) 65 | 66 | active_labels = torch.as_tensor(active_labels, device = torch.device(device), dtype = torch.long) 67 | 68 | # Only compute loss on actual token predictions 69 | loss = lfn(active_logits, active_labels) 70 | 71 | return loss 72 | 73 | def enforce_reproducibility(seed = 42) -> None: 74 | """Enforce Reproducibity 75 | 76 | Enforces reproducibility of models to the furthest 77 | possible extent. This is done by setting fixed seeds for 78 | random number generation etcetera. 79 | 80 | For atomic operations there is currently no simple way to 81 | enforce determinism, as the order of parallel operations 82 | is not known. 83 | 84 | Args: 85 | seed (int, optional): Fixed seed. Defaults to 42. 86 | """ 87 | # Sets seed manually for both CPU and CUDA 88 | torch.manual_seed(seed) 89 | torch.cuda.manual_seed_all(seed) 90 | # CUDNN 91 | torch.backends.cudnn.deterministic = True 92 | torch.backends.cudnn.benchmark = False 93 | # System based 94 | random.seed(seed) 95 | np.random.seed(seed) 96 | 97 | def train_model(network, 98 | tag_encoder, 99 | tag_outside, 100 | transformer_tokenizer, 101 | transformer_config, 102 | dataset_training, 103 | dataset_validation, 104 | max_len = 128, 105 | train_batch_size = 16, 106 | validation_batch_size = 8, 107 | epochs = 5, 108 | warmup_steps = 0, 109 | learning_rate = 5e-5, 110 | device = None, 111 | fixed_seed = 42, 112 | num_workers = 1): 113 | 114 | if fixed_seed is not None: 115 | enforce_reproducibility(fixed_seed) 116 | 117 | # compute number of unique tags from encoder. 118 | n_tags = tag_encoder.classes_.shape[0] 119 | 120 | # prepare datasets for modelling by creating data readers and loaders 121 | dl_train = create_dataloader(sentences = dataset_training.get('sentences'), 122 | tags = dataset_training.get('tags'), 123 | transformer_tokenizer = transformer_tokenizer, 124 | transformer_config = transformer_config, 125 | max_len = max_len, 126 | batch_size = train_batch_size, 127 | tag_encoder = tag_encoder, 128 | tag_outside = tag_outside, 129 | num_workers = num_workers) 130 | dl_validate = create_dataloader(sentences = dataset_validation.get('sentences'), 131 | tags = dataset_validation.get('tags'), 132 | transformer_tokenizer = transformer_tokenizer, 133 | transformer_config = transformer_config, 134 | max_len = max_len, 135 | batch_size = validation_batch_size, 136 | tag_encoder = tag_encoder, 137 | tag_outside = tag_outside, 138 | num_workers = num_workers) 139 | 140 | optimizer_parameters = network.parameters() 141 | 142 | num_train_steps = int(len(dataset_training.get('sentences')) / train_batch_size * epochs) 143 | 144 | optimizer = AdamW(optimizer_parameters, lr = learning_rate) 145 | scheduler = get_linear_schedule_with_warmup( 146 | optimizer, num_warmup_steps = warmup_steps, num_training_steps = num_train_steps 147 | ) 148 | 149 | train_losses = [] 150 | best_valid_loss = np.inf 151 | 152 | for epoch in range(epochs): 153 | 154 | print('\n Epoch {:} / {:}'.format(epoch + 1, epochs)) 155 | 156 | train_loss = train(network, dl_train, optimizer, device, scheduler, n_tags) 157 | train_losses.append(train_loss) 158 | valid_loss = validate(network, dl_validate, device, n_tags) 159 | 160 | print(f"Train Loss = {train_loss} Valid Loss = {valid_loss}") 161 | 162 | if valid_loss < best_valid_loss: 163 | best_parameters = network.state_dict() 164 | best_valid_loss = valid_loss 165 | 166 | # return best model 167 | network.load_state_dict(best_parameters) 168 | 169 | return network, train_losses, best_valid_loss 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /src/NERDA/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | def match_kwargs(function: Callable, **kwargs) -> dict: 4 | """Matches Arguments with Function 5 | 6 | Match keywords arguments with the arguments of a function. 7 | 8 | Args: 9 | function (function): Function to match arguments for. 10 | kwargs: keyword arguments to match against. 11 | 12 | Returns: 13 | dict: dictionary with matching arguments and their 14 | respective values. 15 | 16 | """ 17 | arg_count = function.__code__.co_argcount 18 | args = function.__code__.co_varnames[:arg_count] 19 | 20 | args_dict = {} 21 | for k, v in kwargs.items(): 22 | if k in args: 23 | args_dict[k] = v 24 | 25 | return args_dict 26 | -------------------------------------------------------------------------------- /tests/unit_tests/test_aaaNERDA.py: -------------------------------------------------------------------------------- 1 | # HACK: Filename prefixed with 'aaa' to execute this test before the others 2 | # in order to download necessary ressources for all other tests. 3 | 4 | from NERDA.datasets import get_dane_data, download_dane_data 5 | # TODO: should not be necesssary to download before importing NERDA. 6 | # Download necessary ressources 7 | download_dane_data() 8 | from NERDA.models import NERDA 9 | from NERDA.precooked import DA_ELECTRA_DA 10 | import nltk 11 | nltk.download('punkt') 12 | 13 | # instantiate a minimal model. 14 | model = NERDA(dataset_training = get_dane_data('train', 5), 15 | dataset_validation = get_dane_data('dev', 5), 16 | max_len = 128, 17 | transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased', 18 | hyperparameters = {'epochs' : 1, 19 | 'warmup_steps' : 10, 20 | 'train_batch_size': 5, 21 | 'learning_rate': 0.0001}) 22 | 23 | def test_instantiate_NERDA(): 24 | """Test that model has the correct/expected class""" 25 | assert isinstance(model, NERDA) 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /tests/unit_tests/test_performance.py: -------------------------------------------------------------------------------- 1 | from NERDA.datasets import get_dane_data 2 | from NERDA.models import NERDA 3 | import pandas as pd 4 | 5 | # instantiate a minimal model. 6 | model = NERDA(dataset_training = get_dane_data('train', 5), 7 | dataset_validation = get_dane_data('dev', 5), 8 | transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased', 9 | hyperparameters = {'epochs' : 1, 10 | 'warmup_steps' : 10, 11 | 'train_batch_size': 5, 12 | 'learning_rate': 0.0001}) 13 | 14 | test = get_dane_data('test') 15 | perf = model.evaluate_performance(test) 16 | 17 | def test_performance_df(): 18 | assert isinstance(perf, pd.DataFrame) 19 | 20 | def test_performance_len(): 21 | assert len(perf) > 0 22 | 23 | def test_includes_relevant_metrics(): 24 | metrics = ['F1-Score', 'Precision', 'Recall'] 25 | assert all([x in perf.columns for x in metrics]) 26 | 27 | def test_metrics_dtype(): 28 | metrics = ['F1-Score', 'Precision', 'Recall'] 29 | assert all([perf.dtypes[x] == 'float' for x in metrics]) 30 | 31 | -------------------------------------------------------------------------------- /tests/unit_tests/test_precooked.py: -------------------------------------------------------------------------------- 1 | from NERDA.precooked import DA_ELECTRA_DA 2 | 3 | def test_load_precooked(): 4 | """Test that precooked model can be (down)loaded, instantiated and works end-to-end""" 5 | m = DA_ELECTRA_DA() 6 | m.download_network() 7 | m.load_network() 8 | m.predict_text("Jens Hansen har en bondegård. Det har han!") 9 | -------------------------------------------------------------------------------- /tests/unit_tests/test_predictions.py: -------------------------------------------------------------------------------- 1 | from NERDA.datasets import get_dane_data 2 | from NERDA.models import NERDA 3 | import nltk 4 | 5 | # instantiate a minimal model. 6 | model = NERDA(dataset_training = get_dane_data('train', 5), 7 | dataset_validation = get_dane_data('dev', 5), 8 | transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased', 9 | hyperparameters = {'epochs' : 1, 10 | 'warmup_steps' : 10, 11 | 'train_batch_size': 5, 12 | 'learning_rate': 0.0001}) 13 | 14 | 15 | # set example texts to identify entities in. 16 | text_single = "Pernille Rosenkrantz-Theil kommer fra Vejle" 17 | sentences = [nltk.word_tokenize(text_single)] 18 | 19 | def test_predict(): 20 | """Test that predict runs""" 21 | predictions = model.predict(sentences) 22 | 23 | predictions = model.predict(sentences) 24 | 25 | def test_predict_type(): 26 | """Test token predictions""" 27 | assert isinstance(predictions, list) 28 | 29 | def test_predict_length(): 30 | """Test that sentence and prediction lenghts match""" 31 | assert len(sentences[0])==len(predictions[0]) 32 | 33 | def test_predict_text(): 34 | """Test that predict_text runs""" 35 | predictions = model.predict_text(text_single) 36 | 37 | def test_predict_maxlen_exceed(): 38 | """That that exceeding max len does not break predict""" 39 | text = "ice " * 200 40 | sentences = [nltk.word_tokenize(text)] 41 | model.predict(sentences) 42 | 43 | # test confidence scores 44 | words, preds = model.predict_text(text_single, return_confidence=True) 45 | 46 | def test_confs_len(): 47 | assert len(preds[0])==len(preds[1]) 48 | 49 | predictions_text_single = model.predict_text(text_single) 50 | 51 | def test_predict_text_format(): 52 | """Test text predictions""" 53 | assert isinstance(predictions_text_single, tuple) 54 | 55 | def test_predict_text_match_words_predictions(): 56 | assert len(predictions_text_single[0][0]) == len(predictions_text_single[1][0]) 57 | 58 | # multiple sentences. 59 | text_multi = """ 60 | Pernille Rosenkrantz-Theil kommer fra Vejle. 61 | Jens Hansen har en bondegård. 62 | """ 63 | 64 | def test_predict_text_multi(): 65 | """Test that predict_text runs with multiple sentences""" 66 | predictions = model.predict_text(text_multi, batch_size = 2) 67 | 68 | predictions_text_multi = model.predict_text(text_multi, batch_size = 2) 69 | 70 | def test_predict_text_multi_format(): 71 | """Test multi-sentence text predictions has expected format""" 72 | assert isinstance(predictions_text_multi, tuple) 73 | 74 | def test_predict_text_multi_elements_count(): 75 | """Test dimensions of multi-sentence text predictions""" 76 | assert [len(predictions_text_multi[0]), len(predictions_text_multi[1])] == [2, 2] 77 | 78 | def test_predict_text_multi_lens(): 79 | """Test lengths of multi-sentence text predictions""" 80 | s1 = len(predictions_text_multi[0][0]) == len(predictions_text_multi[1][0]) 81 | s2 = len(predictions_text_multi[0][1]) == len(predictions_text_multi[1][1]) 82 | assert all([s1, s2]) 83 | 84 | -------------------------------------------------------------------------------- /tests/unit_tests/test_training.py: -------------------------------------------------------------------------------- 1 | from NERDA.datasets import get_dane_data 2 | from NERDA.models import NERDA 3 | 4 | # instantiate a minimal model. 5 | model = NERDA(dataset_training = get_dane_data('train', 5), 6 | dataset_validation = get_dane_data('dev', 5), 7 | transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased', 8 | hyperparameters = {'epochs' : 1, 9 | 'warmup_steps' : 10, 10 | 'train_batch_size': 5, 11 | 'learning_rate': 0.0001}) 12 | 13 | def test_training(): 14 | """Test if training runs successfully""" 15 | model.train() 16 | 17 | def test_training_exceed_maxlen(): 18 | """Test if traning does not break even though MAX LEN is exceeded""" 19 | m = NERDA(dataset_training = get_dane_data('train', 5), 20 | dataset_validation = get_dane_data('dev', 5), 21 | max_len = 3, 22 | transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased', 23 | hyperparameters = {'epochs' : 1, 24 | 'warmup_steps' : 10, 25 | 'train_batch_size': 5, 26 | 'learning_rate': 0.0001}) 27 | m.train() 28 | 29 | def test_training_bert(): 30 | """Test if traning does not break even though MAX LEN is exceeded""" 31 | m = NERDA(dataset_training = get_dane_data('train', 5), 32 | dataset_validation = get_dane_data('dev', 5), 33 | transformer = 'bert-base-multilingual-uncased', 34 | hyperparameters = {'epochs' : 1, 35 | 'warmup_steps' : 10, 36 | 'train_batch_size': 5, 37 | 'learning_rate': 0.0001}) 38 | m.train() 39 | --------------------------------------------------------------------------------