├── .github
    └── workflows
    │   └── build.yml
├── .gitignore
├── LICENSE
├── NEWS.md
├── README.md
├── admin
    ├── deploy_models.py
    ├── sandbox.py
    └── tuning.py
├── dev-requirements.txt
├── docs
    ├── datasets.md
    ├── index.md
    ├── nerda_models.md
    ├── networks.md
    ├── performance.md
    ├── preamble.py
    ├── precooked_models.md
    ├── predictions.md
    └── workflow.ipynb
├── logo.png
├── mkdocs.yml
├── pytest.ini
├── setup.cfg
├── setup.py
├── src
    └── NERDA
    │   ├── __init__.py
    │   ├── datasets.py
    │   ├── models.py
    │   ├── networks.py
    │   ├── performance.py
    │   ├── precooked.py
    │   ├── predictions.py
    │   ├── preprocessing.py
    │   ├── training.py
    │   └── utils.py
└── tests
    └── unit_tests
        ├── test_aaaNERDA.py
        ├── test_performance.py
        ├── test_precooked.py
        ├── test_predictions.py
        └── test_training.py


/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | #---------------------------------------------------------------------
 2 | # Build, Test and Publish Package
 3 | #---------------------------------------------------------------------
 4 | name: build
 5 | 
 6 | on: [push]
 7 | 
 8 | jobs:
 9 | 
10 |   Build:
11 | 
12 |     runs-on: ubuntu-latest
13 |   
14 |     steps:
15 |     - uses: actions/checkout@v2
16 | 
17 |     - name: Cache Python packages
18 |       uses: actions/cache@v2
19 |       with: 
20 |         path: ~/.cache/pip
21 |         key: ${{ hashFiles('setup.py') }}-${{ hashFiles('dev-requirements.txt') }}
22 | 
23 |     - name: Set up Python
24 |       uses: actions/setup-python@v2
25 |       with:
26 |         python-version: '3.7'
27 | 
28 |     - name: Display Python version
29 |       run: python -c "import sys; print(sys.version)"
30 | 
31 |     - name: Install pip and dev requirements
32 |       run: |
33 |         python -m pip install --upgrade pip
34 |         pip install -r dev-requirements.txt
35 | 
36 |     - name: Lint
37 |       run: |
38 |         # stop the build if there are Python syntax errors or undefined names
39 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude=.git,__pycache__,docs/source/conf.py,old,build,dist,admin
40 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
41 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --exclude=.git,__pycache__,docs/source/conf.py,old,build,dist,admin --statistics --format=html --htmldir=flake-report
42 | 
43 |     - name: Upload Lint results
44 |       uses: actions/upload-artifact@v2
45 |       with:
46 |         name: lint-results
47 |         path: flake-report/
48 | 
49 |     - name: Run tests
50 |       run: |
51 |         python setup.py test
52 | 
53 |     - name: Publish test results
54 |       uses: EnricoMi/publish-unit-test-result-action@v1.6
55 |       if: always()
56 |       with:
57 |         github_token: ${{ secrets.GITHUB_TOKEN }}
58 |         files: test-results/**/*.xml
59 | 
60 |     - name: Upload coverage to Codecov
61 |       uses: codecov/codecov-action@v1
62 |       with:
63 |         token: ${{ secrets.CODECOV_TOKEN }}
64 |         file: coverage.xml
65 |         flags: unittests
66 | 
67 |     - name: Deploy docs
68 |       if: github.ref == 'refs/heads/main'
69 |       run: |
70 |         pip install .
71 |         mkdocs gh-deploy --force
72 |         
73 |     - name: Build and publish to TEST PyPI
74 |       if: github.ref != 'refs/heads/main'
75 |       env:
76 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
77 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
78 |       run: |
79 |         python setup.py sdist bdist_wheel
80 |         twine upload -r testpypi dist/*
81 | 
82 |     - name: Publish to PyPI
83 |       env:
84 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
85 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
86 |       if: github.ref == 'refs/heads/main'
87 |       run: |
88 |         python setup.py sdist bdist_wheel
89 |         twine upload dist/*
90 |  
91 |     
92 |     


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # Data and models
  7 | /data
  8 | /daNER
  9 | src/*.pickle
 10 | src/*.pkl
 11 | *.csv
 12 | *.pickle
 13 | 
 14 | # playground
 15 | tester.py
 16 | tester2.py
 17 | src/playground.py
 18 | 
 19 | # C extensions
 20 | *.so
 21 | 
 22 | # Distribution / packaging
 23 | .Python
 24 | build/
 25 | develop-eggs/
 26 | dist/
 27 | downloads/
 28 | eggs/
 29 | .eggs/
 30 | lib/
 31 | lib64/
 32 | parts/
 33 | sdist/
 34 | var/
 35 | wheels/
 36 | pip-wheel-metadata/
 37 | share/python-wheels/
 38 | *.egg-info/
 39 | .installed.cfg
 40 | *.egg
 41 | MANIFEST
 42 | test-results/
 43 | 
 44 | 
 45 | # PyInstaller
 46 | #  Usually these files are written by a python script from a template
 47 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 48 | *.manifest
 49 | *.spec
 50 | 
 51 | # Installer logs
 52 | pip-log.txt
 53 | pip-delete-this-directory.txt
 54 | 
 55 | # Unit test / coverage reports
 56 | htmlcov/
 57 | flake-report/
 58 | .tox/
 59 | .nox/
 60 | .coverage
 61 | .coverage.*
 62 | .cache
 63 | nosetests.xml
 64 | coverage.xml
 65 | *.cover
 66 | *.py,cover
 67 | .hypothesis/
 68 | .pytest_cache/
 69 | 
 70 | # Translations
 71 | *.mo
 72 | *.pot
 73 | 
 74 | # Django stuff:
 75 | *.log
 76 | local_settings.py
 77 | db.sqlite3
 78 | db.sqlite3-journal
 79 | 
 80 | # Flask stuff:
 81 | instance/
 82 | .webassets-cache
 83 | 
 84 | # Scrapy stuff:
 85 | .scrapy
 86 | 
 87 | # documentation
 88 | docs/_build/
 89 | 
 90 | # PyBuilder
 91 | target/
 92 | 
 93 | # Jupyter Notebook
 94 | .ipynb_checkpoints
 95 | 
 96 | # IPython
 97 | profile_default/
 98 | ipython_config.py
 99 | 
100 | # pyenv
101 | .python-version
102 | 
103 | # pipenv
104 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
105 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
106 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
107 | #   install all needed dependencies.
108 | #Pipfile.lock
109 | 
110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
111 | __pypackages__/
112 | 
113 | # Celery stuff
114 | celerybeat-schedule
115 | celerybeat.pid
116 | 
117 | # SageMath parsed files
118 | *.sage.py
119 | 
120 | # Environments
121 | .env
122 | .venv
123 | env/
124 | venv/
125 | ENV/
126 | env.bak/
127 | venv.bak/
128 | 
129 | # Spyder project settings
130 | .spyderproject
131 | .spyproject
132 | 
133 | # Rope project settings
134 | .ropeproject
135 | 
136 | # mkdocs documentation
137 | /site
138 | 
139 | # mypy
140 | .mypy_cache/
141 | .dmypy.json
142 | dmypy.json
143 | 
144 | # Pyre type checker
145 | .pyre/
146 | 
147 | # User generated folders
148 | runs/
149 | 
150 | # User created models
151 | *.bin
152 | 
153 | # tensor board results
154 | .DS_Store
155 | .vscode


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Ekstra Bladet, PIN
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # NERDA 1.0.0
 2 | 
 3 | * NERDA model class is now equipped with functions for saving (loading) weights for a fine-tuned NERDA Network to (from) file. See functions model.save_network() and model.load_network_from_file()
 4 | 
 5 | # NERDA 0.9.7
 6 | 
 7 | * return confidence scores for predictions of all tokens, e.g. model.predict(x, return_confidence=True).
 8 | 
 9 | # NERDA 0.9.6
10 | 
11 | * compute Precision, Recall and Accuracy (optional) with evaluate_performance().
12 | * improve relative imports inside package.
13 | 
14 | # NERDA 0.9.5
15 | 
16 | * ... bugfixes.
17 | 
18 | # NERDA 0.9.4
19 | 
20 | * functionality for dynamic quantization, fp32 to fp16, padding parametrized.
21 | 
22 | # NERDA 0.9.2
23 | 
24 | * remove precooked DA_BERT_ML_16BIT, include precooked DA_DISTILBERT_ML.
25 | 
26 | # NERDA 0.9.1
27 | 
28 | * include 16 bit FP precooked DA_BERT_ML_16BIT.
29 | 
30 | # NERDA 0.9.0
31 | 
32 | * Support new versions of `transformers` (4.x) and `torch` 
33 | 
34 | # NERDA 0.8.7
35 | 
36 | * BUGFIX: Restrict torch version.
37 | * Do not import datasets as part of Precooked Models.
38 | * Do not load datasets if not provided by user.
39 | 
40 | # NERDA 0.8.6
41 | 
42 | * First official release.
43 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # NERDA <img src="https://raw.githubusercontent.com/ebanalyse/NERDA/main/logo.png" align="right" height=250/>
  2 | 
  3 | ![Build status](https://github.com/ebanalyse/NERDA/workflows/build/badge.svg)
  4 | [![codecov](https://codecov.io/gh/ebanalyse/NERDA/branch/main/graph/badge.svg?token=OB6LGFQZYX)](https://codecov.io/gh/ebanalyse/NERDA)
  5 | ![PyPI](https://img.shields.io/pypi/v/NERDA.svg)
  6 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/NERDA?color=green)
  7 | ![License](https://img.shields.io/badge/license-MIT-blue.svg)
  8 | 
  9 | Not only is `NERDA` a mesmerizing muppet-like character. `NERDA` is also
 10 | a python package, that offers a slick easy-to-use interface for fine-tuning 
 11 | pretrained transformers for Named Entity Recognition
 12 |  (=NER) tasks. 
 13 | 
 14 | You can also utilize `NERDA` to access a selection of *precooked* `NERDA` models, 
 15 |  that you can use right off the shelf for NER tasks.
 16 | 
 17 | `NERDA` is built on `huggingface` `transformers` and the popular `pytorch`
 18 |  framework.
 19 | 
 20 | ## Installation guide
 21 | `NERDA` can be installed from [PyPI](https://pypi.org/project/NERDA/) with 
 22 | 
 23 | ```
 24 | pip install NERDA
 25 | ```
 26 | 
 27 | If you want the development version then install directly from [GitHub](https://github.com/ebanalyse/NERDA).
 28 | 
 29 | ## Named-Entity Recogntion tasks
 30 | Named-entity recognition (NER) (also known as (named) entity identification, 
 31 | entity chunking, and entity extraction) is a subtask of information extraction
 32 | that seeks to locate and classify named entities mentioned in unstructured 
 33 | text into pre-defined categories such as person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.<sup>[1]</sup>
 34 | 
 35 | [1]: https://en.wikipedia.org/wiki/Named-entity_recognition
 36 | 
 37 | ### Example Task:
 38 | 
 39 | **Task** 
 40 | 
 41 | Identify person names and organizations in text:
 42 | 
 43 | *Jim bought 300 shares of Acme Corp.*
 44 | 
 45 | **Solution**
 46 | 
 47 | | **Named Entity**   | **Type**              | 
 48 | |--------------------|-----------------------|
 49 | | 'Jim'              | Person                |
 50 | | 'Acme Corp.'       | Organization          |
 51 | 
 52 | Read more about NER on [Wikipedia](https://en.wikipedia.org/wiki/Named-entity_recognition).
 53 | 
 54 | ## Train Your Own `NERDA` Model
 55 | 
 56 | Say, we want to fine-tune a pretrained [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) transformer for NER in English.
 57 | 
 58 | Load package.
 59 | 
 60 | ```python
 61 | from NERDA.models import NERDA
 62 | ```
 63 | 
 64 | Instantiate a `NERDA` model (*with default settings*) for the 
 65 | [`CoNLL-2003`](https://www.clips.uantwerpen.be/conll2003/ner/) 
 66 | English NER data set. 
 67 | 
 68 | ```python
 69 | from NERDA.datasets import get_conll_data
 70 | model = NERDA(dataset_training = get_conll_data('train'),
 71 |               dataset_validation = get_conll_data('valid'),
 72 |               transformer = 'bert-base-multilingual-uncased')
 73 | ```
 74 | 
 75 | By default the network architecture is analogous to that of the models in [Hvingelby et al. 2020](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf). 
 76 | 
 77 | The model can then be trained/fine-tuned by invoking the `train` method, e.g.
 78 | 
 79 | ```python
 80 | model.train()
 81 | ```
 82 | 
 83 | **Note**: this will take some time depending on the dimensions of your machine 
 84 | (if you want to skip training, you can go ahead and use one of the models, 
 85 | that we have already precooked for you in stead).
 86 | 
 87 | After the model has been trained, the model can be used for predicting 
 88 | named entities in new texts.
 89 | 
 90 | ```python
 91 | # text to identify named entities in.
 92 | text = 'Old MacDonald had a farm'
 93 | model.predict_text(text)
 94 | ([['Old', 'MacDonald', 'had', 'a', 'farm']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
 95 | ```
 96 | This means, that the model identified 'Old MacDonald' as a *PER*son.
 97 | 
 98 | Please note, that the `NERDA` model configuration above was instantiated 
 99 | with all default settings. You can however customize your `NERDA` model
100 | in a lot of ways:
101 | 
102 | - Use your own data set (finetune a transformer for any given language)
103 | - Choose whatever transformer you like
104 | - Set all of the hyperparameters for the model
105 | - You can even apply your own Network Architecture 
106 | 
107 | Read more about advanced usage of `NERDA` in the [detailed documentation](https://ebanalyse.github.io/NERDA/workflow).
108 | 
109 | ## Use a Precooked `NERDA` model
110 | 
111 | We have precooked a number of `NERDA` models for Danish and English, that you can download 
112 | and use right off the shelf. 
113 | 
114 | Here is an example.
115 | 
116 | Instantiate a multilingual BERT model, that has been finetuned for NER in Danish,
117 | `DA_BERT_ML`.
118 | 
119 | ```python
120 | from NERDA.precooked import DA_BERT_ML
121 | model = DA_BERT_ML()
122 | ```
123 | 
124 | Down(load) network from web:
125 | 
126 | ```python
127 | model.download_network()
128 | model.load_network()
129 | ```
130 | 
131 | You can now predict named entities in new (Danish) texts
132 | 
133 | ```python
134 | # (Danish) text to identify named entities in:
135 | # 'Jens Hansen har en bondegård' = 'Old MacDonald had a farm'
136 | text = 'Jens Hansen har en bondegård'
137 | model.predict_text(text)
138 | ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
139 | ```
140 | 
141 | ### List of Precooked Models
142 | 
143 | The table below shows the precooked `NERDA` models publicly available for download.
144 | 
145 | | **Model**       | **Language** | **Transformer**   | **Dataset** | **F1-score** |  
146 | |-----------------|--------------|-------------------|---------|-----|
147 | | `DA_BERT_ML`    | Danish       | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 82.8  | 
148 | `DA_ELECTRA_DA` | Danish       | [Danish ELECTRA](https://huggingface.co/Maltehb/-l-ctra-danish-electra-small-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 79.8             |
149 | | `EN_BERT_ML`    | English      | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased)| [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 90.4              |
150 | | `EN_ELECTRA_EN` | English       | [English ELECTRA](https://huggingface.co/google/electra-small-discriminator) | [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 89.1             |
151 | 
152 | **F1-score** is the micro-averaged F1-score across entity tags and is 
153 | evaluated on the respective test sets (that have not been used for training nor
154 | validation of the models).
155 | 
156 | Note, that we have not spent a lot of time on actually fine-tuning the models,
157 | so there could be room for improvement. If you are able to improve the models,
158 | we will be happy to hear from you and include your `NERDA` model.
159 | 
160 | ### Model Performance
161 | 
162 | The table below summarizes the performance (F1-scores) of the precooked `NERDA` models.
163 | 
164 | | **Level**     | `DA_BERT_ML` | `DA_ELECTRA_DA` | `EN_BERT_ML` | `EN_ELECTRA_EN` |
165 | |---------------|--------------|-----------------|--------------|-----------------|
166 | | B-PER         | 93.8         | 92.0            | 96.0         | 95.1            |      
167 | | I-PER         | 97.8         | 97.1            | 98.5         | 97.9            |   
168 | | B-ORG         | 69.5         | 66.9            | 88.4         | 86.2            |     
169 | | I-ORG         | 69.9         | 70.7            | 85.7         | 83.1            |   
170 | | B-LOC         | 82.5         | 79.0            | 92.3         | 91.1            |     
171 | | I-LOC         | 31.6         | 44.4            | 83.9         | 80.5            |     
172 | | B-MISC        | 73.4         | 68.6            | 81.8         | 80.1            |     
173 | | I-MISC        | 86.1         | 63.6            | 63.4         | 68.4            |   
174 | | **AVG_MICRO** | 82.8         | 79.8            | 90.4         | 89.1            |      
175 | | **AVG_MACRO** | 75.6         | 72.8            | 86.3         | 85.3            |
176 | 
177 | ## 'NERDA'?
178 | '`NERDA`' originally stands for *'Named Entity Recognition for DAnish'*. However, this
179 | is somewhat misleading, since the functionality is no longer limited to Danish. 
180 | On the contrary it generalizes to all other languages, i.e. `NERDA` supports 
181 | fine-tuning of transformers for NER tasks for any arbitrary 
182 | language.
183 | 
184 | ## Background
185 | `NERDA` is developed as a part of [Ekstra Bladet](https://ekstrabladet.dk/)’s activities on Platform Intelligence in News (PIN). PIN is an industrial research project that is carried out in collaboration between the [Technical University of Denmark](https://www.dtu.dk/), [University of Copenhagen](https://www.ku.dk/) and [Copenhagen Business School](https://www.cbs.dk/) with funding from [Innovation Fund Denmark](https://innovationsfonden.dk/). The project runs from 2020-2023 and develops recommender systems and natural language processing systems geared for news publishing, some of which are open sourced like `NERDA`.
186 | 
187 | ## Shout-outs
188 | - Thanks to [Alexandra Institute](https://alexandra.dk/) for with the [`danlp`](https://github.com/alexandrainst/danlp) package to have encouraged us to develop this package.
189 | - Thanks to [Malte Højmark-Bertelsen](https://www.linkedin.com/in/malte-h%C3%B8jmark-bertelsen-9a618017b/) and [Kasper Junge](https://www.linkedin.com/in/kasper-juunge/?originalSubdomain=dk) for giving feedback on `NERDA`.
190 | 
191 | ## Read more
192 | The detailed documentation for `NERDA` including code references and
193 | extended workflow examples can be accessed [here](https://ebanalyse.github.io/NERDA/).
194 | 
195 | ## Cite this work
196 | 
197 | ```
198 | @inproceedings{nerda,
199 |   title = {NERDA},
200 |   author = {Kjeldgaard, Lars and Nielsen, Lukas},
201 |   year = {2021},
202 |   publisher = {{GitHub}},
203 |   url = {https://github.com/ebanalyse/NERDA}
204 | }
205 | ```
206 | 
207 | ## Contact
208 | We hope, that you will find `NERDA` useful.
209 | 
210 | Please direct any questions and feedbacks to
211 | [us](mailto:lars.kjeldgaard@eb.dk)!
212 | 
213 | If you want to contribute (which we encourage you to), open a
214 | [PR](https://github.com/ebanalyse/NERDA/pulls).
215 | 
216 | If you encounter a bug or want to suggest an enhancement, please 
217 | [open an issue](https://github.com/ebanalyse/NERDA/issues).
218 | 
219 | 


--------------------------------------------------------------------------------
/admin/deploy_models.py:
--------------------------------------------------------------------------------
 1 | from NERDA.datasets import get_conll_data, get_dane_data
 2 | import pandas as pd
 3 | import torch
 4 | import boto3
 5 | 
 6 | def deploy_model_to_s3(model, test_set = get_dane_data('test')):
 7 |     """Deploy Model to S3
 8 | 
 9 |     Args:
10 |         model: NERDA model.
11 |         test_set: Test set for evaluating performance.
12 | 
13 |     Returns:
14 |         str: message saying, if model was uploaded successfully. 
15 |         Model and text file with performance numbers uploaded 
16 |         as side-effects.
17 |     """
18 | 
19 |     model_name = type(model).__name__
20 | 
21 |     file_model = f'{model_name}.bin'
22 |     torch.save(model.network.state_dict(), file_model)
23 |     
24 |     # compute performance on test set and save.
25 |     performance = model.evaluate_performance(test_set)
26 |     
27 |     # write to csv.
28 |     file_performance = f'{model_name}_performance.csv'
29 |     performance.to_csv(file_performance, index = False)
30 |     
31 |     # upload to S3 bucket.
32 |     s3 = boto3.resource('s3')
33 |     s3.Bucket('nerda').upload_file(
34 |             Filename=file_model, 
35 |             Key = file_model)
36 |     s3.Bucket('nerda').upload_file(
37 |             Filename=file_performance, 
38 |             Key = file_performance)
39 | 
40 |     return "Model deployed to S3 successfully." 
41 | 
42 | if __name__ == '__main__':
43 |     from NERDA.precooked import EN_ELECTRA_EN
44 |     model = EN_ELECTRA_EN()
45 |     model.train()  
46 | 
47 |     deploy_model_to_s3(model)
48 | 
49 | 


--------------------------------------------------------------------------------
/admin/sandbox.py:
--------------------------------------------------------------------------------
  1 | from NERDA.models import NERDA
  2 | from NERDA.datasets import get_conll_data, get_dane_data
  3 | from transformers import AutoTokenizer
  4 | trans = 'bert-base-multilingual-uncased'
  5 | tokenizer = AutoTokenizer.from_pretrained(trans, do_lower_case = True)
  6 | data = get_dane_data('train')
  7 | 
  8 | sents = data.get('sentences')
  9 | 
 10 | out = []
 11 | 
 12 | for sent in sents:
 13 |     sent = sents[3595]
 14 |     tokens = []
 15 |     for word in sent:
 16 |             tokens.extend(tokenizer.tokenize(word))
 17 |     out.append(tokens)
 18 | 
 19 | lens = [len(x) for x in out]
 20 | 
 21 | max(lens)
 22 | 
 23 | sents[3595]
 24 | 
 25 | 
 26 | from transformers import AutoTokenizer, AutoModel, AutoConfig 
 27 | t = 'google/electra-small-discriminator'
 28 | cfg = AutoModel.from_pretrained(t)
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | 
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | #trn = get_conll_data('train')
 41 | #idx_min = 3110
 42 | #idx_max = 3115
 43 | #valid = get_conll_data('valid')
 44 | #valid['sentences'] = valid['sentences'][idx_min:idx_max+1]
 45 | #valid['tags'] = valid['tags'][idx_min:idx_max+1]
 46 | #trn['sentences'] = trn['sentences'][idx_min:idx_max+1]
 47 | #trn['tags'] = trn['tags'][idx_min:idx_max+1]
 48 | # model = NERDA(dataset_training=trn,
 49 |  #             dataset_validation = valid)
 50 | #model.train()
 51 | #k=0
 52 | #trn['sentences'][3111]
 53 | #from transformers import AutoTokenizer
 54 | #t = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased')
 55 | #valid = get_conll_data('valid')
 56 | 
 57 | filename = 'en_bert_ml.pkl'
 58 | # pickle.dump(model, open(filename, 'wb'))
 59 | import pickle
 60 | file = open(filename,'rb')
 61 | model = pickle.load(file) 
 62 | test = get_conll_data('test')
 63 | model.evaluate_performance(test, batch_size = 10)
 64 | #for entry in range(3120,3140):
 65 | #    print(entry)
 66 | #    sent = trn['sentences'][entry]
 67 | #    [t.tokenize(word) for word in sent]
 68 | 
 69 | test = get_conll_data('test')
 70 | idx_min = 202
 71 | idx_max = 202
 72 | # valid = get_conll_data('valid')
 73 | #valid['sentences'] = valid['sentences'][idx_min:idx_max+1]
 74 | #valid['tags'] = valid['tags'][idx_min:idx_max+1]
 75 | test['sentences'] = test['sentences'][idx_min:idx_max+1]
 76 | test['tags'] = test['tags'][idx_min:idx_max+1]
 77 | model.evaluate_performance(test)
 78 | # model = NERDA(dataset_training=trn,
 79 |  #             dataset_validation = valid)
 80 | #model.train()
 81 | #k=0
 82 | #trn['sentences'][3111]
 83 | #from transformers import AutoTokenizer
 84 | #t = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased')
 85 | #valid = get_conll_data('valid')
 86 | 
 87 | <<<<<<< HEAD:admin/sandbox.py
 88 | 
 89 | transformer = "google/electra-small-discriminator"
 90 | from transformers import AutoTokenizer, AutoModel, AutoConfig 
 91 | trans = AutoConfig.from_pretrained(transformer)
 92 | 
 93 | def tester():
 94 | 
 95 |     try:
 96 |         model = AutoModel.from_pretrained('google/electra-small-discriminator')
 97 |     except:
 98 |         print("Oops!", sys.exc_info()[0], "occurred.")
 99 | 
100 |     return model
101 | =======
102 | from NERDA.datasets import get_dane_data
103 | trn = get_conll_data('train', 5)
104 | valid = get_conll_data('dev', 5)
105 | transformer = 'bert-base-multilingual-uncased',
106 | model = NERDA(transformer = transformer,
107 |               dataset_training = trn,
108 |               dataset_validation = valid)
109 | >>>>>>> b5eea087ece5f61ec70aa3f99cd4c99b418ebb92:sandbox.py
110 | 


--------------------------------------------------------------------------------
/admin/tuning.py:
--------------------------------------------------------------------------------
 1 | from sys import getdefaultencoding
 2 | from NERDA.models import NERDA
 3 | from NERDA.datasets import get_dane_data
 4 | from hyperopt import fmin, hp, tpe, space_eval
 5 | from hyperopt.pyll import scope
 6 | import numpy as np
 7 | 
 8 | def objective(params):
 9 |     
10 |     print(params)
11 | 
12 |     model = NERDA(dataset_training = get_dane_data('train', 20),
13 |                   dataset_validation = get_dane_data('dev', 20),
14 |                   hyperparameters = params)
15 | 
16 |     model.train()
17 | 
18 |     return model.valid_loss
19 | 
20 | def run_parameter_optimization(objective, number_of_evals = 3):
21 |    
22 |     hpspace = {
23 |             'learning_rate': hp.loguniform('lr', np.log(0.00005), np.log(0.01)),
24 |             'train_batch_size': scope.int(hp.uniform('batch_size', 8, 16)),
25 |             'epochs': scope.int(hp.uniform('epochs', 1, 3)),
26 |             'warmup_steps': hp.choice('warmup_steps', [0, 250, 500]),
27 |         }
28 | 
29 |     print('Running hyperparameter optimization...')
30 | 
31 |     best_params = fmin(objective, space = hpspace, algo = tpe.suggest, max_evals= number_of_evals)
32 | 
33 |     return best_params
34 | 
35 | # best_params = run_parameter_optimization(objective = objective, number_of_evals=3)
36 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | flake8-html
2 | wheel
3 | twine
4 | mkdocs-material 
5 | mkdocstrings
6 | mknotebooks
7 | jupyter


--------------------------------------------------------------------------------
/docs/datasets.md:
--------------------------------------------------------------------------------
1 | # Datasets
2 | ::: NERDA.datasets


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | # NERDA <img src="https://raw.githubusercontent.com/ebanalyse/NERDA/main/logo.png" align="right" height=250/>
  2 | 
  3 | ![Build status](https://github.com/ebanalyse/NERDA/workflows/build/badge.svg)
  4 | [![codecov](https://codecov.io/gh/ebanalyse/NERDA/branch/main/graph/badge.svg?token=OB6LGFQZYX)](https://codecov.io/gh/ebanalyse/NERDA)
  5 | ![PyPI](https://img.shields.io/pypi/v/NERDA.svg)
  6 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/NERDA?color=green)
  7 | ![License](https://img.shields.io/badge/license-MIT-blue.svg)
  8 | 
  9 | Not only is `NERDA` a mesmerizing muppet-like character. `NERDA` is also
 10 | a python package, that offers a slick easy-to-use interface for fine-tuning 
 11 | pretrained transformers for Named Entity Recognition
 12 |  (=NER) tasks. 
 13 | 
 14 | You can also utilize `NERDA` to access a selection of *precooked* `NERDA` models, 
 15 |  that you can use right off the shelf for NER tasks.
 16 | 
 17 | `NERDA` is built on `huggingface` `transformers` and the popular `pytorch`
 18 |  framework.
 19 | 
 20 | ## Installation guide
 21 | `NERDA` can be installed from [PyPI](https://pypi.org/project/NERDA/) with 
 22 | 
 23 | ```
 24 | pip install NERDA
 25 | ```
 26 | 
 27 | If you want the development version then install directly from [GitHub](https://github.com/ebanalyse/NERDA).
 28 | 
 29 | ## Named-Entity Recogntion tasks
 30 | Named-entity recognition (NER) (also known as (named) entity identification, 
 31 | entity chunking, and entity extraction) is a subtask of information extraction
 32 | that seeks to locate and classify named entities mentioned in unstructured 
 33 | text into pre-defined categories such as person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.<sup>[1]</sup>
 34 | 
 35 | [1]: https://en.wikipedia.org/wiki/Named-entity_recognition
 36 | 
 37 | ### Example Task:
 38 | 
 39 | **Task** 
 40 | 
 41 | Identify person names and organizations in text:
 42 | 
 43 | *Jim bought 300 shares of Acme Corp.*
 44 | 
 45 | **Solution**
 46 | 
 47 | | **Named Entity**   | **Type**              | 
 48 | |--------------------|-----------------------|
 49 | | 'Jim'              | Person                |
 50 | | 'Acme Corp.'       | Organization          |
 51 | 
 52 | Read more about NER on [Wikipedia](https://en.wikipedia.org/wiki/Named-entity_recognition).
 53 | 
 54 | ## Train Your Own `NERDA` Model
 55 | 
 56 | Say, we want to fine-tune a pretrained [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) transformer for NER in English.
 57 | 
 58 | Load package.
 59 | 
 60 | ```python
 61 | from NERDA.models import NERDA
 62 | ```
 63 | 
 64 | Instantiate a `NERDA` model (*with default settings*) for the 
 65 | [`CoNLL-2003`](https://www.clips.uantwerpen.be/conll2003/ner/) 
 66 | English NER data set. 
 67 | 
 68 | ```python
 69 | from NERDA.datasets import get_conll_data
 70 | model = NERDA(dataset_training = get_conll_data('train'),
 71 |               dataset_validation = get_conll_data('valid'),
 72 |               transformer = 'bert-base-multilingual-uncased')
 73 | ```
 74 | 
 75 | By default the network architecture is analogous to that of the models in [Hvingelby et al. 2020](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf). 
 76 | 
 77 | The model can then be trained/fine-tuned by invoking the `train` method, e.g.
 78 | 
 79 | ```python
 80 | model.train()
 81 | ```
 82 | 
 83 | **Note**: this will take some time depending on the dimensions of your machine 
 84 | (if you want to skip training, you can go ahead and use one of the models, 
 85 | that we have already precooked for you in stead).
 86 | 
 87 | After the model has been trained, the model can be used for predicting 
 88 | named entities in new texts.
 89 | 
 90 | ```python
 91 | # text to identify named entities in.
 92 | text = 'Old MacDonald had a farm'
 93 | model.predict_text(text)
 94 | ([['Old', 'MacDonald', 'had', 'a', 'farm']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
 95 | ```
 96 | This means, that the model identified 'Old MacDonald' as a *PER*son.
 97 | 
 98 | Please note, that the `NERDA` model configuration above was instantiated 
 99 | with all default settings. You can however customize your `NERDA` model
100 | in a lot of ways:
101 | 
102 | - Use your own data set (finetune a transformer for any given language)
103 | - Choose whatever transformer you like
104 | - Set all of the hyperparameters for the model
105 | - You can even apply your own Network Architecture 
106 | 
107 | Read more about advanced usage of `NERDA` in the [detailed documentation](https://ebanalyse.github.io/NERDA/workflow).
108 | 
109 | ## Use a Precooked `NERDA` model
110 | 
111 | We have precooked a number of `NERDA` models for Danish and English, that you can download 
112 | and use right off the shelf. 
113 | 
114 | Here is an example.
115 | 
116 | Instantiate a multilingual BERT model, that has been finetuned for NER in Danish,
117 | `DA_BERT_ML`.
118 | 
119 | ```python
120 | from NERDA.precooked import DA_BERT_ML()
121 | model = DA_BERT_ML()
122 | ```
123 | 
124 | Down(load) network from web:
125 | 
126 | ```python
127 | model.download_network()
128 | model.load_network()
129 | ```
130 | 
131 | You can now predict named entities in new (Danish) texts
132 | 
133 | ```python
134 | # (Danish) text to identify named entities in:
135 | # 'Jens Hansen har en bondegård' = 'Old MacDonald had a farm'
136 | text = 'Jens Hansen har en bondegård'
137 | model.predict_text(text)
138 | ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
139 | ```
140 | 
141 | ### List of Precooked Models
142 | 
143 | The table below shows the precooked `NERDA` models publicly available for download.
144 | 
145 | | **Model**       | **Language** | **Transformer**   | **Dataset** | **F1-score** |  
146 | |-----------------|--------------|-------------------|---------|-----|
147 | | `DA_BERT_ML`    | Danish       | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 82.8  | 
148 | `DA_ELECTRA_DA` | Danish       | [Danish ELECTRA](https://huggingface.co/Maltehb/-l-ctra-danish-electra-small-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 79.8             |
149 | | `EN_BERT_ML`    | English      | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased)| [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 90.4              |
150 | | `EN_ELECTRA_EN` | English       | [English ELECTRA](https://huggingface.co/google/electra-small-discriminator) | [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 89.1             |
151 | 
152 | **F1-score** is the micro-averaged F1-score across entity tags and is 
153 | evaluated on the respective test sets (that have not been used for training nor
154 | validation of the models).
155 | 
156 | Note, that we have not spent a lot of time on actually fine-tuning the models,
157 | so there could be room for improvement. If you are able to improve the models,
158 | we will be happy to hear from you and include your `NERDA` model.
159 | 
160 | ### Model Performance
161 | 
162 | The table below summarizes the performance (F1-scores) of the precooked `NERDA` models.
163 | 
164 | | **Level**     | `DA_BERT_ML` | `DA_ELECTRA_DA` | `EN_BERT_ML` | `EN_ELECTRA_EN` |
165 | |---------------|--------------|-----------------|--------------|-----------------|
166 | | B-PER         | 93.8         | 92.0            | 96.0         | 95.1            |      
167 | | I-PER         | 97.8         | 97.1            | 98.5         | 97.9            |   
168 | | B-ORG         | 69.5         | 66.9            | 88.4         | 86.2            |     
169 | | I-ORG         | 69.9         | 70.7            | 85.7         | 83.1            |   
170 | | B-LOC         | 82.5         | 79.0            | 92.3         | 91.1            |     
171 | | I-LOC         | 31.6         | 44.4            | 83.9         | 80.5            |     
172 | | B-MISC        | 73.4         | 68.6            | 81.8         | 80.1            |     
173 | | I-MISC        | 86.1         | 63.6            | 63.4         | 68.4            |   
174 | | **AVG_MICRO** | 82.8         | 79.8            | 90.4         | 89.1            |      
175 | | **AVG_MACRO** | 75.6         | 72.8            | 86.3         | 85.3            |
176 | 
177 | ## 'NERDA'?
178 | '`NERDA`' originally stands for *'Named Entity Recognition for DAnish'*. However, this
179 | is somewhat misleading, since the functionality is no longer limited to Danish. 
180 | On the contrary it generalizes to all other languages, i.e. `NERDA` supports 
181 | fine-tuning of transformers for NER tasks for any arbitrary 
182 | language.
183 | 
184 | ## Background
185 | `NERDA` is developed as a part of [Ekstra Bladet](https://ekstrabladet.dk/)’s activities on Platform Intelligence in News (PIN). PIN is an industrial research project that is carried out in collaboration between the [Technical University of Denmark](https://www.dtu.dk/), [University of Copenhagen](https://www.ku.dk/) and [Copenhagen Business School](https://www.cbs.dk/) with funding from [Innovation Fund Denmark](https://innovationsfonden.dk/). The project runs from 2020-2023 and develops recommender systems and natural language processing systems geared for news publishing, some of which are open sourced like `NERDA`.
186 | 
187 | ## Shout-outs
188 | - Thanks to [Alexandra Institute](https://alexandra.dk/) for with the [`danlp`](https://github.com/alexandrainst/danlp) package to have encouraged us to develop this package.
189 | - Thanks to [Malte Højmark-Bertelsen](https://www.linkedin.com/in/malte-h%C3%B8jmark-bertelsen-9a618017b/) and [Kasper Junge](https://www.linkedin.com/in/kasper-juunge/?originalSubdomain=dk) for giving feedback on `NERDA`.
190 | 
191 | ## Contact
192 | We hope, that you will find `NERDA` useful.
193 | 
194 | Please direct any questions and feedbacks to
195 | [us](mailto:lars.kjeldgaard@eb.dk)!
196 | 
197 | If you want to contribute (which we encourage you to), open a
198 | [PR](https://github.com/ebanalyse/NERDA/pulls).
199 | 
200 | If you encounter a bug or want to suggest an enhancement, please 
201 | [open an issue](https://github.com/ebanalyse/NERDA/issues).
202 | 
203 | 


--------------------------------------------------------------------------------
/docs/nerda_models.md:
--------------------------------------------------------------------------------
1 | # NERDA Models
2 | ::: NERDA.models


--------------------------------------------------------------------------------
/docs/networks.md:
--------------------------------------------------------------------------------
1 | # Networks
2 | ::: NERDA.networks


--------------------------------------------------------------------------------
/docs/performance.md:
--------------------------------------------------------------------------------
1 | # Performance
2 | ::: NERDA.performance


--------------------------------------------------------------------------------
/docs/preamble.py:
--------------------------------------------------------------------------------
1 | # suppress warnings for notebook
2 | import warnings
3 | warnings.filterwarnings("ignore")
4 | # download nltk 'punkt' in order to use nltk word/sent-tokenize
5 | import nltk
6 | nltk.download('punkt')


--------------------------------------------------------------------------------
/docs/precooked_models.md:
--------------------------------------------------------------------------------
1 | # Precooked NERDA models
2 | ::: NERDA.precooked


--------------------------------------------------------------------------------
/docs/predictions.md:
--------------------------------------------------------------------------------
1 | # Predictions
2 | ::: NERDA.predictions


--------------------------------------------------------------------------------
/docs/workflow.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "language_info": {
  4 |    "codemirror_mode": {
  5 |     "name": "ipython",
  6 |     "version": 3
  7 |    },
  8 |    "file_extension": ".py",
  9 |    "mimetype": "text/x-python",
 10 |    "name": "python",
 11 |    "nbconvert_exporter": "python",
 12 |    "pygments_lexer": "ipython3",
 13 |    "version": "3.9.0-final"
 14 |   },
 15 |   "orig_nbformat": 2,
 16 |   "kernelspec": {
 17 |    "name": "python3",
 18 |    "display_name": "Python 3.9.0 64-bit ('3.9.0')",
 19 |    "metadata": {
 20 |     "interpreter": {
 21 |      "hash": "36071112a161297f2fd106003050184fbdff34ed057f375faa6d2f5f0cad40eb"
 22 |     }
 23 |    }
 24 |   }
 25 |  },
 26 |  "nbformat": 4,
 27 |  "nbformat_minor": 2,
 28 |  "cells": [
 29 |   {
 30 |    "source": [
 31 |     "# Workflow Examples"
 32 |    ],
 33 |    "cell_type": "markdown",
 34 |    "metadata": {}
 35 |   },
 36 |   {
 37 |    "source": [
 38 |     "`NERDA` offers a simple easy-to-use interface for fine-tuning transformers for Named-Entity Recognition (=NER).  We call this family of models `NERDA` models.\n",
 39 |     "\n",
 40 |     "`NERDA` can be used in two ways. You can either (1) train your own customized `NERDA` model or (2) download and use one of our precooked `NERDA` models for inference i.e. identifying named entities in new texts."
 41 |    ],
 42 |    "cell_type": "markdown",
 43 |    "metadata": {}
 44 |   },
 45 |   {
 46 |    "source": [
 47 |     "## Train Your Own `NERDA` model"
 48 |    ],
 49 |    "cell_type": "markdown",
 50 |    "metadata": {}
 51 |   },
 52 |   {
 53 |    "source": [
 54 |     "We want to fine-tune a transformer for English. \n",
 55 |     "\n",
 56 |     "First, we download an English NER dataset [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) with annotated Named Entities, that we will use for training and evaluation of our model."
 57 |    ],
 58 |    "cell_type": "markdown",
 59 |    "metadata": {}
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 4,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "output_type": "error",
 68 |      "ename": "ModuleNotFoundError",
 69 |      "evalue": "No module named 'NERDA'",
 70 |      "traceback": [
 71 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 72 |       "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
 73 |       "\u001b[0;32m<ipython-input-4-920b857cbb44>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# don't print warnings for this session\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mNERDA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatasets\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mget_dane_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload_dane_data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mdownload_dane_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 74 |       "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'NERDA'"
 75 |      ]
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "from NERDA.datasets import get_conll_data, download_conll_data\n",
 80 |     "download_conll_data()"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "source": [
 85 |     "CoNLL-2003 operates with the following types of named entities:\n",
 86 |     "\n",
 87 |     "1. **PER**sons \n",
 88 |     "2. **ORG**anizations \n",
 89 |     "3. **LOC**ations \n",
 90 |     "4. **MISC**ellaneous \n",
 91 |     "5. **O**utside (Not a named Entity)\n",
 92 |     "\n",
 93 |     "An observation from the CoNLL-2003 data set looks like this."
 94 |    ],
 95 |    "cell_type": "markdown",
 96 |    "metadata": {}
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 3,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "output_type": "error",
105 |      "ename": "NameError",
106 |      "evalue": "name 'get_dane_data' is not defined",
107 |      "traceback": [
108 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
109 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
110 |       "\u001b[0;32m<ipython-input-3-0e6c3d5c6962>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtraining\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_dane_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'train'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mvalidation\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_dane_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'dev'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;31m# example\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtraining\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'sentences'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mtags\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtraining\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'tags'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
111 |       "\u001b[0;31mNameError\u001b[0m: name 'get_dane_data' is not defined"
112 |      ]
113 |     }
114 |    ],
115 |    "source": [
116 |     "# extract the first _5_ rows from the training and validation data splits.\n",
117 |     "training = get_conll_data('train', 5)\n",
118 |     "validation = get_conll_data('valid', 5)\n",
119 |     "# example\n",
120 |     "sentence = training.get('sentences')[0]\n",
121 |     "tags = training.get('tags')[0]\n",
122 |     "print(\"\\n\".join([\"{}/{}\".format(word, tag) for word, tag in zip(sentence, tags)]))"
123 |    ]
124 |   },
125 |   {
126 |    "source": [
127 |     "If you provide your own dataset, it must have the same structure:\n",
128 |     "\n",
129 |     "- It must be a dictionary\n",
130 |     "- The dictionary must contain\n",
131 |     "    - 'sentences': a list of word-tokenized sentences with one sentence per entry \n",
132 |     "    - 'tags': a list with the corresponding named-entity tags.\n",
133 |     "\n",
134 |     "The data set does however *not* have to follow the Inside-Outside-Beginning (IOB) tagging scheme<sup>[1]</sup>.\n",
135 |     "\n",
136 |     "[1]: https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)\n",
137 |     "\n",
138 |     "The IOB tagging scheme implies, that words that are beginning of named entities are tagged with *'B-'* and words 'inside' (=continuations of) named entities are tagged with *'I-'*. That means that 'Joe Biden' should be tagged as `Joe(B-PER) Biden(I-PER)`.\n",
139 |     "\n",
140 |     "Now, instantiate a `NERDA` model for finetuning an [ELECTRA](https://huggingface.co/google/electra-small-discriminator) transformer for NER. "
141 |    ],
142 |    "cell_type": "markdown",
143 |    "metadata": {}
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "from NERDA.models import NERDA\n",
152 |     "tag_scheme = ['B-PER',\n",
153 |     "              'I-PER', \n",
154 |     "              'B-ORG', \n",
155 |     "              'I-ORG', \n",
156 |     "              'B-LOC', \n",
157 |     "              'I-LOC', \n",
158 |     "              'B-MISC', \n",
159 |     "              'I-MISC']\n",
160 |     "model = NERDA(dataset_training = training,\n",
161 |     "              dataset_validation = validation,\n",
162 |     "              tag_scheme = tag_scheme,\n",
163 |     "              tag_outside = 'O',\n",
164 |     "              transformer = 'google/electra-small-discriminator',\n",
165 |     "              hyperparameters = {'epochs' : 1,\n",
166 |     "                                 'warmup_steps' : 10,\n",
167 |     "                                 'train_batch_size': 5,\n",
168 |     "                                 'learning_rate': 0.0001},)"
169 |    ]
170 |   },
171 |   {
172 |    "source": [
173 |     "Note, this model configuration only uses 5 sentences for model training to minimize execution time. Also the hyperparameters for the model have been chosen in order to minimize execution time. Therefore this example only serves to illustrate the functionality i.e. the resulting model will suck.\n",
174 |     "\n",
175 |     "By default the network architecture is analogous that of the models in [Hvingelby et al. 2020](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf). \n",
176 |     "\n",
177 |     "The model can be trained right away by invoking the `train` method."
178 |    ],
179 |    "cell_type": "markdown",
180 |    "metadata": {}
181 |   },
182 |   {
183 |    "source": [
184 |     "model.train()"
185 |    ],
186 |    "cell_type": "code",
187 |    "metadata": {},
188 |    "execution_count": null,
189 |    "outputs": []
190 |   },
191 |   {
192 |    "source": [
193 |     "We can compute the performance of the model on a test set (limited to 5 sentences):"
194 |    ],
195 |    "cell_type": "markdown",
196 |    "metadata": {}
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "test = get_conll_data('test', 5)\n",
205 |     "model.evaluate_performance(test)"
206 |    ]
207 |   },
208 |   {
209 |    "source": [
210 |     "Unsurprisingly, the model sucks in this case due to the ludicrous specification.\n",
211 |     "\n",
212 |     "Named Entities in new texts can be predicted with `predict` functions."
213 |    ],
214 |    "cell_type": "markdown",
215 |    "metadata": {}
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 2,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "text = \"Old MacDonald had a farm\"\n",
224 |     "model.predict_text(text)"
225 |    ]
226 |   },
227 |   {
228 |    "source": [
229 |     "Needless to say the predicted entities for this model are nonsensical.\n",
230 |     "\n",
231 |     "To get a more reasonable model, provide more data and a more meaningful model specification.\n",
232 |     "\n",
233 |     "In general `NERDA` has the following handles, that you use.\n",
234 |     "\n",
235 |     "1. provide your own data set \n",
236 |     "2. choose whatever pretrained transformer you would like to fine-tune\n",
237 |     "3. provide your own set of hyperparameters and lastly\n",
238 |     "4. provide your own `torch` network (architecture). You can do this by instantiating a `NERDA` model with the parameter 'network' set to your own network (torch.nn.Module)."
239 |    ],
240 |    "cell_type": "markdown",
241 |    "metadata": {}
242 |   },
243 |   {
244 |    "source": [
245 |     "## Use a Precooked `NERDA` model"
246 |    ],
247 |    "cell_type": "markdown",
248 |    "metadata": {}
249 |   },
250 |   {
251 |    "source": [
252 |     "We have precooked a number of `NERDA` models, that you can download \n",
253 |     "and use right off the shelf. \n",
254 |     "\n",
255 |     "Here is an example.\n",
256 |     "\n",
257 |     "Instantiate a `NERDA` model based on the English [ELECTRA](https://huggingface.co/google/electra-small-discriminator) transformer, that has been finetuned for NER in English,\n",
258 |     "`EN_ELECTRA_EN`."
259 |    ],
260 |    "cell_type": "markdown",
261 |    "metadata": {}
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "\n",
270 |     "from NERDA.precooked import EN_ELECTRA_EN\n",
271 |     "model = EN_ELECTRA_EN()\n",
272 |     "\n"
273 |    ]
274 |   },
275 |   {
276 |    "source": [
277 |     "(Down)load network:"
278 |    ],
279 |    "cell_type": "markdown",
280 |    "metadata": {}
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "\n",
289 |     "model.download_network()\n",
290 |     "model.load_network()\n"
291 |    ]
292 |   },
293 |   {
294 |    "source": [
295 |     "This model performs much better:"
296 |    ],
297 |    "cell_type": "markdown",
298 |    "metadata": {}
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "metadata": {},
304 |    "outputs": [],
305 |    "source": [
306 |     "model.evaluate_performance(get_conll_data('test', 100))"
307 |    ]
308 |   },
309 |   {
310 |    "source": [
311 |     "Predict named entities in new texts"
312 |    ],
313 |    "cell_type": "markdown",
314 |    "metadata": {}
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": null,
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": [
322 |     "text = 'Old MacDonald had a farm'\n",
323 |     "model.predict_text(text)\n"
324 |    ]
325 |   },
326 |   {
327 |    "source": [
328 |     "### List of Precooked Models"
329 |    ],
330 |    "cell_type": "markdown",
331 |    "metadata": {}
332 |   },
333 |   {
334 |    "source": [
335 |     "The table below shows the precooked `NERDA` models publicly available for download. We have trained models for Danish and English.\n",
336 |     "\n",
337 |     "\n",
338 |     "| **Model**       | **Language** | **Transformer**   | **Dataset** | **F1-score** |  \n",
339 |     "|-----------------|--------------|-------------------|---------|-----|\n",
340 |     "| `DA_BERT_ML`    | Danish       | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 82.8  | \n",
341 |     "`DA_ELECTRA_DA` | Danish       | [Danish ELECTRA](https://huggingface.co/Maltehb/-l-ctra-danish-electra-small-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 79.8             |\n",
342 |     "| `EN_BERT_ML`    | English      | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased)| [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 90.4              |\n",
343 |     "| `EN_ELECTRA_EN` | English       | [English ELECTRA](https://huggingface.co/google/electra-small-discriminator) | [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 89.1             |\n",
344 |     "\n",
345 |     "**F1-score** is the micro-averaged F1-score across entity tags and is \n",
346 |     "evaluated on the respective test sets (that have not been used for training nor\n",
347 |     "validation of the models).\n",
348 |     "\n",
349 |     "Note, that we have not spent a lot of time on actually fine-tuning the models,\n",
350 |     "so there could be room for improvement. If you are able to improve the models,\n",
351 |     "we will be happy to hear from you and include your `NERDA` model.\n",
352 |     "\n",
353 |     "#### Performance of Precooked Models\n",
354 |     "\n",
355 |     "The table below summarizes the performance as measured by F1-scores of the model\n",
356 |     " configurations, that `NERDA` ships with. \n",
357 |     "\n",
358 |     "| **Level**     | `DA_BERT_ML` | `DA_ELECTRA_DA` | `EN_BERT_ML` | `EN_ELECTRA_EN` |\n",
359 |     "|---------------|-----------|------------|-------------|----------------|\n",
360 |     "| B-PER         | 93.8      | 92.0       | 96.0        | 95.1           |      \n",
361 |     "| I-PER         | 97.8      | 97.1       | 98.5        | 97.9           |   \n",
362 |     "| B-ORG         | 69.5      | 66.9       | 88.4        | 86.2           |     \n",
363 |     "| I-ORG         | 69.9      | 70.7       | 85.7        | 83.1           |   \n",
364 |     "| B-LOC         | 82.5      | 79.0       | 92.3        | 91.1           |     \n",
365 |     "| I-LOC         | 31.6      | 44.4       | 83.9        | 80.5           |     \n",
366 |     "| B-MISC        | 73.4      | 68.6       | 81.8        | 80.1           |     \n",
367 |     "| I-MISC        | 86.1      | 63.6       | 63.4        | 68.4           |   \n",
368 |     "| **AVG_MICRO** | 82.8      | 79.8       | 90.4        | 89.1           |      \n",
369 |     "| **AVG_MACRO** | 75.6      | 72.8       | 86.3        | 85.3           |"
370 |    ],
371 |    "cell_type": "markdown",
372 |    "metadata": {}
373 |   },
374 |   {
375 |    "source": [
376 |     "This concludes our walkthrough of `NERDA`. If you have any questions, please do not hesitate to [contact us](mailto:lars.kjeldgaard@eb.dk)!"
377 |    ],
378 |    "cell_type": "markdown",
379 |    "metadata": {}
380 |   }
381 |  ]
382 | }


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/NERDA/ae45d7e5368059721d1073384201433ea7a6e820/logo.png


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: NERDA
 2 | theme:
 3 |   name: "material"
 4 | 
 5 | plugins:
 6 |   - search
 7 |   - mkdocstrings:
 8 |       handlers:
 9 |         python:
10 |           setup_commands:
11 |             - import sys
12 |             - sys.path.append("src")
13 |   - mknotebooks:
14 |       execute: True
15 |       preamble: "docs/preamble.py"
16 | 
17 | nav:
18 |     - Home: index.md
19 |     - Workflow Examples: workflow.ipynb
20 |     - Code Reference:
21 |         - NERDA Models: nerda_models.md
22 |         - Precooked NERDA Models: precooked_models.md
23 |         - Datasets: datasets.md
24 |         - Predictions: predictions.md
25 |         - Networks: networks.md
26 |         - Performance: performance.md
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = -v -s --junitxml=test-results/tests.xml --cov=./ --cov-report=xml


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test=pytest


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="NERDA", 
 8 |     version="1.0.0",
 9 |     author="Lars Kjeldgaard, Lukas Christian Nielsen",
10 |     author_email="lars.kjeldgaard@eb.dk",
11 |     description="A Framework for Finetuning Transformers for Named-Entity Recognition",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/ebanalyse/NERDA",
15 |     packages=setuptools.find_packages(where='src'),
16 |     package_dir={'': 'src'},
17 |     python_requires='>=3.6',
18 |     install_requires=[
19 |         'torch',
20 |         'transformers',
21 |         'sklearn',
22 |         'nltk',
23 |         'pandas',
24 |         'progressbar',
25 |         'pyconll'
26 |     ],
27 |     setup_requires=['pytest-runner'],
28 |     tests_require=['pytest',
29 |                    'pytest-cov'],
30 |     classifiers=[
31 |         "Programming Language :: Python :: 3",
32 |         "License :: OSI Approved :: MIT License",
33 |     ],
34 |     include_package_data=True
35 |     )
36 | 


--------------------------------------------------------------------------------
/src/NERDA/__init__.py:
--------------------------------------------------------------------------------
1 | import NERDA


--------------------------------------------------------------------------------
/src/NERDA/datasets.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This section covers functionality for (down)loading Named Entity 
  3 | Recognition data sets.
  4 | """
  5 | 
  6 | import csv
  7 | import os
  8 | import pyconll
  9 | from io import BytesIO
 10 | from itertools import compress
 11 | from pathlib import Path
 12 | from typing import Union, List, Dict
 13 | from urllib.request import urlopen
 14 | from zipfile import ZipFile
 15 | import ssl
 16 | 
 17 | def download_unzip(url_zip: str,
 18 |                    dir_extract: str) -> str:
 19 |     """Download and unzip a ZIP archive to folder.
 20 | 
 21 |     Loads a ZIP file from URL and extracts all of the files to a 
 22 |     given folder. Does not save the ZIP file itself.
 23 | 
 24 |     Args:
 25 |         url_zip (str): URL to ZIP file.
 26 |         dir_extract (str): Directory where files are extracted.
 27 | 
 28 |     Returns:
 29 |         str: a message telling, if the archive was succesfully
 30 |         extracted. Obviously the files in the ZIP archive are
 31 |         extracted to the desired directory as a side-effect.
 32 |     """
 33 |     
 34 |     # suppress ssl certification
 35 |     ctx = ssl.create_default_context()
 36 |     ctx.check_hostname = False
 37 |     ctx.verify_mode = ssl.CERT_NONE
 38 | 
 39 |     print(f'Reading {url_zip}')
 40 |     with urlopen(url_zip, context=ctx) as zipresp:
 41 |         with ZipFile(BytesIO(zipresp.read())) as zfile:
 42 |             zfile.extractall(dir_extract)
 43 | 
 44 |     return f'archive extracted to {dir_extract}'
 45 | 
 46 | def download_dane_data(dir: str = None) -> str:
 47 |     """Download DaNE data set.
 48 | 
 49 |     Downloads the 'DaNE' data set annotated for Named Entity
 50 |     Recognition developed and hosted by 
 51 |     [Alexandra Institute](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane).
 52 | 
 53 |     Args:
 54 |         dir (str, optional): Directory where DaNE datasets will be saved. If no directory is provided, data will be saved to a hidden folder '.dane' in your home directory.  
 55 |                            
 56 |     Returns:
 57 |         str: a message telling, if the archive was in fact 
 58 |         succesfully extracted. Obviously the DaNE datasets are
 59 |         extracted to the desired directory as a side-effect.
 60 |     
 61 |     Examples:
 62 |         >>> download_dane_data()
 63 |         >>> download_dane_data(dir = 'DaNE')
 64 |         
 65 |     """
 66 |     # set to default directory if nothing else has been provided by user.
 67 |     if dir is None:
 68 |         dir = os.path.join(str(Path.home()), '.dane')
 69 | 
 70 |     return download_unzip(url_zip = 'http://danlp-downloads.alexandra.dk/datasets/ddt.zip',
 71 |                           dir_extract = dir)
 72 | 
 73 | def get_dane_data(split: str = 'train', 
 74 |                   limit: int = None, 
 75 |                   dir: str = None) -> dict:
 76 |     """Load DaNE data split.
 77 | 
 78 |     Loads a single data split from the DaNE data set kindly hosted
 79 |     by [Alexandra Institute](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane).
 80 | 
 81 |     Args:
 82 |         split (str, optional): Choose which split to load. Choose 
 83 |             from 'train', 'dev' and 'test'. Defaults to 'train'.
 84 |         limit (int, optional): Limit the number of observations to be 
 85 |             returned from a given split. Defaults to None, which implies 
 86 |             that the entire data split is returned.
 87 |         dir (str, optional): Directory where data is cached. If set to 
 88 |             None, the function will try to look for files in '.dane' folder in home directory.
 89 | 
 90 |     Returns:
 91 |         dict: Dictionary with word-tokenized 'sentences' and named 
 92 |         entity 'tags' in IOB format.
 93 | 
 94 |     Examples:
 95 |         Get test split
 96 |         >>> get_dane_data('test')
 97 | 
 98 |         Get first 5 observations from training split
 99 |         >>> get_dane_data('train', limit = 5)
100 | 
101 |     """
102 |     assert isinstance(split, str)
103 |     splits = ['train', 'dev', 'test']
104 |     assert split in splits, f'Choose between the following splits: {splits}'
105 | 
106 |     # set to default directory if nothing else has been provided by user.
107 |     if dir is None:
108 |         dir = os.path.join(str(Path.home()), '.dane')
109 |     assert os.path.isdir(dir), f'Directory {dir} does not exist. Try downloading DaNE data with download_dane_data()'
110 |     
111 |     file_path = os.path.join(dir, f'ddt.{split}.conllu')
112 |     assert os.path.isfile(file_path), f'File {file_path} does not exist. Try downloading DaNE data with download_dane_data()'
113 |     
114 |     split = pyconll.load_from_file(file_path)
115 | 
116 |     sentences = []
117 |     entities = []
118 | 
119 |     for sent in split:
120 |         sentences.append([token.form for token in sent._tokens])
121 |         entities.append([token.misc['name'].pop() for token in sent._tokens])
122 |     
123 |     if limit is not None:
124 |         sentences = sentences[:limit]
125 |         entities = entities[:limit]
126 |     
127 |     return {'sentences': sentences, 'tags': entities}
128 |     
129 |     
130 | 
131 | def download_conll_data(dir: str = None) -> str:
132 |     """Download CoNLL-2003 English data set.
133 | 
134 |     Downloads the [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) 
135 |     English data set annotated for Named Entity Recognition.
136 | 
137 |     Args:
138 |         dir (str, optional): Directory where CoNLL-2003 datasets will be saved. If no directory is provided, data will be saved to a hidden folder '.dane' in your home directory.  
139 |                            
140 |     Returns:
141 |         str: a message telling, if the archive was in fact 
142 |         succesfully extracted. Obviously the CoNLL datasets are
143 |         extracted to the desired directory as a side-effect.
144 |     
145 |     Examples:
146 |         >>> download_conll_data()
147 |         >>> download_conll_data(dir = 'conll')
148 |         
149 |     """
150 |     # set to default directory if nothing else has been provided by user.
151 |     if dir is None:
152 |         dir = os.path.join(str(Path.home()), '.conll')
153 | 
154 |     return download_unzip(url_zip = 'https://data.deepai.org/conll2003.zip',
155 |                           dir_extract = dir)
156 | 
157 | def get_conll_data(split: str = 'train', 
158 |                    limit: int = None, 
159 |                    dir: str = None) -> dict:
160 |     """Load CoNLL-2003 (English) data split.
161 | 
162 |     Loads a single data split from the 
163 |     [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) 
164 |     (English) data set.
165 | 
166 |     Args:
167 |         split (str, optional): Choose which split to load. Choose 
168 |             from 'train', 'valid' and 'test'. Defaults to 'train'.
169 |         limit (int, optional): Limit the number of observations to be 
170 |             returned from a given split. Defaults to None, which implies 
171 |             that the entire data split is returned.
172 |         dir (str, optional): Directory where data is cached. If set to 
173 |             None, the function will try to look for files in '.conll' folder in home directory.
174 | 
175 |     Returns:
176 |         dict: Dictionary with word-tokenized 'sentences' and named 
177 |         entity 'tags' in IOB format.
178 | 
179 |     Examples:
180 |         Get test split
181 |         >>> get_conll_data('test')
182 | 
183 |         Get first 5 observations from training split
184 |         >>> get_conll_data('train', limit = 5)
185 | 
186 |     """
187 |     assert isinstance(split, str)
188 |     splits = ['train', 'valid', 'test']
189 |     assert split in splits, f'Choose between the following splits: {splits}'
190 | 
191 |     # set to default directory if nothing else has been provided by user.
192 |     if dir is None:
193 |         dir = os.path.join(str(Path.home()), '.conll')
194 |     assert os.path.isdir(dir), f'Directory {dir} does not exist. Try downloading CoNLL-2003 data with download_conll_data()'
195 |     
196 |     file_path = os.path.join(dir, f'{split}.txt')
197 |     assert os.path.isfile(file_path), f'File {file_path} does not exist. Try downloading CoNLL-2003 data with download_conll_data()'
198 | 
199 |     # read data from file.
200 |     data = []
201 |     with open(file_path, 'r') as file:
202 |         reader = csv.reader(file, delimiter = ' ')
203 |         for row in reader:
204 |             data.append([row])
205 | 
206 |     sentences = []
207 |     sentence = []
208 |     entities = []
209 |     tags = []
210 | 
211 |     for row in data:
212 |         # extract first element of list.
213 |         row = row[0]
214 |         # TO DO: move to data reader.
215 |         if len(row) > 0 and row[0] != '-DOCSTART-':
216 |             sentence.append(row[0])
217 |             tags.append(row[-1])        
218 |         if len(row) == 0 and len(sentence) > 0:
219 |             # clean up sentence/tags.
220 |             # remove white spaces.
221 |             selector = [word != ' ' for word in sentence]
222 |             sentence = list(compress(sentence, selector))
223 |             tags = list(compress(tags, selector))
224 |             # append if sentence length is still greater than zero..
225 |             if len(sentence) > 0:
226 |                 sentences.append(sentence)
227 |                 entities.append(tags)
228 |             sentence = []
229 |             tags = []
230 |             
231 |    
232 |     if limit is not None:
233 |         sentences = sentences[:limit]
234 |         entities = entities[:limit]
235 |     
236 |     return {'sentences': sentences, 'tags': entities}
237 | 
238 |     
239 | 
240 | 
241 | 
242 | 
243 |     
244 |     
245 | 
246 | 
247 | 
248 |     
249 | 
250 | 
251 | 


--------------------------------------------------------------------------------
/src/NERDA/models.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This section covers the interface for `NERDA` models, that is 
  3 | implemented as its own Python class [NERDA.models.NERDA][].
  4 | 
  5 | The interface enables you to easily 
  6 | 
  7 | - specify your own [NERDA.models.NERDA][] model
  8 | - train it
  9 | - evaluate it
 10 | - use it to predict entities in new texts.
 11 | """
 12 | from NERDA.datasets import get_conll_data
 13 | from NERDA.networks import NERDANetwork
 14 | from NERDA.predictions import predict, predict_text
 15 | from NERDA.performance import compute_f1_scores, flatten
 16 | from NERDA.training import train_model
 17 | import pandas as pd
 18 | import numpy as np
 19 | import torch
 20 | import os
 21 | import sys
 22 | import sklearn.preprocessing
 23 | from sklearn.metrics import accuracy_score
 24 | from transformers import AutoModel, AutoTokenizer, AutoConfig
 25 | from typing import List
 26 | 
 27 | class NERDA:
 28 |     """NERDA model
 29 | 
 30 |     A NERDA model object containing a complete model configuration.
 31 |     The model can be trained with the `train` method. Afterwards
 32 |     new observations can be predicted with the `predict` and
 33 |     `predict_text` methods. The performance of the model can be
 34 |     evaluated on a set of new observations with the 
 35 |     `evaluate_performance` method.
 36 | 
 37 |     Examples:
 38 |         Model for a VERY small subset (5 observations) of English NER data
 39 |         >>> from NERDA.datasets import get_conll_data
 40 |         >>> trn = get_conll_data('train', 5)
 41 |         >>> valid = get_conll_data('valid', 5)
 42 |         >>> tag_scheme = ['B-PER', 'I-PER', 'B-LOC', 'I-LOC',
 43 |                           'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC']
 44 |         >>> tag_outside = 'O'
 45 |         >>> transformer = 'bert-base-multilingual-uncased'
 46 |         >>> model = NERDA(transformer = transformer,
 47 |                           tag_scheme = tag_scheme,
 48 |                           tag_outside = tag_outside,
 49 |                           dataset_training = trn,
 50 |                           dataset_validation = valid)
 51 | 
 52 |         Model for complete English NER data set CoNLL-2003 with modified hyperparameters
 53 |         >>> trn = get_conll_data('train')
 54 |         >>> valid = get_conll_data('valid')
 55 |         >>> transformer = 'bert-base-multilingual-uncased'
 56 |         >>> hyperparameters = {'epochs' : 3,
 57 |                                'warmup_steps' : 400,
 58 |                                'train_batch_size': 16,
 59 |                                'learning_rate': 0.0001},
 60 |         >>> model = NERDA(transformer = transformer,
 61 |                           dataset_training = trn,
 62 |                           dataset_validation = valid,
 63 |                           tag_scheme = tag_scheme,
 64 |                           tag_outside = tag_outside,
 65 |                           dropout = 0.1,
 66 |                           hyperparameters = hyperparameters)
 67 | 
 68 |     Attributes:
 69 |         network (torch.nn.Module): network for Named Entity 
 70 |             Recognition task.
 71 |         tag_encoder (sklearn.preprocessing.LabelEncoder): encoder for the
 72 |             NER labels/tags.
 73 |         transformer_model (transformers.PreTrainedModel): (Auto)Model derived from the
 74 |             transformer.
 75 |         transformer_tokenizer (transformers.PretrainedTokenizer): (Auto)Tokenizer
 76 |             derived from the transformer.
 77 |         transformer_config (transformers.PretrainedConfig): (Auto)Config derived from
 78 |             the transformer. 
 79 |         train_losses (list): holds training losses, once the model has been 
 80 |             trained.
 81 |         valid_loss (float): holds validation loss, once the model has been trained.
 82 |     """
 83 |     def __init__(self, 
 84 |                  transformer: str = 'bert-base-multilingual-uncased',
 85 |                  device: str = None, 
 86 |                  tag_scheme: List[str] = [
 87 |                             'B-PER',
 88 |                             'I-PER', 
 89 |                             'B-ORG', 
 90 |                             'I-ORG', 
 91 |                             'B-LOC', 
 92 |                             'I-LOC', 
 93 |                             'B-MISC', 
 94 |                             'I-MISC'
 95 |                             ],
 96 |                  tag_outside: str = 'O',
 97 |                  dataset_training: dict = None,
 98 |                  dataset_validation: dict = None,
 99 |                  max_len: int = 128,
100 |                  network: torch.nn.Module = NERDANetwork,
101 |                  dropout: float = 0.1,
102 |                  hyperparameters: dict = {'epochs' : 4,
103 |                                           'warmup_steps' : 500,
104 |                                           'train_batch_size': 13,
105 |                                           'learning_rate': 0.0001},
106 |                  tokenizer_parameters: dict = {'do_lower_case' : True},
107 |                  validation_batch_size: int = 8,
108 |                  num_workers: int = 1) -> None:
109 |         """Initialize NERDA model
110 | 
111 |         Args:
112 |             transformer (str, optional): which pretrained 'huggingface' 
113 |                 transformer to use. 
114 |             device (str, optional): the desired device to use for computation. 
115 |                 If not provided by the user, we take a guess.
116 |             tag_scheme (List[str], optional): All available NER 
117 |                 tags for the given data set EXCLUDING the special outside tag, 
118 |                 that is handled separately.
119 |             tag_outside (str, optional): the value of the special outside tag. 
120 |                 Defaults to 'O'.
121 |             dataset_training (dict, optional): the training data. Must consist 
122 |                 of 'sentences': word-tokenized sentences and 'tags': corresponding 
123 |                 NER tags. You can look at examples of, how the dataset should 
124 |                 look like by invoking functions get_dane_data() or get_conll_data().
125 |                 Defaults to None, in which case the English CoNLL-2003 data set is used. 
126 |             dataset_validation (dict, optional): the validation data. Must consist
127 |                 of 'sentences': word-tokenized sentences and 'tags': corresponding 
128 |                 NER tags. You can look at examples of, how the dataset should 
129 |                 look like by invoking functions get_dane_data() or get_conll_data().
130 |                 Defaults to None, in which case the English CoNLL-2003 data set 
131 |                 is used.
132 |             max_len (int, optional): the maximum sentence length (number of 
133 |                 tokens after applying the transformer tokenizer) for the transformer. 
134 |                 Sentences are truncated accordingly. Look at your data to get an 
135 |                 impression of, what could be a meaningful setting. Also be aware 
136 |                 that many transformers have a maximum accepted length. Defaults 
137 |                 to 128. 
138 |             network (torch.nn.module, optional): network to be trained. Defaults
139 |                 to a default generic `NERDANetwork`. Can be replaced with your own 
140 |                 customized network architecture. It must however take the same 
141 |                 arguments as `NERDANetwork`.
142 |             dropout (float, optional): dropout probability. Defaults to 0.1.
143 |             hyperparameters (dict, optional): Hyperparameters for the model. Defaults
144 |                 to {'epochs' : 3, 'warmup_steps' : 500, 'train_batch_size': 16, 
145 |                 'learning_rate': 0.0001}.
146 |             tokenizer_parameters (dict, optional): parameters for the transformer 
147 |                 tokenizer. Defaults to {'do_lower_case' : True}.
148 |             validation_batch_size (int, optional): batch size for validation. Defaults
149 |                 to 8.
150 |             num_workers (int, optional): number of workers for data loader.
151 |         """
152 |         
153 |         # set device automatically if not provided by user.
154 |         if device is None:
155 |             self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
156 |             print("Device automatically set to:", self.device)
157 |         else:
158 |             self.device = device
159 |             print("Device set to:", self.device)
160 |         self.tag_scheme = tag_scheme
161 |         self.tag_outside = tag_outside
162 |         self.transformer = transformer  
163 |         self.dataset_training = dataset_training
164 |         self.dataset_validation = dataset_validation
165 |         self.hyperparameters = hyperparameters
166 |         self.tag_outside = tag_outside
167 |         self.tag_scheme = tag_scheme
168 |         tag_complete = [tag_outside] + tag_scheme
169 |         # fit encoder to _all_ possible tags.
170 |         self.max_len = max_len
171 |         self.tag_encoder = sklearn.preprocessing.LabelEncoder()
172 |         self.tag_encoder.fit(tag_complete)
173 |         self.transformer_model = AutoModel.from_pretrained(transformer)
174 |         self.transformer_tokenizer = AutoTokenizer.from_pretrained(transformer, **tokenizer_parameters)
175 |         self.transformer_config = AutoConfig.from_pretrained(transformer)  
176 |         self.network = NERDANetwork(self.transformer_model, self.device, len(tag_complete), dropout = dropout)
177 |         self.network.to(self.device)
178 |         self.validation_batch_size = validation_batch_size
179 |         self.num_workers = num_workers
180 |         self.train_losses = []
181 |         self.valid_loss = np.nan
182 |         self.quantized = False
183 |         self.halved = False
184 | 
185 |     def train(self) -> str:
186 |         """Train Network
187 | 
188 |         Trains the network from the NERDA model specification.
189 | 
190 |         Returns:
191 |             str: a message saying if the model was trained succesfully.
192 |             The network in the 'network' attribute is trained as a 
193 |             side-effect. Training losses and validation loss are saved 
194 |             in 'training_losses' and 'valid_loss' 
195 |             attributes respectively as side-effects.
196 |         """
197 |         network, train_losses, valid_loss = train_model(network = self.network,
198 |                                                         tag_encoder = self.tag_encoder,
199 |                                                         tag_outside = self.tag_outside,
200 |                                                         transformer_tokenizer = self.transformer_tokenizer,
201 |                                                         transformer_config = self.transformer_config,
202 |                                                         dataset_training = self.dataset_training,
203 |                                                         dataset_validation = self.dataset_validation,
204 |                                                         validation_batch_size = self.validation_batch_size,
205 |                                                         max_len = self.max_len,
206 |                                                         device = self.device,
207 |                                                         num_workers = self.num_workers,
208 |                                                         **self.hyperparameters)
209 |         
210 |         # attach as attributes to class
211 |         setattr(self, "network", network)
212 |         setattr(self, "train_losses", train_losses)
213 |         setattr(self, "valid_loss", valid_loss)
214 | 
215 |         return "Model trained successfully"
216 | 
217 |     def load_network_from_file(self, model_path = "model.bin") -> str:
218 |         """Load Pretrained NERDA Network from file
219 | 
220 |         Loads weights for a pretrained NERDA Network from file.
221 | 
222 |         Args:
223 |             model_path (str, optional): Path for model file. 
224 |                 Defaults to "model.bin".
225 | 
226 |         Returns:
227 |             str: message telling if weights for network were
228 |             loaded succesfully.
229 |         """
230 |         # TODO: change assert to Raise.
231 |         assert os.path.exists(model_path), "File does not exist. You can download network with download_network()"
232 |         self.network.load_state_dict(torch.load(model_path, map_location = torch.device(self.device)))
233 |         self.network.device = self.device
234 |         return f'Weights for network loaded from {model_path}'
235 | 
236 |     def save_network(self, model_path:str = "model.bin") -> None:
237 |         """Save Weights of NERDA Network
238 | 
239 |         Saves weights for a fine-tuned NERDA Network to file.
240 | 
241 |         Args:
242 |             model_path (str, optional): Path for model file. 
243 |                 Defaults to "model.bin".
244 | 
245 |         Returns:
246 |             Nothing. Saves model to file as a side-effect.
247 |         """
248 |         torch.save(self.network.state_dict(), model_path)
249 |         print(f"Network written to file {model_path}")
250 | 
251 |     def quantize(self):
252 |         """Apply dynamic quantization to increase performance.
253 | 
254 |         Quantization and half precision inference are mutually exclusive.
255 | 
256 |         Read more: https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html
257 | 
258 |         Returns:
259 |             Nothing. Applies dynamic quantization to Network as a side-effect.
260 |         """
261 |         assert not (self.quantized), "Dynamic quantization already applied"
262 |         assert not (self.halved), "Can't run both quantization and half precision"
263 | 
264 |         self.network = torch.quantization.quantize_dynamic(
265 |             self.network, {torch.nn.Linear}, dtype=torch.qint8
266 |         )
267 |         self.quantized = True
268 | 
269 |     def half(self):
270 |         """Convert weights from Float32 to Float16 to increase performance
271 | 
272 |         Quantization and half precision inference are mutually exclusive.
273 | 
274 |         Read more: https://pytorch.org/docs/master/generated/torch.nn.Module.html?highlight=half#torch.nn.Module.half
275 | 
276 |         Returns: 
277 |             Nothing. Model is "halved" as a side-effect.
278 |         """
279 |         assert not (self.halved), "Half precision already applied"
280 |         assert not (self.quantized), "Can't run both quantization and half precision"
281 | 
282 |         self.network.half()
283 |         self.halved = True
284 | 
285 |     def predict(self, sentences: List[List[str]],
286 |                 return_confidence: bool = False,
287 |                 **kwargs) -> List[List[str]]:
288 |         """Predict Named Entities in Word-Tokenized Sentences
289 | 
290 |         Predicts word-tokenized sentences with trained model.
291 | 
292 |         Args:
293 |             sentences (List[List[str]]): word-tokenized sentences.
294 |             kwargs: arbitrary keyword arguments. For instance
295 |                 'batch_size' and 'num_workers'.
296 |             return_confidence (bool, optional): if True, return
297 |                 confidence scores for all predicted tokens. Defaults
298 |                 to False.
299 | 
300 |         Returns:
301 |             List[List[str]]: Predicted tags for sentences - one
302 |             predicted tag/entity per word token.
303 |         """
304 |         return predict(network = self.network, 
305 |                        sentences = sentences,
306 |                        transformer_tokenizer = self.transformer_tokenizer,
307 |                        transformer_config = self.transformer_config,
308 |                        max_len = self.max_len,
309 |                        device = self.device,
310 |                        tag_encoder = self.tag_encoder,
311 |                        tag_outside = self.tag_outside,
312 |                        return_confidence = return_confidence,
313 |                        **kwargs)
314 | 
315 |     def predict_text(self, text: str, 
316 |                      return_confidence:bool = False, **kwargs) -> list:
317 |         """Predict Named Entities in a Text
318 | 
319 |         Args:
320 |             text (str): text to predict entities in.
321 |             kwargs: arbitrary keyword arguments. For instance
322 |                 'batch_size' and 'num_workers'.
323 |             return_confidence (bool, optional): if True, return
324 |                 confidence scores for all predicted tokens. Defaults
325 |                 to False.
326 | 
327 |         Returns:
328 |             tuple: word-tokenized sentences and predicted 
329 |             tags/entities.
330 |         """
331 |         return predict_text(network = self.network, 
332 |                             text = text,
333 |                             transformer_tokenizer = self.transformer_tokenizer,
334 |                             transformer_config = self.transformer_config,
335 |                             max_len = self.max_len,
336 |                             device = self.device,
337 |                             tag_encoder = self.tag_encoder,
338 |                             tag_outside = self.tag_outside,
339 |                             return_confidence=return_confidence,
340 |                             **kwargs)
341 | 
342 |     def evaluate_performance(self, dataset: dict, 
343 |                              return_accuracy: bool=False,
344 |                              **kwargs) -> pd.DataFrame:
345 |         """Evaluate Performance
346 | 
347 |         Evaluates the performance of the model on an arbitrary
348 |         data set.
349 | 
350 |         Args:
351 |             dataset (dict): Data set that must consist of
352 |                 'sentences' and NER'tags'. You can look at examples
353 |                  of, how the dataset should look like by invoking functions 
354 |                  get_dane_data() or get_conll_data().
355 |             kwargs: arbitrary keyword arguments for predict. For
356 |                 instance 'batch_size' and 'num_workers'.
357 |             return_accuracy (bool): Return accuracy
358 |                 as well? Defaults to False.
359 | 
360 |             
361 |         Returns:
362 |             DataFrame with performance numbers, F1-scores,
363 |             Precision and Recall. Returns dictionary with
364 |             this AND accuracy, if return_accuracy is set to
365 |             True.
366 |         """
367 |         
368 |         tags_predicted = self.predict(dataset.get('sentences'), 
369 |                                       **kwargs)
370 |         
371 |         # compute F1 scores by entity type
372 |         f1 = compute_f1_scores(y_pred = tags_predicted, 
373 |                                y_true = dataset.get('tags'),
374 |                                labels = self.tag_scheme,
375 |                                average = None)
376 |         
377 |         # create DataFrame with performance scores (=F1)
378 |         df = list(zip(self.tag_scheme, f1[2], f1[0], f1[1]))
379 |         df = pd.DataFrame(df, columns = ['Level', 'F1-Score', 'Precision', 'Recall'])    
380 |         
381 |         # compute MICRO-averaged F1-scores and add to table.
382 |         f1_micro = compute_f1_scores(y_pred = tags_predicted, 
383 |                                      y_true = dataset.get('tags'),
384 |                                      labels = self.tag_scheme,
385 |                                      average = 'micro')
386 |         f1_micro = pd.DataFrame({'Level' : ['AVG_MICRO'], 
387 |                                  'F1-Score': [f1_micro[2]],
388 |                                  'Precision': [np.nan],
389 |                                  'Recall': [np.nan]})
390 |         df = df.append(f1_micro)
391 | 
392 |         # compute MACRO-averaged F1-scores and add to table.
393 |         f1_macro = compute_f1_scores(y_pred = tags_predicted, 
394 |                                      y_true = dataset.get('tags'),
395 |                                      labels = self.tag_scheme,
396 |                                      average = 'macro')
397 |         f1_macro = pd.DataFrame({'Level' : ['AVG_MICRO'], 
398 |                                  'F1-Score': [f1_macro[2]],
399 |                                  'Precision': [np.nan],
400 |                                  'Recall': [np.nan]})
401 |         df = df.append(f1_macro)
402 | 
403 |         # compute and return accuracy if desired
404 |         if return_accuracy:
405 |             accuracy = accuracy_score(y_pred = flatten(tags_predicted), 
406 |                                       y_true = flatten(dataset.get('tags')))
407 |             return {'f1':df, 'accuracy': accuracy}
408 |       
409 |         return df
410 | 
411 | 


--------------------------------------------------------------------------------
/src/NERDA/networks.py:
--------------------------------------------------------------------------------
 1 | """This section covers `torch` networks for `NERDA`"""
 2 | import torch
 3 | import torch.nn as nn
 4 | from transformers import AutoConfig
 5 | from NERDA.utils import match_kwargs
 6 | 
 7 | class NERDANetwork(nn.Module):
 8 |     """A Generic Network for NERDA models.
 9 | 
10 |     The network has an analogous architecture to the models in
11 |     [Hvingelby et al. 2020](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf).
12 | 
13 |     Can be replaced with a custom user-defined network with 
14 |     the restriction, that it must take the same arguments.
15 |     """
16 | 
17 |     def __init__(self, transformer: nn.Module, device: str, n_tags: int, dropout: float = 0.1) -> None:
18 |         """Initialize a NERDA Network
19 | 
20 |         Args:
21 |             transformer (nn.Module): huggingface `torch` transformer.
22 |             device (str): Computational device.
23 |             n_tags (int): Number of unique entity tags (incl. outside tag)
24 |             dropout (float, optional): Dropout probability. Defaults to 0.1.
25 |         """
26 |         super(NERDANetwork, self).__init__()
27 |         
28 |         # extract transformer name
29 |         transformer_name = transformer.name_or_path
30 |         # extract AutoConfig, from which relevant parameters can be extracted.
31 |         transformer_config = AutoConfig.from_pretrained(transformer_name)
32 | 
33 |         self.transformer = transformer
34 |         self.dropout = nn.Dropout(dropout)
35 |         self.tags = nn.Linear(transformer_config.hidden_size, n_tags)
36 |         self.device = device
37 | 
38 |     # NOTE: 'offsets 'are not used in model as-is, but they are expected as output
39 |     # down-stream. So _DON'T_ remove! :)
40 |     def forward(self, 
41 |                 input_ids: torch.Tensor, 
42 |                 masks: torch.Tensor, 
43 |                 token_type_ids: torch.Tensor, 
44 |                 target_tags: torch.Tensor, 
45 |                 offsets: torch.Tensor) -> torch.Tensor:
46 |         """Model Forward Iteration
47 | 
48 |         Args:
49 |             input_ids (torch.Tensor): Input IDs.
50 |             masks (torch.Tensor): Attention Masks.
51 |             token_type_ids (torch.Tensor): Token Type IDs.
52 |             target_tags (torch.Tensor): Target tags. Are not used 
53 |                 in model as-is, but they are expected downstream,
54 |                 so they can not be left out.
55 |             offsets (torch.Tensor): Offsets to keep track of original
56 |                 words. Are not used in model as-is, but they are 
57 |                 expected as down-stream, so they can not be left out.
58 | 
59 |         Returns:
60 |             torch.Tensor: predicted values.
61 |         """
62 | 
63 |         # TODO: can be improved with ** and move everything to device in a
64 |         # single step.
65 |         transformer_inputs = {
66 |             'input_ids': input_ids.to(self.device),
67 |             'masks': masks.to(self.device),
68 |             'token_type_ids': token_type_ids.to(self.device)
69 |             }
70 |         
71 |         # match args with transformer
72 |         transformer_inputs = match_kwargs(self.transformer.forward, **transformer_inputs)
73 |            
74 |         outputs = self.transformer(**transformer_inputs)[0]
75 | 
76 |         # apply drop-out
77 |         outputs = self.dropout(outputs)
78 | 
79 |         # outputs for all labels/tags
80 |         outputs = self.tags(outputs)
81 | 
82 |         return outputs
83 | 
84 | 


--------------------------------------------------------------------------------
/src/NERDA/performance.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This section covers functionality for computing performance
 3 | for [NERDA.models.NERDA][] models.
 4 | """
 5 | 
 6 | from typing import List
 7 | from sklearn.metrics import precision_recall_fscore_support
 8 | import warnings
 9 | 
10 | def flatten(l: list):
11 |     """Flattens list"""
12 |     return [item for sublist in l for item in sublist]
13 | 
14 | 
15 | def compute_f1_scores(y_pred: List[List[str]], 
16 |                       y_true: List[List[str]], 
17 |                       labels: List[str],
18 |                       **kwargs) -> list:
19 |     """Compute F1 scores.
20 |     
21 |     Computes F1 Scores
22 | 
23 |     Args:
24 |         y_pred (List): predicted values.
25 |         y_true (List): observed/true values.
26 |         labels (List): all possible tags.
27 |         kwargs: all optional arguments for precision/recall function.
28 | 
29 |     Returns:
30 |         list: resulting F1 scores.
31 | 
32 |     """  
33 |     # check inputs.
34 |     assert sum([len(t) < len(p) for t, p in zip(y_true, y_pred)]) == 0, "Length of predictions must not exceed length of observed values"
35 | 
36 |     # check, if some lengths of observed values exceed predicted values.
37 |     n_exceeds = sum([len(t) > len(p) for t, p in zip(y_true, y_pred)])
38 |     if n_exceeds > 0:
39 |         warnings.warn(f'length of observed values exceeded lengths of predicted values in {n_exceeds} cases and were truncated. _Consider_ increasing max_len parameter for your model.')
40 | 
41 |     # truncate observed values dimensions to match predicted values,
42 |     # this is needed if predictions have been truncated earlier in 
43 |     # the flow.
44 |     y_true = [t[:len(p)] for t, p in zip(y_true, y_pred)]
45 |     
46 |     y_pred = flatten(y_pred)
47 |     y_true = flatten(y_true) 
48 | 
49 |     f1_scores = precision_recall_fscore_support(y_true = y_true,
50 |                                                 y_pred = y_pred,
51 |                                                 labels = labels,
52 |                                                 **kwargs) 
53 | 
54 |     return f1_scores                                                                
55 | 


--------------------------------------------------------------------------------
/src/NERDA/precooked.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This sections covers NERDA Models that have been 'precooked' by 
  3 | Ekstra Bladet and are publicly available for download.
  4 | """
  5 | from NERDA.datasets import get_dane_data, get_conll_data
  6 | from NERDA.models import NERDA
  7 | import os
  8 | import urllib
  9 | from pathlib import Path
 10 | from progressbar import ProgressBar
 11 | 
 12 | pbar = None
 13 | 
 14 | # helper function to show progressbar
 15 | def show_progress(block_num, block_size, total_size):
 16 |     global pbar
 17 |     if pbar is None:
 18 |         pbar = ProgressBar(maxval=total_size)
 19 | 
 20 |     downloaded = block_num * block_size
 21 |     pbar.start()
 22 |     if downloaded < total_size:
 23 |         pbar.update(downloaded)
 24 |     else:
 25 |         pbar.finish()
 26 |         pbar = None
 27 | 
 28 | class Precooked(NERDA):
 29 |     """Precooked NERDA Model
 30 | 
 31 |     NERDA model specification that has been precooked/pretrained
 32 |     and is available for download.
 33 | 
 34 |     Inherits from [NERDA.models.NERDA][].
 35 |     """
 36 |     def __init__(self, **kwargs) -> None:
 37 |         """Initialize Precooked NERDA Model
 38 | 
 39 |         Args:
 40 |             kwargs: all arguments for NERDA Model.
 41 |         """
 42 |         super().__init__(**kwargs)
 43 | 
 44 |     def download_network(self, dir = None) -> None:
 45 |         """Download Precooked Network from Web
 46 | 
 47 |         Args:
 48 |             dir (str, optional): Directory where the model file
 49 |                 will be saved. Defaults to None, in which case
 50 |                 the model will be saved in a folder '.nerda' in
 51 |                 your home directory.
 52 | 
 53 |         Returns:
 54 |             str: Message saying if the download was successfull.
 55 |             Model is downloaded as a side-effect.
 56 |         """
 57 | 
 58 |         model_name = type(self).__name__
 59 | 
 60 |         # url for public S3 bucket with NERDA models.
 61 |         url_s3 = 'https://nerda.s3-eu-west-1.amazonaws.com'
 62 |         url_model = f'{url_s3}/{model_name}.bin'
 63 |         
 64 |         if dir is None:
 65 |             dir = os.path.join(str(Path.home()), '.nerda')
 66 | 
 67 |         if not os.path.exists(dir):
 68 |             os.mkdir(dir)
 69 |             
 70 |         file_path = os.path.join(dir, f'{model_name}.bin')
 71 |         
 72 |         print(
 73 |         """
 74 |         Please make sure, that you're running the latest version of 'NERDA'
 75 |         otherwise the model is not guaranteed to work.
 76 |         """
 77 |         )
 78 |         print(f'Downloading {url_model} to {file_path}')
 79 |         urllib.request.urlretrieve(url_model, file_path, show_progress)
 80 | 
 81 |         return "Network downloaded successfully. Load network with 'load_network'."
 82 | 
 83 |     def load_network(self, file_path: str = None) -> None:
 84 |         """Load Pretrained Network
 85 | 
 86 |         Loads pretrained network from file.
 87 | 
 88 |         Args:
 89 |             file_path (str, optional): Path to model file. Defaults to None,
 90 |                 in which case, the function points to the '.nerda' folder
 91 |                 the home directory.
 92 |         """
 93 | 
 94 |         model_name = type(self).__name__
 95 |         
 96 |         if file_path is None:
 97 |             file_path = os.path.join(str(Path.home()), '.nerda', f'{model_name}.bin')
 98 | 
 99 |         assert os.path.exists(file_path), "File does not exist! You can download network with download_network()"
100 |         print(
101 |         """
102 |         Model loaded. Please make sure, that you're running the latest version 
103 |         of 'NERDA' otherwise the model is not guaranteed to work.
104 |         """
105 |         )
106 |         self.load_network_from_file(file_path)
107 |         
108 | class DA_BERT_ML(Precooked):
109 |     """NERDA [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) 
110 |     for Danish Finetuned on [DaNE data set](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane).
111 |     
112 |     Inherits from [NERDA.precooked.Precooked][].
113 |     
114 |     Examples:
115 |         >>> from NERDA.precooked import DA_BERT_ML()
116 |         >>> model = DA_BERT_ML()
117 |         >>> model.download_network()
118 |         >>> model.load_network()
119 |         >>> text = 'Jens Hansen har en bondegård'
120 |         >>> model.predict_text(text)
121 |         ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
122 |     
123 |     """
124 |     def __init__(self, device: str = None) -> None:
125 |         """Initialize model"""
126 |         super().__init__(transformer = 'bert-base-multilingual-uncased',
127 |                          device = device,
128 |                          tag_scheme = [
129 |                             'B-PER',
130 |                             'I-PER', 
131 |                             'B-ORG', 
132 |                             'I-ORG', 
133 |                             'B-LOC', 
134 |                             'I-LOC', 
135 |                             'B-MISC', 
136 |                             'I-MISC'
137 |                             ],
138 |                          tag_outside = 'O',
139 |                          max_len = 128,
140 |                          dropout = 0.1,
141 |                          hyperparameters = {'epochs' : 4,
142 |                                             'warmup_steps' : 500,
143 |                                             'train_batch_size': 13,
144 |                                             'learning_rate': 0.0001},
145 |                          tokenizer_parameters = {'do_lower_case' : True})
146 | 
147 | class DA_DISTILBERT_ML(Precooked):
148 |     """NERDA [Multilingual BERT](https://huggingface.co/distilbert-base-multilingual-cased) 
149 |     for Danish Finetuned on [DaNE data set](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane).
150 | 
151 |     Inherits from [NERDA.precooked.Precooked][].
152 |     
153 |     Examples:
154 |         >>> from NERDA.precooked import DA_DISTILBERT_ML()
155 |         >>> model = DA_DISTILBERT_ML()
156 |         >>> model.download_network()
157 |         >>> model.load_network()
158 |         >>> text = 'Jens Hansen har en bondegård'
159 |         >>> model.predict_text(text)
160 |         ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
161 |     
162 |     """
163 |     def __init__(self, device: str = None) -> None:
164 |         """Initialize model"""
165 |         super().__init__(transformer = 'distilbert-base-multilingual-cased',
166 |                          device = device,
167 |                          tag_scheme = [
168 |                             'B-PER',
169 |                             'I-PER', 
170 |                             'B-ORG', 
171 |                             'I-ORG', 
172 |                             'B-LOC', 
173 |                             'I-LOC', 
174 |                             'B-MISC', 
175 |                             'I-MISC'
176 |                             ],
177 |                          tag_outside = 'O',
178 |                          max_len = 128,
179 |                          dropout = 0.1,
180 |                          hyperparameters = {'epochs' : 4,
181 |                                             'warmup_steps' : 500,
182 |                                             'train_batch_size': 13,
183 |                                             'learning_rate': 0.0001},
184 |                          tokenizer_parameters = {'do_lower_case' : False})
185 | 
186 | class DA_ELECTRA_DA(Precooked):
187 |     """NERDA [Danish ELECTRA](https://huggingface.co/Maltehb/-l-ctra-danish-electra-small-uncased) 
188 |     for Danish finetuned on [DaNE data set](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane).
189 | 
190 |     Inherits from [NERDA.precooked.Precooked][].
191 | 
192 |     Examples:
193 |         >>> from NERDA.precooked import DA_ELECTRA_DA()
194 |         >>> model = DA_ELECTRA_DA()
195 |         >>> model.download_network()
196 |         >>> model.load_network()
197 |         >>> text = 'Jens Hansen har en bondegård'
198 |         >>> model.predict_text(text)
199 |         ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
200 | 
201 |     """
202 |     def __init__(self, device: str = None) -> None:
203 |         """Initialize model"""
204 |         super().__init__(transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased',
205 |                          device = device,
206 |                          tag_scheme = [
207 |                             'B-PER',
208 |                             'I-PER', 
209 |                             'B-ORG', 
210 |                             'I-ORG', 
211 |                             'B-LOC', 
212 |                             'I-LOC', 
213 |                             'B-MISC', 
214 |                             'I-MISC'
215 |                             ],
216 |                          tag_outside = 'O',
217 |                          max_len = 128,
218 |                          dropout = 0.1,
219 |                          hyperparameters = {'epochs' : 5,
220 |                                             'warmup_steps' : 500,
221 |                                             'train_batch_size': 13,
222 |                                             'learning_rate': 0.0001},
223 |                          tokenizer_parameters = {'do_lower_case' : True})
224 | 
225 | class EN_ELECTRA_EN(Precooked):
226 |     """NERDA [English ELECTRA](https://huggingface.co/google/electra-small-discriminator) 
227 |     for English finetuned on [CoNLL-2003 data set](https://www.clips.uantwerpen.be/conll2003/ner/).
228 | 
229 |     Inherits from [NERDA.precooked.Precooked][].
230 | 
231 |     Examples:
232 |         >>> from NERDA.precooked import EN_ELECTRA_EN()
233 |         >>> model = EN_ELECTRA_EN()
234 |         >>> model.download_network()
235 |         >>> model.load_network()
236 |         >>> text = 'Old MacDonald had a farm'
237 |         >>> model.predict_text(text)
238 |         ([['Old', 'MacDonald', 'had', 'a', 'farm']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
239 | 
240 |     """
241 |     def __init__(self, device: str = None) -> None:
242 |         """Initialize model"""
243 |         super().__init__(transformer = 'google/electra-small-discriminator',
244 |                          device = device,
245 |                          tag_scheme = [
246 |                             'B-PER',
247 |                             'I-PER', 
248 |                             'B-ORG', 
249 |                             'I-ORG', 
250 |                             'B-LOC', 
251 |                             'I-LOC', 
252 |                             'B-MISC', 
253 |                             'I-MISC'
254 |                             ],
255 |                          tag_outside = 'O',
256 |                          max_len = 128,
257 |                          dropout = 0.1,
258 |                          hyperparameters = {'epochs' : 4,
259 |                                             'warmup_steps' : 250,
260 |                                             'train_batch_size': 13,
261 |                                             'learning_rate': 8e-05},
262 |                          tokenizer_parameters = {'do_lower_case' : True})
263 | 
264 | 
265 | class EN_BERT_ML(Precooked):
266 |     """NERDA [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) 
267 |     for English finetuned on [CoNLL-2003 data set](https://www.clips.uantwerpen.be/conll2003/ner/).
268 |     
269 |     Inherits from [NERDA.precooked.Precooked][].
270 |     
271 |     Examples:
272 |         >>> from NERDA.precooked import EN_BERT_ML()
273 |         >>> model = EN_BERT_ML()
274 |         >>> model.download_network()
275 |         >>> model.load_network()
276 |         >>> text = 'Old MacDonald had a farm'
277 |         >>> model.predict_text(text)
278 |         ([['Old', 'MacDonald', 'had', 'a', 'farm']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
279 |     
280 |     """
281 |     def __init__(self, device: str = None) -> None:
282 |         """Initialize model"""
283 |         super().__init__(transformer = 'bert-base-multilingual-uncased',
284 |                          device = device,
285 |                          tag_scheme = [
286 |                             'B-PER',
287 |                             'I-PER', 
288 |                             'B-ORG', 
289 |                             'I-ORG', 
290 |                             'B-LOC', 
291 |                             'I-LOC', 
292 |                             'B-MISC', 
293 |                             'I-MISC'
294 |                             ],
295 |                          tag_outside = 'O',
296 |                          max_len = 128,
297 |                          dropout = 0.1,
298 |                          hyperparameters = {'epochs' : 4,
299 |                                             'warmup_steps' : 500,
300 |                                             'train_batch_size': 13,
301 |                                             'learning_rate': 0.0001},
302 |                          tokenizer_parameters = {'do_lower_case' : True})
303 | 
304 | 
305 | 
306 | 
307 | 


--------------------------------------------------------------------------------
/src/NERDA/predictions.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This section covers functionality for computing predictions
  3 | with a [NERDA.models.NERDA][] model.
  4 | """
  5 | 
  6 | from NERDA.preprocessing import create_dataloader
  7 | import torch
  8 | import numpy as np
  9 | from tqdm import tqdm 
 10 | from nltk.tokenize import sent_tokenize, word_tokenize
 11 | from typing import List, Callable
 12 | import transformers
 13 | import sklearn.preprocessing
 14 | 
 15 | def sigmoid_transform(x):
 16 |     prob = 1/(1 + np.exp(-x))
 17 |     return prob
 18 | 
 19 | def predict(network: torch.nn.Module, 
 20 |             sentences: List[List[str]],
 21 |             transformer_tokenizer: transformers.PreTrainedTokenizer,
 22 |             transformer_config: transformers.PretrainedConfig,
 23 |             max_len: int,
 24 |             device: str,
 25 |             tag_encoder: sklearn.preprocessing.LabelEncoder,
 26 |             tag_outside: str,
 27 |             batch_size: int = 8,
 28 |             num_workers: int = 1,
 29 |             return_tensors: bool = False,
 30 |             return_confidence: bool = False,
 31 |             pad_sequences: bool = True) -> List[List[str]]:
 32 |     """Compute predictions.
 33 | 
 34 |     Computes predictions for a list with word-tokenized sentences 
 35 |     with a `NERDA` model.
 36 | 
 37 |     Args:
 38 |         network (torch.nn.Module): Network.
 39 |         sentences (List[List[str]]): List of lists with word-tokenized
 40 |             sentences.
 41 |         transformer_tokenizer (transformers.PreTrainedTokenizer): 
 42 |             tokenizer for transformer model.
 43 |         transformer_config (transformers.PretrainedConfig): config
 44 |             for transformer model.
 45 |         max_len (int): Maximum length of sentence after applying 
 46 |             transformer tokenizer.
 47 |         device (str): Computational device.
 48 |         tag_encoder (sklearn.preprocessing.LabelEncoder): Encoder
 49 |             for Named-Entity tags.
 50 |         tag_outside (str): Special 'outside' NER tag.
 51 |         batch_size (int, optional): Batch Size for DataLoader. 
 52 |             Defaults to 8.
 53 |         num_workers (int, optional): Number of workers. Defaults
 54 |             to 1.
 55 |         return_tensors (bool, optional): if True, return tensors.
 56 |         return_confidence (bool, optional): if True, return
 57 |             confidence scores for all predicted tokens. Defaults
 58 |             to False.
 59 |         pad_sequences (bool, optional): if True, pad sequences. 
 60 |             Defaults to True.
 61 | 
 62 |     Returns:
 63 |         List[List[str]]: List of lists with predicted Entity
 64 |         tags.
 65 |     """
 66 |     # make sure, that input has the correct format. 
 67 |     assert isinstance(sentences, list), "'sentences' must be a list of list of word-tokens"
 68 |     assert isinstance(sentences[0], list), "'sentences' must be a list of list of word-tokens"
 69 |     assert isinstance(sentences[0][0], str), "'sentences' must be a list of list of word-tokens"
 70 |     
 71 |     # set network to appropriate mode.
 72 |     network.eval()
 73 | 
 74 |     # fill 'dummy' tags (expected input for dataloader).
 75 |     tag_fill = [tag_encoder.classes_[0]]
 76 |     tags_dummy = [tag_fill * len(sent) for sent in sentences]
 77 |     
 78 |     dl = create_dataloader(sentences = sentences,
 79 |                            tags = tags_dummy, 
 80 |                            transformer_tokenizer = transformer_tokenizer,
 81 |                            transformer_config = transformer_config,
 82 |                            max_len = max_len, 
 83 |                            batch_size = batch_size, 
 84 |                            tag_encoder = tag_encoder,
 85 |                            tag_outside = tag_outside,
 86 |                            num_workers = num_workers,
 87 |                            pad_sequences = pad_sequences)
 88 | 
 89 |     predictions = []
 90 |     probabilities = []
 91 |     tensors = []
 92 |     
 93 |     with torch.no_grad():
 94 |         for _, dl in enumerate(dl): 
 95 | 
 96 |             outputs = network(**dl)   
 97 | 
 98 |             # conduct operations on sentence level.
 99 |             for i in range(outputs.shape[0]):
100 |                 
101 |                 # extract prediction and transform.
102 | 
103 |                 # find max by row.
104 |                 values, indices = outputs[i].max(dim=1)
105 |                 
106 |                 preds = tag_encoder.inverse_transform(indices.cpu().numpy())
107 |                 probs = values.cpu().numpy()
108 | 
109 |                 if return_tensors:
110 |                     tensors.append(outputs)    
111 | 
112 |                 # subset predictions for original word tokens.
113 |                 preds = [prediction for prediction, offset in zip(preds.tolist(), dl.get('offsets')[i]) if offset]
114 |                 if return_confidence:
115 |                     probs = [prob for prob, offset in zip(probs.tolist(), dl.get('offsets')[i]) if offset]
116 |             
117 |                 # Remove special tokens ('CLS' + 'SEP').
118 |                 preds = preds[1:-1]
119 |                 if return_confidence:
120 |                     probs = probs[1:-1]
121 |             
122 |                 # make sure resulting predictions have same length as
123 |                 # original sentence.
124 |             
125 |                 # TODO: Move assert statement to unit tests. Does not work 
126 |                 # in boundary.
127 |                 # assert len(preds) == len(sentences[i])            
128 |                 predictions.append(preds)
129 |                 if return_confidence:
130 |                     probabilities.append(probs)
131 |             
132 |             if return_confidence:
133 |                 return predictions, probabilities
134 | 
135 |             if return_tensors:
136 |                 return tensors
137 | 
138 |     return predictions
139 | 
140 | def predict_text(network: torch.nn.Module, 
141 |                  text: str,
142 |                  transformer_tokenizer: transformers.PreTrainedTokenizer,
143 |                  transformer_config: transformers.PretrainedConfig,
144 |                  max_len: int,
145 |                  device: str,
146 |                  tag_encoder: sklearn.preprocessing.LabelEncoder,
147 |                  tag_outside: str,
148 |                  batch_size: int = 8,
149 |                  num_workers: int = 1,
150 |                  pad_sequences: bool = True,
151 |                  return_confidence: bool = False,
152 |                  sent_tokenize: Callable = sent_tokenize,
153 |                  word_tokenize: Callable = word_tokenize) -> tuple:
154 |     """Compute Predictions for Text.
155 | 
156 |     Computes predictions for a text with `NERDA` model. 
157 |     Text is tokenized into sentences before computing predictions.
158 | 
159 |     Args:
160 |         network (torch.nn.Module): Network.
161 |         text (str): text to predict entities in.
162 |         transformer_tokenizer (transformers.PreTrainedTokenizer): 
163 |             tokenizer for transformer model.
164 |         transformer_config (transformers.PretrainedConfig): config
165 |             for transformer model.
166 |         max_len (int): Maximum length of sentence after applying 
167 |             transformer tokenizer.
168 |         device (str): Computational device.
169 |         tag_encoder (sklearn.preprocessing.LabelEncoder): Encoder
170 |             for Named-Entity tags.
171 |         tag_outside (str): Special 'outside' NER tag.
172 |         batch_size (int, optional): Batch Size for DataLoader. 
173 |             Defaults to 8.
174 |         num_workers (int, optional): Number of workers. Defaults
175 |             to 1.
176 |         pad_sequences (bool, optional): if True, pad sequences. 
177 |             Defaults to True.
178 |         return_confidence (bool, optional): if True, return 
179 |             confidence scores for predicted tokens. Defaults
180 |             to False.
181 | 
182 |     Returns:
183 |         tuple: sentence- and word-tokenized text with corresponding
184 |         predicted named-entity tags.
185 |     """
186 |     assert isinstance(text, str), "'text' must be a string."
187 |     sentences = sent_tokenize(text)
188 | 
189 |     sentences = [word_tokenize(sentence) for sentence in sentences]
190 | 
191 |     predictions = predict(network = network, 
192 |                           sentences = sentences,
193 |                           transformer_tokenizer = transformer_tokenizer,
194 |                           transformer_config = transformer_config,
195 |                           max_len = max_len,
196 |                           device = device,
197 |                           return_confidence = return_confidence,
198 |                           batch_size = batch_size,
199 |                           num_workers = num_workers,
200 |                           pad_sequences = pad_sequences,
201 |                           tag_encoder = tag_encoder,
202 |                           tag_outside = tag_outside)
203 | 
204 |     return sentences, predictions
205 | 
206 | 


--------------------------------------------------------------------------------
/src/NERDA/preprocessing.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import warnings
  3 | import transformers
  4 | import sklearn.preprocessing
  5 | 
  6 | class NERDADataSetReader():
  7 |     """Generic NERDA DataSetReader"""
  8 |     
  9 |     def __init__(self, 
 10 |                 sentences: list, 
 11 |                 tags: list, 
 12 |                 transformer_tokenizer: transformers.PreTrainedTokenizer, 
 13 |                 transformer_config: transformers.PretrainedConfig, 
 14 |                 max_len: int, 
 15 |                 tag_encoder: sklearn.preprocessing.LabelEncoder, 
 16 |                 tag_outside: str,
 17 |                 pad_sequences : bool = True) -> None:
 18 |         """Initialize DataSetReader
 19 | 
 20 |         Initializes DataSetReader that prepares and preprocesses 
 21 |         DataSet for Named-Entity Recognition Task and training.
 22 | 
 23 |         Args:
 24 |             sentences (list): Sentences.
 25 |             tags (list): Named-Entity tags.
 26 |             transformer_tokenizer (transformers.PreTrainedTokenizer): 
 27 |                 tokenizer for transformer.
 28 |             transformer_config (transformers.PretrainedConfig): Config
 29 |                 for transformer model.
 30 |             max_len (int): Maximum length of sentences after applying
 31 |                 transformer tokenizer.
 32 |             tag_encoder (sklearn.preprocessing.LabelEncoder): Encoder
 33 |                 for Named-Entity tags.
 34 |             tag_outside (str): Special Outside tag.
 35 |             pad_sequences (bool): Pad sequences to max_len. Defaults
 36 |                 to True.
 37 |         """
 38 |         self.sentences = sentences
 39 |         self.tags = tags
 40 |         self.transformer_tokenizer = transformer_tokenizer
 41 |         self.max_len = max_len
 42 |         self.tag_encoder = tag_encoder
 43 |         self.pad_token_id = transformer_config.pad_token_id
 44 |         self.tag_outside_transformed = tag_encoder.transform([tag_outside])[0]
 45 |         self.pad_sequences = pad_sequences
 46 |     
 47 |     def __len__(self):
 48 |         return len(self.sentences)
 49 | 
 50 |     def __getitem__(self, item):
 51 |         sentence = self.sentences[item]
 52 |         tags = self.tags[item]
 53 |         # encode tags
 54 |         tags = self.tag_encoder.transform(tags)
 55 |         
 56 |         # check inputs for consistancy
 57 |         assert len(sentence) == len(tags)
 58 | 
 59 |         input_ids = []
 60 |         target_tags = []
 61 |         tokens = []
 62 |         offsets = []
 63 |         
 64 |         # for debugging purposes
 65 |         # print(item)
 66 |         for i, word in enumerate(sentence):
 67 |             # bert tokenization
 68 |             wordpieces = self.transformer_tokenizer.tokenize(word)
 69 |             tokens.extend(wordpieces)
 70 |             # make room for CLS if there is an identified word piece
 71 |             if len(wordpieces)>0:
 72 |                 offsets.extend([1]+[0]*(len(wordpieces)-1))
 73 |             # Extends the ner_tag if the word has been split by the wordpiece tokenizer
 74 |             target_tags.extend([tags[i]] * len(wordpieces)) 
 75 |                
 76 |         # Make room for adding special tokens (one for both 'CLS' and 'SEP' special tokens)
 77 |         # max_len includes _all_ tokens.
 78 |         if len(tokens) > self.max_len-2:
 79 |             msg = f'Sentence #{item} length {len(tokens)} exceeds max_len {self.max_len} and has been truncated'
 80 |             warnings.warn(msg)
 81 |         tokens = tokens[:self.max_len-2] 
 82 |         target_tags = target_tags[:self.max_len-2]
 83 |         offsets = offsets[:self.max_len-2]
 84 | 
 85 |         # encode tokens for BERT
 86 |         # TO DO: prettify this.
 87 |         input_ids = self.transformer_tokenizer.convert_tokens_to_ids(tokens)
 88 |         input_ids = [self.transformer_tokenizer.cls_token_id] + input_ids + [self.transformer_tokenizer.sep_token_id]
 89 |         
 90 |         # fill out other inputs for model.    
 91 |         target_tags = [self.tag_outside_transformed] + target_tags + [self.tag_outside_transformed] 
 92 |         masks = [1] * len(input_ids)
 93 |         # set to 0, because we are not doing NSP or QA type task (across multiple sentences)
 94 |         # token_type_ids distinguishes sentences.
 95 |         token_type_ids = [0] * len(input_ids) 
 96 |         offsets = [1] + offsets + [1]
 97 | 
 98 |         # Padding to max length 
 99 |         # compute padding length
100 |         if self.pad_sequences:
101 |             padding_len = self.max_len - len(input_ids)
102 |             input_ids = input_ids + ([self.pad_token_id] * padding_len)
103 |             masks = masks + ([0] * padding_len)  
104 |             offsets = offsets + ([0] * padding_len)
105 |             token_type_ids = token_type_ids + ([0] * padding_len)
106 |             target_tags = target_tags + ([self.tag_outside_transformed] * padding_len)  
107 |     
108 |         return {'input_ids' : torch.tensor(input_ids, dtype = torch.long),
109 |                 'masks' : torch.tensor(masks, dtype = torch.long),
110 |                 'token_type_ids' : torch.tensor(token_type_ids, dtype = torch.long),
111 |                 'target_tags' : torch.tensor(target_tags, dtype = torch.long),
112 |                 'offsets': torch.tensor(offsets, dtype = torch.long)} 
113 |       
114 | def create_dataloader(sentences, 
115 |                       tags, 
116 |                       transformer_tokenizer, 
117 |                       transformer_config, 
118 |                       max_len,  
119 |                       tag_encoder, 
120 |                       tag_outside,
121 |                       batch_size = 1,
122 |                       num_workers = 1,
123 |                       pad_sequences = True):
124 | 
125 |     if not pad_sequences and batch_size > 1:
126 |         print("setting pad_sequences to True, because batch_size is more than one.")
127 |         pad_sequences = True
128 | 
129 |     data_reader = NERDADataSetReader(
130 |         sentences = sentences, 
131 |         tags = tags,
132 |         transformer_tokenizer = transformer_tokenizer, 
133 |         transformer_config = transformer_config,
134 |         max_len = max_len,
135 |         tag_encoder = tag_encoder,
136 |         tag_outside = tag_outside,
137 |         pad_sequences = pad_sequences)
138 |         # Don't pad sequences if batch size == 1. This improves performance.
139 | 
140 |     data_loader = torch.utils.data.DataLoader(
141 |         data_reader, batch_size = batch_size, num_workers = num_workers
142 |     )
143 | 
144 |     return data_loader
145 | 
146 | 


--------------------------------------------------------------------------------
/src/NERDA/training.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from .preprocessing import create_dataloader
  3 | from sklearn import preprocessing
  4 | from transformers import AdamW, get_linear_schedule_with_warmup
  5 | import random
  6 | import torch
  7 | from tqdm import tqdm
  8 | 
  9 | def train(model, data_loader, optimizer, device, scheduler, n_tags):
 10 |     """One Iteration of Training"""
 11 | 
 12 |     model.train()    
 13 |     final_loss = 0.0
 14 |     
 15 |     for dl in tqdm(data_loader, total=len(data_loader)):
 16 | 
 17 |         optimizer.zero_grad()
 18 |         outputs = model(**dl)
 19 |         loss = compute_loss(outputs, 
 20 |                             dl.get('target_tags'),
 21 |                             dl.get('masks'), 
 22 |                             device, 
 23 |                             n_tags)
 24 |         loss.backward()
 25 |         optimizer.step()
 26 |         scheduler.step()
 27 |         final_loss += loss.item()
 28 | 
 29 |     # Return average loss
 30 |     return final_loss / len(data_loader)
 31 | 
 32 | def validate(model, data_loader, device, n_tags):
 33 |     """One Iteration of Validation"""
 34 | 
 35 |     model.eval()
 36 |     final_loss = 0.0
 37 | 
 38 |     for dl in tqdm(data_loader, total=len(data_loader)):
 39 |         
 40 |         outputs = model(**dl)
 41 |         loss = compute_loss(outputs, 
 42 |                             dl.get('target_tags'),
 43 |                             dl.get('masks'), 
 44 |                             device, 
 45 |                             n_tags)
 46 |         final_loss += loss.item()
 47 |     
 48 |     # Return average loss.
 49 |     return final_loss / len(data_loader)   
 50 | 
 51 | def compute_loss(preds, target_tags, masks, device, n_tags):
 52 |     
 53 |     # initialize loss function.
 54 |     lfn = torch.nn.CrossEntropyLoss()
 55 | 
 56 |     # Compute active loss to not compute loss of paddings
 57 |     active_loss = masks.view(-1) == 1
 58 | 
 59 |     active_logits = preds.view(-1, n_tags)
 60 |     active_labels = torch.where(
 61 |         active_loss,
 62 |         target_tags.view(-1),
 63 |         torch.tensor(lfn.ignore_index).type_as(target_tags)
 64 |     )
 65 | 
 66 |     active_labels = torch.as_tensor(active_labels, device = torch.device(device), dtype = torch.long)
 67 |     
 68 |     # Only compute loss on actual token predictions
 69 |     loss = lfn(active_logits, active_labels)
 70 | 
 71 |     return loss
 72 | 
 73 | def enforce_reproducibility(seed = 42) -> None:
 74 |     """Enforce Reproducibity
 75 | 
 76 |     Enforces reproducibility of models to the furthest 
 77 |     possible extent. This is done by setting fixed seeds for
 78 |     random number generation etcetera. 
 79 | 
 80 |     For atomic operations there is currently no simple way to
 81 |     enforce determinism, as the order of parallel operations
 82 |     is not known.
 83 | 
 84 |     Args:
 85 |         seed (int, optional): Fixed seed. Defaults to 42.  
 86 |     """
 87 |     # Sets seed manually for both CPU and CUDA
 88 |     torch.manual_seed(seed)
 89 |     torch.cuda.manual_seed_all(seed)
 90 |     # CUDNN
 91 |     torch.backends.cudnn.deterministic = True
 92 |     torch.backends.cudnn.benchmark = False
 93 |     # System based
 94 |     random.seed(seed)
 95 |     np.random.seed(seed)
 96 | 
 97 | def train_model(network,
 98 |                 tag_encoder,
 99 |                 tag_outside,
100 |                 transformer_tokenizer,
101 |                 transformer_config,
102 |                 dataset_training, 
103 |                 dataset_validation, 
104 |                 max_len = 128,
105 |                 train_batch_size = 16,
106 |                 validation_batch_size = 8,
107 |                 epochs = 5,
108 |                 warmup_steps = 0,
109 |                 learning_rate = 5e-5,
110 |                 device = None,
111 |                 fixed_seed = 42,
112 |                 num_workers = 1):
113 |     
114 |     if fixed_seed is not None:
115 |         enforce_reproducibility(fixed_seed)
116 |     
117 |     # compute number of unique tags from encoder.
118 |     n_tags = tag_encoder.classes_.shape[0]
119 | 
120 |     # prepare datasets for modelling by creating data readers and loaders
121 |     dl_train = create_dataloader(sentences = dataset_training.get('sentences'),
122 |                                  tags = dataset_training.get('tags'), 
123 |                                  transformer_tokenizer = transformer_tokenizer, 
124 |                                  transformer_config = transformer_config,
125 |                                  max_len = max_len, 
126 |                                  batch_size = train_batch_size, 
127 |                                  tag_encoder = tag_encoder,
128 |                                  tag_outside = tag_outside,
129 |                                  num_workers = num_workers)
130 |     dl_validate = create_dataloader(sentences = dataset_validation.get('sentences'), 
131 |                                     tags = dataset_validation.get('tags'),
132 |                                     transformer_tokenizer = transformer_tokenizer,
133 |                                     transformer_config = transformer_config, 
134 |                                     max_len = max_len, 
135 |                                     batch_size = validation_batch_size, 
136 |                                     tag_encoder = tag_encoder,
137 |                                     tag_outside = tag_outside,
138 |                                     num_workers = num_workers)
139 | 
140 |     optimizer_parameters = network.parameters()
141 | 
142 |     num_train_steps = int(len(dataset_training.get('sentences')) / train_batch_size * epochs)
143 |     
144 |     optimizer = AdamW(optimizer_parameters, lr = learning_rate)
145 |     scheduler = get_linear_schedule_with_warmup(
146 |         optimizer, num_warmup_steps = warmup_steps, num_training_steps = num_train_steps
147 |     )
148 | 
149 |     train_losses = []
150 |     best_valid_loss = np.inf
151 | 
152 |     for epoch in range(epochs):
153 |         
154 |         print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
155 | 
156 |         train_loss = train(network, dl_train, optimizer, device, scheduler, n_tags)
157 |         train_losses.append(train_loss)
158 |         valid_loss = validate(network, dl_validate, device, n_tags)
159 | 
160 |         print(f"Train Loss = {train_loss} Valid Loss = {valid_loss}")
161 | 
162 |         if valid_loss < best_valid_loss:
163 |             best_parameters = network.state_dict()            
164 |             best_valid_loss = valid_loss
165 | 
166 |     # return best model
167 |     network.load_state_dict(best_parameters)
168 | 
169 |     return network, train_losses, best_valid_loss
170 | 
171 | 
172 |         
173 | 


--------------------------------------------------------------------------------
/src/NERDA/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable
 2 | 
 3 | def match_kwargs(function: Callable, **kwargs) -> dict:
 4 |     """Matches Arguments with Function
 5 | 
 6 |     Match keywords arguments with the arguments of a function.
 7 | 
 8 |     Args:
 9 |         function (function): Function to match arguments for.
10 |         kwargs: keyword arguments to match against.
11 | 
12 |     Returns:
13 |         dict: dictionary with matching arguments and their
14 |         respective values.
15 | 
16 |     """
17 |     arg_count = function.__code__.co_argcount
18 |     args = function.__code__.co_varnames[:arg_count]
19 | 
20 |     args_dict = {}
21 |     for k, v in kwargs.items():
22 |         if k in args:
23 |             args_dict[k] = v
24 | 
25 |     return args_dict
26 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_aaaNERDA.py:
--------------------------------------------------------------------------------
 1 | # HACK: Filename prefixed with 'aaa' to execute this test before the others
 2 | # in order to download necessary ressources for all other tests.
 3 | 
 4 | from NERDA.datasets import get_dane_data, download_dane_data
 5 | # TODO: should not be necesssary to download before importing NERDA.
 6 | # Download necessary ressources
 7 | download_dane_data()
 8 | from NERDA.models import NERDA
 9 | from NERDA.precooked import DA_ELECTRA_DA
10 | import nltk
11 | nltk.download('punkt')
12 | 
13 | # instantiate a minimal model.
14 | model = NERDA(dataset_training = get_dane_data('train', 5),
15 |               dataset_validation = get_dane_data('dev', 5),
16 |               max_len = 128,
17 |               transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased',
18 |               hyperparameters = {'epochs' : 1,
19 |                                  'warmup_steps' : 10,
20 |                                  'train_batch_size': 5,
21 |                                  'learning_rate': 0.0001})
22 | 
23 | def test_instantiate_NERDA():
24 |     """Test that model has the correct/expected class"""
25 |     assert isinstance(model, NERDA)
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_performance.py:
--------------------------------------------------------------------------------
 1 | from NERDA.datasets import get_dane_data
 2 | from NERDA.models import NERDA
 3 | import pandas as pd
 4 | 
 5 | # instantiate a minimal model.
 6 | model = NERDA(dataset_training = get_dane_data('train', 5),
 7 |               dataset_validation = get_dane_data('dev', 5),
 8 |               transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased',
 9 |               hyperparameters = {'epochs' : 1,
10 |                                  'warmup_steps' : 10,
11 |                                  'train_batch_size': 5,
12 |                                  'learning_rate': 0.0001})
13 | 
14 | test = get_dane_data('test')
15 | perf = model.evaluate_performance(test)
16 | 
17 | def test_performance_df():
18 |     assert isinstance(perf, pd.DataFrame)
19 | 
20 | def test_performance_len():
21 |     assert len(perf) > 0
22 | 
23 | def test_includes_relevant_metrics():
24 |     metrics = ['F1-Score', 'Precision', 'Recall']
25 |     assert all([x in perf.columns for x in metrics])
26 | 
27 | def test_metrics_dtype():
28 |     metrics = ['F1-Score', 'Precision', 'Recall']
29 |     assert all([perf.dtypes[x] == 'float' for x in metrics])
30 | 
31 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_precooked.py:
--------------------------------------------------------------------------------
1 | from NERDA.precooked import DA_ELECTRA_DA
2 | 
3 | def test_load_precooked():
4 |     """Test that precooked model can be (down)loaded, instantiated and works end-to-end"""
5 |     m = DA_ELECTRA_DA()
6 |     m.download_network()
7 |     m.load_network()
8 |     m.predict_text("Jens Hansen har en bondegård. Det har han!")
9 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_predictions.py:
--------------------------------------------------------------------------------
 1 | from NERDA.datasets import get_dane_data
 2 | from NERDA.models import NERDA
 3 | import nltk
 4 | 
 5 | # instantiate a minimal model.
 6 | model = NERDA(dataset_training = get_dane_data('train', 5),
 7 |               dataset_validation = get_dane_data('dev', 5),
 8 |               transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased',
 9 |               hyperparameters = {'epochs' : 1,
10 |                                  'warmup_steps' : 10,
11 |                                  'train_batch_size': 5,
12 |                                  'learning_rate': 0.0001})
13 | 
14 | 
15 | # set example texts to identify entities in.
16 | text_single = "Pernille Rosenkrantz-Theil kommer fra Vejle"
17 | sentences = [nltk.word_tokenize(text_single)]
18 | 
19 | def test_predict():
20 |     """Test that predict runs"""
21 |     predictions = model.predict(sentences)
22 | 
23 | predictions = model.predict(sentences)
24 | 
25 | def test_predict_type():
26 |     """Test token predictions"""
27 |     assert isinstance(predictions, list)
28 | 
29 | def test_predict_length():
30 |     """Test that sentence and prediction lenghts match"""
31 |     assert len(sentences[0])==len(predictions[0])
32 | 
33 | def test_predict_text():
34 |     """Test that predict_text runs"""
35 |     predictions = model.predict_text(text_single)
36 | 
37 | def test_predict_maxlen_exceed():
38 |     """That that exceeding max len does not break predict"""
39 |     text = "ice " * 200
40 |     sentences = [nltk.word_tokenize(text)]
41 |     model.predict(sentences)
42 | 
43 | # test confidence scores
44 | words, preds = model.predict_text(text_single, return_confidence=True)
45 | 
46 | def test_confs_len():
47 |     assert len(preds[0])==len(preds[1])
48 | 
49 | predictions_text_single = model.predict_text(text_single)
50 | 
51 | def test_predict_text_format():
52 |     """Test text predictions"""
53 |     assert isinstance(predictions_text_single, tuple)
54 | 
55 | def test_predict_text_match_words_predictions():
56 |     assert len(predictions_text_single[0][0]) == len(predictions_text_single[1][0])
57 | 
58 | # multiple sentences.
59 | text_multi = """
60 | Pernille Rosenkrantz-Theil kommer fra Vejle.
61 | Jens Hansen har en bondegård.
62 | """
63 | 
64 | def test_predict_text_multi():
65 |     """Test that predict_text runs with multiple sentences"""
66 |     predictions = model.predict_text(text_multi, batch_size = 2)
67 | 
68 | predictions_text_multi = model.predict_text(text_multi, batch_size = 2)
69 | 
70 | def test_predict_text_multi_format():
71 |     """Test multi-sentence text predictions has expected format"""
72 |     assert isinstance(predictions_text_multi, tuple)
73 | 
74 | def test_predict_text_multi_elements_count():
75 |     """Test dimensions of multi-sentence text predictions"""
76 |     assert [len(predictions_text_multi[0]), len(predictions_text_multi[1])] == [2, 2]
77 | 
78 | def test_predict_text_multi_lens():
79 |     """Test lengths of multi-sentence text predictions"""
80 |     s1 = len(predictions_text_multi[0][0]) == len(predictions_text_multi[1][0])
81 |     s2 = len(predictions_text_multi[0][1]) == len(predictions_text_multi[1][1])
82 |     assert all([s1, s2])
83 | 
84 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_training.py:
--------------------------------------------------------------------------------
 1 | from NERDA.datasets import get_dane_data
 2 | from NERDA.models import NERDA
 3 | 
 4 | # instantiate a minimal model.
 5 | model = NERDA(dataset_training = get_dane_data('train', 5),
 6 |               dataset_validation = get_dane_data('dev', 5),
 7 |               transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased',
 8 |               hyperparameters = {'epochs' : 1,
 9 |                                  'warmup_steps' : 10,
10 |                                  'train_batch_size': 5,
11 |                                  'learning_rate': 0.0001})
12 | 
13 | def test_training():
14 |     """Test if training runs successfully"""
15 |     model.train()
16 | 
17 | def test_training_exceed_maxlen():
18 |     """Test if traning does not break even though MAX LEN is exceeded"""
19 |     m = NERDA(dataset_training = get_dane_data('train', 5),
20 |               dataset_validation = get_dane_data('dev', 5),
21 |               max_len = 3,
22 |               transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased',
23 |               hyperparameters = {'epochs' : 1,
24 |                                  'warmup_steps' : 10,
25 |                                  'train_batch_size': 5,
26 |                                  'learning_rate': 0.0001})
27 |     m.train()
28 | 
29 | def test_training_bert():
30 |     """Test if traning does not break even though MAX LEN is exceeded"""
31 |     m = NERDA(dataset_training = get_dane_data('train', 5),
32 |               dataset_validation = get_dane_data('dev', 5),
33 |               transformer = 'bert-base-multilingual-uncased',
34 |               hyperparameters = {'epochs' : 1,
35 |                                  'warmup_steps' : 10,
36 |                                  'train_batch_size': 5,
37 |                                  'learning_rate': 0.0001})
38 |     m.train()
39 | 


--------------------------------------------------------------------------------