├── .github
└── workflows
│ └── build.yml
├── .gitignore
├── LICENSE
├── NEWS.md
├── README.md
├── admin
├── deploy_models.py
├── sandbox.py
└── tuning.py
├── dev-requirements.txt
├── docs
├── datasets.md
├── index.md
├── nerda_models.md
├── networks.md
├── performance.md
├── preamble.py
├── precooked_models.md
├── predictions.md
└── workflow.ipynb
├── logo.png
├── mkdocs.yml
├── pytest.ini
├── setup.cfg
├── setup.py
├── src
└── NERDA
│ ├── __init__.py
│ ├── datasets.py
│ ├── models.py
│ ├── networks.py
│ ├── performance.py
│ ├── precooked.py
│ ├── predictions.py
│ ├── preprocessing.py
│ ├── training.py
│ └── utils.py
└── tests
└── unit_tests
├── test_aaaNERDA.py
├── test_performance.py
├── test_precooked.py
├── test_predictions.py
└── test_training.py
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | #---------------------------------------------------------------------
2 | # Build, Test and Publish Package
3 | #---------------------------------------------------------------------
4 | name: build
5 |
6 | on: [push]
7 |
8 | jobs:
9 |
10 | Build:
11 |
12 | runs-on: ubuntu-latest
13 |
14 | steps:
15 | - uses: actions/checkout@v2
16 |
17 | - name: Cache Python packages
18 | uses: actions/cache@v2
19 | with:
20 | path: ~/.cache/pip
21 | key: ${{ hashFiles('setup.py') }}-${{ hashFiles('dev-requirements.txt') }}
22 |
23 | - name: Set up Python
24 | uses: actions/setup-python@v2
25 | with:
26 | python-version: '3.7'
27 |
28 | - name: Display Python version
29 | run: python -c "import sys; print(sys.version)"
30 |
31 | - name: Install pip and dev requirements
32 | run: |
33 | python -m pip install --upgrade pip
34 | pip install -r dev-requirements.txt
35 |
36 | - name: Lint
37 | run: |
38 | # stop the build if there are Python syntax errors or undefined names
39 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude=.git,__pycache__,docs/source/conf.py,old,build,dist,admin
40 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
41 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --exclude=.git,__pycache__,docs/source/conf.py,old,build,dist,admin --statistics --format=html --htmldir=flake-report
42 |
43 | - name: Upload Lint results
44 | uses: actions/upload-artifact@v2
45 | with:
46 | name: lint-results
47 | path: flake-report/
48 |
49 | - name: Run tests
50 | run: |
51 | python setup.py test
52 |
53 | - name: Publish test results
54 | uses: EnricoMi/publish-unit-test-result-action@v1.6
55 | if: always()
56 | with:
57 | github_token: ${{ secrets.GITHUB_TOKEN }}
58 | files: test-results/**/*.xml
59 |
60 | - name: Upload coverage to Codecov
61 | uses: codecov/codecov-action@v1
62 | with:
63 | token: ${{ secrets.CODECOV_TOKEN }}
64 | file: coverage.xml
65 | flags: unittests
66 |
67 | - name: Deploy docs
68 | if: github.ref == 'refs/heads/main'
69 | run: |
70 | pip install .
71 | mkdocs gh-deploy --force
72 |
73 | - name: Build and publish to TEST PyPI
74 | if: github.ref != 'refs/heads/main'
75 | env:
76 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
77 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
78 | run: |
79 | python setup.py sdist bdist_wheel
80 | twine upload -r testpypi dist/*
81 |
82 | - name: Publish to PyPI
83 | env:
84 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
85 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
86 | if: github.ref == 'refs/heads/main'
87 | run: |
88 | python setup.py sdist bdist_wheel
89 | twine upload dist/*
90 |
91 |
92 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # Data and models
7 | /data
8 | /daNER
9 | src/*.pickle
10 | src/*.pkl
11 | *.csv
12 | *.pickle
13 |
14 | # playground
15 | tester.py
16 | tester2.py
17 | src/playground.py
18 |
19 | # C extensions
20 | *.so
21 |
22 | # Distribution / packaging
23 | .Python
24 | build/
25 | develop-eggs/
26 | dist/
27 | downloads/
28 | eggs/
29 | .eggs/
30 | lib/
31 | lib64/
32 | parts/
33 | sdist/
34 | var/
35 | wheels/
36 | pip-wheel-metadata/
37 | share/python-wheels/
38 | *.egg-info/
39 | .installed.cfg
40 | *.egg
41 | MANIFEST
42 | test-results/
43 |
44 |
45 | # PyInstaller
46 | # Usually these files are written by a python script from a template
47 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
48 | *.manifest
49 | *.spec
50 |
51 | # Installer logs
52 | pip-log.txt
53 | pip-delete-this-directory.txt
54 |
55 | # Unit test / coverage reports
56 | htmlcov/
57 | flake-report/
58 | .tox/
59 | .nox/
60 | .coverage
61 | .coverage.*
62 | .cache
63 | nosetests.xml
64 | coverage.xml
65 | *.cover
66 | *.py,cover
67 | .hypothesis/
68 | .pytest_cache/
69 |
70 | # Translations
71 | *.mo
72 | *.pot
73 |
74 | # Django stuff:
75 | *.log
76 | local_settings.py
77 | db.sqlite3
78 | db.sqlite3-journal
79 |
80 | # Flask stuff:
81 | instance/
82 | .webassets-cache
83 |
84 | # Scrapy stuff:
85 | .scrapy
86 |
87 | # documentation
88 | docs/_build/
89 |
90 | # PyBuilder
91 | target/
92 |
93 | # Jupyter Notebook
94 | .ipynb_checkpoints
95 |
96 | # IPython
97 | profile_default/
98 | ipython_config.py
99 |
100 | # pyenv
101 | .python-version
102 |
103 | # pipenv
104 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
105 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
106 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
107 | # install all needed dependencies.
108 | #Pipfile.lock
109 |
110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
111 | __pypackages__/
112 |
113 | # Celery stuff
114 | celerybeat-schedule
115 | celerybeat.pid
116 |
117 | # SageMath parsed files
118 | *.sage.py
119 |
120 | # Environments
121 | .env
122 | .venv
123 | env/
124 | venv/
125 | ENV/
126 | env.bak/
127 | venv.bak/
128 |
129 | # Spyder project settings
130 | .spyderproject
131 | .spyproject
132 |
133 | # Rope project settings
134 | .ropeproject
135 |
136 | # mkdocs documentation
137 | /site
138 |
139 | # mypy
140 | .mypy_cache/
141 | .dmypy.json
142 | dmypy.json
143 |
144 | # Pyre type checker
145 | .pyre/
146 |
147 | # User generated folders
148 | runs/
149 |
150 | # User created models
151 | *.bin
152 |
153 | # tensor board results
154 | .DS_Store
155 | .vscode
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Ekstra Bladet, PIN
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # NERDA 1.0.0
2 |
3 | * NERDA model class is now equipped with functions for saving (loading) weights for a fine-tuned NERDA Network to (from) file. See functions model.save_network() and model.load_network_from_file()
4 |
5 | # NERDA 0.9.7
6 |
7 | * return confidence scores for predictions of all tokens, e.g. model.predict(x, return_confidence=True).
8 |
9 | # NERDA 0.9.6
10 |
11 | * compute Precision, Recall and Accuracy (optional) with evaluate_performance().
12 | * improve relative imports inside package.
13 |
14 | # NERDA 0.9.5
15 |
16 | * ... bugfixes.
17 |
18 | # NERDA 0.9.4
19 |
20 | * functionality for dynamic quantization, fp32 to fp16, padding parametrized.
21 |
22 | # NERDA 0.9.2
23 |
24 | * remove precooked DA_BERT_ML_16BIT, include precooked DA_DISTILBERT_ML.
25 |
26 | # NERDA 0.9.1
27 |
28 | * include 16 bit FP precooked DA_BERT_ML_16BIT.
29 |
30 | # NERDA 0.9.0
31 |
32 | * Support new versions of `transformers` (4.x) and `torch`
33 |
34 | # NERDA 0.8.7
35 |
36 | * BUGFIX: Restrict torch version.
37 | * Do not import datasets as part of Precooked Models.
38 | * Do not load datasets if not provided by user.
39 |
40 | # NERDA 0.8.6
41 |
42 | * First official release.
43 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NERDA
2 |
3 | 
4 | [](https://codecov.io/gh/ebanalyse/NERDA)
5 | 
6 | 
7 | 
8 |
9 | Not only is `NERDA` a mesmerizing muppet-like character. `NERDA` is also
10 | a python package, that offers a slick easy-to-use interface for fine-tuning
11 | pretrained transformers for Named Entity Recognition
12 | (=NER) tasks.
13 |
14 | You can also utilize `NERDA` to access a selection of *precooked* `NERDA` models,
15 | that you can use right off the shelf for NER tasks.
16 |
17 | `NERDA` is built on `huggingface` `transformers` and the popular `pytorch`
18 | framework.
19 |
20 | ## Installation guide
21 | `NERDA` can be installed from [PyPI](https://pypi.org/project/NERDA/) with
22 |
23 | ```
24 | pip install NERDA
25 | ```
26 |
27 | If you want the development version then install directly from [GitHub](https://github.com/ebanalyse/NERDA).
28 |
29 | ## Named-Entity Recogntion tasks
30 | Named-entity recognition (NER) (also known as (named) entity identification,
31 | entity chunking, and entity extraction) is a subtask of information extraction
32 | that seeks to locate and classify named entities mentioned in unstructured
33 | text into pre-defined categories such as person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.[1]
34 |
35 | [1]: https://en.wikipedia.org/wiki/Named-entity_recognition
36 |
37 | ### Example Task:
38 |
39 | **Task**
40 |
41 | Identify person names and organizations in text:
42 |
43 | *Jim bought 300 shares of Acme Corp.*
44 |
45 | **Solution**
46 |
47 | | **Named Entity** | **Type** |
48 | |--------------------|-----------------------|
49 | | 'Jim' | Person |
50 | | 'Acme Corp.' | Organization |
51 |
52 | Read more about NER on [Wikipedia](https://en.wikipedia.org/wiki/Named-entity_recognition).
53 |
54 | ## Train Your Own `NERDA` Model
55 |
56 | Say, we want to fine-tune a pretrained [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) transformer for NER in English.
57 |
58 | Load package.
59 |
60 | ```python
61 | from NERDA.models import NERDA
62 | ```
63 |
64 | Instantiate a `NERDA` model (*with default settings*) for the
65 | [`CoNLL-2003`](https://www.clips.uantwerpen.be/conll2003/ner/)
66 | English NER data set.
67 |
68 | ```python
69 | from NERDA.datasets import get_conll_data
70 | model = NERDA(dataset_training = get_conll_data('train'),
71 | dataset_validation = get_conll_data('valid'),
72 | transformer = 'bert-base-multilingual-uncased')
73 | ```
74 |
75 | By default the network architecture is analogous to that of the models in [Hvingelby et al. 2020](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf).
76 |
77 | The model can then be trained/fine-tuned by invoking the `train` method, e.g.
78 |
79 | ```python
80 | model.train()
81 | ```
82 |
83 | **Note**: this will take some time depending on the dimensions of your machine
84 | (if you want to skip training, you can go ahead and use one of the models,
85 | that we have already precooked for you in stead).
86 |
87 | After the model has been trained, the model can be used for predicting
88 | named entities in new texts.
89 |
90 | ```python
91 | # text to identify named entities in.
92 | text = 'Old MacDonald had a farm'
93 | model.predict_text(text)
94 | ([['Old', 'MacDonald', 'had', 'a', 'farm']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
95 | ```
96 | This means, that the model identified 'Old MacDonald' as a *PER*son.
97 |
98 | Please note, that the `NERDA` model configuration above was instantiated
99 | with all default settings. You can however customize your `NERDA` model
100 | in a lot of ways:
101 |
102 | - Use your own data set (finetune a transformer for any given language)
103 | - Choose whatever transformer you like
104 | - Set all of the hyperparameters for the model
105 | - You can even apply your own Network Architecture
106 |
107 | Read more about advanced usage of `NERDA` in the [detailed documentation](https://ebanalyse.github.io/NERDA/workflow).
108 |
109 | ## Use a Precooked `NERDA` model
110 |
111 | We have precooked a number of `NERDA` models for Danish and English, that you can download
112 | and use right off the shelf.
113 |
114 | Here is an example.
115 |
116 | Instantiate a multilingual BERT model, that has been finetuned for NER in Danish,
117 | `DA_BERT_ML`.
118 |
119 | ```python
120 | from NERDA.precooked import DA_BERT_ML
121 | model = DA_BERT_ML()
122 | ```
123 |
124 | Down(load) network from web:
125 |
126 | ```python
127 | model.download_network()
128 | model.load_network()
129 | ```
130 |
131 | You can now predict named entities in new (Danish) texts
132 |
133 | ```python
134 | # (Danish) text to identify named entities in:
135 | # 'Jens Hansen har en bondegård' = 'Old MacDonald had a farm'
136 | text = 'Jens Hansen har en bondegård'
137 | model.predict_text(text)
138 | ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
139 | ```
140 |
141 | ### List of Precooked Models
142 |
143 | The table below shows the precooked `NERDA` models publicly available for download.
144 |
145 | | **Model** | **Language** | **Transformer** | **Dataset** | **F1-score** |
146 | |-----------------|--------------|-------------------|---------|-----|
147 | | `DA_BERT_ML` | Danish | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 82.8 |
148 | `DA_ELECTRA_DA` | Danish | [Danish ELECTRA](https://huggingface.co/Maltehb/-l-ctra-danish-electra-small-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 79.8 |
149 | | `EN_BERT_ML` | English | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased)| [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 90.4 |
150 | | `EN_ELECTRA_EN` | English | [English ELECTRA](https://huggingface.co/google/electra-small-discriminator) | [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 89.1 |
151 |
152 | **F1-score** is the micro-averaged F1-score across entity tags and is
153 | evaluated on the respective test sets (that have not been used for training nor
154 | validation of the models).
155 |
156 | Note, that we have not spent a lot of time on actually fine-tuning the models,
157 | so there could be room for improvement. If you are able to improve the models,
158 | we will be happy to hear from you and include your `NERDA` model.
159 |
160 | ### Model Performance
161 |
162 | The table below summarizes the performance (F1-scores) of the precooked `NERDA` models.
163 |
164 | | **Level** | `DA_BERT_ML` | `DA_ELECTRA_DA` | `EN_BERT_ML` | `EN_ELECTRA_EN` |
165 | |---------------|--------------|-----------------|--------------|-----------------|
166 | | B-PER | 93.8 | 92.0 | 96.0 | 95.1 |
167 | | I-PER | 97.8 | 97.1 | 98.5 | 97.9 |
168 | | B-ORG | 69.5 | 66.9 | 88.4 | 86.2 |
169 | | I-ORG | 69.9 | 70.7 | 85.7 | 83.1 |
170 | | B-LOC | 82.5 | 79.0 | 92.3 | 91.1 |
171 | | I-LOC | 31.6 | 44.4 | 83.9 | 80.5 |
172 | | B-MISC | 73.4 | 68.6 | 81.8 | 80.1 |
173 | | I-MISC | 86.1 | 63.6 | 63.4 | 68.4 |
174 | | **AVG_MICRO** | 82.8 | 79.8 | 90.4 | 89.1 |
175 | | **AVG_MACRO** | 75.6 | 72.8 | 86.3 | 85.3 |
176 |
177 | ## 'NERDA'?
178 | '`NERDA`' originally stands for *'Named Entity Recognition for DAnish'*. However, this
179 | is somewhat misleading, since the functionality is no longer limited to Danish.
180 | On the contrary it generalizes to all other languages, i.e. `NERDA` supports
181 | fine-tuning of transformers for NER tasks for any arbitrary
182 | language.
183 |
184 | ## Background
185 | `NERDA` is developed as a part of [Ekstra Bladet](https://ekstrabladet.dk/)’s activities on Platform Intelligence in News (PIN). PIN is an industrial research project that is carried out in collaboration between the [Technical University of Denmark](https://www.dtu.dk/), [University of Copenhagen](https://www.ku.dk/) and [Copenhagen Business School](https://www.cbs.dk/) with funding from [Innovation Fund Denmark](https://innovationsfonden.dk/). The project runs from 2020-2023 and develops recommender systems and natural language processing systems geared for news publishing, some of which are open sourced like `NERDA`.
186 |
187 | ## Shout-outs
188 | - Thanks to [Alexandra Institute](https://alexandra.dk/) for with the [`danlp`](https://github.com/alexandrainst/danlp) package to have encouraged us to develop this package.
189 | - Thanks to [Malte Højmark-Bertelsen](https://www.linkedin.com/in/malte-h%C3%B8jmark-bertelsen-9a618017b/) and [Kasper Junge](https://www.linkedin.com/in/kasper-juunge/?originalSubdomain=dk) for giving feedback on `NERDA`.
190 |
191 | ## Read more
192 | The detailed documentation for `NERDA` including code references and
193 | extended workflow examples can be accessed [here](https://ebanalyse.github.io/NERDA/).
194 |
195 | ## Cite this work
196 |
197 | ```
198 | @inproceedings{nerda,
199 | title = {NERDA},
200 | author = {Kjeldgaard, Lars and Nielsen, Lukas},
201 | year = {2021},
202 | publisher = {{GitHub}},
203 | url = {https://github.com/ebanalyse/NERDA}
204 | }
205 | ```
206 |
207 | ## Contact
208 | We hope, that you will find `NERDA` useful.
209 |
210 | Please direct any questions and feedbacks to
211 | [us](mailto:lars.kjeldgaard@eb.dk)!
212 |
213 | If you want to contribute (which we encourage you to), open a
214 | [PR](https://github.com/ebanalyse/NERDA/pulls).
215 |
216 | If you encounter a bug or want to suggest an enhancement, please
217 | [open an issue](https://github.com/ebanalyse/NERDA/issues).
218 |
219 |
--------------------------------------------------------------------------------
/admin/deploy_models.py:
--------------------------------------------------------------------------------
1 | from NERDA.datasets import get_conll_data, get_dane_data
2 | import pandas as pd
3 | import torch
4 | import boto3
5 |
6 | def deploy_model_to_s3(model, test_set = get_dane_data('test')):
7 | """Deploy Model to S3
8 |
9 | Args:
10 | model: NERDA model.
11 | test_set: Test set for evaluating performance.
12 |
13 | Returns:
14 | str: message saying, if model was uploaded successfully.
15 | Model and text file with performance numbers uploaded
16 | as side-effects.
17 | """
18 |
19 | model_name = type(model).__name__
20 |
21 | file_model = f'{model_name}.bin'
22 | torch.save(model.network.state_dict(), file_model)
23 |
24 | # compute performance on test set and save.
25 | performance = model.evaluate_performance(test_set)
26 |
27 | # write to csv.
28 | file_performance = f'{model_name}_performance.csv'
29 | performance.to_csv(file_performance, index = False)
30 |
31 | # upload to S3 bucket.
32 | s3 = boto3.resource('s3')
33 | s3.Bucket('nerda').upload_file(
34 | Filename=file_model,
35 | Key = file_model)
36 | s3.Bucket('nerda').upload_file(
37 | Filename=file_performance,
38 | Key = file_performance)
39 |
40 | return "Model deployed to S3 successfully."
41 |
42 | if __name__ == '__main__':
43 | from NERDA.precooked import EN_ELECTRA_EN
44 | model = EN_ELECTRA_EN()
45 | model.train()
46 |
47 | deploy_model_to_s3(model)
48 |
49 |
--------------------------------------------------------------------------------
/admin/sandbox.py:
--------------------------------------------------------------------------------
1 | from NERDA.models import NERDA
2 | from NERDA.datasets import get_conll_data, get_dane_data
3 | from transformers import AutoTokenizer
4 | trans = 'bert-base-multilingual-uncased'
5 | tokenizer = AutoTokenizer.from_pretrained(trans, do_lower_case = True)
6 | data = get_dane_data('train')
7 |
8 | sents = data.get('sentences')
9 |
10 | out = []
11 |
12 | for sent in sents:
13 | sent = sents[3595]
14 | tokens = []
15 | for word in sent:
16 | tokens.extend(tokenizer.tokenize(word))
17 | out.append(tokens)
18 |
19 | lens = [len(x) for x in out]
20 |
21 | max(lens)
22 |
23 | sents[3595]
24 |
25 |
26 | from transformers import AutoTokenizer, AutoModel, AutoConfig
27 | t = 'google/electra-small-discriminator'
28 | cfg = AutoModel.from_pretrained(t)
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 | #trn = get_conll_data('train')
41 | #idx_min = 3110
42 | #idx_max = 3115
43 | #valid = get_conll_data('valid')
44 | #valid['sentences'] = valid['sentences'][idx_min:idx_max+1]
45 | #valid['tags'] = valid['tags'][idx_min:idx_max+1]
46 | #trn['sentences'] = trn['sentences'][idx_min:idx_max+1]
47 | #trn['tags'] = trn['tags'][idx_min:idx_max+1]
48 | # model = NERDA(dataset_training=trn,
49 | # dataset_validation = valid)
50 | #model.train()
51 | #k=0
52 | #trn['sentences'][3111]
53 | #from transformers import AutoTokenizer
54 | #t = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased')
55 | #valid = get_conll_data('valid')
56 |
57 | filename = 'en_bert_ml.pkl'
58 | # pickle.dump(model, open(filename, 'wb'))
59 | import pickle
60 | file = open(filename,'rb')
61 | model = pickle.load(file)
62 | test = get_conll_data('test')
63 | model.evaluate_performance(test, batch_size = 10)
64 | #for entry in range(3120,3140):
65 | # print(entry)
66 | # sent = trn['sentences'][entry]
67 | # [t.tokenize(word) for word in sent]
68 |
69 | test = get_conll_data('test')
70 | idx_min = 202
71 | idx_max = 202
72 | # valid = get_conll_data('valid')
73 | #valid['sentences'] = valid['sentences'][idx_min:idx_max+1]
74 | #valid['tags'] = valid['tags'][idx_min:idx_max+1]
75 | test['sentences'] = test['sentences'][idx_min:idx_max+1]
76 | test['tags'] = test['tags'][idx_min:idx_max+1]
77 | model.evaluate_performance(test)
78 | # model = NERDA(dataset_training=trn,
79 | # dataset_validation = valid)
80 | #model.train()
81 | #k=0
82 | #trn['sentences'][3111]
83 | #from transformers import AutoTokenizer
84 | #t = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased')
85 | #valid = get_conll_data('valid')
86 |
87 | <<<<<<< HEAD:admin/sandbox.py
88 |
89 | transformer = "google/electra-small-discriminator"
90 | from transformers import AutoTokenizer, AutoModel, AutoConfig
91 | trans = AutoConfig.from_pretrained(transformer)
92 |
93 | def tester():
94 |
95 | try:
96 | model = AutoModel.from_pretrained('google/electra-small-discriminator')
97 | except:
98 | print("Oops!", sys.exc_info()[0], "occurred.")
99 |
100 | return model
101 | =======
102 | from NERDA.datasets import get_dane_data
103 | trn = get_conll_data('train', 5)
104 | valid = get_conll_data('dev', 5)
105 | transformer = 'bert-base-multilingual-uncased',
106 | model = NERDA(transformer = transformer,
107 | dataset_training = trn,
108 | dataset_validation = valid)
109 | >>>>>>> b5eea087ece5f61ec70aa3f99cd4c99b418ebb92:sandbox.py
110 |
--------------------------------------------------------------------------------
/admin/tuning.py:
--------------------------------------------------------------------------------
1 | from sys import getdefaultencoding
2 | from NERDA.models import NERDA
3 | from NERDA.datasets import get_dane_data
4 | from hyperopt import fmin, hp, tpe, space_eval
5 | from hyperopt.pyll import scope
6 | import numpy as np
7 |
8 | def objective(params):
9 |
10 | print(params)
11 |
12 | model = NERDA(dataset_training = get_dane_data('train', 20),
13 | dataset_validation = get_dane_data('dev', 20),
14 | hyperparameters = params)
15 |
16 | model.train()
17 |
18 | return model.valid_loss
19 |
20 | def run_parameter_optimization(objective, number_of_evals = 3):
21 |
22 | hpspace = {
23 | 'learning_rate': hp.loguniform('lr', np.log(0.00005), np.log(0.01)),
24 | 'train_batch_size': scope.int(hp.uniform('batch_size', 8, 16)),
25 | 'epochs': scope.int(hp.uniform('epochs', 1, 3)),
26 | 'warmup_steps': hp.choice('warmup_steps', [0, 250, 500]),
27 | }
28 |
29 | print('Running hyperparameter optimization...')
30 |
31 | best_params = fmin(objective, space = hpspace, algo = tpe.suggest, max_evals= number_of_evals)
32 |
33 | return best_params
34 |
35 | # best_params = run_parameter_optimization(objective = objective, number_of_evals=3)
36 |
--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | flake8-html
2 | wheel
3 | twine
4 | mkdocs-material
5 | mkdocstrings
6 | mknotebooks
7 | jupyter
--------------------------------------------------------------------------------
/docs/datasets.md:
--------------------------------------------------------------------------------
1 | # Datasets
2 | ::: NERDA.datasets
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # NERDA
2 |
3 | 
4 | [](https://codecov.io/gh/ebanalyse/NERDA)
5 | 
6 | 
7 | 
8 |
9 | Not only is `NERDA` a mesmerizing muppet-like character. `NERDA` is also
10 | a python package, that offers a slick easy-to-use interface for fine-tuning
11 | pretrained transformers for Named Entity Recognition
12 | (=NER) tasks.
13 |
14 | You can also utilize `NERDA` to access a selection of *precooked* `NERDA` models,
15 | that you can use right off the shelf for NER tasks.
16 |
17 | `NERDA` is built on `huggingface` `transformers` and the popular `pytorch`
18 | framework.
19 |
20 | ## Installation guide
21 | `NERDA` can be installed from [PyPI](https://pypi.org/project/NERDA/) with
22 |
23 | ```
24 | pip install NERDA
25 | ```
26 |
27 | If you want the development version then install directly from [GitHub](https://github.com/ebanalyse/NERDA).
28 |
29 | ## Named-Entity Recogntion tasks
30 | Named-entity recognition (NER) (also known as (named) entity identification,
31 | entity chunking, and entity extraction) is a subtask of information extraction
32 | that seeks to locate and classify named entities mentioned in unstructured
33 | text into pre-defined categories such as person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.[1]
34 |
35 | [1]: https://en.wikipedia.org/wiki/Named-entity_recognition
36 |
37 | ### Example Task:
38 |
39 | **Task**
40 |
41 | Identify person names and organizations in text:
42 |
43 | *Jim bought 300 shares of Acme Corp.*
44 |
45 | **Solution**
46 |
47 | | **Named Entity** | **Type** |
48 | |--------------------|-----------------------|
49 | | 'Jim' | Person |
50 | | 'Acme Corp.' | Organization |
51 |
52 | Read more about NER on [Wikipedia](https://en.wikipedia.org/wiki/Named-entity_recognition).
53 |
54 | ## Train Your Own `NERDA` Model
55 |
56 | Say, we want to fine-tune a pretrained [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) transformer for NER in English.
57 |
58 | Load package.
59 |
60 | ```python
61 | from NERDA.models import NERDA
62 | ```
63 |
64 | Instantiate a `NERDA` model (*with default settings*) for the
65 | [`CoNLL-2003`](https://www.clips.uantwerpen.be/conll2003/ner/)
66 | English NER data set.
67 |
68 | ```python
69 | from NERDA.datasets import get_conll_data
70 | model = NERDA(dataset_training = get_conll_data('train'),
71 | dataset_validation = get_conll_data('valid'),
72 | transformer = 'bert-base-multilingual-uncased')
73 | ```
74 |
75 | By default the network architecture is analogous to that of the models in [Hvingelby et al. 2020](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf).
76 |
77 | The model can then be trained/fine-tuned by invoking the `train` method, e.g.
78 |
79 | ```python
80 | model.train()
81 | ```
82 |
83 | **Note**: this will take some time depending on the dimensions of your machine
84 | (if you want to skip training, you can go ahead and use one of the models,
85 | that we have already precooked for you in stead).
86 |
87 | After the model has been trained, the model can be used for predicting
88 | named entities in new texts.
89 |
90 | ```python
91 | # text to identify named entities in.
92 | text = 'Old MacDonald had a farm'
93 | model.predict_text(text)
94 | ([['Old', 'MacDonald', 'had', 'a', 'farm']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
95 | ```
96 | This means, that the model identified 'Old MacDonald' as a *PER*son.
97 |
98 | Please note, that the `NERDA` model configuration above was instantiated
99 | with all default settings. You can however customize your `NERDA` model
100 | in a lot of ways:
101 |
102 | - Use your own data set (finetune a transformer for any given language)
103 | - Choose whatever transformer you like
104 | - Set all of the hyperparameters for the model
105 | - You can even apply your own Network Architecture
106 |
107 | Read more about advanced usage of `NERDA` in the [detailed documentation](https://ebanalyse.github.io/NERDA/workflow).
108 |
109 | ## Use a Precooked `NERDA` model
110 |
111 | We have precooked a number of `NERDA` models for Danish and English, that you can download
112 | and use right off the shelf.
113 |
114 | Here is an example.
115 |
116 | Instantiate a multilingual BERT model, that has been finetuned for NER in Danish,
117 | `DA_BERT_ML`.
118 |
119 | ```python
120 | from NERDA.precooked import DA_BERT_ML()
121 | model = DA_BERT_ML()
122 | ```
123 |
124 | Down(load) network from web:
125 |
126 | ```python
127 | model.download_network()
128 | model.load_network()
129 | ```
130 |
131 | You can now predict named entities in new (Danish) texts
132 |
133 | ```python
134 | # (Danish) text to identify named entities in:
135 | # 'Jens Hansen har en bondegård' = 'Old MacDonald had a farm'
136 | text = 'Jens Hansen har en bondegård'
137 | model.predict_text(text)
138 | ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
139 | ```
140 |
141 | ### List of Precooked Models
142 |
143 | The table below shows the precooked `NERDA` models publicly available for download.
144 |
145 | | **Model** | **Language** | **Transformer** | **Dataset** | **F1-score** |
146 | |-----------------|--------------|-------------------|---------|-----|
147 | | `DA_BERT_ML` | Danish | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 82.8 |
148 | `DA_ELECTRA_DA` | Danish | [Danish ELECTRA](https://huggingface.co/Maltehb/-l-ctra-danish-electra-small-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 79.8 |
149 | | `EN_BERT_ML` | English | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased)| [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 90.4 |
150 | | `EN_ELECTRA_EN` | English | [English ELECTRA](https://huggingface.co/google/electra-small-discriminator) | [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 89.1 |
151 |
152 | **F1-score** is the micro-averaged F1-score across entity tags and is
153 | evaluated on the respective test sets (that have not been used for training nor
154 | validation of the models).
155 |
156 | Note, that we have not spent a lot of time on actually fine-tuning the models,
157 | so there could be room for improvement. If you are able to improve the models,
158 | we will be happy to hear from you and include your `NERDA` model.
159 |
160 | ### Model Performance
161 |
162 | The table below summarizes the performance (F1-scores) of the precooked `NERDA` models.
163 |
164 | | **Level** | `DA_BERT_ML` | `DA_ELECTRA_DA` | `EN_BERT_ML` | `EN_ELECTRA_EN` |
165 | |---------------|--------------|-----------------|--------------|-----------------|
166 | | B-PER | 93.8 | 92.0 | 96.0 | 95.1 |
167 | | I-PER | 97.8 | 97.1 | 98.5 | 97.9 |
168 | | B-ORG | 69.5 | 66.9 | 88.4 | 86.2 |
169 | | I-ORG | 69.9 | 70.7 | 85.7 | 83.1 |
170 | | B-LOC | 82.5 | 79.0 | 92.3 | 91.1 |
171 | | I-LOC | 31.6 | 44.4 | 83.9 | 80.5 |
172 | | B-MISC | 73.4 | 68.6 | 81.8 | 80.1 |
173 | | I-MISC | 86.1 | 63.6 | 63.4 | 68.4 |
174 | | **AVG_MICRO** | 82.8 | 79.8 | 90.4 | 89.1 |
175 | | **AVG_MACRO** | 75.6 | 72.8 | 86.3 | 85.3 |
176 |
177 | ## 'NERDA'?
178 | '`NERDA`' originally stands for *'Named Entity Recognition for DAnish'*. However, this
179 | is somewhat misleading, since the functionality is no longer limited to Danish.
180 | On the contrary it generalizes to all other languages, i.e. `NERDA` supports
181 | fine-tuning of transformers for NER tasks for any arbitrary
182 | language.
183 |
184 | ## Background
185 | `NERDA` is developed as a part of [Ekstra Bladet](https://ekstrabladet.dk/)’s activities on Platform Intelligence in News (PIN). PIN is an industrial research project that is carried out in collaboration between the [Technical University of Denmark](https://www.dtu.dk/), [University of Copenhagen](https://www.ku.dk/) and [Copenhagen Business School](https://www.cbs.dk/) with funding from [Innovation Fund Denmark](https://innovationsfonden.dk/). The project runs from 2020-2023 and develops recommender systems and natural language processing systems geared for news publishing, some of which are open sourced like `NERDA`.
186 |
187 | ## Shout-outs
188 | - Thanks to [Alexandra Institute](https://alexandra.dk/) for with the [`danlp`](https://github.com/alexandrainst/danlp) package to have encouraged us to develop this package.
189 | - Thanks to [Malte Højmark-Bertelsen](https://www.linkedin.com/in/malte-h%C3%B8jmark-bertelsen-9a618017b/) and [Kasper Junge](https://www.linkedin.com/in/kasper-juunge/?originalSubdomain=dk) for giving feedback on `NERDA`.
190 |
191 | ## Contact
192 | We hope, that you will find `NERDA` useful.
193 |
194 | Please direct any questions and feedbacks to
195 | [us](mailto:lars.kjeldgaard@eb.dk)!
196 |
197 | If you want to contribute (which we encourage you to), open a
198 | [PR](https://github.com/ebanalyse/NERDA/pulls).
199 |
200 | If you encounter a bug or want to suggest an enhancement, please
201 | [open an issue](https://github.com/ebanalyse/NERDA/issues).
202 |
203 |
--------------------------------------------------------------------------------
/docs/nerda_models.md:
--------------------------------------------------------------------------------
1 | # NERDA Models
2 | ::: NERDA.models
--------------------------------------------------------------------------------
/docs/networks.md:
--------------------------------------------------------------------------------
1 | # Networks
2 | ::: NERDA.networks
--------------------------------------------------------------------------------
/docs/performance.md:
--------------------------------------------------------------------------------
1 | # Performance
2 | ::: NERDA.performance
--------------------------------------------------------------------------------
/docs/preamble.py:
--------------------------------------------------------------------------------
1 | # suppress warnings for notebook
2 | import warnings
3 | warnings.filterwarnings("ignore")
4 | # download nltk 'punkt' in order to use nltk word/sent-tokenize
5 | import nltk
6 | nltk.download('punkt')
--------------------------------------------------------------------------------
/docs/precooked_models.md:
--------------------------------------------------------------------------------
1 | # Precooked NERDA models
2 | ::: NERDA.precooked
--------------------------------------------------------------------------------
/docs/predictions.md:
--------------------------------------------------------------------------------
1 | # Predictions
2 | ::: NERDA.predictions
--------------------------------------------------------------------------------
/docs/workflow.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "language_info": {
4 | "codemirror_mode": {
5 | "name": "ipython",
6 | "version": 3
7 | },
8 | "file_extension": ".py",
9 | "mimetype": "text/x-python",
10 | "name": "python",
11 | "nbconvert_exporter": "python",
12 | "pygments_lexer": "ipython3",
13 | "version": "3.9.0-final"
14 | },
15 | "orig_nbformat": 2,
16 | "kernelspec": {
17 | "name": "python3",
18 | "display_name": "Python 3.9.0 64-bit ('3.9.0')",
19 | "metadata": {
20 | "interpreter": {
21 | "hash": "36071112a161297f2fd106003050184fbdff34ed057f375faa6d2f5f0cad40eb"
22 | }
23 | }
24 | }
25 | },
26 | "nbformat": 4,
27 | "nbformat_minor": 2,
28 | "cells": [
29 | {
30 | "source": [
31 | "# Workflow Examples"
32 | ],
33 | "cell_type": "markdown",
34 | "metadata": {}
35 | },
36 | {
37 | "source": [
38 | "`NERDA` offers a simple easy-to-use interface for fine-tuning transformers for Named-Entity Recognition (=NER). We call this family of models `NERDA` models.\n",
39 | "\n",
40 | "`NERDA` can be used in two ways. You can either (1) train your own customized `NERDA` model or (2) download and use one of our precooked `NERDA` models for inference i.e. identifying named entities in new texts."
41 | ],
42 | "cell_type": "markdown",
43 | "metadata": {}
44 | },
45 | {
46 | "source": [
47 | "## Train Your Own `NERDA` model"
48 | ],
49 | "cell_type": "markdown",
50 | "metadata": {}
51 | },
52 | {
53 | "source": [
54 | "We want to fine-tune a transformer for English. \n",
55 | "\n",
56 | "First, we download an English NER dataset [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) with annotated Named Entities, that we will use for training and evaluation of our model."
57 | ],
58 | "cell_type": "markdown",
59 | "metadata": {}
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 4,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "output_type": "error",
68 | "ename": "ModuleNotFoundError",
69 | "evalue": "No module named 'NERDA'",
70 | "traceback": [
71 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
72 | "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
73 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# don't print warnings for this session\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mNERDA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatasets\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mget_dane_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload_dane_data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mdownload_dane_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
74 | "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'NERDA'"
75 | ]
76 | }
77 | ],
78 | "source": [
79 | "from NERDA.datasets import get_conll_data, download_conll_data\n",
80 | "download_conll_data()"
81 | ]
82 | },
83 | {
84 | "source": [
85 | "CoNLL-2003 operates with the following types of named entities:\n",
86 | "\n",
87 | "1. **PER**sons \n",
88 | "2. **ORG**anizations \n",
89 | "3. **LOC**ations \n",
90 | "4. **MISC**ellaneous \n",
91 | "5. **O**utside (Not a named Entity)\n",
92 | "\n",
93 | "An observation from the CoNLL-2003 data set looks like this."
94 | ],
95 | "cell_type": "markdown",
96 | "metadata": {}
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 3,
101 | "metadata": {},
102 | "outputs": [
103 | {
104 | "output_type": "error",
105 | "ename": "NameError",
106 | "evalue": "name 'get_dane_data' is not defined",
107 | "traceback": [
108 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
109 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
110 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtraining\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_dane_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'train'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mvalidation\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_dane_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'dev'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# example\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtraining\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'sentences'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mtags\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtraining\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'tags'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
111 | "\u001b[0;31mNameError\u001b[0m: name 'get_dane_data' is not defined"
112 | ]
113 | }
114 | ],
115 | "source": [
116 | "# extract the first _5_ rows from the training and validation data splits.\n",
117 | "training = get_conll_data('train', 5)\n",
118 | "validation = get_conll_data('valid', 5)\n",
119 | "# example\n",
120 | "sentence = training.get('sentences')[0]\n",
121 | "tags = training.get('tags')[0]\n",
122 | "print(\"\\n\".join([\"{}/{}\".format(word, tag) for word, tag in zip(sentence, tags)]))"
123 | ]
124 | },
125 | {
126 | "source": [
127 | "If you provide your own dataset, it must have the same structure:\n",
128 | "\n",
129 | "- It must be a dictionary\n",
130 | "- The dictionary must contain\n",
131 | " - 'sentences': a list of word-tokenized sentences with one sentence per entry \n",
132 | " - 'tags': a list with the corresponding named-entity tags.\n",
133 | "\n",
134 | "The data set does however *not* have to follow the Inside-Outside-Beginning (IOB) tagging scheme[1].\n",
135 | "\n",
136 | "[1]: https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)\n",
137 | "\n",
138 | "The IOB tagging scheme implies, that words that are beginning of named entities are tagged with *'B-'* and words 'inside' (=continuations of) named entities are tagged with *'I-'*. That means that 'Joe Biden' should be tagged as `Joe(B-PER) Biden(I-PER)`.\n",
139 | "\n",
140 | "Now, instantiate a `NERDA` model for finetuning an [ELECTRA](https://huggingface.co/google/electra-small-discriminator) transformer for NER. "
141 | ],
142 | "cell_type": "markdown",
143 | "metadata": {}
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "from NERDA.models import NERDA\n",
152 | "tag_scheme = ['B-PER',\n",
153 | " 'I-PER', \n",
154 | " 'B-ORG', \n",
155 | " 'I-ORG', \n",
156 | " 'B-LOC', \n",
157 | " 'I-LOC', \n",
158 | " 'B-MISC', \n",
159 | " 'I-MISC']\n",
160 | "model = NERDA(dataset_training = training,\n",
161 | " dataset_validation = validation,\n",
162 | " tag_scheme = tag_scheme,\n",
163 | " tag_outside = 'O',\n",
164 | " transformer = 'google/electra-small-discriminator',\n",
165 | " hyperparameters = {'epochs' : 1,\n",
166 | " 'warmup_steps' : 10,\n",
167 | " 'train_batch_size': 5,\n",
168 | " 'learning_rate': 0.0001},)"
169 | ]
170 | },
171 | {
172 | "source": [
173 | "Note, this model configuration only uses 5 sentences for model training to minimize execution time. Also the hyperparameters for the model have been chosen in order to minimize execution time. Therefore this example only serves to illustrate the functionality i.e. the resulting model will suck.\n",
174 | "\n",
175 | "By default the network architecture is analogous that of the models in [Hvingelby et al. 2020](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf). \n",
176 | "\n",
177 | "The model can be trained right away by invoking the `train` method."
178 | ],
179 | "cell_type": "markdown",
180 | "metadata": {}
181 | },
182 | {
183 | "source": [
184 | "model.train()"
185 | ],
186 | "cell_type": "code",
187 | "metadata": {},
188 | "execution_count": null,
189 | "outputs": []
190 | },
191 | {
192 | "source": [
193 | "We can compute the performance of the model on a test set (limited to 5 sentences):"
194 | ],
195 | "cell_type": "markdown",
196 | "metadata": {}
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "test = get_conll_data('test', 5)\n",
205 | "model.evaluate_performance(test)"
206 | ]
207 | },
208 | {
209 | "source": [
210 | "Unsurprisingly, the model sucks in this case due to the ludicrous specification.\n",
211 | "\n",
212 | "Named Entities in new texts can be predicted with `predict` functions."
213 | ],
214 | "cell_type": "markdown",
215 | "metadata": {}
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 2,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "text = \"Old MacDonald had a farm\"\n",
224 | "model.predict_text(text)"
225 | ]
226 | },
227 | {
228 | "source": [
229 | "Needless to say the predicted entities for this model are nonsensical.\n",
230 | "\n",
231 | "To get a more reasonable model, provide more data and a more meaningful model specification.\n",
232 | "\n",
233 | "In general `NERDA` has the following handles, that you use.\n",
234 | "\n",
235 | "1. provide your own data set \n",
236 | "2. choose whatever pretrained transformer you would like to fine-tune\n",
237 | "3. provide your own set of hyperparameters and lastly\n",
238 | "4. provide your own `torch` network (architecture). You can do this by instantiating a `NERDA` model with the parameter 'network' set to your own network (torch.nn.Module)."
239 | ],
240 | "cell_type": "markdown",
241 | "metadata": {}
242 | },
243 | {
244 | "source": [
245 | "## Use a Precooked `NERDA` model"
246 | ],
247 | "cell_type": "markdown",
248 | "metadata": {}
249 | },
250 | {
251 | "source": [
252 | "We have precooked a number of `NERDA` models, that you can download \n",
253 | "and use right off the shelf. \n",
254 | "\n",
255 | "Here is an example.\n",
256 | "\n",
257 | "Instantiate a `NERDA` model based on the English [ELECTRA](https://huggingface.co/google/electra-small-discriminator) transformer, that has been finetuned for NER in English,\n",
258 | "`EN_ELECTRA_EN`."
259 | ],
260 | "cell_type": "markdown",
261 | "metadata": {}
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "\n",
270 | "from NERDA.precooked import EN_ELECTRA_EN\n",
271 | "model = EN_ELECTRA_EN()\n",
272 | "\n"
273 | ]
274 | },
275 | {
276 | "source": [
277 | "(Down)load network:"
278 | ],
279 | "cell_type": "markdown",
280 | "metadata": {}
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "metadata": {},
286 | "outputs": [],
287 | "source": [
288 | "\n",
289 | "model.download_network()\n",
290 | "model.load_network()\n"
291 | ]
292 | },
293 | {
294 | "source": [
295 | "This model performs much better:"
296 | ],
297 | "cell_type": "markdown",
298 | "metadata": {}
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": null,
303 | "metadata": {},
304 | "outputs": [],
305 | "source": [
306 | "model.evaluate_performance(get_conll_data('test', 100))"
307 | ]
308 | },
309 | {
310 | "source": [
311 | "Predict named entities in new texts"
312 | ],
313 | "cell_type": "markdown",
314 | "metadata": {}
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": null,
319 | "metadata": {},
320 | "outputs": [],
321 | "source": [
322 | "text = 'Old MacDonald had a farm'\n",
323 | "model.predict_text(text)\n"
324 | ]
325 | },
326 | {
327 | "source": [
328 | "### List of Precooked Models"
329 | ],
330 | "cell_type": "markdown",
331 | "metadata": {}
332 | },
333 | {
334 | "source": [
335 | "The table below shows the precooked `NERDA` models publicly available for download. We have trained models for Danish and English.\n",
336 | "\n",
337 | "\n",
338 | "| **Model** | **Language** | **Transformer** | **Dataset** | **F1-score** | \n",
339 | "|-----------------|--------------|-------------------|---------|-----|\n",
340 | "| `DA_BERT_ML` | Danish | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 82.8 | \n",
341 | "`DA_ELECTRA_DA` | Danish | [Danish ELECTRA](https://huggingface.co/Maltehb/-l-ctra-danish-electra-small-uncased) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane) | 79.8 |\n",
342 | "| `EN_BERT_ML` | English | [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased)| [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 90.4 |\n",
343 | "| `EN_ELECTRA_EN` | English | [English ELECTRA](https://huggingface.co/google/electra-small-discriminator) | [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/) | 89.1 |\n",
344 | "\n",
345 | "**F1-score** is the micro-averaged F1-score across entity tags and is \n",
346 | "evaluated on the respective test sets (that have not been used for training nor\n",
347 | "validation of the models).\n",
348 | "\n",
349 | "Note, that we have not spent a lot of time on actually fine-tuning the models,\n",
350 | "so there could be room for improvement. If you are able to improve the models,\n",
351 | "we will be happy to hear from you and include your `NERDA` model.\n",
352 | "\n",
353 | "#### Performance of Precooked Models\n",
354 | "\n",
355 | "The table below summarizes the performance as measured by F1-scores of the model\n",
356 | " configurations, that `NERDA` ships with. \n",
357 | "\n",
358 | "| **Level** | `DA_BERT_ML` | `DA_ELECTRA_DA` | `EN_BERT_ML` | `EN_ELECTRA_EN` |\n",
359 | "|---------------|-----------|------------|-------------|----------------|\n",
360 | "| B-PER | 93.8 | 92.0 | 96.0 | 95.1 | \n",
361 | "| I-PER | 97.8 | 97.1 | 98.5 | 97.9 | \n",
362 | "| B-ORG | 69.5 | 66.9 | 88.4 | 86.2 | \n",
363 | "| I-ORG | 69.9 | 70.7 | 85.7 | 83.1 | \n",
364 | "| B-LOC | 82.5 | 79.0 | 92.3 | 91.1 | \n",
365 | "| I-LOC | 31.6 | 44.4 | 83.9 | 80.5 | \n",
366 | "| B-MISC | 73.4 | 68.6 | 81.8 | 80.1 | \n",
367 | "| I-MISC | 86.1 | 63.6 | 63.4 | 68.4 | \n",
368 | "| **AVG_MICRO** | 82.8 | 79.8 | 90.4 | 89.1 | \n",
369 | "| **AVG_MACRO** | 75.6 | 72.8 | 86.3 | 85.3 |"
370 | ],
371 | "cell_type": "markdown",
372 | "metadata": {}
373 | },
374 | {
375 | "source": [
376 | "This concludes our walkthrough of `NERDA`. If you have any questions, please do not hesitate to [contact us](mailto:lars.kjeldgaard@eb.dk)!"
377 | ],
378 | "cell_type": "markdown",
379 | "metadata": {}
380 | }
381 | ]
382 | }
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/NERDA/ae45d7e5368059721d1073384201433ea7a6e820/logo.png
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: NERDA
2 | theme:
3 | name: "material"
4 |
5 | plugins:
6 | - search
7 | - mkdocstrings:
8 | handlers:
9 | python:
10 | setup_commands:
11 | - import sys
12 | - sys.path.append("src")
13 | - mknotebooks:
14 | execute: True
15 | preamble: "docs/preamble.py"
16 |
17 | nav:
18 | - Home: index.md
19 | - Workflow Examples: workflow.ipynb
20 | - Code Reference:
21 | - NERDA Models: nerda_models.md
22 | - Precooked NERDA Models: precooked_models.md
23 | - Datasets: datasets.md
24 | - Predictions: predictions.md
25 | - Networks: networks.md
26 | - Performance: performance.md
27 |
28 |
29 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = -v -s --junitxml=test-results/tests.xml --cov=./ --cov-report=xml
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test=pytest
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | with open("README.md", "r") as fh:
4 | long_description = fh.read()
5 |
6 | setuptools.setup(
7 | name="NERDA",
8 | version="1.0.0",
9 | author="Lars Kjeldgaard, Lukas Christian Nielsen",
10 | author_email="lars.kjeldgaard@eb.dk",
11 | description="A Framework for Finetuning Transformers for Named-Entity Recognition",
12 | long_description=long_description,
13 | long_description_content_type="text/markdown",
14 | url="https://github.com/ebanalyse/NERDA",
15 | packages=setuptools.find_packages(where='src'),
16 | package_dir={'': 'src'},
17 | python_requires='>=3.6',
18 | install_requires=[
19 | 'torch',
20 | 'transformers',
21 | 'sklearn',
22 | 'nltk',
23 | 'pandas',
24 | 'progressbar',
25 | 'pyconll'
26 | ],
27 | setup_requires=['pytest-runner'],
28 | tests_require=['pytest',
29 | 'pytest-cov'],
30 | classifiers=[
31 | "Programming Language :: Python :: 3",
32 | "License :: OSI Approved :: MIT License",
33 | ],
34 | include_package_data=True
35 | )
36 |
--------------------------------------------------------------------------------
/src/NERDA/__init__.py:
--------------------------------------------------------------------------------
1 | import NERDA
--------------------------------------------------------------------------------
/src/NERDA/datasets.py:
--------------------------------------------------------------------------------
1 | """
2 | This section covers functionality for (down)loading Named Entity
3 | Recognition data sets.
4 | """
5 |
6 | import csv
7 | import os
8 | import pyconll
9 | from io import BytesIO
10 | from itertools import compress
11 | from pathlib import Path
12 | from typing import Union, List, Dict
13 | from urllib.request import urlopen
14 | from zipfile import ZipFile
15 | import ssl
16 |
17 | def download_unzip(url_zip: str,
18 | dir_extract: str) -> str:
19 | """Download and unzip a ZIP archive to folder.
20 |
21 | Loads a ZIP file from URL and extracts all of the files to a
22 | given folder. Does not save the ZIP file itself.
23 |
24 | Args:
25 | url_zip (str): URL to ZIP file.
26 | dir_extract (str): Directory where files are extracted.
27 |
28 | Returns:
29 | str: a message telling, if the archive was succesfully
30 | extracted. Obviously the files in the ZIP archive are
31 | extracted to the desired directory as a side-effect.
32 | """
33 |
34 | # suppress ssl certification
35 | ctx = ssl.create_default_context()
36 | ctx.check_hostname = False
37 | ctx.verify_mode = ssl.CERT_NONE
38 |
39 | print(f'Reading {url_zip}')
40 | with urlopen(url_zip, context=ctx) as zipresp:
41 | with ZipFile(BytesIO(zipresp.read())) as zfile:
42 | zfile.extractall(dir_extract)
43 |
44 | return f'archive extracted to {dir_extract}'
45 |
46 | def download_dane_data(dir: str = None) -> str:
47 | """Download DaNE data set.
48 |
49 | Downloads the 'DaNE' data set annotated for Named Entity
50 | Recognition developed and hosted by
51 | [Alexandra Institute](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane).
52 |
53 | Args:
54 | dir (str, optional): Directory where DaNE datasets will be saved. If no directory is provided, data will be saved to a hidden folder '.dane' in your home directory.
55 |
56 | Returns:
57 | str: a message telling, if the archive was in fact
58 | succesfully extracted. Obviously the DaNE datasets are
59 | extracted to the desired directory as a side-effect.
60 |
61 | Examples:
62 | >>> download_dane_data()
63 | >>> download_dane_data(dir = 'DaNE')
64 |
65 | """
66 | # set to default directory if nothing else has been provided by user.
67 | if dir is None:
68 | dir = os.path.join(str(Path.home()), '.dane')
69 |
70 | return download_unzip(url_zip = 'http://danlp-downloads.alexandra.dk/datasets/ddt.zip',
71 | dir_extract = dir)
72 |
73 | def get_dane_data(split: str = 'train',
74 | limit: int = None,
75 | dir: str = None) -> dict:
76 | """Load DaNE data split.
77 |
78 | Loads a single data split from the DaNE data set kindly hosted
79 | by [Alexandra Institute](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane).
80 |
81 | Args:
82 | split (str, optional): Choose which split to load. Choose
83 | from 'train', 'dev' and 'test'. Defaults to 'train'.
84 | limit (int, optional): Limit the number of observations to be
85 | returned from a given split. Defaults to None, which implies
86 | that the entire data split is returned.
87 | dir (str, optional): Directory where data is cached. If set to
88 | None, the function will try to look for files in '.dane' folder in home directory.
89 |
90 | Returns:
91 | dict: Dictionary with word-tokenized 'sentences' and named
92 | entity 'tags' in IOB format.
93 |
94 | Examples:
95 | Get test split
96 | >>> get_dane_data('test')
97 |
98 | Get first 5 observations from training split
99 | >>> get_dane_data('train', limit = 5)
100 |
101 | """
102 | assert isinstance(split, str)
103 | splits = ['train', 'dev', 'test']
104 | assert split in splits, f'Choose between the following splits: {splits}'
105 |
106 | # set to default directory if nothing else has been provided by user.
107 | if dir is None:
108 | dir = os.path.join(str(Path.home()), '.dane')
109 | assert os.path.isdir(dir), f'Directory {dir} does not exist. Try downloading DaNE data with download_dane_data()'
110 |
111 | file_path = os.path.join(dir, f'ddt.{split}.conllu')
112 | assert os.path.isfile(file_path), f'File {file_path} does not exist. Try downloading DaNE data with download_dane_data()'
113 |
114 | split = pyconll.load_from_file(file_path)
115 |
116 | sentences = []
117 | entities = []
118 |
119 | for sent in split:
120 | sentences.append([token.form for token in sent._tokens])
121 | entities.append([token.misc['name'].pop() for token in sent._tokens])
122 |
123 | if limit is not None:
124 | sentences = sentences[:limit]
125 | entities = entities[:limit]
126 |
127 | return {'sentences': sentences, 'tags': entities}
128 |
129 |
130 |
131 | def download_conll_data(dir: str = None) -> str:
132 | """Download CoNLL-2003 English data set.
133 |
134 | Downloads the [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/)
135 | English data set annotated for Named Entity Recognition.
136 |
137 | Args:
138 | dir (str, optional): Directory where CoNLL-2003 datasets will be saved. If no directory is provided, data will be saved to a hidden folder '.dane' in your home directory.
139 |
140 | Returns:
141 | str: a message telling, if the archive was in fact
142 | succesfully extracted. Obviously the CoNLL datasets are
143 | extracted to the desired directory as a side-effect.
144 |
145 | Examples:
146 | >>> download_conll_data()
147 | >>> download_conll_data(dir = 'conll')
148 |
149 | """
150 | # set to default directory if nothing else has been provided by user.
151 | if dir is None:
152 | dir = os.path.join(str(Path.home()), '.conll')
153 |
154 | return download_unzip(url_zip = 'https://data.deepai.org/conll2003.zip',
155 | dir_extract = dir)
156 |
157 | def get_conll_data(split: str = 'train',
158 | limit: int = None,
159 | dir: str = None) -> dict:
160 | """Load CoNLL-2003 (English) data split.
161 |
162 | Loads a single data split from the
163 | [CoNLL-2003](https://www.clips.uantwerpen.be/conll2003/ner/)
164 | (English) data set.
165 |
166 | Args:
167 | split (str, optional): Choose which split to load. Choose
168 | from 'train', 'valid' and 'test'. Defaults to 'train'.
169 | limit (int, optional): Limit the number of observations to be
170 | returned from a given split. Defaults to None, which implies
171 | that the entire data split is returned.
172 | dir (str, optional): Directory where data is cached. If set to
173 | None, the function will try to look for files in '.conll' folder in home directory.
174 |
175 | Returns:
176 | dict: Dictionary with word-tokenized 'sentences' and named
177 | entity 'tags' in IOB format.
178 |
179 | Examples:
180 | Get test split
181 | >>> get_conll_data('test')
182 |
183 | Get first 5 observations from training split
184 | >>> get_conll_data('train', limit = 5)
185 |
186 | """
187 | assert isinstance(split, str)
188 | splits = ['train', 'valid', 'test']
189 | assert split in splits, f'Choose between the following splits: {splits}'
190 |
191 | # set to default directory if nothing else has been provided by user.
192 | if dir is None:
193 | dir = os.path.join(str(Path.home()), '.conll')
194 | assert os.path.isdir(dir), f'Directory {dir} does not exist. Try downloading CoNLL-2003 data with download_conll_data()'
195 |
196 | file_path = os.path.join(dir, f'{split}.txt')
197 | assert os.path.isfile(file_path), f'File {file_path} does not exist. Try downloading CoNLL-2003 data with download_conll_data()'
198 |
199 | # read data from file.
200 | data = []
201 | with open(file_path, 'r') as file:
202 | reader = csv.reader(file, delimiter = ' ')
203 | for row in reader:
204 | data.append([row])
205 |
206 | sentences = []
207 | sentence = []
208 | entities = []
209 | tags = []
210 |
211 | for row in data:
212 | # extract first element of list.
213 | row = row[0]
214 | # TO DO: move to data reader.
215 | if len(row) > 0 and row[0] != '-DOCSTART-':
216 | sentence.append(row[0])
217 | tags.append(row[-1])
218 | if len(row) == 0 and len(sentence) > 0:
219 | # clean up sentence/tags.
220 | # remove white spaces.
221 | selector = [word != ' ' for word in sentence]
222 | sentence = list(compress(sentence, selector))
223 | tags = list(compress(tags, selector))
224 | # append if sentence length is still greater than zero..
225 | if len(sentence) > 0:
226 | sentences.append(sentence)
227 | entities.append(tags)
228 | sentence = []
229 | tags = []
230 |
231 |
232 | if limit is not None:
233 | sentences = sentences[:limit]
234 | entities = entities[:limit]
235 |
236 | return {'sentences': sentences, 'tags': entities}
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
--------------------------------------------------------------------------------
/src/NERDA/models.py:
--------------------------------------------------------------------------------
1 | """
2 | This section covers the interface for `NERDA` models, that is
3 | implemented as its own Python class [NERDA.models.NERDA][].
4 |
5 | The interface enables you to easily
6 |
7 | - specify your own [NERDA.models.NERDA][] model
8 | - train it
9 | - evaluate it
10 | - use it to predict entities in new texts.
11 | """
12 | from NERDA.datasets import get_conll_data
13 | from NERDA.networks import NERDANetwork
14 | from NERDA.predictions import predict, predict_text
15 | from NERDA.performance import compute_f1_scores, flatten
16 | from NERDA.training import train_model
17 | import pandas as pd
18 | import numpy as np
19 | import torch
20 | import os
21 | import sys
22 | import sklearn.preprocessing
23 | from sklearn.metrics import accuracy_score
24 | from transformers import AutoModel, AutoTokenizer, AutoConfig
25 | from typing import List
26 |
27 | class NERDA:
28 | """NERDA model
29 |
30 | A NERDA model object containing a complete model configuration.
31 | The model can be trained with the `train` method. Afterwards
32 | new observations can be predicted with the `predict` and
33 | `predict_text` methods. The performance of the model can be
34 | evaluated on a set of new observations with the
35 | `evaluate_performance` method.
36 |
37 | Examples:
38 | Model for a VERY small subset (5 observations) of English NER data
39 | >>> from NERDA.datasets import get_conll_data
40 | >>> trn = get_conll_data('train', 5)
41 | >>> valid = get_conll_data('valid', 5)
42 | >>> tag_scheme = ['B-PER', 'I-PER', 'B-LOC', 'I-LOC',
43 | 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC']
44 | >>> tag_outside = 'O'
45 | >>> transformer = 'bert-base-multilingual-uncased'
46 | >>> model = NERDA(transformer = transformer,
47 | tag_scheme = tag_scheme,
48 | tag_outside = tag_outside,
49 | dataset_training = trn,
50 | dataset_validation = valid)
51 |
52 | Model for complete English NER data set CoNLL-2003 with modified hyperparameters
53 | >>> trn = get_conll_data('train')
54 | >>> valid = get_conll_data('valid')
55 | >>> transformer = 'bert-base-multilingual-uncased'
56 | >>> hyperparameters = {'epochs' : 3,
57 | 'warmup_steps' : 400,
58 | 'train_batch_size': 16,
59 | 'learning_rate': 0.0001},
60 | >>> model = NERDA(transformer = transformer,
61 | dataset_training = trn,
62 | dataset_validation = valid,
63 | tag_scheme = tag_scheme,
64 | tag_outside = tag_outside,
65 | dropout = 0.1,
66 | hyperparameters = hyperparameters)
67 |
68 | Attributes:
69 | network (torch.nn.Module): network for Named Entity
70 | Recognition task.
71 | tag_encoder (sklearn.preprocessing.LabelEncoder): encoder for the
72 | NER labels/tags.
73 | transformer_model (transformers.PreTrainedModel): (Auto)Model derived from the
74 | transformer.
75 | transformer_tokenizer (transformers.PretrainedTokenizer): (Auto)Tokenizer
76 | derived from the transformer.
77 | transformer_config (transformers.PretrainedConfig): (Auto)Config derived from
78 | the transformer.
79 | train_losses (list): holds training losses, once the model has been
80 | trained.
81 | valid_loss (float): holds validation loss, once the model has been trained.
82 | """
83 | def __init__(self,
84 | transformer: str = 'bert-base-multilingual-uncased',
85 | device: str = None,
86 | tag_scheme: List[str] = [
87 | 'B-PER',
88 | 'I-PER',
89 | 'B-ORG',
90 | 'I-ORG',
91 | 'B-LOC',
92 | 'I-LOC',
93 | 'B-MISC',
94 | 'I-MISC'
95 | ],
96 | tag_outside: str = 'O',
97 | dataset_training: dict = None,
98 | dataset_validation: dict = None,
99 | max_len: int = 128,
100 | network: torch.nn.Module = NERDANetwork,
101 | dropout: float = 0.1,
102 | hyperparameters: dict = {'epochs' : 4,
103 | 'warmup_steps' : 500,
104 | 'train_batch_size': 13,
105 | 'learning_rate': 0.0001},
106 | tokenizer_parameters: dict = {'do_lower_case' : True},
107 | validation_batch_size: int = 8,
108 | num_workers: int = 1) -> None:
109 | """Initialize NERDA model
110 |
111 | Args:
112 | transformer (str, optional): which pretrained 'huggingface'
113 | transformer to use.
114 | device (str, optional): the desired device to use for computation.
115 | If not provided by the user, we take a guess.
116 | tag_scheme (List[str], optional): All available NER
117 | tags for the given data set EXCLUDING the special outside tag,
118 | that is handled separately.
119 | tag_outside (str, optional): the value of the special outside tag.
120 | Defaults to 'O'.
121 | dataset_training (dict, optional): the training data. Must consist
122 | of 'sentences': word-tokenized sentences and 'tags': corresponding
123 | NER tags. You can look at examples of, how the dataset should
124 | look like by invoking functions get_dane_data() or get_conll_data().
125 | Defaults to None, in which case the English CoNLL-2003 data set is used.
126 | dataset_validation (dict, optional): the validation data. Must consist
127 | of 'sentences': word-tokenized sentences and 'tags': corresponding
128 | NER tags. You can look at examples of, how the dataset should
129 | look like by invoking functions get_dane_data() or get_conll_data().
130 | Defaults to None, in which case the English CoNLL-2003 data set
131 | is used.
132 | max_len (int, optional): the maximum sentence length (number of
133 | tokens after applying the transformer tokenizer) for the transformer.
134 | Sentences are truncated accordingly. Look at your data to get an
135 | impression of, what could be a meaningful setting. Also be aware
136 | that many transformers have a maximum accepted length. Defaults
137 | to 128.
138 | network (torch.nn.module, optional): network to be trained. Defaults
139 | to a default generic `NERDANetwork`. Can be replaced with your own
140 | customized network architecture. It must however take the same
141 | arguments as `NERDANetwork`.
142 | dropout (float, optional): dropout probability. Defaults to 0.1.
143 | hyperparameters (dict, optional): Hyperparameters for the model. Defaults
144 | to {'epochs' : 3, 'warmup_steps' : 500, 'train_batch_size': 16,
145 | 'learning_rate': 0.0001}.
146 | tokenizer_parameters (dict, optional): parameters for the transformer
147 | tokenizer. Defaults to {'do_lower_case' : True}.
148 | validation_batch_size (int, optional): batch size for validation. Defaults
149 | to 8.
150 | num_workers (int, optional): number of workers for data loader.
151 | """
152 |
153 | # set device automatically if not provided by user.
154 | if device is None:
155 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
156 | print("Device automatically set to:", self.device)
157 | else:
158 | self.device = device
159 | print("Device set to:", self.device)
160 | self.tag_scheme = tag_scheme
161 | self.tag_outside = tag_outside
162 | self.transformer = transformer
163 | self.dataset_training = dataset_training
164 | self.dataset_validation = dataset_validation
165 | self.hyperparameters = hyperparameters
166 | self.tag_outside = tag_outside
167 | self.tag_scheme = tag_scheme
168 | tag_complete = [tag_outside] + tag_scheme
169 | # fit encoder to _all_ possible tags.
170 | self.max_len = max_len
171 | self.tag_encoder = sklearn.preprocessing.LabelEncoder()
172 | self.tag_encoder.fit(tag_complete)
173 | self.transformer_model = AutoModel.from_pretrained(transformer)
174 | self.transformer_tokenizer = AutoTokenizer.from_pretrained(transformer, **tokenizer_parameters)
175 | self.transformer_config = AutoConfig.from_pretrained(transformer)
176 | self.network = NERDANetwork(self.transformer_model, self.device, len(tag_complete), dropout = dropout)
177 | self.network.to(self.device)
178 | self.validation_batch_size = validation_batch_size
179 | self.num_workers = num_workers
180 | self.train_losses = []
181 | self.valid_loss = np.nan
182 | self.quantized = False
183 | self.halved = False
184 |
185 | def train(self) -> str:
186 | """Train Network
187 |
188 | Trains the network from the NERDA model specification.
189 |
190 | Returns:
191 | str: a message saying if the model was trained succesfully.
192 | The network in the 'network' attribute is trained as a
193 | side-effect. Training losses and validation loss are saved
194 | in 'training_losses' and 'valid_loss'
195 | attributes respectively as side-effects.
196 | """
197 | network, train_losses, valid_loss = train_model(network = self.network,
198 | tag_encoder = self.tag_encoder,
199 | tag_outside = self.tag_outside,
200 | transformer_tokenizer = self.transformer_tokenizer,
201 | transformer_config = self.transformer_config,
202 | dataset_training = self.dataset_training,
203 | dataset_validation = self.dataset_validation,
204 | validation_batch_size = self.validation_batch_size,
205 | max_len = self.max_len,
206 | device = self.device,
207 | num_workers = self.num_workers,
208 | **self.hyperparameters)
209 |
210 | # attach as attributes to class
211 | setattr(self, "network", network)
212 | setattr(self, "train_losses", train_losses)
213 | setattr(self, "valid_loss", valid_loss)
214 |
215 | return "Model trained successfully"
216 |
217 | def load_network_from_file(self, model_path = "model.bin") -> str:
218 | """Load Pretrained NERDA Network from file
219 |
220 | Loads weights for a pretrained NERDA Network from file.
221 |
222 | Args:
223 | model_path (str, optional): Path for model file.
224 | Defaults to "model.bin".
225 |
226 | Returns:
227 | str: message telling if weights for network were
228 | loaded succesfully.
229 | """
230 | # TODO: change assert to Raise.
231 | assert os.path.exists(model_path), "File does not exist. You can download network with download_network()"
232 | self.network.load_state_dict(torch.load(model_path, map_location = torch.device(self.device)))
233 | self.network.device = self.device
234 | return f'Weights for network loaded from {model_path}'
235 |
236 | def save_network(self, model_path:str = "model.bin") -> None:
237 | """Save Weights of NERDA Network
238 |
239 | Saves weights for a fine-tuned NERDA Network to file.
240 |
241 | Args:
242 | model_path (str, optional): Path for model file.
243 | Defaults to "model.bin".
244 |
245 | Returns:
246 | Nothing. Saves model to file as a side-effect.
247 | """
248 | torch.save(self.network.state_dict(), model_path)
249 | print(f"Network written to file {model_path}")
250 |
251 | def quantize(self):
252 | """Apply dynamic quantization to increase performance.
253 |
254 | Quantization and half precision inference are mutually exclusive.
255 |
256 | Read more: https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html
257 |
258 | Returns:
259 | Nothing. Applies dynamic quantization to Network as a side-effect.
260 | """
261 | assert not (self.quantized), "Dynamic quantization already applied"
262 | assert not (self.halved), "Can't run both quantization and half precision"
263 |
264 | self.network = torch.quantization.quantize_dynamic(
265 | self.network, {torch.nn.Linear}, dtype=torch.qint8
266 | )
267 | self.quantized = True
268 |
269 | def half(self):
270 | """Convert weights from Float32 to Float16 to increase performance
271 |
272 | Quantization and half precision inference are mutually exclusive.
273 |
274 | Read more: https://pytorch.org/docs/master/generated/torch.nn.Module.html?highlight=half#torch.nn.Module.half
275 |
276 | Returns:
277 | Nothing. Model is "halved" as a side-effect.
278 | """
279 | assert not (self.halved), "Half precision already applied"
280 | assert not (self.quantized), "Can't run both quantization and half precision"
281 |
282 | self.network.half()
283 | self.halved = True
284 |
285 | def predict(self, sentences: List[List[str]],
286 | return_confidence: bool = False,
287 | **kwargs) -> List[List[str]]:
288 | """Predict Named Entities in Word-Tokenized Sentences
289 |
290 | Predicts word-tokenized sentences with trained model.
291 |
292 | Args:
293 | sentences (List[List[str]]): word-tokenized sentences.
294 | kwargs: arbitrary keyword arguments. For instance
295 | 'batch_size' and 'num_workers'.
296 | return_confidence (bool, optional): if True, return
297 | confidence scores for all predicted tokens. Defaults
298 | to False.
299 |
300 | Returns:
301 | List[List[str]]: Predicted tags for sentences - one
302 | predicted tag/entity per word token.
303 | """
304 | return predict(network = self.network,
305 | sentences = sentences,
306 | transformer_tokenizer = self.transformer_tokenizer,
307 | transformer_config = self.transformer_config,
308 | max_len = self.max_len,
309 | device = self.device,
310 | tag_encoder = self.tag_encoder,
311 | tag_outside = self.tag_outside,
312 | return_confidence = return_confidence,
313 | **kwargs)
314 |
315 | def predict_text(self, text: str,
316 | return_confidence:bool = False, **kwargs) -> list:
317 | """Predict Named Entities in a Text
318 |
319 | Args:
320 | text (str): text to predict entities in.
321 | kwargs: arbitrary keyword arguments. For instance
322 | 'batch_size' and 'num_workers'.
323 | return_confidence (bool, optional): if True, return
324 | confidence scores for all predicted tokens. Defaults
325 | to False.
326 |
327 | Returns:
328 | tuple: word-tokenized sentences and predicted
329 | tags/entities.
330 | """
331 | return predict_text(network = self.network,
332 | text = text,
333 | transformer_tokenizer = self.transformer_tokenizer,
334 | transformer_config = self.transformer_config,
335 | max_len = self.max_len,
336 | device = self.device,
337 | tag_encoder = self.tag_encoder,
338 | tag_outside = self.tag_outside,
339 | return_confidence=return_confidence,
340 | **kwargs)
341 |
342 | def evaluate_performance(self, dataset: dict,
343 | return_accuracy: bool=False,
344 | **kwargs) -> pd.DataFrame:
345 | """Evaluate Performance
346 |
347 | Evaluates the performance of the model on an arbitrary
348 | data set.
349 |
350 | Args:
351 | dataset (dict): Data set that must consist of
352 | 'sentences' and NER'tags'. You can look at examples
353 | of, how the dataset should look like by invoking functions
354 | get_dane_data() or get_conll_data().
355 | kwargs: arbitrary keyword arguments for predict. For
356 | instance 'batch_size' and 'num_workers'.
357 | return_accuracy (bool): Return accuracy
358 | as well? Defaults to False.
359 |
360 |
361 | Returns:
362 | DataFrame with performance numbers, F1-scores,
363 | Precision and Recall. Returns dictionary with
364 | this AND accuracy, if return_accuracy is set to
365 | True.
366 | """
367 |
368 | tags_predicted = self.predict(dataset.get('sentences'),
369 | **kwargs)
370 |
371 | # compute F1 scores by entity type
372 | f1 = compute_f1_scores(y_pred = tags_predicted,
373 | y_true = dataset.get('tags'),
374 | labels = self.tag_scheme,
375 | average = None)
376 |
377 | # create DataFrame with performance scores (=F1)
378 | df = list(zip(self.tag_scheme, f1[2], f1[0], f1[1]))
379 | df = pd.DataFrame(df, columns = ['Level', 'F1-Score', 'Precision', 'Recall'])
380 |
381 | # compute MICRO-averaged F1-scores and add to table.
382 | f1_micro = compute_f1_scores(y_pred = tags_predicted,
383 | y_true = dataset.get('tags'),
384 | labels = self.tag_scheme,
385 | average = 'micro')
386 | f1_micro = pd.DataFrame({'Level' : ['AVG_MICRO'],
387 | 'F1-Score': [f1_micro[2]],
388 | 'Precision': [np.nan],
389 | 'Recall': [np.nan]})
390 | df = df.append(f1_micro)
391 |
392 | # compute MACRO-averaged F1-scores and add to table.
393 | f1_macro = compute_f1_scores(y_pred = tags_predicted,
394 | y_true = dataset.get('tags'),
395 | labels = self.tag_scheme,
396 | average = 'macro')
397 | f1_macro = pd.DataFrame({'Level' : ['AVG_MICRO'],
398 | 'F1-Score': [f1_macro[2]],
399 | 'Precision': [np.nan],
400 | 'Recall': [np.nan]})
401 | df = df.append(f1_macro)
402 |
403 | # compute and return accuracy if desired
404 | if return_accuracy:
405 | accuracy = accuracy_score(y_pred = flatten(tags_predicted),
406 | y_true = flatten(dataset.get('tags')))
407 | return {'f1':df, 'accuracy': accuracy}
408 |
409 | return df
410 |
411 |
--------------------------------------------------------------------------------
/src/NERDA/networks.py:
--------------------------------------------------------------------------------
1 | """This section covers `torch` networks for `NERDA`"""
2 | import torch
3 | import torch.nn as nn
4 | from transformers import AutoConfig
5 | from NERDA.utils import match_kwargs
6 |
7 | class NERDANetwork(nn.Module):
8 | """A Generic Network for NERDA models.
9 |
10 | The network has an analogous architecture to the models in
11 | [Hvingelby et al. 2020](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf).
12 |
13 | Can be replaced with a custom user-defined network with
14 | the restriction, that it must take the same arguments.
15 | """
16 |
17 | def __init__(self, transformer: nn.Module, device: str, n_tags: int, dropout: float = 0.1) -> None:
18 | """Initialize a NERDA Network
19 |
20 | Args:
21 | transformer (nn.Module): huggingface `torch` transformer.
22 | device (str): Computational device.
23 | n_tags (int): Number of unique entity tags (incl. outside tag)
24 | dropout (float, optional): Dropout probability. Defaults to 0.1.
25 | """
26 | super(NERDANetwork, self).__init__()
27 |
28 | # extract transformer name
29 | transformer_name = transformer.name_or_path
30 | # extract AutoConfig, from which relevant parameters can be extracted.
31 | transformer_config = AutoConfig.from_pretrained(transformer_name)
32 |
33 | self.transformer = transformer
34 | self.dropout = nn.Dropout(dropout)
35 | self.tags = nn.Linear(transformer_config.hidden_size, n_tags)
36 | self.device = device
37 |
38 | # NOTE: 'offsets 'are not used in model as-is, but they are expected as output
39 | # down-stream. So _DON'T_ remove! :)
40 | def forward(self,
41 | input_ids: torch.Tensor,
42 | masks: torch.Tensor,
43 | token_type_ids: torch.Tensor,
44 | target_tags: torch.Tensor,
45 | offsets: torch.Tensor) -> torch.Tensor:
46 | """Model Forward Iteration
47 |
48 | Args:
49 | input_ids (torch.Tensor): Input IDs.
50 | masks (torch.Tensor): Attention Masks.
51 | token_type_ids (torch.Tensor): Token Type IDs.
52 | target_tags (torch.Tensor): Target tags. Are not used
53 | in model as-is, but they are expected downstream,
54 | so they can not be left out.
55 | offsets (torch.Tensor): Offsets to keep track of original
56 | words. Are not used in model as-is, but they are
57 | expected as down-stream, so they can not be left out.
58 |
59 | Returns:
60 | torch.Tensor: predicted values.
61 | """
62 |
63 | # TODO: can be improved with ** and move everything to device in a
64 | # single step.
65 | transformer_inputs = {
66 | 'input_ids': input_ids.to(self.device),
67 | 'masks': masks.to(self.device),
68 | 'token_type_ids': token_type_ids.to(self.device)
69 | }
70 |
71 | # match args with transformer
72 | transformer_inputs = match_kwargs(self.transformer.forward, **transformer_inputs)
73 |
74 | outputs = self.transformer(**transformer_inputs)[0]
75 |
76 | # apply drop-out
77 | outputs = self.dropout(outputs)
78 |
79 | # outputs for all labels/tags
80 | outputs = self.tags(outputs)
81 |
82 | return outputs
83 |
84 |
--------------------------------------------------------------------------------
/src/NERDA/performance.py:
--------------------------------------------------------------------------------
1 | """
2 | This section covers functionality for computing performance
3 | for [NERDA.models.NERDA][] models.
4 | """
5 |
6 | from typing import List
7 | from sklearn.metrics import precision_recall_fscore_support
8 | import warnings
9 |
10 | def flatten(l: list):
11 | """Flattens list"""
12 | return [item for sublist in l for item in sublist]
13 |
14 |
15 | def compute_f1_scores(y_pred: List[List[str]],
16 | y_true: List[List[str]],
17 | labels: List[str],
18 | **kwargs) -> list:
19 | """Compute F1 scores.
20 |
21 | Computes F1 Scores
22 |
23 | Args:
24 | y_pred (List): predicted values.
25 | y_true (List): observed/true values.
26 | labels (List): all possible tags.
27 | kwargs: all optional arguments for precision/recall function.
28 |
29 | Returns:
30 | list: resulting F1 scores.
31 |
32 | """
33 | # check inputs.
34 | assert sum([len(t) < len(p) for t, p in zip(y_true, y_pred)]) == 0, "Length of predictions must not exceed length of observed values"
35 |
36 | # check, if some lengths of observed values exceed predicted values.
37 | n_exceeds = sum([len(t) > len(p) for t, p in zip(y_true, y_pred)])
38 | if n_exceeds > 0:
39 | warnings.warn(f'length of observed values exceeded lengths of predicted values in {n_exceeds} cases and were truncated. _Consider_ increasing max_len parameter for your model.')
40 |
41 | # truncate observed values dimensions to match predicted values,
42 | # this is needed if predictions have been truncated earlier in
43 | # the flow.
44 | y_true = [t[:len(p)] for t, p in zip(y_true, y_pred)]
45 |
46 | y_pred = flatten(y_pred)
47 | y_true = flatten(y_true)
48 |
49 | f1_scores = precision_recall_fscore_support(y_true = y_true,
50 | y_pred = y_pred,
51 | labels = labels,
52 | **kwargs)
53 |
54 | return f1_scores
55 |
--------------------------------------------------------------------------------
/src/NERDA/precooked.py:
--------------------------------------------------------------------------------
1 | """
2 | This sections covers NERDA Models that have been 'precooked' by
3 | Ekstra Bladet and are publicly available for download.
4 | """
5 | from NERDA.datasets import get_dane_data, get_conll_data
6 | from NERDA.models import NERDA
7 | import os
8 | import urllib
9 | from pathlib import Path
10 | from progressbar import ProgressBar
11 |
12 | pbar = None
13 |
14 | # helper function to show progressbar
15 | def show_progress(block_num, block_size, total_size):
16 | global pbar
17 | if pbar is None:
18 | pbar = ProgressBar(maxval=total_size)
19 |
20 | downloaded = block_num * block_size
21 | pbar.start()
22 | if downloaded < total_size:
23 | pbar.update(downloaded)
24 | else:
25 | pbar.finish()
26 | pbar = None
27 |
28 | class Precooked(NERDA):
29 | """Precooked NERDA Model
30 |
31 | NERDA model specification that has been precooked/pretrained
32 | and is available for download.
33 |
34 | Inherits from [NERDA.models.NERDA][].
35 | """
36 | def __init__(self, **kwargs) -> None:
37 | """Initialize Precooked NERDA Model
38 |
39 | Args:
40 | kwargs: all arguments for NERDA Model.
41 | """
42 | super().__init__(**kwargs)
43 |
44 | def download_network(self, dir = None) -> None:
45 | """Download Precooked Network from Web
46 |
47 | Args:
48 | dir (str, optional): Directory where the model file
49 | will be saved. Defaults to None, in which case
50 | the model will be saved in a folder '.nerda' in
51 | your home directory.
52 |
53 | Returns:
54 | str: Message saying if the download was successfull.
55 | Model is downloaded as a side-effect.
56 | """
57 |
58 | model_name = type(self).__name__
59 |
60 | # url for public S3 bucket with NERDA models.
61 | url_s3 = 'https://nerda.s3-eu-west-1.amazonaws.com'
62 | url_model = f'{url_s3}/{model_name}.bin'
63 |
64 | if dir is None:
65 | dir = os.path.join(str(Path.home()), '.nerda')
66 |
67 | if not os.path.exists(dir):
68 | os.mkdir(dir)
69 |
70 | file_path = os.path.join(dir, f'{model_name}.bin')
71 |
72 | print(
73 | """
74 | Please make sure, that you're running the latest version of 'NERDA'
75 | otherwise the model is not guaranteed to work.
76 | """
77 | )
78 | print(f'Downloading {url_model} to {file_path}')
79 | urllib.request.urlretrieve(url_model, file_path, show_progress)
80 |
81 | return "Network downloaded successfully. Load network with 'load_network'."
82 |
83 | def load_network(self, file_path: str = None) -> None:
84 | """Load Pretrained Network
85 |
86 | Loads pretrained network from file.
87 |
88 | Args:
89 | file_path (str, optional): Path to model file. Defaults to None,
90 | in which case, the function points to the '.nerda' folder
91 | the home directory.
92 | """
93 |
94 | model_name = type(self).__name__
95 |
96 | if file_path is None:
97 | file_path = os.path.join(str(Path.home()), '.nerda', f'{model_name}.bin')
98 |
99 | assert os.path.exists(file_path), "File does not exist! You can download network with download_network()"
100 | print(
101 | """
102 | Model loaded. Please make sure, that you're running the latest version
103 | of 'NERDA' otherwise the model is not guaranteed to work.
104 | """
105 | )
106 | self.load_network_from_file(file_path)
107 |
108 | class DA_BERT_ML(Precooked):
109 | """NERDA [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased)
110 | for Danish Finetuned on [DaNE data set](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane).
111 |
112 | Inherits from [NERDA.precooked.Precooked][].
113 |
114 | Examples:
115 | >>> from NERDA.precooked import DA_BERT_ML()
116 | >>> model = DA_BERT_ML()
117 | >>> model.download_network()
118 | >>> model.load_network()
119 | >>> text = 'Jens Hansen har en bondegård'
120 | >>> model.predict_text(text)
121 | ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
122 |
123 | """
124 | def __init__(self, device: str = None) -> None:
125 | """Initialize model"""
126 | super().__init__(transformer = 'bert-base-multilingual-uncased',
127 | device = device,
128 | tag_scheme = [
129 | 'B-PER',
130 | 'I-PER',
131 | 'B-ORG',
132 | 'I-ORG',
133 | 'B-LOC',
134 | 'I-LOC',
135 | 'B-MISC',
136 | 'I-MISC'
137 | ],
138 | tag_outside = 'O',
139 | max_len = 128,
140 | dropout = 0.1,
141 | hyperparameters = {'epochs' : 4,
142 | 'warmup_steps' : 500,
143 | 'train_batch_size': 13,
144 | 'learning_rate': 0.0001},
145 | tokenizer_parameters = {'do_lower_case' : True})
146 |
147 | class DA_DISTILBERT_ML(Precooked):
148 | """NERDA [Multilingual BERT](https://huggingface.co/distilbert-base-multilingual-cased)
149 | for Danish Finetuned on [DaNE data set](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane).
150 |
151 | Inherits from [NERDA.precooked.Precooked][].
152 |
153 | Examples:
154 | >>> from NERDA.precooked import DA_DISTILBERT_ML()
155 | >>> model = DA_DISTILBERT_ML()
156 | >>> model.download_network()
157 | >>> model.load_network()
158 | >>> text = 'Jens Hansen har en bondegård'
159 | >>> model.predict_text(text)
160 | ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
161 |
162 | """
163 | def __init__(self, device: str = None) -> None:
164 | """Initialize model"""
165 | super().__init__(transformer = 'distilbert-base-multilingual-cased',
166 | device = device,
167 | tag_scheme = [
168 | 'B-PER',
169 | 'I-PER',
170 | 'B-ORG',
171 | 'I-ORG',
172 | 'B-LOC',
173 | 'I-LOC',
174 | 'B-MISC',
175 | 'I-MISC'
176 | ],
177 | tag_outside = 'O',
178 | max_len = 128,
179 | dropout = 0.1,
180 | hyperparameters = {'epochs' : 4,
181 | 'warmup_steps' : 500,
182 | 'train_batch_size': 13,
183 | 'learning_rate': 0.0001},
184 | tokenizer_parameters = {'do_lower_case' : False})
185 |
186 | class DA_ELECTRA_DA(Precooked):
187 | """NERDA [Danish ELECTRA](https://huggingface.co/Maltehb/-l-ctra-danish-electra-small-uncased)
188 | for Danish finetuned on [DaNE data set](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane).
189 |
190 | Inherits from [NERDA.precooked.Precooked][].
191 |
192 | Examples:
193 | >>> from NERDA.precooked import DA_ELECTRA_DA()
194 | >>> model = DA_ELECTRA_DA()
195 | >>> model.download_network()
196 | >>> model.load_network()
197 | >>> text = 'Jens Hansen har en bondegård'
198 | >>> model.predict_text(text)
199 | ([['Jens', 'Hansen', 'har', 'en', 'bondegård']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
200 |
201 | """
202 | def __init__(self, device: str = None) -> None:
203 | """Initialize model"""
204 | super().__init__(transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased',
205 | device = device,
206 | tag_scheme = [
207 | 'B-PER',
208 | 'I-PER',
209 | 'B-ORG',
210 | 'I-ORG',
211 | 'B-LOC',
212 | 'I-LOC',
213 | 'B-MISC',
214 | 'I-MISC'
215 | ],
216 | tag_outside = 'O',
217 | max_len = 128,
218 | dropout = 0.1,
219 | hyperparameters = {'epochs' : 5,
220 | 'warmup_steps' : 500,
221 | 'train_batch_size': 13,
222 | 'learning_rate': 0.0001},
223 | tokenizer_parameters = {'do_lower_case' : True})
224 |
225 | class EN_ELECTRA_EN(Precooked):
226 | """NERDA [English ELECTRA](https://huggingface.co/google/electra-small-discriminator)
227 | for English finetuned on [CoNLL-2003 data set](https://www.clips.uantwerpen.be/conll2003/ner/).
228 |
229 | Inherits from [NERDA.precooked.Precooked][].
230 |
231 | Examples:
232 | >>> from NERDA.precooked import EN_ELECTRA_EN()
233 | >>> model = EN_ELECTRA_EN()
234 | >>> model.download_network()
235 | >>> model.load_network()
236 | >>> text = 'Old MacDonald had a farm'
237 | >>> model.predict_text(text)
238 | ([['Old', 'MacDonald', 'had', 'a', 'farm']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
239 |
240 | """
241 | def __init__(self, device: str = None) -> None:
242 | """Initialize model"""
243 | super().__init__(transformer = 'google/electra-small-discriminator',
244 | device = device,
245 | tag_scheme = [
246 | 'B-PER',
247 | 'I-PER',
248 | 'B-ORG',
249 | 'I-ORG',
250 | 'B-LOC',
251 | 'I-LOC',
252 | 'B-MISC',
253 | 'I-MISC'
254 | ],
255 | tag_outside = 'O',
256 | max_len = 128,
257 | dropout = 0.1,
258 | hyperparameters = {'epochs' : 4,
259 | 'warmup_steps' : 250,
260 | 'train_batch_size': 13,
261 | 'learning_rate': 8e-05},
262 | tokenizer_parameters = {'do_lower_case' : True})
263 |
264 |
265 | class EN_BERT_ML(Precooked):
266 | """NERDA [Multilingual BERT](https://huggingface.co/bert-base-multilingual-uncased)
267 | for English finetuned on [CoNLL-2003 data set](https://www.clips.uantwerpen.be/conll2003/ner/).
268 |
269 | Inherits from [NERDA.precooked.Precooked][].
270 |
271 | Examples:
272 | >>> from NERDA.precooked import EN_BERT_ML()
273 | >>> model = EN_BERT_ML()
274 | >>> model.download_network()
275 | >>> model.load_network()
276 | >>> text = 'Old MacDonald had a farm'
277 | >>> model.predict_text(text)
278 | ([['Old', 'MacDonald', 'had', 'a', 'farm']], [['B-PER', 'I-PER', 'O', 'O', 'O']])
279 |
280 | """
281 | def __init__(self, device: str = None) -> None:
282 | """Initialize model"""
283 | super().__init__(transformer = 'bert-base-multilingual-uncased',
284 | device = device,
285 | tag_scheme = [
286 | 'B-PER',
287 | 'I-PER',
288 | 'B-ORG',
289 | 'I-ORG',
290 | 'B-LOC',
291 | 'I-LOC',
292 | 'B-MISC',
293 | 'I-MISC'
294 | ],
295 | tag_outside = 'O',
296 | max_len = 128,
297 | dropout = 0.1,
298 | hyperparameters = {'epochs' : 4,
299 | 'warmup_steps' : 500,
300 | 'train_batch_size': 13,
301 | 'learning_rate': 0.0001},
302 | tokenizer_parameters = {'do_lower_case' : True})
303 |
304 |
305 |
306 |
307 |
--------------------------------------------------------------------------------
/src/NERDA/predictions.py:
--------------------------------------------------------------------------------
1 | """
2 | This section covers functionality for computing predictions
3 | with a [NERDA.models.NERDA][] model.
4 | """
5 |
6 | from NERDA.preprocessing import create_dataloader
7 | import torch
8 | import numpy as np
9 | from tqdm import tqdm
10 | from nltk.tokenize import sent_tokenize, word_tokenize
11 | from typing import List, Callable
12 | import transformers
13 | import sklearn.preprocessing
14 |
15 | def sigmoid_transform(x):
16 | prob = 1/(1 + np.exp(-x))
17 | return prob
18 |
19 | def predict(network: torch.nn.Module,
20 | sentences: List[List[str]],
21 | transformer_tokenizer: transformers.PreTrainedTokenizer,
22 | transformer_config: transformers.PretrainedConfig,
23 | max_len: int,
24 | device: str,
25 | tag_encoder: sklearn.preprocessing.LabelEncoder,
26 | tag_outside: str,
27 | batch_size: int = 8,
28 | num_workers: int = 1,
29 | return_tensors: bool = False,
30 | return_confidence: bool = False,
31 | pad_sequences: bool = True) -> List[List[str]]:
32 | """Compute predictions.
33 |
34 | Computes predictions for a list with word-tokenized sentences
35 | with a `NERDA` model.
36 |
37 | Args:
38 | network (torch.nn.Module): Network.
39 | sentences (List[List[str]]): List of lists with word-tokenized
40 | sentences.
41 | transformer_tokenizer (transformers.PreTrainedTokenizer):
42 | tokenizer for transformer model.
43 | transformer_config (transformers.PretrainedConfig): config
44 | for transformer model.
45 | max_len (int): Maximum length of sentence after applying
46 | transformer tokenizer.
47 | device (str): Computational device.
48 | tag_encoder (sklearn.preprocessing.LabelEncoder): Encoder
49 | for Named-Entity tags.
50 | tag_outside (str): Special 'outside' NER tag.
51 | batch_size (int, optional): Batch Size for DataLoader.
52 | Defaults to 8.
53 | num_workers (int, optional): Number of workers. Defaults
54 | to 1.
55 | return_tensors (bool, optional): if True, return tensors.
56 | return_confidence (bool, optional): if True, return
57 | confidence scores for all predicted tokens. Defaults
58 | to False.
59 | pad_sequences (bool, optional): if True, pad sequences.
60 | Defaults to True.
61 |
62 | Returns:
63 | List[List[str]]: List of lists with predicted Entity
64 | tags.
65 | """
66 | # make sure, that input has the correct format.
67 | assert isinstance(sentences, list), "'sentences' must be a list of list of word-tokens"
68 | assert isinstance(sentences[0], list), "'sentences' must be a list of list of word-tokens"
69 | assert isinstance(sentences[0][0], str), "'sentences' must be a list of list of word-tokens"
70 |
71 | # set network to appropriate mode.
72 | network.eval()
73 |
74 | # fill 'dummy' tags (expected input for dataloader).
75 | tag_fill = [tag_encoder.classes_[0]]
76 | tags_dummy = [tag_fill * len(sent) for sent in sentences]
77 |
78 | dl = create_dataloader(sentences = sentences,
79 | tags = tags_dummy,
80 | transformer_tokenizer = transformer_tokenizer,
81 | transformer_config = transformer_config,
82 | max_len = max_len,
83 | batch_size = batch_size,
84 | tag_encoder = tag_encoder,
85 | tag_outside = tag_outside,
86 | num_workers = num_workers,
87 | pad_sequences = pad_sequences)
88 |
89 | predictions = []
90 | probabilities = []
91 | tensors = []
92 |
93 | with torch.no_grad():
94 | for _, dl in enumerate(dl):
95 |
96 | outputs = network(**dl)
97 |
98 | # conduct operations on sentence level.
99 | for i in range(outputs.shape[0]):
100 |
101 | # extract prediction and transform.
102 |
103 | # find max by row.
104 | values, indices = outputs[i].max(dim=1)
105 |
106 | preds = tag_encoder.inverse_transform(indices.cpu().numpy())
107 | probs = values.cpu().numpy()
108 |
109 | if return_tensors:
110 | tensors.append(outputs)
111 |
112 | # subset predictions for original word tokens.
113 | preds = [prediction for prediction, offset in zip(preds.tolist(), dl.get('offsets')[i]) if offset]
114 | if return_confidence:
115 | probs = [prob for prob, offset in zip(probs.tolist(), dl.get('offsets')[i]) if offset]
116 |
117 | # Remove special tokens ('CLS' + 'SEP').
118 | preds = preds[1:-1]
119 | if return_confidence:
120 | probs = probs[1:-1]
121 |
122 | # make sure resulting predictions have same length as
123 | # original sentence.
124 |
125 | # TODO: Move assert statement to unit tests. Does not work
126 | # in boundary.
127 | # assert len(preds) == len(sentences[i])
128 | predictions.append(preds)
129 | if return_confidence:
130 | probabilities.append(probs)
131 |
132 | if return_confidence:
133 | return predictions, probabilities
134 |
135 | if return_tensors:
136 | return tensors
137 |
138 | return predictions
139 |
140 | def predict_text(network: torch.nn.Module,
141 | text: str,
142 | transformer_tokenizer: transformers.PreTrainedTokenizer,
143 | transformer_config: transformers.PretrainedConfig,
144 | max_len: int,
145 | device: str,
146 | tag_encoder: sklearn.preprocessing.LabelEncoder,
147 | tag_outside: str,
148 | batch_size: int = 8,
149 | num_workers: int = 1,
150 | pad_sequences: bool = True,
151 | return_confidence: bool = False,
152 | sent_tokenize: Callable = sent_tokenize,
153 | word_tokenize: Callable = word_tokenize) -> tuple:
154 | """Compute Predictions for Text.
155 |
156 | Computes predictions for a text with `NERDA` model.
157 | Text is tokenized into sentences before computing predictions.
158 |
159 | Args:
160 | network (torch.nn.Module): Network.
161 | text (str): text to predict entities in.
162 | transformer_tokenizer (transformers.PreTrainedTokenizer):
163 | tokenizer for transformer model.
164 | transformer_config (transformers.PretrainedConfig): config
165 | for transformer model.
166 | max_len (int): Maximum length of sentence after applying
167 | transformer tokenizer.
168 | device (str): Computational device.
169 | tag_encoder (sklearn.preprocessing.LabelEncoder): Encoder
170 | for Named-Entity tags.
171 | tag_outside (str): Special 'outside' NER tag.
172 | batch_size (int, optional): Batch Size for DataLoader.
173 | Defaults to 8.
174 | num_workers (int, optional): Number of workers. Defaults
175 | to 1.
176 | pad_sequences (bool, optional): if True, pad sequences.
177 | Defaults to True.
178 | return_confidence (bool, optional): if True, return
179 | confidence scores for predicted tokens. Defaults
180 | to False.
181 |
182 | Returns:
183 | tuple: sentence- and word-tokenized text with corresponding
184 | predicted named-entity tags.
185 | """
186 | assert isinstance(text, str), "'text' must be a string."
187 | sentences = sent_tokenize(text)
188 |
189 | sentences = [word_tokenize(sentence) for sentence in sentences]
190 |
191 | predictions = predict(network = network,
192 | sentences = sentences,
193 | transformer_tokenizer = transformer_tokenizer,
194 | transformer_config = transformer_config,
195 | max_len = max_len,
196 | device = device,
197 | return_confidence = return_confidence,
198 | batch_size = batch_size,
199 | num_workers = num_workers,
200 | pad_sequences = pad_sequences,
201 | tag_encoder = tag_encoder,
202 | tag_outside = tag_outside)
203 |
204 | return sentences, predictions
205 |
206 |
--------------------------------------------------------------------------------
/src/NERDA/preprocessing.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import warnings
3 | import transformers
4 | import sklearn.preprocessing
5 |
6 | class NERDADataSetReader():
7 | """Generic NERDA DataSetReader"""
8 |
9 | def __init__(self,
10 | sentences: list,
11 | tags: list,
12 | transformer_tokenizer: transformers.PreTrainedTokenizer,
13 | transformer_config: transformers.PretrainedConfig,
14 | max_len: int,
15 | tag_encoder: sklearn.preprocessing.LabelEncoder,
16 | tag_outside: str,
17 | pad_sequences : bool = True) -> None:
18 | """Initialize DataSetReader
19 |
20 | Initializes DataSetReader that prepares and preprocesses
21 | DataSet for Named-Entity Recognition Task and training.
22 |
23 | Args:
24 | sentences (list): Sentences.
25 | tags (list): Named-Entity tags.
26 | transformer_tokenizer (transformers.PreTrainedTokenizer):
27 | tokenizer for transformer.
28 | transformer_config (transformers.PretrainedConfig): Config
29 | for transformer model.
30 | max_len (int): Maximum length of sentences after applying
31 | transformer tokenizer.
32 | tag_encoder (sklearn.preprocessing.LabelEncoder): Encoder
33 | for Named-Entity tags.
34 | tag_outside (str): Special Outside tag.
35 | pad_sequences (bool): Pad sequences to max_len. Defaults
36 | to True.
37 | """
38 | self.sentences = sentences
39 | self.tags = tags
40 | self.transformer_tokenizer = transformer_tokenizer
41 | self.max_len = max_len
42 | self.tag_encoder = tag_encoder
43 | self.pad_token_id = transformer_config.pad_token_id
44 | self.tag_outside_transformed = tag_encoder.transform([tag_outside])[0]
45 | self.pad_sequences = pad_sequences
46 |
47 | def __len__(self):
48 | return len(self.sentences)
49 |
50 | def __getitem__(self, item):
51 | sentence = self.sentences[item]
52 | tags = self.tags[item]
53 | # encode tags
54 | tags = self.tag_encoder.transform(tags)
55 |
56 | # check inputs for consistancy
57 | assert len(sentence) == len(tags)
58 |
59 | input_ids = []
60 | target_tags = []
61 | tokens = []
62 | offsets = []
63 |
64 | # for debugging purposes
65 | # print(item)
66 | for i, word in enumerate(sentence):
67 | # bert tokenization
68 | wordpieces = self.transformer_tokenizer.tokenize(word)
69 | tokens.extend(wordpieces)
70 | # make room for CLS if there is an identified word piece
71 | if len(wordpieces)>0:
72 | offsets.extend([1]+[0]*(len(wordpieces)-1))
73 | # Extends the ner_tag if the word has been split by the wordpiece tokenizer
74 | target_tags.extend([tags[i]] * len(wordpieces))
75 |
76 | # Make room for adding special tokens (one for both 'CLS' and 'SEP' special tokens)
77 | # max_len includes _all_ tokens.
78 | if len(tokens) > self.max_len-2:
79 | msg = f'Sentence #{item} length {len(tokens)} exceeds max_len {self.max_len} and has been truncated'
80 | warnings.warn(msg)
81 | tokens = tokens[:self.max_len-2]
82 | target_tags = target_tags[:self.max_len-2]
83 | offsets = offsets[:self.max_len-2]
84 |
85 | # encode tokens for BERT
86 | # TO DO: prettify this.
87 | input_ids = self.transformer_tokenizer.convert_tokens_to_ids(tokens)
88 | input_ids = [self.transformer_tokenizer.cls_token_id] + input_ids + [self.transformer_tokenizer.sep_token_id]
89 |
90 | # fill out other inputs for model.
91 | target_tags = [self.tag_outside_transformed] + target_tags + [self.tag_outside_transformed]
92 | masks = [1] * len(input_ids)
93 | # set to 0, because we are not doing NSP or QA type task (across multiple sentences)
94 | # token_type_ids distinguishes sentences.
95 | token_type_ids = [0] * len(input_ids)
96 | offsets = [1] + offsets + [1]
97 |
98 | # Padding to max length
99 | # compute padding length
100 | if self.pad_sequences:
101 | padding_len = self.max_len - len(input_ids)
102 | input_ids = input_ids + ([self.pad_token_id] * padding_len)
103 | masks = masks + ([0] * padding_len)
104 | offsets = offsets + ([0] * padding_len)
105 | token_type_ids = token_type_ids + ([0] * padding_len)
106 | target_tags = target_tags + ([self.tag_outside_transformed] * padding_len)
107 |
108 | return {'input_ids' : torch.tensor(input_ids, dtype = torch.long),
109 | 'masks' : torch.tensor(masks, dtype = torch.long),
110 | 'token_type_ids' : torch.tensor(token_type_ids, dtype = torch.long),
111 | 'target_tags' : torch.tensor(target_tags, dtype = torch.long),
112 | 'offsets': torch.tensor(offsets, dtype = torch.long)}
113 |
114 | def create_dataloader(sentences,
115 | tags,
116 | transformer_tokenizer,
117 | transformer_config,
118 | max_len,
119 | tag_encoder,
120 | tag_outside,
121 | batch_size = 1,
122 | num_workers = 1,
123 | pad_sequences = True):
124 |
125 | if not pad_sequences and batch_size > 1:
126 | print("setting pad_sequences to True, because batch_size is more than one.")
127 | pad_sequences = True
128 |
129 | data_reader = NERDADataSetReader(
130 | sentences = sentences,
131 | tags = tags,
132 | transformer_tokenizer = transformer_tokenizer,
133 | transformer_config = transformer_config,
134 | max_len = max_len,
135 | tag_encoder = tag_encoder,
136 | tag_outside = tag_outside,
137 | pad_sequences = pad_sequences)
138 | # Don't pad sequences if batch size == 1. This improves performance.
139 |
140 | data_loader = torch.utils.data.DataLoader(
141 | data_reader, batch_size = batch_size, num_workers = num_workers
142 | )
143 |
144 | return data_loader
145 |
146 |
--------------------------------------------------------------------------------
/src/NERDA/training.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from .preprocessing import create_dataloader
3 | from sklearn import preprocessing
4 | from transformers import AdamW, get_linear_schedule_with_warmup
5 | import random
6 | import torch
7 | from tqdm import tqdm
8 |
9 | def train(model, data_loader, optimizer, device, scheduler, n_tags):
10 | """One Iteration of Training"""
11 |
12 | model.train()
13 | final_loss = 0.0
14 |
15 | for dl in tqdm(data_loader, total=len(data_loader)):
16 |
17 | optimizer.zero_grad()
18 | outputs = model(**dl)
19 | loss = compute_loss(outputs,
20 | dl.get('target_tags'),
21 | dl.get('masks'),
22 | device,
23 | n_tags)
24 | loss.backward()
25 | optimizer.step()
26 | scheduler.step()
27 | final_loss += loss.item()
28 |
29 | # Return average loss
30 | return final_loss / len(data_loader)
31 |
32 | def validate(model, data_loader, device, n_tags):
33 | """One Iteration of Validation"""
34 |
35 | model.eval()
36 | final_loss = 0.0
37 |
38 | for dl in tqdm(data_loader, total=len(data_loader)):
39 |
40 | outputs = model(**dl)
41 | loss = compute_loss(outputs,
42 | dl.get('target_tags'),
43 | dl.get('masks'),
44 | device,
45 | n_tags)
46 | final_loss += loss.item()
47 |
48 | # Return average loss.
49 | return final_loss / len(data_loader)
50 |
51 | def compute_loss(preds, target_tags, masks, device, n_tags):
52 |
53 | # initialize loss function.
54 | lfn = torch.nn.CrossEntropyLoss()
55 |
56 | # Compute active loss to not compute loss of paddings
57 | active_loss = masks.view(-1) == 1
58 |
59 | active_logits = preds.view(-1, n_tags)
60 | active_labels = torch.where(
61 | active_loss,
62 | target_tags.view(-1),
63 | torch.tensor(lfn.ignore_index).type_as(target_tags)
64 | )
65 |
66 | active_labels = torch.as_tensor(active_labels, device = torch.device(device), dtype = torch.long)
67 |
68 | # Only compute loss on actual token predictions
69 | loss = lfn(active_logits, active_labels)
70 |
71 | return loss
72 |
73 | def enforce_reproducibility(seed = 42) -> None:
74 | """Enforce Reproducibity
75 |
76 | Enforces reproducibility of models to the furthest
77 | possible extent. This is done by setting fixed seeds for
78 | random number generation etcetera.
79 |
80 | For atomic operations there is currently no simple way to
81 | enforce determinism, as the order of parallel operations
82 | is not known.
83 |
84 | Args:
85 | seed (int, optional): Fixed seed. Defaults to 42.
86 | """
87 | # Sets seed manually for both CPU and CUDA
88 | torch.manual_seed(seed)
89 | torch.cuda.manual_seed_all(seed)
90 | # CUDNN
91 | torch.backends.cudnn.deterministic = True
92 | torch.backends.cudnn.benchmark = False
93 | # System based
94 | random.seed(seed)
95 | np.random.seed(seed)
96 |
97 | def train_model(network,
98 | tag_encoder,
99 | tag_outside,
100 | transformer_tokenizer,
101 | transformer_config,
102 | dataset_training,
103 | dataset_validation,
104 | max_len = 128,
105 | train_batch_size = 16,
106 | validation_batch_size = 8,
107 | epochs = 5,
108 | warmup_steps = 0,
109 | learning_rate = 5e-5,
110 | device = None,
111 | fixed_seed = 42,
112 | num_workers = 1):
113 |
114 | if fixed_seed is not None:
115 | enforce_reproducibility(fixed_seed)
116 |
117 | # compute number of unique tags from encoder.
118 | n_tags = tag_encoder.classes_.shape[0]
119 |
120 | # prepare datasets for modelling by creating data readers and loaders
121 | dl_train = create_dataloader(sentences = dataset_training.get('sentences'),
122 | tags = dataset_training.get('tags'),
123 | transformer_tokenizer = transformer_tokenizer,
124 | transformer_config = transformer_config,
125 | max_len = max_len,
126 | batch_size = train_batch_size,
127 | tag_encoder = tag_encoder,
128 | tag_outside = tag_outside,
129 | num_workers = num_workers)
130 | dl_validate = create_dataloader(sentences = dataset_validation.get('sentences'),
131 | tags = dataset_validation.get('tags'),
132 | transformer_tokenizer = transformer_tokenizer,
133 | transformer_config = transformer_config,
134 | max_len = max_len,
135 | batch_size = validation_batch_size,
136 | tag_encoder = tag_encoder,
137 | tag_outside = tag_outside,
138 | num_workers = num_workers)
139 |
140 | optimizer_parameters = network.parameters()
141 |
142 | num_train_steps = int(len(dataset_training.get('sentences')) / train_batch_size * epochs)
143 |
144 | optimizer = AdamW(optimizer_parameters, lr = learning_rate)
145 | scheduler = get_linear_schedule_with_warmup(
146 | optimizer, num_warmup_steps = warmup_steps, num_training_steps = num_train_steps
147 | )
148 |
149 | train_losses = []
150 | best_valid_loss = np.inf
151 |
152 | for epoch in range(epochs):
153 |
154 | print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
155 |
156 | train_loss = train(network, dl_train, optimizer, device, scheduler, n_tags)
157 | train_losses.append(train_loss)
158 | valid_loss = validate(network, dl_validate, device, n_tags)
159 |
160 | print(f"Train Loss = {train_loss} Valid Loss = {valid_loss}")
161 |
162 | if valid_loss < best_valid_loss:
163 | best_parameters = network.state_dict()
164 | best_valid_loss = valid_loss
165 |
166 | # return best model
167 | network.load_state_dict(best_parameters)
168 |
169 | return network, train_losses, best_valid_loss
170 |
171 |
172 |
173 |
--------------------------------------------------------------------------------
/src/NERDA/utils.py:
--------------------------------------------------------------------------------
1 | from typing import Callable
2 |
3 | def match_kwargs(function: Callable, **kwargs) -> dict:
4 | """Matches Arguments with Function
5 |
6 | Match keywords arguments with the arguments of a function.
7 |
8 | Args:
9 | function (function): Function to match arguments for.
10 | kwargs: keyword arguments to match against.
11 |
12 | Returns:
13 | dict: dictionary with matching arguments and their
14 | respective values.
15 |
16 | """
17 | arg_count = function.__code__.co_argcount
18 | args = function.__code__.co_varnames[:arg_count]
19 |
20 | args_dict = {}
21 | for k, v in kwargs.items():
22 | if k in args:
23 | args_dict[k] = v
24 |
25 | return args_dict
26 |
--------------------------------------------------------------------------------
/tests/unit_tests/test_aaaNERDA.py:
--------------------------------------------------------------------------------
1 | # HACK: Filename prefixed with 'aaa' to execute this test before the others
2 | # in order to download necessary ressources for all other tests.
3 |
4 | from NERDA.datasets import get_dane_data, download_dane_data
5 | # TODO: should not be necesssary to download before importing NERDA.
6 | # Download necessary ressources
7 | download_dane_data()
8 | from NERDA.models import NERDA
9 | from NERDA.precooked import DA_ELECTRA_DA
10 | import nltk
11 | nltk.download('punkt')
12 |
13 | # instantiate a minimal model.
14 | model = NERDA(dataset_training = get_dane_data('train', 5),
15 | dataset_validation = get_dane_data('dev', 5),
16 | max_len = 128,
17 | transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased',
18 | hyperparameters = {'epochs' : 1,
19 | 'warmup_steps' : 10,
20 | 'train_batch_size': 5,
21 | 'learning_rate': 0.0001})
22 |
23 | def test_instantiate_NERDA():
24 | """Test that model has the correct/expected class"""
25 | assert isinstance(model, NERDA)
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/tests/unit_tests/test_performance.py:
--------------------------------------------------------------------------------
1 | from NERDA.datasets import get_dane_data
2 | from NERDA.models import NERDA
3 | import pandas as pd
4 |
5 | # instantiate a minimal model.
6 | model = NERDA(dataset_training = get_dane_data('train', 5),
7 | dataset_validation = get_dane_data('dev', 5),
8 | transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased',
9 | hyperparameters = {'epochs' : 1,
10 | 'warmup_steps' : 10,
11 | 'train_batch_size': 5,
12 | 'learning_rate': 0.0001})
13 |
14 | test = get_dane_data('test')
15 | perf = model.evaluate_performance(test)
16 |
17 | def test_performance_df():
18 | assert isinstance(perf, pd.DataFrame)
19 |
20 | def test_performance_len():
21 | assert len(perf) > 0
22 |
23 | def test_includes_relevant_metrics():
24 | metrics = ['F1-Score', 'Precision', 'Recall']
25 | assert all([x in perf.columns for x in metrics])
26 |
27 | def test_metrics_dtype():
28 | metrics = ['F1-Score', 'Precision', 'Recall']
29 | assert all([perf.dtypes[x] == 'float' for x in metrics])
30 |
31 |
--------------------------------------------------------------------------------
/tests/unit_tests/test_precooked.py:
--------------------------------------------------------------------------------
1 | from NERDA.precooked import DA_ELECTRA_DA
2 |
3 | def test_load_precooked():
4 | """Test that precooked model can be (down)loaded, instantiated and works end-to-end"""
5 | m = DA_ELECTRA_DA()
6 | m.download_network()
7 | m.load_network()
8 | m.predict_text("Jens Hansen har en bondegård. Det har han!")
9 |
--------------------------------------------------------------------------------
/tests/unit_tests/test_predictions.py:
--------------------------------------------------------------------------------
1 | from NERDA.datasets import get_dane_data
2 | from NERDA.models import NERDA
3 | import nltk
4 |
5 | # instantiate a minimal model.
6 | model = NERDA(dataset_training = get_dane_data('train', 5),
7 | dataset_validation = get_dane_data('dev', 5),
8 | transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased',
9 | hyperparameters = {'epochs' : 1,
10 | 'warmup_steps' : 10,
11 | 'train_batch_size': 5,
12 | 'learning_rate': 0.0001})
13 |
14 |
15 | # set example texts to identify entities in.
16 | text_single = "Pernille Rosenkrantz-Theil kommer fra Vejle"
17 | sentences = [nltk.word_tokenize(text_single)]
18 |
19 | def test_predict():
20 | """Test that predict runs"""
21 | predictions = model.predict(sentences)
22 |
23 | predictions = model.predict(sentences)
24 |
25 | def test_predict_type():
26 | """Test token predictions"""
27 | assert isinstance(predictions, list)
28 |
29 | def test_predict_length():
30 | """Test that sentence and prediction lenghts match"""
31 | assert len(sentences[0])==len(predictions[0])
32 |
33 | def test_predict_text():
34 | """Test that predict_text runs"""
35 | predictions = model.predict_text(text_single)
36 |
37 | def test_predict_maxlen_exceed():
38 | """That that exceeding max len does not break predict"""
39 | text = "ice " * 200
40 | sentences = [nltk.word_tokenize(text)]
41 | model.predict(sentences)
42 |
43 | # test confidence scores
44 | words, preds = model.predict_text(text_single, return_confidence=True)
45 |
46 | def test_confs_len():
47 | assert len(preds[0])==len(preds[1])
48 |
49 | predictions_text_single = model.predict_text(text_single)
50 |
51 | def test_predict_text_format():
52 | """Test text predictions"""
53 | assert isinstance(predictions_text_single, tuple)
54 |
55 | def test_predict_text_match_words_predictions():
56 | assert len(predictions_text_single[0][0]) == len(predictions_text_single[1][0])
57 |
58 | # multiple sentences.
59 | text_multi = """
60 | Pernille Rosenkrantz-Theil kommer fra Vejle.
61 | Jens Hansen har en bondegård.
62 | """
63 |
64 | def test_predict_text_multi():
65 | """Test that predict_text runs with multiple sentences"""
66 | predictions = model.predict_text(text_multi, batch_size = 2)
67 |
68 | predictions_text_multi = model.predict_text(text_multi, batch_size = 2)
69 |
70 | def test_predict_text_multi_format():
71 | """Test multi-sentence text predictions has expected format"""
72 | assert isinstance(predictions_text_multi, tuple)
73 |
74 | def test_predict_text_multi_elements_count():
75 | """Test dimensions of multi-sentence text predictions"""
76 | assert [len(predictions_text_multi[0]), len(predictions_text_multi[1])] == [2, 2]
77 |
78 | def test_predict_text_multi_lens():
79 | """Test lengths of multi-sentence text predictions"""
80 | s1 = len(predictions_text_multi[0][0]) == len(predictions_text_multi[1][0])
81 | s2 = len(predictions_text_multi[0][1]) == len(predictions_text_multi[1][1])
82 | assert all([s1, s2])
83 |
84 |
--------------------------------------------------------------------------------
/tests/unit_tests/test_training.py:
--------------------------------------------------------------------------------
1 | from NERDA.datasets import get_dane_data
2 | from NERDA.models import NERDA
3 |
4 | # instantiate a minimal model.
5 | model = NERDA(dataset_training = get_dane_data('train', 5),
6 | dataset_validation = get_dane_data('dev', 5),
7 | transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased',
8 | hyperparameters = {'epochs' : 1,
9 | 'warmup_steps' : 10,
10 | 'train_batch_size': 5,
11 | 'learning_rate': 0.0001})
12 |
13 | def test_training():
14 | """Test if training runs successfully"""
15 | model.train()
16 |
17 | def test_training_exceed_maxlen():
18 | """Test if traning does not break even though MAX LEN is exceeded"""
19 | m = NERDA(dataset_training = get_dane_data('train', 5),
20 | dataset_validation = get_dane_data('dev', 5),
21 | max_len = 3,
22 | transformer = 'Maltehb/-l-ctra-danish-electra-small-uncased',
23 | hyperparameters = {'epochs' : 1,
24 | 'warmup_steps' : 10,
25 | 'train_batch_size': 5,
26 | 'learning_rate': 0.0001})
27 | m.train()
28 |
29 | def test_training_bert():
30 | """Test if traning does not break even though MAX LEN is exceeded"""
31 | m = NERDA(dataset_training = get_dane_data('train', 5),
32 | dataset_validation = get_dane_data('dev', 5),
33 | transformer = 'bert-base-multilingual-uncased',
34 | hyperparameters = {'epochs' : 1,
35 | 'warmup_steps' : 10,
36 | 'train_batch_size': 5,
37 | 'learning_rate': 0.0001})
38 | m.train()
39 |
--------------------------------------------------------------------------------