├── .github
    └── workflows
    │   └── testing.yml
├── .gitignore
├── LICENSE.txt
├── README.md
├── data
    ├── crunchbase_companies.json.gz
    ├── first_names.json
    ├── form_frequencies.json
    ├── geonames.json
    ├── model.PNG
    ├── products.json
    ├── reuters_small.tar.gz
    ├── sentiment
    │   ├── lexicons
    │   │   ├── IBM_Debater
    │   │   │   └── no_unigram.txt
    │   │   ├── NRC_Sentiment_Emotion
    │   │   │   ├── NRC-Emotion-Lexicon-Wordlevel-v0.92.txt
    │   │   │   └── no_sent.txt
    │   │   ├── NRC_VAD_Lexicon
    │   │   │   └── Norwegian-no-NRC-VAD-Lexicon.txt
    │   │   └── socal
    │   │   │   ├── no_adj.txt
    │   │   │   ├── no_adv.txt
    │   │   │   ├── no_google.txt
    │   │   │   ├── no_int.txt
    │   │   │   ├── no_noun.txt
    │   │   │   └── no_verb.txt
    │   └── norec_sentence
    │   │   ├── dev.txt
    │   │   ├── labels.json
    │   │   ├── test.txt
    │   │   └── train.txt
    ├── skweak_logo.jpg
    ├── skweak_logo_thumbnail.jpg
    ├── skweak_procedure.png
    └── wikidata_small_tokenised.json.gz
├── examples
    ├── ner
    │   ├── Step by step NER.ipynb
    │   ├── __init__.py
    │   ├── conll2003_ner.py
    │   ├── conll2003_prep.py
    │   ├── data_utils.py
    │   ├── eval_utils.py
    │   └── muc6_ner.py
    ├── quick_start.ipynb
    └── sentiment
    │   ├── Step_by_step.ipynb
    │   ├── __init__.py
    │   ├── norec_sentiment.py
    │   ├── sentiment_lexicons.py
    │   ├── sentiment_models.py
    │   ├── transformer_model.py
    │   └── weak_supervision_sentiment.py
├── poetry.lock
├── poetry.toml
├── pyproject.toml
├── skweak
    ├── __init__.py
    ├── aggregation.py
    ├── analysis.py
    ├── base.py
    ├── doclevel.py
    ├── gazetteers.py
    ├── generative.py
    ├── heuristics.py
    ├── spacy.py
    ├── utils.py
    └── voting.py
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── test_aggregation.py
    ├── test_analysis.py
    ├── test_doclevel.py
    ├── test_gazetteers.py
    ├── test_heuristics.py
    └── test_utils.py


/.github/workflows/testing.yml:
--------------------------------------------------------------------------------
 1 | name: testing
 2 | 
 3 | on: [ push ]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       matrix:
11 |         python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11" ]
12 |       fail-fast: false
13 | 
14 |     steps:
15 |       - uses: actions/checkout@v3
16 |         name: Checkout
17 | 
18 |       - name: Set up Python ${{ matrix.python-version }}
19 |         uses: actions/setup-python@v4
20 |         with:
21 |           python-version: ${{ matrix.python-version }}
22 |           cache: 'pip'
23 | 
24 |       - uses: Gr1N/setup-poetry@v8
25 |         with:
26 |           poetry-version: 1.5.1
27 | 
28 |       - name: Install Python dependencies
29 |         run: |
30 |           poetry run pip install -U pip
31 |           poetry install --with dev
32 | 
33 |       # TODO: add mkdocs documentation, make sure examples work
34 | 
35 |       #    - name: Lint with flake8 #TODO: use ruff
36 |       #      run: |
37 |       #        # stop the build if there are Python syntax errors or undefined names
38 |       #        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
39 |       #        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
40 |       #        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
41 | 
42 |       - name: Test with pytest
43 |         run: poetry run pytest
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | .venv/
3 | build/
4 | sdist/
5 | dist/
6 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 | 
3 | Copyright (C) 2021-2026 Norsk Regnesentral
4 | 
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 | 
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 | 
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # skweak: Weak supervision for NLP
  2 | 
  3 | [![GitHub license](https://img.shields.io/github/license/NorskRegnesentral/skweak)](https://github.com/NorskRegnesentral/skweak/blob/main/LICENSE.txt)
  4 | [![GitHub stars](https://img.shields.io/github/stars/NorskRegnesentral/skweak)](https://github.com/NorskRegnesentral/skweak/stargazers)
  5 | ![PyPI](https://img.shields.io/pypi/v/skweak)
  6 | ![Testing](https://github.com/NorskRegnesentral/skweak/actions/workflows/testing.yml/badge.svg)
  7 | 
  8 | <br>
  9 | <p align="center">
 10 |    <img alt="skweak logo" src="https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo.jpg"/>
 11 | </p><br>
 12 | 
 13 | **Skweak is no longer actively maintained** (if you are interested to take over the project, give us a shout). 
 14 | 
 15 | Labelled data remains a scarce resource in many practical NLP scenarios. This is especially the case when working with resource-poor languages (or text domains), or when using task-specific labels without pre-existing datasets. The only available option is often to collect and annotate texts by hand, which is expensive and time-consuming. 
 16 | 
 17 | `skweak` (pronounced `/skwi:k/`) is a Python-based software toolkit that provides a concrete solution to this problem using weak supervision. `skweak` is built around a very simple idea: Instead of annotating texts by hand, we define a set of _labelling functions_ to automatically label our documents, and then _aggregate_ their results to obtain a labelled version of our corpus. 
 18 | 
 19 | The labelling functions may take various forms, such as domain-specific heuristics (like pattern-matching rules), gazetteers (based on large dictionaries), machine learning models, or even annotations from crowd-workers. The aggregation is done using a statistical model that automatically estimates the relative accuracy (and confusions) of each labelling function by comparing their predictions with one another.
 20 | 
 21 | `skweak` can be applied to both sequence labelling and text classification, and comes with a complete API that makes it possible to create, apply and aggregate labelling functions with just a few lines of code. The toolkit is also tightly integrated with [SpaCy](http://www.spacy.io), which makes it easy to incorporate into existing NLP pipelines. Give it a try!
 22 | 
 23 | <br>
 24 | 
 25 | **Full Paper**:<br>
 26 | Pierre Lison, Jeremy Barnes and Aliaksandr Hubin (2021), "[skweak: Weak Supervision Made Easy for NLP](https://aclanthology.org/2021.acl-demo.40/)", *ACL 2021 (System demonstrations)*.
 27 | 
 28 | **Documentation & API**: See the [Wiki](https://github.com/NorskRegnesentral/skweak/wiki) for details on how to use `skweak`. 
 29 | 
 30 | <br>
 31 | 
 32 | 
 33 | https://user-images.githubusercontent.com/11574012/114999146-e0995300-9ea1-11eb-8288-2bb54dc043e7.mp4
 34 | 
 35 | <br>
 36 | 
 37 | 
 38 | 
 39 | ## Dependencies
 40 | 
 41 | - `spacy` >= 3.0.0
 42 | - `hmmlearn` >= 0.3.0
 43 | - `pandas` >= 0.23
 44 | - `numpy` >= 1.18
 45 | 
 46 | You also need Python >= 3.6. 
 47 | 
 48 | 
 49 | ## Install
 50 | 
 51 | The easiest way to install `skweak` is through `pip`:
 52 | 
 53 | ```shell
 54 | pip install skweak
 55 | ```
 56 | 
 57 | or if you want to install from the repo:
 58 | 
 59 | ```shell
 60 | pip install --user git+https://github.com/NorskRegnesentral/skweak
 61 | ```
 62 | 
 63 | The above installation only includes the core library (not the additional examples in `examples`).
 64 | 
 65 | Note: some examples and tests may require trained spaCy pipelines. These can be downloaded automatically using the syntax (for the pipeline `en_core_web_sm`)
 66 | ```shell
 67 | python -m spacy download en_core_web_sm
 68 | ```
 69 | 
 70 | 
 71 | ## Basic Overview
 72 | 
 73 | <br>
 74 | <p align="center">
 75 |    <img alt="Overview of skweak" src="https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_procedure.png"/>
 76 | </p><br>
 77 | 
 78 | Weak supervision with `skweak` goes through the following steps:
 79 | - **Start**: First, you need raw (unlabelled) data from your text domain. `skweak` is build on top of [SpaCy](http://www.spacy.io), and operates with Spacy `Doc` objects, so you first need to convert your documents to `Doc` objects using SpaCy.
 80 | - **Step 1**: Then, we need to define a range of labelling functions that will take those documents and annotate spans with labels. Those labelling functions can comes from heuristics, gazetteers, machine learning models, etc. See the ![documentation](https://github.com/NorskRegnesentral/skweak/wiki) for more details. 
 81 | - **Step 2**: Once the labelling functions have been applied to your corpus, you need to _aggregate_ their results in order to obtain a single annotation layer (instead of the multiple, possibly conflicting annotations from the labelling functions). This is done in `skweak` using a generative model that automatically estimates the relative accuracy and possible confusions of each labelling function. 
 82 | - **Step 3**: Finally, based on those aggregated labels, we can train our final model. Step 2 gives us a labelled corpus that (probabilistically) aggregates the outputs of all labelling functions, and you can use this labelled data to estimate any kind of machine learning model. You are free to use whichever model/framework you prefer. 
 83 | 
 84 | ## Quickstart
 85 | 
 86 | Here is a minimal example with three labelling functions (LFs) applied on a single document:
 87 | 
 88 | ```python
 89 | import spacy, re
 90 | from skweak import heuristics, gazetteers, generative, utils
 91 | 
 92 | # LF 1: heuristic to detect occurrences of MONEY entities
 93 | def money_detector(doc):
 94 |    for tok in doc[1:]:
 95 |       if tok.text[0].isdigit() and tok.nbor(-1).is_currency:
 96 |           yield tok.i-1, tok.i+1, "MONEY"
 97 | lf1 = heuristics.FunctionAnnotator("money", money_detector)
 98 | 
 99 | # LF 2: detection of years with a regex
100 | lf2= heuristics.TokenConstraintAnnotator("years", lambda tok: re.match("(19|20)\d{2}$", 
101 |                                                   tok.text), "DATE")
102 | 
103 | # LF 3: a gazetteer with a few names
104 | NAMES = [("Barack", "Obama"), ("Donald", "Trump"), ("Joe", "Biden")]
105 | trie = gazetteers.Trie(NAMES)
106 | lf3 = gazetteers.GazetteerAnnotator("presidents", {"PERSON":trie})
107 | 
108 | # We create a corpus (here with a single text)
109 | nlp = spacy.load("en_core_web_sm")
110 | doc = nlp("Donald Trump paid $750 in federal income taxes in 2016")
111 | 
112 | # apply the labelling functions
113 | doc = lf3(lf2(lf1(doc)))
114 | 
115 | # create and fit the HMM aggregation model
116 | hmm = generative.HMM("hmm", ["PERSON", "DATE", "MONEY"])
117 | hmm.fit([doc]*10)
118 | 
119 | # once fitted, we simply apply the model to aggregate all functions
120 | doc = hmm(doc)
121 | 
122 | # we can then visualise the final result (in Jupyter)
123 | utils.display_entities(doc, "hmm")
124 | ```
125 | 
126 | Obviously, to get the most out of `skweak`, you will need more than three labelling functions. And, most importantly, you will need a larger corpus including as many documents as possible from your domain, so that the model can derive good estimates of the relative accuracy of each labelling function. 
127 | 
128 | ## Documentation
129 | 
130 | See the [Wiki](https://github.com/NorskRegnesentral/skweak/wiki). 
131 | 
132 | 
133 | ## License
134 | 
135 | `skweak` is released under an MIT License. 
136 | 
137 | The MIT License is a short and simple permissive license allowing both commercial and non-commercial use of the software. The only requirement is to preserve
138 | the copyright and license notices (see file [License](https://github.com/NorskRegnesentral/skweak/blob/main/LICENSE.txt)). Licensed works, modifications, and larger works may be distributed under different terms and without source code.
139 | 
140 | ## Citation
141 | 
142 | See our paper describing the framework: 
143 | 
144 | Pierre Lison, Jeremy Barnes and Aliaksandr Hubin (2021), "[skweak: Weak Supervision Made Easy for NLP](https://aclanthology.org/2021.acl-demo.40/)", *ACL 2021 (System demonstrations)*. 
145 | 
146 | ```bibtex
147 | @inproceedings{lison-etal-2021-skweak,
148 |     title = "skweak: Weak Supervision Made Easy for {NLP}",
149 |     author = "Lison, Pierre  and
150 |       Barnes, Jeremy  and
151 |       Hubin, Aliaksandr",
152 |     booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: System Demonstrations",
153 |     month = aug,
154 |     year = "2021",
155 |     address = "Online",
156 |     publisher = "Association for Computational Linguistics",
157 |     url = "https://aclanthology.org/2021.acl-demo.40",
158 |     doi = "10.18653/v1/2021.acl-demo.40",
159 |     pages = "337--346",
160 | }
161 | ```
162 | 


--------------------------------------------------------------------------------
/data/crunchbase_companies.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NorskRegnesentral/skweak/2b6db15e8429dbda062b2cc9cc74e69f51a0a8b6/data/crunchbase_companies.json.gz


--------------------------------------------------------------------------------
/data/model.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NorskRegnesentral/skweak/2b6db15e8429dbda062b2cc9cc74e69f51a0a8b6/data/model.PNG


--------------------------------------------------------------------------------
/data/reuters_small.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NorskRegnesentral/skweak/2b6db15e8429dbda062b2cc9cc74e69f51a0a8b6/data/reuters_small.tar.gz


--------------------------------------------------------------------------------
/data/sentiment/lexicons/socal/no_adv.txt:
--------------------------------------------------------------------------------
  1 | vidunderlig	5
  2 | herlig	5
  3 | nydelig	5
  4 | unntaksvis	5
  5 | utmerket	5
  6 | fantastisk	5
  7 | fantastisk	5
  8 | spektakulært	5
  9 | feilfritt	5
 10 | nydelig	5
 11 | enestående	5
 12 | perfekt	5
 13 | fantastisk	5
 14 | utrolig	5
 15 | guddommelig	5
 16 | fantastisk	5
 17 | upåklagelig	5
 18 | utelukkende	5
 19 | fabelaktig	5
 20 | bedårende	5
 21 | strålende	5
 22 | utrolig	5
 23 | blendende	5
 24 | utrolig	5
 25 | strålende	5
 26 | veldig bra	5
 27 | orgasmisk	5
 28 | hyggelig	4
 29 | behagelig	4
 30 | jublende	4
 31 | flott	4
 32 | magisk	4
 33 | pent	4
 34 | sprudlende	4
 35 | engasjerende	4
 36 | elegant	4
 37 | beundringsverdig	4
 38 | ypperlig	4
 39 | elskelig	4
 40 | mesterlig	4
 41 | genialt	4
 42 | fantastisk	4
 43 | forfriskende	4
 44 | lykkelig	4
 45 | kjærlig	4
 46 | høyest	4
 47 | fenomenalt	4
 48 | sjarmerende	4
 49 | innovativt	4
 50 | deilig	4
 51 | mirakuløst	4
 52 | fortryllende	4
 53 | engrossingly	4
 54 | morsomt	4
 55 | vakkert	4
 56 | intelligent	4
 57 | gledelig	4
 58 | attraktivt	4
 59 | utsøkt	4
 60 | kjevefall	4
 61 | velvillig	4
 62 | strålende	4
 63 | entusiastisk	4
 64 | oppladbart	4
 65 | fredelig	3
 66 | stilig	3
 67 | eksotisk	3
 68 | omfattende	3
 69 | omhyggelig	3
 70 | søtt	3
 71 | fantasifullt	3
 72 | prisverdig	3
 73 | enormt	3
 74 | høflig	3
 75 | kjærlig	3
 76 | populært	3
 77 | søt	3
 78 | lett	3
 79 | bra	3
 80 | rikt	3
 81 | robust	3
 82 | tilfredsstillende	3
 83 | nylig	3
 84 | gratis	3
 85 | gripende	3
 86 | muntert	3
 87 | nøyaktig	3
 88 | positivt	3
 89 | spennende	3
 90 | spennende	3
 91 | gunstig	3
 92 | kreativt	3
 93 | festlig	3
 94 | lidenskapelig	3
 95 | fagmessig	3
 96 | fengslende	3
 97 | elegant	3
 98 | kunstnerisk	3
 99 | behendig	3
100 | imponerende	3
101 | intellektuelt	3
102 | levende	3
103 | ekstraordinært	3
104 | smart	3
105 | fantasifullt	3
106 | ergonomisk	3
107 | riktig	3
108 | sømløst	3
109 | fritt	3
110 | vittig	3
111 | uredd	3
112 | lyst	3
113 | fleksibelt	3
114 | adeptly	3
115 | ømt	3
116 | klokt	3
117 | triumferende	3
118 | uanstrengt	3
119 | hyggelig	3
120 | uproariously	3
121 | enormt	3
122 | morsomt	3
123 | hjertelig	3
124 | rikelig	3
125 | vellykket	3
126 | humoristisk	3
127 | tålmodig	3
128 | minneverdig	3
129 | uvurderlig	3
130 | underholdende	3
131 | ergonomisk	3
132 | dristig	3
133 | kraftig	3
134 | beleilig	3
135 | rungende	3
136 | adroitly	3
137 | romantisk	3
138 | forbløffende	3
139 | heroisk	3
140 | energisk	3
141 | sjelelig	3
142 | sjenerøst	3
143 | modig	3
144 | tappert	3
145 | rimelig	3
146 | pålitelig	3
147 | rimelig	3
148 | billig	3
149 | heldigvis	2
150 | profesjonelt	2
151 | bemerkelsesverdig	2
152 | elegant	2
153 | suspensivt	2
154 | intrikat	2
155 | treffende	2
156 | pent ferdig	2
157 | konsekvent	2
158 | pålitelig	2
159 | lyrisk	2
160 | passende	2
161 | virkelig	2
162 | riktig	2
163 | intensivt	2
164 | hjertelig	2
165 | evig	2
166 | gjerne	2
167 | modig	2
168 | forseggjort	2
169 | fersk	2
170 | godt	2
171 | overbærende	2
172 | overbevisende	2
173 | effektivt	2
174 | fargerikt	2
175 | gradvis	2
176 | rolig	2
177 | hederlig	2
178 | kjærlig	2
179 | dyktig	2
180 | liberalt	2
181 | lekent	2
182 | omtenksomt	2
183 | nøyaktig	2
184 | sannferdig	2
185 | målrettet	2
186 | heldigvis	2
187 | forsiktig	2
188 | komfortabelt	2
189 | grundig	2
190 | ivrig	2
191 | pent	2
192 | kompetent	2
193 | lovende	2
194 | pen	2
195 | nøye	2
196 | fantastisk	2
197 | delikat	2
198 | aktivt	2
199 | uskyldig	2
200 | kjærlig	2
201 | umåtelig	2
202 | trofast	2
203 | kapabel	2
204 | sammenhengende	2
205 | vanedannende	2
206 | oppfinnsomt	2
207 | reflekterende	2
208 | hjelpsomt	2
209 | nobelt	2
210 | ydmykt	2
211 | dyptgående	2
212 | ivrig	2
213 | oppriktig	2
214 | smart	2
215 | høflig	2
216 | interessant	2
217 | mystisk	2
218 | sentimentalt	2
219 | smart	2
220 | formidabelt	2
221 | fint	2
222 | lett	2
223 | eksepsjonell	2
224 | eterisk	2
225 | hovedsakelig	2
226 | ridderlig	2
227 | strategisk	2
228 | greit	2
229 | elektronisk	2
230 | kunstnerisk	2
231 | moralsk	2
232 | erotisk	2
233 | rørende	2
234 | kraftig	2
235 | optimistisk	2
236 | sterk	2
237 | spirituelt	2
238 | sympatisk	2
239 | nostalgisk	2
240 | smakfullt	2
241 | trygt	2
242 | monumentalt	2
243 | hjerteskjærende	2
244 | pent	2
245 | trygt	2
246 | avgjørende	2
247 | ansvarlig	2
248 | stolt	2
249 | forståelig nok	2
250 | mektig	2
251 | autentisk	2
252 | kompromissløst	2
253 | bedre	2
254 | digitali	2
255 | rask	2
256 | gratis	2
257 | klar	2
258 | rent	2
259 | universelt	1
260 | intuitivt	1
261 | forbausende	1
262 | ren	1
263 | stilistisk	1
264 | kjent	1
265 | rikelig	1
266 | digitalt	1
267 | lydløst	1
268 | andpusten	1
269 | naturlig	1
270 | komisk	1
271 | svimmel	1
272 | realistisk	1
273 | nøye	1
274 | skarpt	1
275 | uskyldig	1
276 | intimt	1
277 | helhet	1
278 | offisielt	1
279 | troverdig	1
280 | straks	1
281 | musikalsk	1
282 | merkbart	1
283 | unikt	1
284 | logisk	1
285 | lunefullt	1
286 | lett	1
287 | passende	1
288 | klassisk	1
289 | effektivt	1
290 | slående	1
291 | helst	1
292 | første	1
293 | hovedsakelig	1
294 | beskjedent	1
295 | rimelig	1
296 | tilstrekkelig	1
297 | elektrisk	1
298 | betydelig	1
299 | gjenkjennelig	1
300 | i det vesentlige	1
301 | tilstrekkelig	1
302 | mykt	1
303 | sikkert	1
304 | intenst	1
305 | solid	1
306 | umåtelig	1
307 | høytidelig	1
308 | varmt	1
309 | relevant	1
310 | rettferdig	1
311 | dyktig	1
312 | sikkert	1
313 | ordentlig	1
314 | normalt	1
315 | rent	1
316 | overbevisende	1
317 | billig	1
318 | sentralt	1
319 | tydelig	1
320 | bevisst	1
321 | sannsynlig	1
322 | forholdsvis	1
323 | nok	1
324 | rett frem	1
325 | sammenlignbart	1
326 | responsivt	1
327 | utpreget	1
328 | raskt	1
329 | følsomt	1
330 | spontant	1
331 | villig	1
332 | anstendig	1
333 | brønn	1
334 | dyrt	1
335 | smart	1
336 | virkelig	1
337 | legitimt	1
338 | uendelig	1
339 | raskt	1
340 | lett	1
341 | fremtredende	1
342 | verdifullt	1
343 | tilfeldig	1
344 | nyttig	1
345 | jevnt	1
346 | skyll	1
347 | gradvis	1
348 | spesielt	1
349 | sterkt	1
350 | jevnt og trutt	1
351 | automatisk	1
352 | stille	1
353 | troverdig	1
354 | tilfredsstillende	1
355 | flørtende	1
356 | uvanlig	1
357 | med rette	1
358 | globalt	1
359 | med respekt	1
360 | quirkily	1
361 | uavhengig	1
362 | enormt	1
363 | ekte	1
364 | realistisk	1
365 | myk	1
366 | stor	1
367 | rettferdig	1
368 | uheldig	1
369 | komisk	1
370 | ukonvensjonelt	1
371 | vitenskapelig	1
372 | uforutsigbart	1
373 | vanlig	1
374 | klar	1
375 | forførende	1
376 | muntert	1
377 | hypnotisk	1
378 | pålitelig	1
379 | ambisiøst	1
380 | nonchalant	1
381 | sikkert	1
382 | kompakt	1
383 | ekstra	1
384 | akseptabelt	1
385 | økonomisk	1
386 | funksjonelt	1
387 | leselig	1
388 | valgfritt	1
389 | konkurransedyktig	1
390 | merkbart	1
391 | glatt	-1
392 | kjølig	-1
393 | hysterisk	-1
394 | stort	-1
395 | blinkende	-1
396 | overfladisk	-1
397 | ially	-1
398 | dramatisk	-1
399 | rett ut	-1
400 | underlig	-1
401 | nedslående	-1
402 | tvangsmessig	-1
403 | sjelden	-1
404 | tett	-1
405 | kaldt	-1
406 | marginalt	-1
407 | skarpt	-1
408 | ostelig	-1
409 | snevert	-1
410 | sjokkerende	-1
411 | kort	-1
412 | forbausende	-1
413 | tydelig	-1
414 | foruroligende	-1
415 | svakt	-1
416 | alvorlig	-1
417 | løst	-1
418 | opprørende	-1
419 | ujevnt	-1
420 | tung	-1
421 | hard	-1
422 | uunngåelig	-1
423 | nervøst	-1
424 | kommersielt	-1
425 | nølende	-1
426 | lite	-1
427 | eksternt	-1
428 | vilt	-1
429 | trist	-1
430 | fantastisk	-1
431 | uendelig	-1
432 | tomgang	-1
433 | negativt	-1
434 | minimalt	-1
435 | ekstremt	-1
436 | beskyttende	-1
437 | rampete	-1
438 | tett	-1
439 | motvillig	-1
440 | sakte	-1
441 | unødvendig	-1
442 | lurt	-1
443 | gjennomsiktig	-1
444 | gal	-1
445 | følelsesmessig	-1
446 | urolig	-1
447 | knapt	-1
448 | omtrent	-1
449 | hakkete	-1
450 | inkonsekvent	-1
451 | tungt	-1
452 | rastløs	-1
453 | kompleks	-1
454 | merkelig	-1
455 | konvensjonelt	-1
456 | spesielt	-1
457 | stereotyp	-1
458 | utenfor emnet	-1
459 | trendig	-1
460 | lang	-1
461 | klinisk	-1
462 | forsiktig	-1
463 | politisk	-1
464 | religiøst	-1
465 | vanskelig	-1
466 | radikalt	-1
467 | feilaktig	-1
468 | gjentatt	-1
469 | uhyggelig	-1
470 | uinteressant	-1
471 | svakt	-1
472 | overflødig	-1
473 | mørkt	-1
474 | kryptisk	-1
475 | løs	-1
476 | kunstig	-1
477 | campily	-1
478 | sporadisk	-1
479 | forenklet	-1
480 | sterkt	-1
481 | unnskyldende	-1
482 | uløselig	-1
483 | flamboyant	-1
484 | idealistisk	-1
485 | vantro	-1
486 | vanlig	-1
487 | billig	-1
488 | ulykkelig	-1
489 | sakte	-1
490 | sent	-1
491 | vedvarende	-1
492 | ufullstendig	-1
493 | temperamentsfull	-1
494 | ironisk nok	-1
495 | merkelig	-1
496 | blindende	-1
497 | trassig	-1
498 | uklart	-1
499 | mørk	-1
500 | innfødt	-1
501 | uregelmessig	-1
502 | urealistisk	-1
503 | gratis	-2
504 | kjølig	-2
505 | heldigvis	-2
506 | urimelig	-2
507 | repeterende	-2
508 | upassende	-2
509 | uforklarlig	-2
510 | unødvendig	-2
511 | brashly	-2
512 | dårlig	-2
513 | ubarmhjertig	-2
514 | ubehagelig	-2
515 | lat	-2
516 | støyende	-2
517 | bare	-2
518 | alvorlig	-2
519 | voldsomt	-2
520 | beryktet	-2
521 | grovt	-2
522 | likegyldig	-2
523 | naken	-2
524 | klønete	-2
525 | lunken	-2
526 | ulogisk	-2
527 | mindnumbingly	-2
528 | amatøraktig	-2
529 | latterlig	-2
530 | klumpete	-2
531 | uberørt	-2
532 | urettmessig	-2
533 | umulig	-2
534 | feil	-2
535 | dessverre	-2
536 | angivelig	-2
537 | forutsigbart	-2
538 | flatt	-2
539 | skyldfølende	-2
540 | vanvittig	-2
541 | innblandet	-2
542 | tregt	-2
543 | uvillig	-2
544 | uavbrutt	-2
545 | urettferdig	-2
546 | tåpelig	-2
547 | dessverre	-2
548 | engstelig	-2
549 | sappily	-2
550 | takknemlig	-2
551 | urettferdig	-2
552 | sårt	-2
553 | icily	-2
554 | hardt	-2
555 | knapt	-2
556 | upassende	-2
557 | høyt	-2
558 | lystig	-2
559 | unnvikende	-2
560 | kjedelig	-2
561 | apprehensive	-2
562 | neppe	-2
563 | vagt	-2
564 | overbevisende	-2
565 | utålmodig	-2
566 | unøyaktig	-2
567 | dessverre	-2
568 | voldsomt	-2
569 | overdreven	-2
570 | uintelligent	-2
571 | feil	-2
572 | skissert	-2
573 | kjedelig	-2
574 | sjalu	-2
575 | svakt	-2
576 | offensivt	-2
577 | vilkårlig	-2
578 | ubarmhjertig	-2
579 | kjedelig	-2
580 | desperat	-2
581 | tankeløst	-2
582 | beklager	-2
583 | altfor	-2
584 | mislykket	-2
585 | skjelvende	-2
586 | lam	-2
587 | tre	-2
588 | ukontrollerbart	-2
589 | strengt	-2
590 | desperat	-2
591 | tøft	-2
592 | forvirrende	-2
593 | fantasiløst	-2
594 | negativt	-2
595 | rotete	-2
596 | mistenkelig	-2
597 | ulovlig	-2
598 | feil	-2
599 | overveldende	-2
600 | sauete	-2
601 | tankeløst	-2
602 | ['d | ville] _rather	-2
603 | generisk	-2
604 | akutt	-2
605 | nerdete	-2
606 | urolig	-2
607 | mutt	-2
608 | høyt	-2
609 | morsomt	-2
610 | deprimerende	-2
611 | uforståelig	-2
612 | katatonisk	-2
613 | endimensjonalt	-2
614 | syk	-2
615 | generelt	-2
616 | ikke tiltalende	-2
617 | sta	-2
618 | møysommelig	-2
619 | ugunstig	-2
620 | tilfeldig	-2
621 | frekt	-2
622 | dystert	-2
623 | slurvet	-2
624 | sinnsykt	-2
625 | latterlig	-2
626 | beruset	-2
627 | impulsivt	-2
628 | vanskelig	-2
629 | vakuum	-2
630 | grådig	-2
631 | naivt	-2
632 | syndig	-3
633 | uendelig	-3
634 | uhyrlig	-3
635 | lurt	-3
636 | aggressivt	-3
637 | kynisk	-3
638 | sint	-3
639 | ubehagelig	-3
640 | terminalt	-3
641 | dystert	-3
642 | søppel	-3
643 | motbydelig	-3
644 | frekt	-3
645 | skummelt	-3
646 | frekt	-3
647 | skremmende	-3
648 | inhabil	-3
649 | for	-3
650 | blindt	-3
651 | håpløst	-3
652 | sinnsløs	-3
653 | pretensiøst	-3
654 | vilt	-3
655 | skuffende	-3
656 | absurd	-3
657 | tunghendt	-3
658 | kjedelig	-3
659 | tett	-3
660 | truende	-3
661 | b-film	-3
662 | farlig	-3
663 | illevarslende	-3
664 | utilgivende	-3
665 | grovt	-3
666 | rabiat	-3
667 | hjemsøkende	-3
668 | fryktelig	-3
669 | uheldigvis	-3
670 | urovekkende	-3
671 | kjedelig	-3
672 | skamløst	-3
673 | krøllete	-3
674 | dystert	-3
675 | blatant	-3
676 | egoistisk	-3
677 | dårlig	-3
678 | bisarrt	-3
679 | grafisk	-3
680 | tragisk	-3
681 | problematisk	-3
682 | kronisk	-3
683 | død	-3
684 | irriterende	-3
685 | irriterende	-3
686 | analt	-3
687 | dødelig	-3
688 | meningsløst	-3
689 | arrogant	-3
690 | skammelig	-3
691 | dårlig	-3
692 | latterlig	-4
693 | uutholdelig	-4
694 | unnskyldelig	-4
695 | djevelsk	-4
696 | ukikkelig	-4
697 | avskrivning	-4
698 | pervers	-4
699 | dumt	-4
700 | uakseptabelt	-4
701 | kriminelt	-4
702 | grusomt	-4
703 | latterlig	-4
704 | smertefullt	-4
705 | notorisk	-4
706 | inanely	-4
707 | patetisk	-4
708 | uanstendig	-4
709 | meningsløst	-4
710 | ynkelig	-4
711 | livløst	-4
712 | fornærmende	-4
713 | ondsinnet	-4
714 | psykotisk	-4
715 | opprørende	-4
716 | patologisk	-4
717 | utnyttende	-4
718 | idiotisk	-4
719 | fryktelig	-5
720 | kvalmende	-5
721 | utilgivelig	-5
722 | katastrofalt	-5
723 | fryktelig	-5
724 | sykt	-5
725 | veldig	-5
726 | brutalt	-5
727 | forferdelig	-5
728 | forferdelig	-5
729 | elendig	-5
730 | fryktelig	-5
731 | forferdelig	-5
732 | ondskapsfullt	-5
733 | frastøtende	-5
734 | trist	-5
735 | fryktelig	-5
736 | skjemmende	-5
737 | ulykkelig	-5
738 | grotesk	-5
739 | alvorlig	-5
740 | ondskapsfullt	-5
741 | motbydelig	-5
742 | ulidelig	-5
743 | forferdelig	-5
744 | fryktelig	-5
745 | forferdelig	-5
746 | styggt	-5
747 | nøye	2
748 | kort	-1
749 | ensom	-1
750 | streetwise	1
751 | slu	-1
752 | vital	1
753 | mind-blowingly	2
754 | melodramatisk	-2
755 | ulastelig	5
756 | nasalt	-1
757 | dyktig	2
758 | hjerteskjærende	-2
759 | uredelig	-3
760 | plettfritt	4
761 | tynt	-1
762 | lystig	2
763 | vakkert	4
764 | utsmykket	2
765 | pent	2
766 | dynamisk	2
767 | kjedelig	-3
768 | utilstrekkelig	-1
769 | sportslig	2
770 | usammenhengende	-2
771 | rettferdig	-2
772 | veldedig	1
773 | flittig	2
774 | del tomt	-2
775 | greit	1
776 | skjevt	-1
777 | fristende	1
778 | klokt	2
779 | kostbar	-1
780 | uoverkommelig	-3
781 | tomt	-1
782 | naturskjønt	4
783 | seremonielt	1
784 | surrealistisk	-1
785 | prisbelønt	5
786 | fascinerende	5
787 | frustrerende	-2
788 | moro	3
789 | periodevis	-1
790 | vennlig	2
791 | kraftig	2
792 | veltalende	3
793 | freakishly	-3
794 | skremmende	-1
795 | bare	1
796 | omrørende	2
797 | etisk	1
798 | forsvarlig	1
799 | hensynsløs	-2
800 | litt	-1
801 | utrolig	-2
802 | fiendishly	-3
803 | skikkelig	1
804 | sørgelig	-4
805 | kjapt	1
806 | rausende	3
807 | gledelig	4
808 | motbydelig	-3
809 | nådeløst	-3
810 | rettferdig	1
811 | nådig	2
812 | frodig	3
813 | lykksalig	4
814 | historisk	1
815 | kortfattet	-1
816 | svakt	-1
817 | halvhjertet	-1
818 | raskt	1
819 | skremmende	-2
820 | banebrytende	3
821 | på villspor	-1
822 | bittert	-2
823 | besatt	-3
824 | hjelpeløst	-3
825 | hilsen	-2
826 | fortjent	1
827 | rasende	-4
828 | ubønnhørlig	-2
829 | uelegant	-1
830 | rørende	3
831 | rolig	1
832 | spent	3
833 | godartet	1
834 | målløst	-1
835 | forvirrende	-2
836 | skjemmende	-4
837 | raskt	-2
838 | moderat	1
839 | grovt	-2
840 | fantastisk	5
841 | stolt	-1
842 | beroligende	1
843 | svakt	-2
844 | majestetisk	4
845 | snikende	-4
846 | distraherende	-1
847 | skummelt	-1
848 | skrytende	-1
849 | utmerket	4
850 | uklokt	-2
851 | iherdig	3
852 | rasende	-2
853 | ufarlig	1
854 | forgjeves	-2
855 | lakonisk	-1
856 | oppgitt	-2
857 | lønnsomt	1
858 | forvirrende	-2
859 | bekymringsfullt	-3
860 | kvalmende	-3
861 | lunefull	-2
862 | fanatisk	-3
863 | uforsiktig	-1
864 | abysmalt	-4
865 | bærekraftig	2
866 | foraktelig	-3
867 | glumly	-2
868 | uberegnelig	-1
869 | sparsommelig	1
870 | torturøst	-4
871 | ublu	-4
872 | selvtilfreds	-2
873 | feil	-1
874 | skadelig	-2
875 | smertefritt	1
876 | feil	-1
877 | luskent	-1
878 | episk	4
879 | 


--------------------------------------------------------------------------------
/data/sentiment/lexicons/socal/no_int.txt:
--------------------------------------------------------------------------------
  1 | minst	-3
  2 | mindre	-1.5
  3 | knapt	-1.5
  4 | neppe	-1.5
  5 | nesten	-1.5
  6 | ikke for	-1.5
  7 | ikke bare	0.5
  8 | ikke bare	0.5
  9 | ikke bare	0.5
 10 | bare	-0.5
 11 | litt	-0.5
 12 | litt	-0.5
 13 | litt	-0.5
 14 | marginalt	-0.5
 15 | relativt	-0.3
 16 | mildt	-0.3
 17 | moderat	-0.3
 18 | noe	-0.3
 19 | delvis	-0.3
 20 | litt	-0.3
 21 | uten tvil	-0.2
 22 | stort sett	-0.2
 23 | hovedsakelig	-0.2
 24 | minst	-0.9
 25 | til en viss grad	-0.2
 26 | til en viss grad	-0.2
 27 | slags	-0.3
 28 | sorta	-0.3
 29 | slags	-0.3
 30 | ganske	-0.3
 31 | ganske	-0.2
 32 | pen	-0.1
 33 | heller	-0.1
 34 | umiddelbart	0.1
 35 | ganske	0.1
 36 | perfekt	0.1
 37 | konsekvent	0.1
 38 | virkelig	0.2
 39 | klart	0.2
 40 | åpenbart	0.2
 41 | absolutt	0.2
 42 | helt	0.2
 43 | definitivt	0.2
 44 | absolutt	0.2
 45 | konstant	0.2
 46 | høyt	0.2
 47 | veldig	0.2
 48 | betydelig	0.2
 49 | merkbart	0.2
 50 | karakteristisk	0.2
 51 | ofte	0.2
 52 | forferdelig	0.2
 53 | totalt	0.2
 54 | stort sett	0.2
 55 | fullt	0.2
 56 | ekstra	0.3
 57 | virkelig	0.3
 58 | spesielt	0.3
 59 | spesielt	0.3
 60 | jævla	0.3
 61 | intensivt	0.3
 62 | rett og slett	0.3
 63 | helt	0.3
 64 | sterkt	0.3
 65 | bemerkelsesverdig	0.3
 66 | stort sett	0.3
 67 | utrolig	0.3
 68 | påfallende	0.3
 69 | fantastisk	0.3
 70 | i det vesentlige	0.3
 71 | uvanlig	0.3
 72 | dramatisk	0.3
 73 | intenst	0.3
 74 | ekstremt	0.4
 75 | så	0.4
 76 | utrolig	0.4
 77 | fryktelig	0.4
 78 | enormt	0.4
 79 | umåtelig	0.4
 80 | slik	0.4
 81 | utrolig	0.4
 82 | sinnsykt	0.4
 83 | opprørende	0.4
 84 | radikalt	0.4
 85 | blærende	0.4
 86 | unntaksvis	0.4
 87 | overstigende	0.4
 88 | uten tvil	0.4
 89 | vei	0.4
 90 | langt	0.4
 91 | dypt	0.4
 92 | super	0.4
 93 | dypt	0.4
 94 | universelt	0.4
 95 | rikelig	0.4
 96 | uendelig	0.4
 97 | eksponentielt	0.4
 98 | enormt	0.4
 99 | grundig	0.4
100 | lidenskapelig	0.4
101 | voldsomt	0.4
102 | latterlig	0.4
103 | uanstendig	0.4
104 | vilt	0.4
105 | ekstraordinært	0.5
106 | spektakulært	0.5
107 | fenomenalt	0.5
108 | monumentalt	0.5
109 | utrolig	0.5
110 | helt	0.5
111 | mer	-0.5
112 | enda mer	0.5
113 | mer enn	0.5
114 | mest	1
115 | ytterste	1
116 | totalt	0.5
117 | monumental	0.5
118 | flott	0.5
119 | enorm	0.5
120 | enorme	0.5
121 | massiv	0.5
122 | fullført	0.4
123 | uendelig	0.4
124 | uendelig	0.4
125 | absolutt	0.5
126 | rungende	0.4
127 | uskadd	0.4
128 | drop dead	0.4
129 | massiv	0.5
130 | kollossal	0.5
131 | utrolig	0.5
132 | ufattelig	0.5
133 | abject	0.5
134 | en slik	0.4
135 | en slik	0.4
136 | fullstendig	0.4
137 | dobbelt	0.3
138 | klar	0.3
139 | klarere	0.2
140 | klareste	0.5
141 | stor	0.3
142 | større	0.2
143 | største	0.5
144 | åpenbart	0.03
145 | alvorlig	0.3
146 | dyp	0.3
147 | dypere	0.2
148 | dypeste	0.5
149 | betydelig	0.2
150 | viktig	0.3
151 | større	0.2
152 | avgjørende	0.3
153 | umiddelbar	0.1
154 | synlig	0.1
155 | merkbar	0.1
156 | konsistent	0.1
157 | høy	0.2
158 | høyere	0.1
159 | høyeste	0.5
160 | ekte	0.2
161 | sant	0.2
162 | ren	0.2
163 | bestemt	0.2
164 | mye	0.2
165 | liten	-0.3
166 | mindre	-0.2
167 | minste	-0.5
168 | moll	-0.3
169 | moderat	-0.3
170 | mild	-0.3
171 | lett	-0.5
172 | minste	-0.9
173 | ubetydelig	-0.5
174 | ubetydelig	-0.5
175 | lav	-2
176 | lavere	-1.5
177 | laveste	-3
178 | få	-2
179 | færre	-1.5
180 | færrest	-3
181 | mye	0.3
182 | mange	0.3
183 | flere	0.2
184 | flere	0.2
185 | forskjellige	0.2
186 | noen få	-0.3
187 | et par	-0.3
188 | et par	-0.3
189 | mye	0.3
190 | masse	0.3
191 | i det hele tatt	-0.5
192 | mye	0.5
193 | en hel masse	0.5
194 | en enorm mengde på	0.5
195 | enorme antall på	0.5
196 | en pokker på	0.5
197 | en mengde på	0.5
198 | en mutltid på	0.5
199 | tonn	0.5
200 | tonn	0.5
201 | en haug med	0.3
202 | hauger på	0.3
203 | rikelig med	0.3
204 | en viss mengde	-0.2
205 | noen	-0.2
206 | litt av	-0.5
207 | litt av	-0.5
208 | litt av	-0.5
209 | vanskelig å	-1.5
210 | vanskelig til	-1.5
211 | tøff til	-1.5
212 | ikke i nærheten av	-3
213 | ikke alt det	-1.2
214 | ikke det	-1.5
215 | ut av	-2
216 | 


--------------------------------------------------------------------------------
/data/sentiment/lexicons/socal/no_verb.txt:
--------------------------------------------------------------------------------
  1 | kulminerer	4
  2 | opphøyelse	4
  3 | glede	4
  4 | ære	4
  5 | stein	4
  6 | elsker	4
  7 | enthrall	4
  8 | ærefrykt	4
  9 | fascinere	4
 10 | enthrall	4
 11 | enthrall	4
 12 | elat	4
 13 | extol	3
 14 | helliggjøre	3
 15 | transcend	3
 16 | oppnå	3
 17 | beundre	3
 18 | forbløffe	3
 19 | verne om	3
 20 | ros	3
 21 | glede	3
 22 | vie	3
 23 | fortrylle	3
 24 | elske	3
 25 | energiser	3
 26 | nyt	3
 27 | underholde	3
 28 | utmerke seg	3
 29 | imponere	3
 30 | innovere	3
 31 | ivrig	3
 32 | kjærlighet	3
 33 | tryllebinde	3
 34 | ros	3
 35 | premie	3
 36 | rave	3
 37 | glede	3
 38 | klang	3
 39 | respekt	3
 40 | gjenopprette	3
 41 | revitalisere	3
 42 | smak	3
 43 | lykkes	3
 44 | overvinne	3
 45 | overgå	3
 46 | trives	3
 47 | triumf	3
 48 | vidunder	3
 49 | løft	3
 50 | capitivere	3
 51 | wow	3
 52 | spenning	3
 53 | vant	3
 54 | bekrefte	3
 55 | glad	3
 56 | forskjønne	3
 57 | skatt	3
 58 | stavebind	3
 59 | trollbundet	3
 60 | spennende	3
 61 | blende	3
 62 | gush	3
 63 | hjelp	2
 64 | more	2
 65 | applaudere	2
 66 | setter pris på	2
 67 | tiltrekke	2
 68 | gi	2
 69 | skryte av	2
 70 | boost	2
 71 | stell	2
 72 | kjærtegn	2
 73 | feire	2
 74 | sjarm	2
 75 | koordinere	2
 76 | samarbeide	2
 77 | minnes	2
 78 | kompliment	2
 79 | gratulerer	2
 80 | erobre	2
 81 | bidra	2
 82 | samarbeide	2
 83 | opprett	2
 84 | kreditt	2
 85 | dyrke	2
 86 | dedikere	2
 87 | fortjener	2
 88 | omfavne	2
 89 | oppmuntre	2
 90 | godkjenne	2
 91 | engasjere	2
 92 | forbedre	2
 93 | berike	2
 94 | fremkalle	2
 95 | legge til rette for	2
 96 | favorisere	2
 97 | passform	2
 98 | oppfylle	2
 99 | få	2
100 | glad	2
101 | harmoniser	2
102 | helbrede	2
103 | høydepunkt	2
104 | ære	2
105 | lys	2
106 | senk	2
107 | inspirere	2
108 | interesse	2
109 | intriger	2
110 | le	2
111 | maske	2
112 | motivere	2
113 | pleie	2
114 | overvinne	2
115 | overvant	2
116 | vær så snill	2
117 | fremgang	2
118 | blomstre	2
119 | rens	2
120 | utstråle	2
121 | rally	2
122 | høste	2
123 | forene	2
124 | innløsning	2
125 | avgrense	2
126 | kongelig	2
127 | fornyelse	2
128 | reparasjon	2
129 | løse	2
130 | gjenforene	2
131 | svale	2
132 | belønning	2
133 | rival	2
134 | gnisten	2
135 | underbygge	2
136 | søte	2
137 | svimle	2
138 | sympatisere	2
139 | tillit	2
140 | løft	2
141 | ærverdig	2
142 | vinn	2
143 | verdt	2
144 | aktelse	2
145 | styrke	2
146 | frigjør	2
147 | anbefaler	2
148 | master	2
149 | forbedre	2
150 | overgå	2
151 | skinne	2
152 | pioner	2
153 | fortjeneste	2
154 | styrke	2
155 | extol	2
156 | extoll	2
157 | takk	2
158 | oppdater	2
159 | fortjeneste	2
160 | livne opp	2
161 | frigjør	2
162 | godkjenne	2
163 | forbedre	2
164 | frita	1
165 | godta	1
166 | bekrefte	1
167 | lindre	1
168 | forbedre	1
169 | forutse	1
170 | blidgjøre	1
171 | håpe	1
172 | assistere	1
173 | passer	1
174 | bli venn	1
175 | fange	1
176 | rens	1
177 | komfort	1
178 | kommune	1
179 | kommunisere	1
180 | kompensere	1
181 | kompromiss	1
182 | kondone	1
183 | overbevise	1
184 | råd	1
185 | korstog	1
186 | verdig	1
187 | doner	1
188 | spare	1
189 | forseggjort	1
190 | pynt ut	1
191 | styrke	1
192 | aktivere	1
193 | gi	1
194 | opplyse	1
195 | overlate	1
196 | tenke	1
197 | etablere	1
198 | utvikle seg	1
199 | opphisse	1
200 | opplevelse	1
201 | bli kjent	1
202 | flatere	1
203 | tilgi	1
204 | befeste	1
205 | foster	1
206 | boltre seg	1
207 | pynt	1
208 | generere	1
209 | glans	1
210 | glitter	1
211 | glød	1
212 | tilfredsstille	1
213 | guide	1
214 | sele	1
215 | informer	1
216 | arve	1
217 | spøk	1
218 | siste	1
219 | som	1
220 | formidle	1
221 | nominere	1
222 | gi næring	1
223 | adlyde	1
224 | tilbud	1
225 | overliste	1
226 | holde ut	1
227 | seire	1
228 | utsette	1
229 | beskytt	1
230 | purr	1
231 | reaktiver	1
232 | berolige	1
233 | gjenvinne	1
234 | tilbakelent	1
235 | gjenopprette	1
236 | slapp av	1
237 | avlaste	1
238 | oppussing	1
239 | renovere	1
240 | omvende deg	1
241 | hvile	1
242 | redning	1
243 | gjenopplive	1
244 | modnes	1
245 | hilsen	1
246 | tilfredsstille	1
247 | sikker	1
248 | del	1
249 | betyr	1
250 | forenkle	1
251 | smil	1
252 | krydder	1
253 | stabiliser	1
254 | standardisere	1
255 | stimulere	1
256 | stiver	1
257 | avta	1
258 | tilstrekkelig	1
259 | dress	1
260 | støtte	1
261 | tåle	1
262 | hyllest	1
263 | oppgradere	1
264 | overliste	1
265 | promotere	1
266 | empati	1
267 | rette	1
268 | overladning	1
269 | plass til	1
270 | multitask	1
271 | oppnå	1
272 | utdannet	1
273 | strømlinjeforme	1
274 | effektivitet	1
275 | blomstre	1
276 | tjen	1
277 | innkvartering	1
278 | berolige	1
279 | oppbygg	1
280 | bli venn	1
281 | mykgjøre	1
282 | felicitate	1
283 | frikoble	1
284 | overstige	1
285 | avmystifisere	1
286 | verdi	1
287 | titillate	1
288 | reienforce	1
289 | hjelp	1
290 | garanti	1
291 | komplement	1
292 | kapitaliser	1
293 | pris	1
294 | oppnå	1
295 | argumentere	-1
296 | kamp	-1
297 | uskarphet	-1
298 | svak	-1
299 | brudd	-1
300 | blåmerke	-1
301 | feil	-1
302 | avbryt	-1
303 | utfordring	-1
304 | chide	-1
305 | tette	-1
306 | kollidere	-1
307 | kamp	-1
308 | tvinge	-1
309 | komplisere	-1
310 | concoct	-1
311 | samsvar	-1
312 | konfrontere	-1
313 | krever	-1
314 | kvake	-1
315 | dawdle	-1
316 | reduksjon	-1
317 | forsinkelse	-1
318 | død	-1
319 | avskrive	-1
320 | avvik	-1
321 | diktere	-1
322 | motet	-1
323 | avskjed	-1
324 | dispensere	-1
325 | misfornøyde	-1
326 | kast	-1
327 | tvist	-1
328 | distrahere	-1
329 | grøft	-1
330 | skilsmisse	-1
331 | dominere	-1
332 | nedskift	-1
333 | svindle	-1
334 | fare	-1
335 | håndheve	-1
336 | oppsluk	-1
337 | vikle	-1
338 | misunnelse	-1
339 | slett	-1
340 | feil	-1
341 | unngå	-1
342 | overdrive	-1
343 | ekskluder	-1
344 | utføre	-1
345 | eksponere	-1
346 | slukk	-1
347 | feign	-1
348 | fidget	-1
349 | flykte	-1
350 | forby	-1
351 | bekymre	-1
352 | skremme	-1
353 | rynke pannen	-1
354 | fumle	-1
355 | gamble	-1
356 | forherlige	-1
357 | grip	-1
358 | grip	-1
359 | stønn	-1
360 | knurring	-1
361 | brummen	-1
362 | hamstring	-1
363 | vondt	-1
364 | ignorere	-1
365 | implikere	-1
366 | bønnfall	-1
367 | fengsel	-1
368 | indusere	-1
369 | betennelse	-1
370 | forstyrre	-1
371 | avbryt	-1
372 | rus	-1
373 | trenge inn	-1
374 | oversvømmet	-1
375 | klagesang	-1
376 | lekkasje	-1
377 | avvikle	-1
378 | blander	-1
379 | oppfører seg feil	-1
380 | feilkast	-1
381 | villede	-1
382 | villedet	-1
383 | feilinformasjon	-1
384 | Mishandle	-1
385 | feil	-1
386 | mistrust	-1
387 | misforstå	-1
388 | misbruk	-1
389 | stønn	-1
390 | mønstre	-1
391 | mutter	-1
392 | nøytralisere	-1
393 | oppheve	-1
394 | utelat	-1
395 | utgang	-1
396 | overoppnå	-1
397 | overløp	-1
398 | overse	-1
399 | overmakt	-1
400 | overkjørt	-1
401 | overreagerer	-1
402 | overforenkle	-1
403 | overvelde	-1
404 | skjemme bort	-1
405 | omkomme	-1
406 | forfølge	-1
407 | plod	-1
408 | forby	-1
409 | lirke	-1
410 | avslutt	-1
411 | rasjonalisere	-1
412 | tilbakevise	-1
413 | trekke seg tilbake	-1
414 | avstå	-1
415 | rehash	-1
416 | gjengjelde	-1
417 | retrett	-1
418 | kvitt	-1
419 | rip	-1
420 | risiko	-1
421 | romantiser	-1
422 | sag	-1
423 | skåld	-1
424 | skremme	-1
425 | svi	-1
426 | scowl	-1
427 | skrape	-1
428 | granske	-1
429 | sjokk	-1
430 | skråstrek	-1
431 | slug	-1
432 | smugle	-1
433 | snappe	-1
434 | snike	-1
435 | sob	-1
436 | forstuing	-1
437 | stammer	-1
438 | stikk	-1
439 | stjal	-1
440 | bortkommen	-1
441 | fast	-1
442 | stunt	-1
443 | undertrykke	-1
444 | snuble	-1
445 | sverget	-1
446 | rive	-1
447 | erte	-1
448 | dekk	-1
449 | revet	-1
450 | overtredelse	-1
451 | felle	-1
452 | overtredelse	-1
453 | triks	-1
454 | trudge	-1
455 | angre	-1
456 | underbruk	-1
457 | angre	-1
458 | unravel	-1
459 | røtter	-1
460 | avta	-1
461 | varp	-1
462 | sutre	-1
463 | pisk	-1
464 | wince	-1
465 | sår	-1
466 | gjesp	-1
467 | kjef	-1
468 | lengter	-1
469 | idolize	-1
470 | hemme	-1
471 | pålegge	-1
472 | bekymring	-1
473 | emne	-1
474 | tåle	-1
475 | fluster	-1
476 | snivel	-1
477 | insinuere	-1
478 | coddle	-1
479 | oppscenen	-1
480 | underutnytte	-1
481 | squirm	-1
482 | mikromanage	-1
483 | hund	-1
484 | hollywoodise	-1
485 | sidespor	-1
486 | karikatur	-1
487 | uenighet	-1
488 | standard	-1
489 | dø	-1
490 | problemer	-1
491 | mistillit	-1
492 | skyld	-1
493 | lekter	-1
494 | overoppblås	-1
495 | tømme	-1
496 | vondt	-1
497 | krampe	-1
498 | jostle	-1
499 | rasle	-1
500 | uklar	-1
501 | rust	-1
502 | feil	-1
503 | lur	-1
504 | knuse	-1
505 | placate	-1
506 | overoppheting	-1
507 | døve	-1
508 | prute	-1
509 | cuss	-1
510 | uenighet	-1
511 | uoverensstemmelse	-1
512 | slapp	-1
513 | misfarging	-1
514 | avslutte	-1
515 | tretthet	-1
516 | motbevise	-1
517 | syltetøy	-1
518 | bolt	-1
519 | offer	-1
520 | sverte	-1
521 | belch	-1
522 | feiltolke	-1
523 | forlenge	-1
524 | typecast	-1
525 | klynge	-1
526 | gjennomsyre	-1
527 | koble fra	-1
528 | susing	-1
529 | hobble	-1
530 | drivhjul	-1
531 | liten	-1
532 | overreach	-1
533 | deform	-1
534 | rangel	-1
535 | prevaricate	-1
536 | forhåndsdømme	-1
537 | raske	-1
538 | peeve	-1
539 | misforstå	-1
540 | misforstått	-1
541 | feil fremstilling	-1
542 | jabber	-1
543 | irk	-1
544 | impinge	-1
545 | hoodwink	-1
546 | gawk	-1
547 | frazzle	-1
548 | dupe	-1
549 | desorienterende	-1
550 | lure	-1
551 | skremmende	-1
552 | karpe	-1
553 | tukt	-1
554 | blab	-1
555 | blabber	-1
556 | beleirer	-1
557 | belabor	-1
558 | bjørn	-1
559 | avskaffe	-2
560 | anklage	-2
561 | agitere	-2
562 | hevder	-2
563 | bakhold	-2
564 | amputere	-2
565 | sinne	-2
566 | irritere	-2
567 | motvirke	-2
568 | angrep	-2
569 | avverge	-2
570 | babble	-2
571 | grevling	-2
572 | balk	-2
573 | forvis	-2
574 | slo	-2
575 | tro	-2
576 | pass opp	-2
577 | bite	-2
578 | blære	-2
579 | blokk	-2
580 | tabbe	-2
581 | bry	-2
582 | skryte	-2
583 | bestikkelse	-2
584 | bust	-2
585 | feil	-2
586 | gnage	-2
587 | billigere	-2
588 | kvele	-2
589 | sammenstøt	-2
590 | tvinge	-2
591 | kollaps	-2
592 | commiserate	-2
593 | skjul	-2
594 | begrense	-2
595 | konflikt	-2
596 | forvirre	-2
597 | konspirere	-2
598 | begrense	-2
599 | motsier	-2
600 | contrive	-2
601 | begjære	-2
602 | krympe	-2
603 | lamme	-2
604 | kritisere	-2
605 | knuse	-2
606 | begrense	-2
607 | skade	-2
608 | forfall	-2
609 | lure	-2
610 | nederlag	-2
611 | tømme	-2
612 | trykk	-2
613 | frata	-2
614 | latterliggjøre	-2
615 | forringe	-2
616 | skuffe	-2
617 | ikke godkjenner	-2
618 | diskreditere	-2
619 | diskriminere	-2
620 | motløs	-2
621 | misliker	-2
622 | forstyrre	-2
623 | misfornøyd	-2
624 | forvreng	-2
625 | nød	-2
626 | forstyrr	-2
627 | undergang	-2
628 | avløp	-2
629 | drukner	-2
630 | dump	-2
631 | eliminere	-2
632 | flau	-2
633 | emote	-2
634 | inngrep	-2
635 | erodere	-2
636 | kaste ut	-2
637 | eksos	-2
638 | utvise	-2
639 | fabrikere	-2
640 | falsk	-2
641 | vakle	-2
642 | flaunt	-2
643 | flyndre	-2
644 | kraft	-2
645 | taper	-2
646 | forsak	-2
647 | sørge	-2
648 | hemme	-2
649 | sikring	-2
650 | hindre	-2
651 | sult	-2
652 | kjas	-2
653 | svekke	-2
654 | hindre	-2
655 | pådra	-2
656 | inept	-2
657 | infisere	-2
658 | angrep	-2
659 | påføre	-2
660 | skade	-2
661 | invadere	-2
662 | irritere	-2
663 | fare	-2
664 | mangel	-2
665 | lyve	-2
666 | taper	-2
667 | tapte	-2
668 | manipulere	-2
669 | rot	-2
670 | spotte	-2
671 | drap	-2
672 | nag	-2
673 | negere	-2
674 | forsømmelse	-2
675 | besatt	-2
676 | hindre	-2
677 | fornærme	-2
678 | motsette	-2
679 | overaktiv	-2
680 | overskygge	-2
681 | lamme	-2
682 | nedlatende	-2
683 | perplex	-2
684 | overhode	-2
685 | forstyrrelse	-2
686 | plyndre	-2
687 | pontifikat	-2
688 | pout	-2
689 | preen	-2
690 | late som	-2
691 | tiltale	-2
692 | provosere	-2
693 | straffe	-2
694 | avvis	-2
695 | rekyl	-2
696 | nekte	-2
697 | regress	-2
698 | tilbakefall	-2
699 | si fra deg	-2
700 | undertrykk	-2
701 | bebreid	-2
702 | mislik	-2
703 | begrense	-2
704 | forsinke	-2
705 | hevn	-2
706 | gå tilbake	-2
707 | tilbakekalle	-2
708 | opprør	-2
709 | brudd	-2
710 | sap	-2
711 | skjelle	-2
712 | skru	-2
713 | gripe	-2
714 | skill	-2
715 | knuse	-2
716 | skjul	-2
717 | makulere	-2
718 | unngå	-2
719 | skulk	-2
720 | baktalelse	-2
721 | spor	-2
722 | smøre	-2
723 | hån	-2
724 | snorke	-2
725 | spank	-2
726 | gyte	-2
727 | sputter	-2
728 | sløse	-2
729 | stilk	-2
730 | skremme	-2
731 | stjele	-2
732 | kveler	-2
733 | stagnere	-2
734 | kvele	-2
735 | stamme	-2
736 | strekke	-2
737 | sliter	-2
738 | bukke under	-2
739 | lider	-2
740 | kvele	-2
741 | tukle	-2
742 | hån	-2
743 | true	-2
744 | thrash	-2
745 | slit	-2
746 | tråkke	-2
747 | bagatellisere	-2
748 | undergrave	-2
749 | underwhelm	-2
750 | vex	-2
751 | bryte	-2
752 | skeptisk	-2
753 | avfall	-2
754 | svekk	-2
755 | vilje	-2
756 | vri deg	-2
757 | myrde	-2
758 | blind	-2
759 | uenig	-2
760 | utstøte	-2
761 | vandre	-2
762 | klage	-2
763 | disenchant	-2
764 | revulse	-2
765 | duehull	-2
766 | flabbergast	-2
767 | harry	-2
768 | piss	-2
769 | feil	-2
770 | ødelegge	-2
771 | skadedyr	-2
772 | skjevhet	-2
773 | panorere	-2
774 | dra	-2
775 | _ned	-2
776 | mar	-2
777 | klage	-2
778 | skade	-2
779 | forverre	-2
780 | vandalisere	-2
781 | avslutt	-2
782 | funksjonsfeil	-2
783 | tosk	-2
784 | slave	-2
785 | taint	-2
786 | ødelagt	-2
787 | flekk	-2
788 | rykket ned	-2
789 | sprit	-2
790 | utukt	-2
791 | rane	-2
792 | trist	-2
793 | diss	-2
794 | medskyldig	-2
795 | ondskap	-2
796 | manglende evne	-2
797 | sverte	-2
798 | forurense	-2
799 | smerte	-2
800 | feilberegne	-2
801 | mope	-2
802 | plage	-2
803 | accost	-2
804 | unnerve	-2
805 | skam	-2
806 | irettesett	-2
807 | overdrive	-2
808 | feilbehandling	-2
809 | myr	-2
810 | ondartet	-2
811 | trussel	-2
812 | jeer	-2
813 | ugyldiggjøre	-2
814 | innflytelse	-2
815 | heckle	-2
816 | hamstrung	-2
817 | gripe	-2
818 | ryper	-2
819 | flout	-2
820 | enervate	-2
821 | emasculate	-2
822 | manglende respekt	-2
823 | vanære	-2
824 | nedsett	-2
825 | debase	-2
826 | kolliderer	-2
827 | bungle	-2
828 | besmirch	-2
829 | aunguish	-2
830 | fornærme	-2
831 | forverre	-2
832 | forfalle	-2
833 | bash	-2
834 | bar	-3
835 | misbruk	-3
836 | forverre	-3
837 | alarm	-3
838 | fremmedgjøre	-3
839 | atrofi	-3
840 | baffel	-3
841 | tro	-3
842 | nedgjøre	-3
843 | forvirret	-3
844 | eksplosjon	-3
845 | bombardere	-3
846 | brutalisere	-3
847 | kantrer	-3
848 | careen	-3
849 | jukse	-3
850 | fordømme	-3
851 | forvirre	-3
852 | korroderer	-3
853 | korrupt	-3
854 | stapp	-3
855 | forbannelse	-3
856 | utartet	-3
857 | fornedre	-3
858 | fordømme	-3
859 | beklager	-3
860 | forverres	-3
861 | fortvilelse	-3
862 | forkaste	-3
863 | avsky	-3
864 | rasende	-3
865 | slaveri	-3
866 | utrydde	-3
867 | irritere	-3
868 | utnytte	-3
869 | utrydde	-3
870 | mislykkes	-3
871 | frustrer	-3
872 | gløtt	-3
873 | vevstol	-3
874 | mangel	-3
875 | molest	-3
876 | utslette	-3
877 | undertrykke	-3
878 | pervertere	-3
879 | plagerize	-3
880 | pest	-3
881 | herjing	-3
882 | irettesette	-3
883 | angre	-3
884 | vekke opp igjen	-3
885 | avvis	-3
886 | avvise	-3
887 | latterliggjøring	-3
888 | rue	-3
889 | sabotasje	-3
890 | spott	-3
891 | skrik	-3
892 | koke	-3
893 | skrumpe	-3
894 | smelle	-3
895 | kvele	-3
896 | spyd	-3
897 | sulte	-3
898 | stink	-3
899 | underkaste	-3
900 | undergrave	-3
901 | hindre	-3
902 | pine	-3
903 | opprørt	-3
904 | usurp	-3
905 | jamre	-3
906 | forverres	-3
907 | overfall	-3
908 | halshugge	-3
909 | ærekrenke	-3
910 | nedbryter	-3
911 | rive	-3
912 | demoralisere	-3
913 | fornærmelse	-3
914 | råte	-3
915 | suger	-3
916 | bastardize	-3
917 | kvalme	-3
918 | plyndring	-3
919 | ydmyke	-3
920 | hvem_ PRP _ reek	-3
921 | not_ dritt	-3
922 | desillusjon	-3
923 | forårsake	-3
924 | stank	-3
925 | stinket	-3
926 | voldtekt	-3
927 | kvinnelig	-3
928 | ødelegge	-3
929 | håpløshet	-3
930 | røkelse	-3
931 | fattige	-3
932 | trakassere	-3
933 | forurense	-3
934 | ødelegge	-3
935 | traumatisere	-3
936 | skandalisere	-3
937 | repugn	-3
938 | raseri	-3
939 | plagiere	-3
940 | lambaste	-3
941 | imperil	-3
942 | glødere	-3
943 | excoriate	-3
944 | rådgiver	-3
945 | nedsettelse	-3
946 | despoil	-3
947 | vanhellige	-3
948 | demonisere	-3
949 | bespottelse	-3
950 | hjernevask	-3
951 | browbeat	-3
952 | appal	-3
953 | forferdelig	-3
954 | befoul	-3
955 | plage	-4
956 | utslette	-4
957 | forråde	-4
958 | jævla	-4
959 | avskyr	-4
960 | gruer	-4
961 | hater	-4
962 | forferdelig	-4
963 | rasende	-4
964 | mortify	-4
965 | frastøtte	-4
966 | ruin	-4
967 | slakter	-4
968 | ødelegge	-4
969 | slakt	-4
970 | forferdelig	-4
971 | terrorisere	-4
972 | oppkast	-4
973 | kunne_ panikk	-4
974 | opprør	-4
975 | tortur	-4
976 | spott	-4
977 | avsky	-4
978 | utføre	-4
979 | vanære	-4
980 | avsky	-5
981 | forferdelig	-5
982 | avsky	-5
983 | kannibalisere	-5
984 | uren	-5
985 | forakte	-5
986 | 


--------------------------------------------------------------------------------
/data/sentiment/norec_sentence/labels.json:
--------------------------------------------------------------------------------
1 | {"Negative": "0", "Neutral": "1", "Positive": "2"}


--------------------------------------------------------------------------------
/data/skweak_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NorskRegnesentral/skweak/2b6db15e8429dbda062b2cc9cc74e69f51a0a8b6/data/skweak_logo.jpg


--------------------------------------------------------------------------------
/data/skweak_logo_thumbnail.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NorskRegnesentral/skweak/2b6db15e8429dbda062b2cc9cc74e69f51a0a8b6/data/skweak_logo_thumbnail.jpg


--------------------------------------------------------------------------------
/data/skweak_procedure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NorskRegnesentral/skweak/2b6db15e8429dbda062b2cc9cc74e69f51a0a8b6/data/skweak_procedure.png


--------------------------------------------------------------------------------
/data/wikidata_small_tokenised.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NorskRegnesentral/skweak/2b6db15e8429dbda062b2cc9cc74e69f51a0a8b6/data/wikidata_small_tokenised.json.gz


--------------------------------------------------------------------------------
/examples/ner/__init__.py:
--------------------------------------------------------------------------------
1 | from . import data_utils, conll2003_ner, eval_utils, muc6_ner, conll2003_prep


--------------------------------------------------------------------------------
/examples/ner/conll2003_ner.py:
--------------------------------------------------------------------------------
  1 | from typing import Iterable, Tuple
  2 | import re, json, os
  3 | import snips_nlu_parsers
  4 | from skweak.base import CombinedAnnotator, SpanAnnotator
  5 | from skweak.spacy import ModelAnnotator, TruecaseAnnotator
  6 | from skweak.heuristics import FunctionAnnotator, TokenConstraintAnnotator, SpanConstraintAnnotator, SpanEditorAnnotator
  7 | from skweak.gazetteers import GazetteerAnnotator, extract_json_data
  8 | from skweak.doclevel import DocumentHistoryAnnotator, DocumentMajorityAnnotator
  9 | from skweak.aggregation import MajorityVoter
 10 | from skweak import utils
 11 | from spacy.tokens import Doc, Span  # type: ignore
 12 | from . import data_utils
 13 | 
 14 | # Data files for gazetteers
 15 | WIKIDATA = os.path.dirname(__file__) + "/../../data/wikidata_tokenised.json"
 16 | WIKIDATA_SMALL = os.path.dirname(__file__) + "/../../data/wikidata_small_tokenised.json"
 17 | COMPANY_NAMES = os.path.dirname(__file__) + "/../../data/company_names_tokenised.json"
 18 | GEONAMES = os.path.dirname(__file__) + "/../../data/geonames.json"
 19 | CRUNCHBASE = os.path.dirname(__file__) + "/../../data/crunchbase.json"
 20 | PRODUCTS = os.path.dirname(__file__) + "/../../data/products.json"
 21 | FIRST_NAMES = os.path.dirname(__file__) + "/../../data/first_names.json"
 22 | FORM_FREQUENCIES = os.path.dirname(__file__) + "/../../data/form_frequencies.json"
 23 | 
 24 | 
 25 | ############################################
 26 | # Combination of all annotators
 27 | ############################################
 28 | 
 29 | 
 30 | class NERAnnotator(CombinedAnnotator):
 31 |     """Annotator of entities in documents, combining several sub-annotators (such as gazetteers,
 32 |     spacy models etc.). To add all annotators currently implemented, call add_all(). """
 33 | 
 34 |     def add_all(self):
 35 |         """Adds all implemented annotation functions, models and filters"""
 36 | 
 37 |         print("Loading shallow functions")
 38 |         self.add_shallow()
 39 |         print("Loading Spacy NER models")
 40 |         self.add_models()
 41 |         print("Loading gazetteer supervision modules")
 42 |         self.add_gazetteers()
 43 |         print("Loading document-level supervision sources")
 44 |         self.add_doc_level()
 45 | 
 46 |         return self
 47 | 
 48 |     def add_shallow(self):
 49 |         """Adds shallow annotation functions"""
 50 | 
 51 |         # Detection of dates, time, money, and numbers
 52 |         self.add_annotator(FunctionAnnotator("date_detector", date_generator))
 53 |         self.add_annotator(FunctionAnnotator("time_detector", time_generator))
 54 |         self.add_annotator(FunctionAnnotator("money_detector", money_generator))
 55 | 
 56 |         # Detection based on casing
 57 |         proper_detector = TokenConstraintAnnotator("proper_detector", utils.is_likely_proper, "ENT")
 58 | 
 59 |         # Detection based on casing, but allowing some lowercased tokens
 60 |         proper2_detector = TokenConstraintAnnotator("proper2_detector", utils.is_likely_proper, "ENT")
 61 |         proper2_detector.add_gap_tokens(data_utils.LOWERCASED_TOKENS | data_utils.NAME_PREFIXES)
 62 | 
 63 |         # Detection based on part-of-speech tags
 64 |         nnp_detector = TokenConstraintAnnotator("nnp_detector", lambda tok: tok.tag_ in {"NNP", "NNPS"}, "ENT")
 65 | 
 66 |         # Detection based on dependency relations (compound phrases)
 67 |         compound = lambda tok: utils.is_likely_proper(tok) and utils.in_compound(tok)
 68 |         compound_detector = TokenConstraintAnnotator("compound_detector", compound, "ENT")
 69 | 
 70 |         exclusives = ["date_detector", "time_detector", "money_detector"]
 71 |         for annotator in [proper_detector, proper2_detector, nnp_detector, compound_detector]:
 72 |             annotator.add_incompatible_sources(exclusives)
 73 |             annotator.add_gap_tokens(["'s", "-"])
 74 |             self.add_annotator(annotator)
 75 | 
 76 |             # We add one variants for each NE detector, looking at infrequent tokens
 77 |             infrequent_name = "infrequent_%s" % annotator.name
 78 |             self.add_annotator(SpanConstraintAnnotator(infrequent_name, annotator.name, utils.is_infrequent))
 79 | 
 80 |         # Other types (legal references etc.)
 81 |         misc_detector = FunctionAnnotator("misc_detector", misc_generator)
 82 |         legal_detector = FunctionAnnotator("legal_detector", legal_generator)
 83 | 
 84 |         # Detection of companies with a legal type
 85 |         ends_with_legal_suffix = lambda x: x[-1].lower_.rstrip(".") in data_utils.LEGAL_SUFFIXES
 86 |         company_type_detector = SpanConstraintAnnotator("company_type_detector", "proper2_detector",
 87 |                                                         ends_with_legal_suffix, "COMPANY")
 88 | 
 89 |         # Detection of full person names
 90 |         full_name_detector = SpanConstraintAnnotator("full_name_detector", "proper2_detector",
 91 |                                                      FullNameDetector(), "PERSON")
 92 | 
 93 |         for annotator in [misc_detector, legal_detector, company_type_detector, full_name_detector]:
 94 |             annotator.add_incompatible_sources(exclusives)
 95 |             self.add_annotator(annotator)
 96 | 
 97 |         # General number detector
 98 |         number_detector = FunctionAnnotator("number_detector", number_generator)
 99 |         number_detector.add_incompatible_sources(exclusives + ["legal_detector", "company_type_detector"])
100 |         self.add_annotator(number_detector)
101 | 
102 |         self.add_annotator(SnipsAnnotator("snips"))
103 |         return self
104 | 
105 |     def add_models(self):
106 |         """Adds Spacy NER models to the annotator"""
107 | 
108 |         self.add_annotator(ModelAnnotator("core_web_md", "en_core_web_md"))
109 |         self.add_annotator(TruecaseAnnotator("core_web_md_truecase", "en_core_web_md", FORM_FREQUENCIES))
110 |         self.add_annotator(ModelAnnotator("BTC", os.path.dirname(__file__) + "/../../data/btc"))
111 |         self.add_annotator( TruecaseAnnotator("BTC_truecase", os.path.dirname(__file__) + "/../../data/btc", FORM_FREQUENCIES))
112 | 
113 |         # Avoid spans that start with an article
114 |         editor = lambda span: span[1:] if span[0].lemma_ in {"the", "a", "an"} else span
115 |         self.add_annotator(SpanEditorAnnotator("edited_BTC", "BTC", editor))
116 |         self.add_annotator(SpanEditorAnnotator("edited_BTC_truecase", "BTC_truecase", editor))
117 |         self.add_annotator(SpanEditorAnnotator("edited_core_web_md", "core_web_md", editor))
118 |         self.add_annotator(SpanEditorAnnotator("edited_core_web_md_truecase", "core_web_md_truecase", editor))
119 | 
120 |         return self
121 | 
122 |     def add_gazetteers(self, full_load=True):
123 |         """Adds gazetteer supervision models (company names and wikidata)."""
124 | 
125 |         # Annotation of company names based on a large list of companies
126 |         # company_tries = extract_json_data(COMPANY_NAMES) if full_load else {}
127 | 
128 |         # Annotation of company, person and location names based on wikidata
129 |         wiki_tries = extract_json_data(WIKIDATA) if full_load else {}
130 | 
131 |         # Annotation of company, person and location names based on wikidata (only entries with descriptions)
132 |         wiki_small_tries = extract_json_data(WIKIDATA_SMALL)
133 | 
134 |         # Annotation of location names based on geonames
135 |         geo_tries = extract_json_data(GEONAMES)
136 | 
137 |         # Annotation of organisation and person names based on crunchbase open data
138 |         crunchbase_tries = extract_json_data(CRUNCHBASE)
139 | 
140 |         # Annotation of product names
141 |         products_tries = extract_json_data(PRODUCTS)
142 | 
143 |         exclusives = ["date_detector", "time_detector", "money_detector", "number_detector"]
144 |         for name, tries in {"wiki":wiki_tries, "wiki_small":wiki_small_tries,
145 |                             "geo":geo_tries, "crunchbase":crunchbase_tries, "products":products_tries}.items():
146 |             
147 |             # For each KB, we create two gazetters (case-sensitive or not)
148 |             cased_gazetteer = GazetteerAnnotator("%s_cased"%name, tries, case_sensitive=True)
149 |             uncased_gazetteer = GazetteerAnnotator("%s_uncased"%name, tries, case_sensitive=False)
150 |             cased_gazetteer.add_incompatible_sources(exclusives)
151 |             uncased_gazetteer.add_incompatible_sources(exclusives)
152 |             self.add_annotators(cased_gazetteer, uncased_gazetteer)
153 |                 
154 |             # We also add new sources for multitoken entities (which have higher confidence)
155 |             multitoken_cased = SpanConstraintAnnotator("multitoken_%s"%(cased_gazetteer.name), 
156 |                                                        cased_gazetteer.name, lambda s: len(s) > 1)
157 |             multitoken_uncased = SpanConstraintAnnotator("multitoken_%s"%(uncased_gazetteer.name), 
158 |                                                          uncased_gazetteer.name, lambda s: len(s) > 1)
159 |             self.add_annotators(multitoken_cased, multitoken_uncased)
160 |                 
161 |         return self
162 | 
163 |     def add_doc_level(self):
164 |         """Adds document-level supervision sources"""
165 | 
166 |         self.add_annotator(ConLL2003Standardiser())
167 | 
168 |         maj_voter = MajorityVoter("doclevel_voter", ["LOC", "MISC", "ORG", "PER"], 
169 |                                   initial_weights={"doc_history":0, "doc_majority":0})
170 |         maj_voter.add_underspecified_label("ENT", {"LOC", "MISC", "ORG", "PER"})     
171 |         self.add_annotator(maj_voter)   
172 |            
173 |         self.add_annotator(DocumentHistoryAnnotator("doc_history_cased", "doclevel_voter", ["PER", "ORG"]))
174 |         self.add_annotator(DocumentHistoryAnnotator("doc_history_uncased", "doclevel_voter", ["PER", "ORG"],
175 |                                                     case_sentitive=False))
176 |         
177 |         maj_voter = MajorityVoter("doclevel_voter", ["LOC", "MISC", "ORG", "PER"],
178 |                                   initial_weights={"doc_majority":0})
179 |         maj_voter.add_underspecified_label("ENT", {"LOC", "MISC", "ORG", "PER"})
180 |         self.add_annotator(maj_voter)
181 | 
182 |         self.add_annotator(DocumentMajorityAnnotator("doc_majority_cased", "doclevel_voter"))
183 |         self.add_annotator(DocumentMajorityAnnotator("doc_majority_uncased", "doclevel_voter", 
184 |                                                      case_sensitive=False))
185 |         return self
186 | 
187 | 
188 | ############################################
189 | # Heuristics
190 | ############################################
191 | 
192 | 
193 | def date_generator(doc):
194 |     """Searches for occurrences of date patterns in text"""
195 | 
196 |     spans = []
197 | 
198 |     i = 0
199 |     while i < len(doc):
200 |         tok = doc[i]
201 |         if tok.lemma_ in data_utils.DAYS | data_utils.DAYS_ABBRV:
202 |             spans.append((i, i + 1, "DATE"))
203 |         elif tok.is_digit and re.match("\\d+$", tok.text) and int(tok.text) > 1920 and int(tok.text) < 2040:
204 |             spans.append((i, i + 1, "DATE"))
205 |         elif tok.lemma_ in data_utils.MONTHS | data_utils.MONTHS_ABBRV:
206 |             if tok.tag_ == "MD":  # Skipping "May" used as auxiliary
207 |                 pass
208 |             elif i > 0 and re.match("\\d+$", doc[i - 1].text) and int(doc[i - 1].text) < 32:
209 |                 spans.append((i - 1, i + 1, "DATE"))
210 |             elif i > 1 and re.match("\\d+(?:st|nd|rd|th)$", doc[i - 2].text) and doc[i - 1].lower_ == "of":
211 |                 spans.append((i - 2, i + 1, "DATE"))
212 |             elif i < len(doc) - 1 and re.match("\\d+$", doc[i + 1].text) and int(doc[i + 1].text) < 32:
213 |                 spans.append((i, i + 2, "DATE"))
214 |                 i += 1
215 |             else:
216 |                 spans.append((i, i + 1, "DATE"))
217 |         i += 1
218 | 
219 |     for start, end, content in utils.merge_contiguous_spans(spans, doc):
220 |         yield start, end, content
221 | 
222 | 
223 | def time_generator(doc):
224 |     """Searches for occurrences of time patterns in text"""
225 | 
226 |     i = 0
227 |     while i < len(doc):
228 |         tok = doc[i]
229 | 
230 |         if (i < len(doc) - 1 and tok.text[0].isdigit() and
231 |                 doc[i + 1].lower_ in {"am", "pm", "a.m.", "p.m.", "am.", "pm."}):
232 |             yield i, i + 2, "TIME"
233 |             i += 1
234 |         elif tok.text[0].isdigit() and re.match("\\d{1,2}\\:\\d{1,2}", tok.text):
235 |             yield i, i + 1, "TIME"
236 |             i += 1
237 |         i += 1
238 | 
239 | 
240 | def money_generator(doc):
241 |     """Searches for occurrences of money patterns in text"""
242 | 
243 |     i = 0
244 |     while i < len(doc):
245 |         tok = doc[i]
246 |         if tok.text[0].isdigit():
247 |             j = i + 1
248 |             while (j < len(doc) and (doc[j].text[0].isdigit() or doc[j].norm_ in data_utils.MAGNITUDES)):
249 |                 j += 1
250 | 
251 |             found_symbol = False
252 |             if i > 0 and doc[i - 1].text in (data_utils.CURRENCY_CODES | data_utils.CURRENCY_SYMBOLS):
253 |                 i = i - 1
254 |                 found_symbol = True
255 |             if (j < len(doc) and doc[j].text in
256 |                     (data_utils.CURRENCY_CODES | data_utils.CURRENCY_SYMBOLS | {"euros", "cents", "rubles"})):
257 |                 j += 1
258 |                 found_symbol = True
259 | 
260 |             if found_symbol:
261 |                 yield i, j, "MONEY"
262 |             i = j
263 |         else:
264 |             i += 1
265 | 
266 | 
267 | def number_generator(doc):
268 |     """Searches for occurrences of number patterns (cardinal, ordinal, quantity or percent) in text"""
269 | 
270 |     i = 0
271 |     while i < len(doc):
272 |         tok = doc[i]
273 | 
274 |         if tok.lower_ in data_utils.ORDINALS:
275 |             yield i, i + 1, "ORDINAL"
276 | 
277 |         elif re.search("\\d", tok.text):
278 |             j = i + 1
279 |             while (j < len(doc) and (doc[j].norm_ in data_utils.MAGNITUDES)):
280 |                 j += 1
281 |             if j < len(doc) and doc[j].lower_.rstrip(".") in data_utils.UNITS:
282 |                 j += 1
283 |                 yield i, j, "QUANTITY"
284 |             elif j < len(doc) and doc[j].lower_ in ["%", "percent", "pc.", "pc", "pct", "pct.", "percents",
285 |                                                     "percentage"]:
286 |                 j += 1
287 |                 yield i, j, "PERCENT"
288 |             else:
289 |                 yield i, j, "CARDINAL"
290 |             i = j - 1
291 |         i += 1
292 | 
293 | 
294 | class FullNameDetector():
295 |     """Search for occurrences of full person names (first name followed by at least one title token)"""
296 | 
297 |     def __init__(self):
298 |         fd = open(FIRST_NAMES)
299 |         self.first_names = set(json.load(fd))
300 |         fd.close()
301 | 
302 |     def __call__(self, span: Span) -> bool:
303 |         # We assume full names are between 2 and 5 tokens
304 |         if len(span) < 2 or len(span) > 5:
305 |             return False
306 | 
307 |         return (span[0].text in self.first_names and
308 |                 span[-1].is_alpha and span[-1].is_title)
309 | 
310 | 
311 | class SnipsAnnotator(SpanAnnotator):
312 |     """Annotation using the Snips NLU entity parser. 
313 |        You must install  "snips-nlu-parsers" (pip install snips-nlu-parsers) to make it work.
314 |     """
315 |     
316 |     def __init__(self, name: str):
317 |         """Initialise the annotation tool."""
318 | 
319 |         super(SnipsAnnotator, self).__init__(name)
320 |         self.parser = snips_nlu_parsers.BuiltinEntityParser.build(language="en")
321 | 
322 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
323 |         """Runs the parser on the spacy document, and convert the result to labels."""
324 | 
325 |         text = doc.text
326 | 
327 |         # The current version of Snips has a bug that makes it crash with some rare
328 |         # Turkish characters, or mentions of "billion years"
329 |         text = text.replace("’", "'").replace("”", "\"").replace("“", "\"").replace("—", "-")
330 |         text = text.encode("iso-8859-15", "ignore").decode("iso-8859-15")
331 |         text = re.sub("(\\d+) ([bm]illion(?: (?:\\d+|one|two|three|four|five|six|seven" +
332 |                       "|eight|nine|ten))? years?)", "\\g<1>.0 \\g<2>", text)
333 | 
334 |         results = self.parser.parse(text)
335 |         for result in results:
336 |             span = doc.char_span(result["range"]["start"], result["range"]["end"])
337 |             if span is None or span.text.lower() in {"now"} or span.text in {"may"}:
338 |                 continue
339 |             label = None
340 |             if (result["entity_kind"] == "snips/number" and span.text.lower() not in
341 |                     {"one", "some", "few", "many", "several"}):
342 |                 label = "CARDINAL"
343 |             elif (result["entity_kind"] == "snips/ordinal" and span.text.lower() not in
344 |                   {"first", "second", "the first", "the second"}):
345 |                 label = "ORDINAL"
346 |             elif result["entity_kind"] == "snips/temperature":
347 |                 label = "QUANTITY"
348 |             elif result["entity_kind"] == "snips/amountOfMoney":
349 |                 label = "MONEY"
350 |             elif result["entity_kind"] == "snips/percentage":
351 |                 label = "PERCENT"
352 |             elif result["entity_kind"] in {"snips/date", "snips/datePeriod", "snips/datetime"}:
353 |                 label = "DATE"
354 |             elif result["entity_kind"] in {"snips/time", "snips/timePeriod"}:
355 |                 label = "TIME"
356 | 
357 |             if label:
358 |                 yield span.start, span.end, label
359 | 
360 | def legal_generator(doc):
361 |     legal_spans = []
362 |     for span in utils.get_spans(doc, ["proper2_detector", "nnp_detector"]):
363 |         if not utils.is_likely_proper(doc[span.end-1]):
364 |             continue         
365 |         last_token = doc[span.end-1].text.title().rstrip("s")
366 |                   
367 |         if last_token in data_utils.LEGAL:     
368 |             legal_spans.append((span.start,span.end, "LAW"))
369 |                      
370 |     
371 |     # Handling legal references such as Article 5
372 |     for i in range(len(doc) - 1):
373 |         if doc[i].text.rstrip("s") in {"Article", "Paragraph", "Section", "Chapter", "§"}:
374 |             if doc[i + 1].text[0].isdigit() or doc[i + 1].text in data_utils.ROMAN_NUMERALS:
375 |                 start, end = i, i + 2
376 |                 if (i < len(doc) - 3 and doc[i + 2].text in {"-", "to", "and"}
377 |                         and (doc[i + 3].text[0].isdigit() or doc[i + 3].text in data_utils.ROMAN_NUMERALS)):
378 |                     end = i + 4
379 |                 legal_spans.append((start, end, "LAW"))
380 | 
381 |     # Merge contiguous spans of legal references ("Article 5, Paragraph 3")
382 |     legal_spans = utils.merge_contiguous_spans(legal_spans, doc)
383 |     for start, end, label in legal_spans:
384 |         yield start, end, label
385 | 
386 | 
387 | def misc_generator(doc):
388 |     """Detects occurrences of countries and various less-common entities (NORP, FAC, EVENT, LANG)"""
389 |     
390 |     spans = set(doc.spans["proper2_detector"])
391 |     spans |= {doc[i:i+1] for i in range(len(doc))}
392 |     
393 |     for span in sorted(spans):
394 | 
395 |         span_text = span.text
396 |         if span_text.isupper():
397 |             span_text = span_text.title()
398 |         last_token = doc[span.end-1].text
399 | 
400 |         if span_text in data_utils.COUNTRIES:
401 |             yield span.start, span.end, "GPE"
402 | 
403 |         if len(span) <= 3 and (span in data_utils.NORPS or last_token in data_utils.NORPS 
404 |                                or last_token.rstrip("s") in data_utils.NORPS):
405 |             yield span.start, span.end, "NORP"
406 |     
407 |         if span in data_utils.LANGUAGES and doc[span.start].tag_=="NNP":
408 |             yield span.start, span.end, "LANGUAGE"
409 |             
410 |         if last_token in data_utils.FACILITIES and len(span) > 1:
411 |             yield span.start, span.end, "FAC"     
412 | 
413 |         if last_token in data_utils.EVENTS  and len(span) > 1:
414 |             yield span.start, span.end, "EVENT"     
415 |     
416 |        
417 |     
418 | ############################################
419 | # Standardisation of the output labels
420 | ############################################
421 | 
422 | 
423 | class ConLL2003Standardiser(SpanAnnotator):
424 |     """Annotator taking existing annotations and standardising them
425 |     to fit the ConLL 2003 tag scheme"""
426 | 
427 |     def __init__(self):
428 |         super(ConLL2003Standardiser, self).__init__("")
429 | 
430 |     def __call__(self, doc):
431 |         """Annotates one single document"""     
432 |                
433 |         for source in doc.spans:
434 |                
435 |             new_spans = []  
436 |             for span in doc.spans[source]:
437 |                 if "\n" in span.text:
438 |                     continue
439 |                 elif span.label_=="PERSON":
440 |                     new_spans.append(Span(doc, span.start, span.end, label="PER"))
441 |                 elif span.label_ in {"ORGANIZATION", "ORGANISATION", "COMPANY"}:
442 |                     new_spans.append(Span(doc, span.start, span.end, label="ORG"))
443 |                 elif span.label_ in {"GPE"}:
444 |                     new_spans.append(Span(doc, span.start, span.end, label="LOC"))
445 |                 elif span.label_ in {"EVENT", "FAC", "LANGUAGE", "LAW", "NORP", "PRODUCT", "WORK_OF_ART"}:
446 |                     new_spans.append(Span(doc, span.start, span.end, label="MISC"))
447 |                 else:
448 |                     new_spans.append(span)         
449 |             doc.spans[source] = new_spans      
450 |         return doc
451 | 
452 | 


--------------------------------------------------------------------------------
/examples/ner/conll2003_prep.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from .conll2003_ner import (WIKIDATA, WIKIDATA_SMALL, CRUNCHBASE, PRODUCTS, 
  3 |                             GEONAMES, COMPANY_NAMES)
  4 | 
  5 | from . import data_utils
  6 | import pickle, re, json
  7 | import spacy
  8 | 
  9 | """Contains scripts used to compile the lists of entities from Wikipedia, Geonames,
 10 | Crunchbase and DBPedia. Those scripts can be ignored in most cases, as it is easier
 11 | to directly rely on the already compiled json files. """
 12 | 
 13 | 
 14 | ############################################
 15 | # Compilation of data sources
 16 | ############################################
 17 |    
 18 |     
 19 | def compile_wikidata(wikidata="../data/WikidataNE_20170320_NECKAR_1_0.json_.gz", only_with_descriptions=False):
 20 |     """Compiles a JSON file with the wiki data"""
 21 |      
 22 |     
 23 |     import gzip, json
 24 |     fd = gzip.open(wikidata)
 25 |     wikidata = {"PERSON":{}, "LOC":{}, "GPE":{}, "ORG":{}}
 26 |     location_qs = set()
 27 |     for l in fd:
 28 |         d = json.loads(l)
 29 |         neClass = str(d["neClass"])
 30 |         name = d["norm_name"]
 31 |         if ("en_sitelink" not in d and neClass !="PER"):
 32 |             continue
 33 |         if "en_sitelink" in d:
 34 |             if "," in d["en_sitelink"] or "(" in d["en_sitelink"]:
 35 |                 continue
 36 |         if name[0].isdigit() or name[-1].isdigit() or len(name) < 2:
 37 |             continue
 38 |         if neClass=="PER":
 39 |             neClass = "PERSON"
 40 |         elif neClass=="LOC":
 41 |             if {'Mountain Range', 'River', 'Sea', 'Continent', 'Mountain'}.intersection(d.get("location_type",set())):
 42 |                 neClass = "LOC"
 43 |             else:
 44 |                 neClass ="GPE"
 45 |             location_qs.add(d["id"])
 46 |         elif neClass=="ORG" and d["id"] in location_qs:
 47 |             continue
 48 |         if "alias" in d:
 49 |             d["nb_aliases"] = len(d["alias"])
 50 |             del d["alias"]
 51 |         for key_to_remove in ["de_sitelink", '$oid', "id", "coordinate", "official_website", "_id"]:
 52 |             if key_to_remove in d:
 53 |                 del d[key_to_remove]
 54 |         if name in wikidata[neClass]:
 55 |             merge = wikidata[neClass][name] if len(str(wikidata[neClass][name])) > len(str(d)) else d
 56 |             merge["nb_entities"] = wikidata[neClass][name].get("nb_entities", 1) + 1
 57 |             wikidata[neClass][name] = merge
 58 |         else:
 59 |             wikidata[neClass][name] = d
 60 |                   
 61 |     fd = open("data/frequencies.pkl", "rb")
 62 |     frequencies = pickle.load(fd)
 63 |     fd.close() 
 64 |     
 65 |     # We only keep entities with a certain frequency
 66 |     for neClass in ["PERSON", "LOC", "ORG", "GPE"]:
 67 |         for entity in list(wikidata[neClass].keys()): 
 68 |             if entity.lower() in frequencies and frequencies[entity.lower()]>10000: 
 69 |                 del wikidata[neClass][entity]
 70 |     
 71 |     # And prune those that cannot be encoded using latin characters
 72 |     for neClass in ["PERSON", "LOC", "ORG", "GPE"]:
 73 |         for entity in list(wikidata[neClass].keys()): 
 74 |             try:
 75 |                 entity.encode('iso-8859-15') 
 76 |             except UnicodeEncodeError: 
 77 |                 del wikidata[neClass][entity]
 78 | 
 79 |         
 80 |     wikidata2 = {neClass:{} for neClass in wikidata}
 81 |     for neClass in wikidata:
 82 |         entities_for_class = set()
 83 |         for entity in wikidata[neClass]:
 84 |             nb_tokens = len(entity.split())
 85 |             if nb_tokens > 10:
 86 |                 continue
 87 |             if only_with_descriptions and "description" not in wikidata[neClass][entity]:
 88 |                 continue
 89 |             entities_for_class.add(entity) 
 90 |             if "en_sitelink" in wikidata[neClass][entity]:
 91 |                 entities_for_class.add(wikidata[neClass][entity]["en_sitelink"])
 92 |         wikidata2[neClass] = entities_for_class #type: ignore
 93 |                             
 94 |     fd = open(WIKIDATA_SMALL if only_with_descriptions else WIKIDATA, "w")
 95 |     json.dump({key:sorted(names) for key,names in wikidata2.items()}, fd)
 96 |     fd.close()
 97 | 
 98 |     
 99 | def get_alternative_company_names(name, vocab=None):
100 |     """Extract a list of alternative company names (with or without legal suffix etc.)"""
101 |     
102 |     alternatives = {name}        
103 |     while True:
104 |         current_nb_alternatives = len(alternatives)
105 |             
106 |         for alternative in list(alternatives):
107 |             tokens = alternative.split()
108 |             if len(tokens)==1:
109 |                 continue
110 |                 
111 |             # We add an alternative name without the legal suffix
112 |             if tokens[-1].lower().rstrip(".") in data_utils.LEGAL_SUFFIXES: 
113 |                 alternatives.add(" ".join(tokens[:-1]))
114 |             
115 |             if tokens[-1].lower() in {"limited", "corporation"}:
116 |                 alternatives.add(" ".join(tokens[:-1]))                
117 |                 
118 |             if tokens[-1].lower().rstrip(".") in {"corp", "inc", "co"}:
119 |                 if alternative.endswith("."):
120 |                     alternatives.add(alternative.rstrip("."))
121 |                 else:
122 |                     alternatives.add(alternative+".")
123 |                 
124 |             # If the last token is a country name (like The SAS Group Norway), add an alternative without
125 |             if tokens[-1] in data_utils.COUNTRIES:
126 |                 alternatives.add(" ".join(tokens[:-1])) 
127 |                 
128 |             # If the name starts with a the, add an alternative without it
129 |             if tokens[0].lower()=="the":   
130 |                 alternatives.add(" ".join(tokens[1:]))
131 |                 
132 |             # If the name ends with a generic token such as "Telenor International", add an alternative without
133 |             if vocab is not None and tokens[-1] in data_utils.GENERIC_TOKENS and any([tok for tok in tokens if vocab[tok].rank==0]):
134 |                 alternatives.add(" ".join(tokens[:-1])) 
135 |                     
136 |         if len(alternatives)==current_nb_alternatives:
137 |             break
138 |     
139 |     # We require the alternatives to have at least 2 characters (4 characters if the name does not look like an acronym)
140 |     alternatives = {alt for alt in alternatives if len(alt) > 1 and alt.lower().rstrip(".") not in data_utils.LEGAL_SUFFIXES} 
141 |     alternatives = {alt for alt in alternatives if len(alt) > 3 or alt.isupper()}
142 |     
143 |     return alternatives
144 | 
145 | 
146 | def compile_company_names():
147 |     """Compiles a JSON file with company names"""
148 |     
149 |     vocab = spacy.load("en_core_web_md").vocab
150 |     
151 |     fd = open("../data/graph/entity.sql.json")
152 |     company_entities = set()
153 |     other_org_entities = set()
154 |     for l in fd:
155 |         dico = json.loads(l)
156 |         if ("factset_entity_type_description" not in dico or dico["factset_entity_type_description" ] not in 
157 |             {"Private Company", "Subsidiary", "Extinct", "Public Company", "Holding Company", "College/University", 
158 |              "Government", "Non-Profit Organization", "Operating Division", "Foundation/Endowment"}):
159 |             continue
160 |         name = dico["factset_entity_name"]
161 |         name = name.split("(")[0].split(",")[0].strip(" \n\t/")
162 |         if not name:
163 |             continue
164 | 
165 |         alternatives = get_alternative_company_names(name, vocab)
166 |         if dico["factset_entity_type_description" ] in {"College/University", "Government", "Non-Profit Organization", "Foundation/Endowment"}:
167 |             other_org_entities.update(alternatives)
168 |         else:
169 |             company_entities.update(alternatives)
170 |     fd.close()
171 |     print("Number of extracted entities: %i companies and %i other organisations"%(len(company_entities), len(other_org_entities)))
172 |     fd = open(COMPANY_NAMES, "w")
173 |     json.dump({"COMPANY":sorted(company_entities), "ORG":sorted(other_org_entities)}, fd)
174 |     fd.close()
175 |     
176 |     
177 | def compile_geographical_data(geo_source="../data/allCountries.txt", population_threshold=100000):
178 |     """Compiles a JSON file with geographical locations"""
179 |     
180 |     names = set()
181 |     fd = open(geo_source)
182 |     for i, line in enumerate(fd):
183 |         line_feats = line.split("\t")
184 |         if len(line_feats) < 15:
185 |             continue
186 |         population = int(line_feats[14])
187 |         if population < population_threshold:
188 |             continue
189 |         name = line_feats[1].strip()
190 |         names.add(name)
191 |         name = re.sub(".*(?:Kingdom|Republic|Province|State|Commonwealth|Region|City|Federation) of ", "", name).strip()
192 |         names.add(name)
193 |         name = name.replace(" City", "").replace(" Region", "").replace(" District", "").replace(" County", "").replace(" Zone", "").strip()
194 |         names.add(name)
195 |         name = (name.replace("Arrondissement de ", "").replace("Stadtkreis ", "").replace("Landkreis ", "").strip()
196 |                 .replace("Departamento de ", "").replace("Département de ", "").replace("Provincia di ", "")).strip()
197 |         names.add(name)
198 |         name = re.sub("^the ", "", name).strip()
199 |         names.add(name)
200 |         if i%10000==0:
201 |             print("Number of processed lines:", i, "and number of extracted locations:", len(names))
202 |     fd.close()
203 |     names = {alt for alt in names if len(alt) > 2 and alt.lower().rstrip(".") not in data_utils.LEGAL_SUFFIXES}
204 |     fd = open(GEONAMES, "w")
205 |     json.dump({"GPE":sorted(names)}, fd)
206 |     fd.close()
207 |         
208 |         
209 | def compile_crunchbase_data(org_data="../data/organizations.csv", people_data="../data/people.csv"):
210 |     """Compiles a JSON file with company and person names from Crunchbase Open Data"""
211 | 
212 |     company_entities = set()
213 |     other_org_entities = set()
214 |     
215 |     vocab = spacy.load("en_core_web_md").vocab
216 |     
217 |     fd = open(org_data)
218 |     for line in fd:
219 |         split = [s.strip() for s in line.rstrip().strip("\"").split("\",\"")]
220 |         if len(split) < 5:
221 |             continue
222 |         name = split[1]
223 |         alternatives = get_alternative_company_names(name, vocab)        
224 |         if split[3] in {"company", "investor"}:
225 |             company_entities.update(alternatives)
226 |         else:
227 |             other_org_entities.update(alternatives)
228 |     fd.close()
229 |     print("Number of extracted entities: %i companies and %i other organisations"%(len(company_entities), len(other_org_entities)))
230 | 
231 |     persons = set()
232 |     fd = open(people_data)
233 |     for line in fd:
234 |         split = [s.strip() for s in line.rstrip().strip("\"").split("\",\"")]
235 |         if len(split) < 5:
236 |             continue
237 |         first_name = split[2]
238 |         last_name = split[3]
239 |         alternatives = {"%s %s"%(first_name, last_name)}
240 |     #    alternatives.add(last_name)
241 |         alternatives.add("%s. %s"%(first_name[0], last_name))
242 |         if " " in first_name:
243 |             first_split = first_name.split(" ", 1)
244 |             alternatives.add("%s %s"%(first_split[0], last_name))
245 |             alternatives.add("%s %s. %s"%(first_split[0], first_split[1][0], last_name))
246 |             alternatives.add("%s. %s. %s"%(first_split[0][0], first_split[1][0], last_name))
247 |         persons.update(alternatives)
248 |         
249 |     # We require person names to have at least 3 characters (and not be a suffix)
250 |     persons = {alt for alt in persons if len(alt) > 2 and alt.lower().rstrip(".") not in data_utils.LEGAL_SUFFIXES}
251 |     fd.close()
252 |     print("Number of extracted entities: %i person names"%(len(persons)))
253 |    
254 |     fd = open(CRUNCHBASE, "w")
255 |     json.dump({"COMPANY":sorted(company_entities), "ORG":sorted(other_org_entities), "PERSON":sorted(persons)}, fd)
256 |     fd.close()
257 |     
258 | def compile_product_data(data="../data/dbpedia.json"):
259 |     fd = open(data)
260 |     all_product_names = set()
261 |     for line in fd:
262 |         line = line.strip().strip(",")
263 |         value = json.loads(line)["label2"]["value"]
264 |         if "(" in value:
265 |             continue
266 |             
267 |         product_names = {value}
268 |         
269 |         # The DBpedia entries are all titled, which cause problems for products such as iPad
270 |         if len(value)>2 and value[0] in {"I", "E"} and value[1].isupper() and value[2].islower():
271 |             product_names.add(value[0].lower()+value[1:])
272 |         
273 |         # We also add plural entries
274 |         for product_name in list(product_names):
275 |             if len(product_name.split()) <= 2:
276 |                 plural = product_name + ("es" if value.endswith("s") else "s")
277 |                 product_names.add(plural)
278 |                 
279 |         all_product_names.update(product_names)
280 |         
281 |     fd = open(PRODUCTS, "w")
282 |     json.dump({"PRODUCT":sorted(all_product_names)}, fd)
283 |     fd.close()
284 |         
285 |         
286 | def compile_wiki_product_data(data="../data/wiki_products.json"):
287 |     fd = open(data)
288 |     dict_list = json.load(fd)
289 |     fd.close()
290 |     products = set()
291 |     for product_dict in dict_list:
292 |         product_name = product_dict["itemLabel"]
293 |         if "("  in product_name or len(product_name) <= 2:
294 |             continue
295 |         products.add(product_name)
296 |         if len(product_name.split()) <= 2:
297 |             plural = product_name + ("es" if product_name.endswith("s") else "s")
298 |             products.add(plural)
299 | 
300 |     fd = open(WIKIDATA, "r")
301 |     current_dict = json.load(fd)
302 |     fd.close()
303 |     current_dict["PRODUCT"] = sorted(products)
304 |     fd = open(WIKIDATA, "w")
305 |     json.dump(current_dict, fd)
306 |     fd.close()
307 |     
308 |     fd = open(WIKIDATA_SMALL, "r")
309 |     current_dict = json.load(fd)
310 |     fd.close()
311 |     current_dict["PRODUCT"] = sorted(products)
312 |     fd = open(WIKIDATA_SMALL, "w")
313 |     json.dump(current_dict, fd)
314 |     fd.close()
315 | 


--------------------------------------------------------------------------------
/examples/ner/data_utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """Class containing some generic entity names (in English)"""
  3 | 
  4 | # List of currency symbols and three-letter codes
  5 | CURRENCY_SYMBOLS = {"$", "¥", "£", "€", "kr", "₽", "R$", "₹", "Rp", "₪", "zł", "Rs", "₺", "RS"}
  6 | 
  7 | CURRENCY_CODES = {"USD", "EUR", "CNY", "JPY", "GBP", "NOK", "DKK", "CAD", "RUB", "MXN", "ARS", "BGN",
  8 |                   "BRL", "CHF", "CLP", "CZK", "INR", "IDR", "ILS", "IRR", "IQD", "KRW", "KZT", "NGN",
  9 |                   "QAR", "SEK", "SYP", "TRY", "UAH", "AED", "AUD", "COP", "MYR", "SGD", "NZD", "THB",
 10 |                   "HUF", "HKD", "ZAR", "PHP", "KES", "EGP", "PKR", "PLN", "XAU", "VND", "GBX"}
 11 | 
 12 | # sets of tokens used for the shallow patterns
 13 | MONTHS = {"January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November",
 14 |           "December"}
 15 | MONTHS_ABBRV = {"Jan.", "Feb.", "Mar.", "Apr.", "May.", "Jun.", "Jul.", "Aug.", "Sep.", "Sept.", "Oct.", "Nov.", "Dec."}
 16 | DAYS = {"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"}
 17 | DAYS_ABBRV = {"Mon.", "Tu.", "Tue.", "Tues.", "Wed.", "Th.", "Thu.", "Thur.", "Thurs.", "Fri.", "Sat.", "Sun."}
 18 | MAGNITUDES = {"million", "billion", "mln", "bln", "bn", "thousand", "m", "k", "b", "m.", "k.", "b.", "mln.", "bln.",
 19 |               "bn."}
 20 | UNITS = {"tons", "tonnes", "barrels", "m", "km", "miles", "kph", "mph", "kg", "°C", "dB", "ft", "gal", "gallons", "g",
 21 |          "kW", "s", "oz",
 22 |          "m2", "km2", "yards", "W", "kW", "kWh", "kWh/yr", "Gb", "MW", "kilometers", "meters", "liters", "litres", "g",
 23 |          "grams", "tons/yr",
 24 |          'pounds', 'cubits', 'degrees', 'ton', 'kilograms', 'inches', 'inch', 'megawatts', 'metres', 'feet', 'ounces',
 25 |          'watts', 'megabytes',
 26 |          'gigabytes', 'terabytes', 'hectares', 'centimeters', 'millimeters', "F", "Celsius"}
 27 | ORDINALS = ({"first, second, third", "fourth", "fifth", "sixth", "seventh"} |
 28 |             {"%i1st" % i for i in range(100)} | {"%i2nd" % i for i in range(100)} | {"%ith" % i for i in range(1000)})
 29 | ROMAN_NUMERALS = {'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI',
 30 |                   'XVII',
 31 |                   'XVIII', 'XIX', 'XX', 'XXI', 'XXII', 'XXIII', 'XXIV', 'XXV', 'XXVI', 'XXVII', 'XXVIII', 'XXIX', 'XXX'}
 32 | 
 33 | # Full list of country names
 34 | COUNTRIES = {'Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua', 'Argentina', 'Armenia', 'Australia',
 35 |              'Austria',
 36 |              'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
 37 |              'Bhutan',
 38 |              'Bolivia', 'Bosnia Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina', 'Burundi',
 39 |              'Cambodia', 'Cameroon',
 40 |              'Canada', 'Cape Verde', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros',
 41 |              'Congo', 'Costa Rica',
 42 |              'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic',
 43 |              'East Timor',
 44 |              'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Fiji',
 45 |              'Finland', 'France',
 46 |              'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea',
 47 |              'Guinea-Bissau', 'Guyana',
 48 |              'Haiti', 'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel',
 49 |              'Italy', 'Ivory Coast',
 50 |              'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', 'Korea North', 'Korea South', 'Kosovo',
 51 |              'Kuwait', 'Kyrgyzstan',
 52 |              'Laos', 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Liechtenstein', 'Lithuania', 'Luxembourg',
 53 |              'Macedonia', 'Madagascar',
 54 |              'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Mauritania', 'Mauritius', 'Mexico',
 55 |              'Micronesia',
 56 |              'Moldova', 'Monaco', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nauru',
 57 |              'Nepal', 'Netherlands',
 58 |              'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Panama',
 59 |              'Papua New Guinea',
 60 |              'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russian Federation',
 61 |              'Rwanda', 'St Kitts & Nevis',
 62 |              'St Lucia', 'Saint Vincent & the Grenadines', 'Samoa', 'San Marino', 'Sao Tome & Principe', 'Saudi Arabia',
 63 |              'Senegal', 'Serbia',
 64 |              'Seychelles', 'Sierra Leone', 'Singapore', 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia',
 65 |              'South Africa', 'South Sudan',
 66 |              'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'Swaziland', 'Sweden', 'Switzerland', 'Syria', 'Taiwan',
 67 |              'Tajikistan', 'Tanzania',
 68 |              'Thailand', 'Togo', 'Tonga', 'Trinidad & Tobago', 'Tunisia', 'Turkey', 'Turkmenistan', 'Tuvalu', 'Uganda',
 69 |              'Ukraine',
 70 |              'United Arab Emirates', 'United Kingdom', 'United States', 'Uruguay', 'Uzbekistan', 'Vanuatu',
 71 |              'Vatican City', 'Venezuela',
 72 |              'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe', "USA", "UK", "Russia", "South Korea"}
 73 | 
 74 | # Natialities, religious and political groups
 75 | NORPS = {'Afghan', 'African', 'Albanian', 'Algerian', 'American', 'Andorran', 'Anglican', 'Angolan', 'Arab', 'Aramean',
 76 |          'Argentine', 'Armenian',
 77 |          'Asian', 'Australian', 'Austrian', 'Azerbaijani', 'Bahamian', 'Bahraini', 'Baklan', 'Bangladeshi', 'Batswana',
 78 |          'Belarusian', 'Belgian',
 79 |          'Belizean', 'Beninese', 'Bermudian', 'Bhutanese', 'Bolivian', 'Bosnian', 'Brazilian', 'British', 'Bruneian',
 80 |          'Buddhist',
 81 |          'Bulgarian', 'Burkinabe', 'Burmese', 'Burundian', 'Californian', 'Cambodian', 'Cameroonian', 'Canadian',
 82 |          'Cape Verdian', 'Catholic', 'Caymanian',
 83 |          'Central African', 'Central American', 'Chadian', 'Chilean', 'Chinese', 'Christian', 'Christian-Democrat',
 84 |          'Christian-Democratic',
 85 |          'Colombian', 'Communist', 'Comoran', 'Congolese', 'Conservative', 'Costa Rican', 'Croat', 'Cuban', 'Cypriot',
 86 |          'Czech', 'Dane', 'Danish',
 87 |          'Democrat', 'Democratic', 'Djibouti', 'Dominican', 'Dutch', 'East European', 'Ecuadorean', 'Egyptian',
 88 |          'Emirati', 'English', 'Equatoguinean',
 89 |          'Equatorial Guinean', 'Eritrean', 'Estonian', 'Ethiopian', 'Eurasian', 'European', 'Fijian', 'Filipino',
 90 |          'Finn', 'Finnish', 'French',
 91 |          'Gabonese', 'Gambian', 'Georgian', 'German', 'Germanic', 'Ghanaian', 'Greek', 'Greenlander', 'Grenadan',
 92 |          'Grenadian', 'Guadeloupean', 'Guatemalan',
 93 |          'Guinea-Bissauan', 'Guinean', 'Guyanese', 'Haitian', 'Hawaiian', 'Hindu', 'Hinduist', 'Hispanic', 'Honduran',
 94 |          'Hungarian', 'Icelander', 'Indian',
 95 |          'Indonesian', 'Iranian', 'Iraqi', 'Irish', 'Islamic', 'Islamist', 'Israeli', 'Israelite', 'Italian', 'Ivorian',
 96 |          'Jain', 'Jamaican', 'Japanese',
 97 |          'Jew', 'Jewish', 'Jordanian', 'Kazakhstani', 'Kenyan', 'Kirghiz', 'Korean', 'Kurd', 'Kurdish', 'Kuwaiti',
 98 |          'Kyrgyz', 'Labour', 'Latin',
 99 |          'Latin American', 'Latvian', 'Lebanese', 'Liberal', 'Liberian', 'Libyan', 'Liechtensteiner', 'Lithuanian',
100 |          'Londoner', 'Luxembourger',
101 |          'Macedonian', 'Malagasy', 'Malawian', 'Malaysian', 'Maldivan', 'Malian', 'Maltese', 'Manxman', 'Marshallese',
102 |          'Martinican', 'Martiniquais',
103 |          'Marxist', 'Mauritanian', 'Mauritian', 'Mexican', 'Micronesian', 'Moldovan', 'Mongolian', 'Montenegrin',
104 |          'Montserratian', 'Moroccan',
105 |          'Motswana', 'Mozambican', 'Muslim', 'Myanmarese', 'Namibian', 'Nationalist', 'Nazi', 'Nauruan', 'Nepalese',
106 |          'Netherlander', 'New Yorker',
107 |          'New Zealander', 'Nicaraguan', 'Nigerian', 'Nordic', 'North American', 'North Korean', 'Norwegian', 'Orthodox',
108 |          'Pakistani', 'Palauan',
109 |          'Palestinian', 'Panamanian', 'Papua New Guinean', 'Paraguayan', 'Parisian', 'Peruvian', 'Philistine', 'Pole',
110 |          'Polish', 'Portuguese',
111 |          'Protestant', 'Puerto Rican', 'Qatari', 'Republican', 'Roman', 'Romanian', 'Russian', 'Rwandan',
112 |          'Saint Helenian', 'Saint Lucian',
113 |          'Saint Vincentian', 'Salvadoran', 'Sammarinese', 'Samoan', 'San Marinese', 'Sao Tomean', 'Saudi',
114 |          'Saudi Arabian', 'Scandinavian', 'Scottish',
115 |          'Senegalese', 'Serb', 'Serbian', 'Shia', 'Shiite', 'Sierra Leonean', 'Sikh', 'Singaporean', 'Slovak',
116 |          'Slovene', 'Social-Democrat', 'Socialist',
117 |          'Somali', 'South African', 'South American', 'South Korean', 'Soviet', 'Spaniard', 'Spanish', 'Sri Lankan',
118 |          'Sudanese', 'Sunni',
119 |          'Surinamer', 'Swazi', 'Swede', 'Swedish', 'Swiss', 'Syrian', 'Taiwanese', 'Tajik', 'Tanzanian', 'Taoist',
120 |          'Texan', 'Thai', 'Tibetan',
121 |          'Tobagonian', 'Togolese', 'Tongan', 'Tunisian', 'Turk', 'Turkish', 'Turkmen(s)', 'Tuvaluan', 'Ugandan',
122 |          'Ukrainian', 'Uruguayan', 'Uzbek',
123 |          'Uzbekistani', 'Venezuelan', 'Vietnamese', 'Vincentian', 'Virgin Islander', 'Welsh', 'West European',
124 |          'Western', 'Yemeni', 'Yemenite',
125 |          'Yugoslav', 'Zambian', 'Zimbabwean', 'Zionist'}
126 | 
127 | # Facilities
128 | FACILITIES = {"Palace", "Temple", "Gate", "Museum", "Bridge", "Road", "Airport", "Hospital", "School", "Tower",
129 |               "Station", "Avenue",
130 |               "Prison", "Building", "Plant", "Shopping Center", "Shopping Centre", "Mall", "Church", "Synagogue",
131 |               "Mosque", "Harbor", "Harbour",
132 |               "Rail", "Railway", "Metro", "Tram", "Highway", "Tunnel", 'House', 'Field', 'Hall', 'Place', 'Freeway',
133 |               'Wall', 'Square', 'Park',
134 |               'Hotel'}
135 | 
136 | # Legal documents
137 | LEGAL = {"Law", "Agreement", "Act", 'Bill', "Constitution", "Directive", "Treaty", "Code", "Reform", "Convention",
138 |          "Resolution", "Regulation",
139 |          "Amendment", "Customs", "Protocol", "Charter"}
140 | 
141 | # event names
142 | EVENTS = {"War", "Festival", "Show", "Massacre", "Battle", "Revolution", "Olympics", "Games", "Cup", "Week", "Day",
143 |           "Year", "Series"}
144 | 
145 | # Names of languages
146 | LANGUAGES = {'Afar', 'Abkhazian', 'Avestan', 'Afrikaans', 'Akan', 'Amharic', 'Aragonese', 'Arabic', 'Aramaic',
147 |              'Assamese', 'Avaric', 'Aymara',
148 |              'Azerbaijani', 'Bashkir', 'Belarusian', 'Bulgarian', 'Bambara', 'Bislama', 'Bengali', 'Tibetan', 'Breton',
149 |              'Bosnian', 'Cantonese',
150 |              'Catalan', 'Chechen', 'Chamorro', 'Corsican', 'Cree', 'Czech', 'Chuvash', 'Welsh', 'Danish', 'German',
151 |              'Divehi', 'Dzongkha', 'Ewe',
152 |              'Greek', 'English', 'Esperanto', 'Spanish', 'Castilian', 'Estonian', 'Basque', 'Persian', 'Fulah',
153 |              'Filipino', 'Finnish', 'Fijian', 'Faroese',
154 |              'French', 'Western Frisian', 'Irish', 'Gaelic', 'Galician', 'Guarani', 'Gujarati', 'Manx', 'Hausa',
155 |              'Hebrew', 'Hindi', 'Hiri Motu',
156 |              'Croatian', 'Haitian', 'Hungarian', 'Armenian', 'Herero', 'Indonesian', 'Igbo', 'Inupiaq', 'Ido',
157 |              'Icelandic', 'Italian', 'Inuktitut',
158 |              'Japanese', 'Javanese', 'Georgian', 'Kongo', 'Kikuyu', 'Kuanyama', 'Kazakh', 'Kalaallisut', 'Greenlandic',
159 |              'Central Khmer', 'Kannada',
160 |              'Korean', 'Kanuri', 'Kashmiri', 'Kurdish', 'Komi', 'Cornish', 'Kirghiz', 'Latin', 'Luxembourgish', 'Ganda',
161 |              'Limburgish', 'Lingala', 'Lao',
162 |              'Lithuanian', 'Luba-Katanga', 'Latvian', 'Malagasy', 'Marshallese', 'Maori', 'Macedonian', 'Malayalam',
163 |              'Mongolian', 'Marathi', 'Malay',
164 |              'Maltese', 'Burmese', 'Nauru', 'Bokmål', 'Norwegian', 'Ndebele', 'Nepali', 'Ndonga', 'Dutch', 'Flemish',
165 |              'Nynorsk', 'Navajo', 'Chichewa',
166 |              'Occitan', 'Ojibwa', 'Oromo', 'Oriya', 'Ossetian', 'Punjabi', 'Pali', 'Polish', 'Pashto', 'Portuguese',
167 |              'Quechua', 'Romansh', 'Rundi',
168 |              'Romanian', 'Russian', 'Kinyarwanda', 'Sanskrit', 'Sardinian', 'Sindhi', 'Sami', 'Sango', 'Sinhalese',
169 |              'Slovak', 'Slovenian', 'Samoan',
170 |              'Shona', 'Somali', 'Albanian', 'Serbian', 'Swati', 'Sotho', 'Sundanese', 'Swedish', 'Swahili', 'Tamil',
171 |              'Telugu', 'Tajik', 'Thai',
172 |              'Tigrinya', 'Turkmen', 'Taiwanese', 'Tagalog', 'Tswana', 'Tonga', 'Turkish', 'Tsonga', 'Tatar', 'Twi',
173 |              'Tahitian', 'Uighur', 'Ukrainian',
174 |              'Urdu', 'Uzbek', 'Venda', 'Vietnamese', 'Volapük', 'Walloon', 'Wolof', 'Xhosa', 'Yiddish', 'Yoruba',
175 |              'Zhuang', 'Mandarin',
176 |              'Mandarin Chinese', 'Chinese', 'Zulu'}
177 | 
178 | LEGAL_SUFFIXES = {
179 |     'ltd',  # Limited ~13.000
180 |     'llc',  # limited liability company (UK)
181 |     'ltda',  # limitada (Brazil, Portugal)
182 |     'inc',  # Incorporated ~9700
183 |     'co ltd',  # Company Limited ~9200
184 |     'corp',  # Corporation ~5200
185 |     'sa',  # Spółka Akcyjna (Poland), Société Anonyme (France)  ~3200
186 |     'plc',  # Public Limited Company (Great Britain) ~2100
187 |     'ag',  # Aktiengesellschaft (Germany) ~1000
188 |     'gmbh',  # Gesellschaft mit beschränkter Haftung  (Germany)
189 |     'bhd',  # Berhad (Malaysia) ~900
190 |     'jsc',  # Joint Stock Company (Russia) ~900
191 |     'co',  # Corporation/Company ~900
192 |     'ab',  # Aktiebolag (Sweden) ~800
193 |     'ad',  # Akcionarsko Društvo (Serbia), Aktsionerno Drujestvo (Bulgaria) ~600
194 |     'tbk',  # Terbuka (Indonesia) ~500
195 |     'as',  # Anonim Şirket (Turkey), Aksjeselskap (Norway) ~500
196 |     'pjsc',  # Public Joint Stock Company (Russia, Ukraine) ~400
197 |     'spa',  # Società Per Azioni (Italy) ~300
198 |     'nv',  # Naamloze vennootschap (Netherlands, Belgium) ~230
199 |     'dd',  # Dioničko Društvo (Croatia) ~220
200 |     'a s',  # a/s (Denmark), a.s (Slovakia) ~210
201 |     'oao',  # Открытое акционерное общество (Russia) ~190
202 |     'asa',  # Allmennaksjeselskap (Norway) ~160
203 |     'ojsc',  # Open Joint Stock Company (Russia) ~160
204 |     'lp',  # Limited Partnership (US) ~140
205 |     'llp',  # limited liability partnership
206 |     'oyj',  # julkinen osakeyhtiö (Finland) ~120
207 |     'de cv',  # Capital Variable (Mexico) ~120
208 |     'se',  # Societas Europaea (Germany) ~100
209 |     'kk',  # kabushiki gaisha (Japan)
210 |     'aps',  # Anpartsselskab (Denmark)
211 |     'cv',  # commanditaire vennootschap (Netherlands)
212 |     'sas',  # société par actions simplifiée (France)
213 |     'sro',  # Spoločnosť s ručením obmedzeným (Slovakia)
214 |     'oy',  # Osakeyhtiö (Finland)
215 |     'kg',  # Kommanditgesellschaft (Germany)
216 |     'bv',  # Besloten Vennootschap (Netherlands)
217 |     'sarl',  # société à responsabilité limitée (France)
218 |     'srl',  # Società a responsabilità limitata (Italy)
219 |     'sl'  # Sociedad Limitada (Spain)
220 | }
221 | # Generic words that may appear in official company names but are sometimes skipped when mentioned in news articles (e.g. Nordea Bank -> Nordea)
222 | GENERIC_TOKENS = {"International", "Group", "Solutions", "Technologies", "Management", "Association", "Associates",
223 |                   "Partners",
224 |                   "Systems", "Holdings", "Services", "Bank", "Fund", "Stiftung", "Company"}
225 | 
226 | # List of tokens that are typically lowercase even when they occur in capitalised segments (e.g. International Council of Shopping Centers)
227 | LOWERCASED_TOKENS = {"'s", "-", "a", "an", "the", "at", "by", "for", "in", "of", "on", "to", "up", "and"}
228 | 
229 | # Prefixes to family names that are often in lowercase
230 | NAME_PREFIXES = {"-", "von", "van", "de", "di", "le", "la", "het", "'t'", "dem", "der", "den", "d'", "ter"}
231 | 


--------------------------------------------------------------------------------
/examples/ner/eval_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas
  3 | import sklearn.metrics
  4 | from skweak import utils
  5 | from spacy.tokens import Span # type: ignore
  6 | 
  7 | def evaluate(docs, all_labels, target_sources):
  8 |     """Extracts the evaluation results for one or more sources, and add them to a pandas DataFrame."""
  9 |     
 10 |     if isinstance(target_sources, str):
 11 |         target_sources = [target_sources]
 12 | 
 13 |     records = []
 14 |     for source in target_sources:
 15 |         results = get_results(docs, all_labels, source)
 16 |         labels = set(results["label_weights"].keys())
 17 |         for name in sorted(labels) + ["micro", "weighted", "macro"]:
 18 |             if name in results:
 19 |                 record = results[name]
 20 |                 record["label"] = name
 21 |                 record["model"] = source
 22 |                 if name in labels:
 23 |                     record["proportion"] = results["label_weights"][name]          
 24 |                 records.append(record)
 25 |     
 26 |     df = pandas.DataFrame.from_records(records)
 27 |     df["proportion"] = df.proportion.apply(lambda x: "%.1f %%"%(x*100) if not np.isnan(x) else "")
 28 |     df["tok_cee"] = df.tok_cee.apply(lambda x: str(x) if not np.isnan(x) else "")
 29 |     df["tok_acc"] = df.tok_acc.apply(lambda x: str(x) if not np.isnan(x) else "")
 30 |     df["coverage"] = df.coverage.apply(lambda x: str(x) if not np.isnan(x) else "")
 31 |     df = df.set_index(["label", "proportion", "model"]).sort_index()
 32 |     df = df[["tok_precision", "tok_recall", "tok_f1", "tok_cee", "tok_acc", "coverage",
 33 |              "ent_precision", "ent_recall", "ent_f1"]]
 34 |     return df
 35 | 
 36 | 
 37 | 
 38 | def get_results(docs, all_labels, target_source, conf_threshold=0.5):
 39 |     """Computes the usual metrics (precision, recall, F1, cross-entropy) on the dataset, using the spacy entities 
 40 |     in each document as gold standard, and the annotations of a given source as the predicted values"""
 41 | 
 42 | 
 43 |     all_numbers = compute_raw_numbers(docs, all_labels, target_source, conf_threshold)
 44 |     tok_tp, tok_fp, tok_fn, tok_logloss, tok_nb, tok_tp_tn, ent_tp, ent_fp, ent_fn, ent_support, tok_support = all_numbers
 45 | 
 46 |     # We then compute the metrics themselves
 47 |     results = {}
 48 |     for label in ent_support:
 49 |         ent_pred = ent_tp[label]+ent_fp[label] + 1E-10
 50 |         ent_true = ent_tp[label]+ent_fn[label] + 1E-10
 51 |         tok_pred = tok_tp[label]+tok_fp[label] + 1E-10
 52 |         tok_true = tok_tp[label]+tok_fn[label] + 1E-10
 53 |         results[label] = {}
 54 |         results[label]["ent_precision"] = round(ent_tp[label]/ent_pred, 3)
 55 |         results[label]["ent_recall"] = round(ent_tp[label]/ent_true, 3)
 56 |         results[label]["tok_precision"] = round(tok_tp[label]/tok_pred, 3)
 57 |         results[label]["tok_recall"] = round(tok_tp[label]/tok_true, 3)
 58 |         
 59 |         ent_f1_numerator = (results[label]["ent_precision"] * results[label]["ent_recall"])
 60 |         ent_f1_denominator = (results[label]["ent_precision"] +results[label]["ent_recall"]) + 1E-10
 61 |         results[label]["ent_f1"] = 2*round(ent_f1_numerator / ent_f1_denominator, 3)
 62 |             
 63 |         tok_f1_numerator = (results[label]["tok_precision"] * results[label]["tok_recall"])
 64 |         tok_f1_denominator = (results[label]["tok_precision"] +results[label]["tok_recall"]) + 1E-10
 65 |         results[label]["tok_f1"] = 2*round(tok_f1_numerator / tok_f1_denominator, 3)
 66 |     
 67 |     results["macro"] = {"ent_precision":round(np.mean([results[l]["ent_precision"] for l in ent_support]), 3), 
 68 |                        "ent_recall":round(np.mean([results[l]["ent_recall"] for l in ent_support]), 3), 
 69 |                        "tok_precision":round(np.mean([results[l]["tok_precision"] for l in ent_support]), 3), 
 70 |                        "tok_recall":round(np.mean([results[l]["tok_recall"] for l in ent_support]), 3)}
 71 |     
 72 |         
 73 |     label_weights = {l:ent_support[l]/sum(ent_support.values()) for l in ent_support}
 74 |     results["label_weights"] = label_weights
 75 |     results["weighted"] = {"ent_precision":round(np.sum([results[l]["ent_precision"]*label_weights[l] 
 76 |                                                             for l in ent_support]), 3), 
 77 |                            "ent_recall":round(np.sum([results[l]["ent_recall"]*label_weights[l] 
 78 |                                                          for l in ent_support]), 3), 
 79 |                            "tok_precision":round(np.sum([results[l]["tok_precision"]*label_weights[l] 
 80 |                                                            for l in ent_support]), 3), 
 81 |                            "tok_recall":round(np.sum([results[l]["tok_recall"]*label_weights[l] 
 82 |                                                         for l in ent_support]), 3)}
 83 |     
 84 |     ent_pred = sum([ent_tp[l] for l in ent_support]) + sum([ent_fp[l] for l in ent_support]) + 1E-10
 85 |     ent_true = sum([ent_tp[l] for l in ent_support]) + sum([ent_fn[l] for l in ent_support]) + 1E-10
 86 |     tok_pred = sum([tok_tp[l] for l in ent_support]) + sum([tok_fp[l] for l in ent_support]) + 1E-10
 87 |     tok_true = sum([tok_tp[l] for l in ent_support])  + sum([tok_fn[l] for l in ent_support]) + 1E-10
 88 |     results["micro"] = {"ent_precision":round(sum([ent_tp[l] for l in ent_support]) / ent_pred, 3), 
 89 |                         "ent_recall":round(sum([ent_tp[l] for l in ent_support]) / ent_true, 3), 
 90 |                         "tok_precision":round(sum([tok_tp[l] for l in ent_support]) /tok_pred, 3), 
 91 |                         "tok_recall":round(sum([tok_tp[l] for l in ent_support]) / tok_true, 3),
 92 |                         "tok_cee":round(tok_logloss/tok_nb, 3),
 93 |                         "tok_acc": round(tok_tp_tn/tok_nb, 3),
 94 |                         "coverage":round((sum(tok_tp.values()) +sum(tok_fp.values())) / sum(tok_support.values()), 3)}
 95 |     
 96 |     for metric in ["macro", "weighted", "micro"]:
 97 |         ent_f1_numerator = (results[metric]["ent_precision"] * results[metric]["ent_recall"])
 98 |         ent_f1_denominator = (results[metric]["ent_precision"] +results[metric]["ent_recall"]) + 1E-10
 99 |         results[metric]["ent_f1"] = 2*round(ent_f1_numerator / ent_f1_denominator, 3)
100 |             
101 |         tok_f1_numerator = (results[metric]["tok_precision"] * results[metric]["tok_recall"])
102 |         tok_f1_denominator = (results[metric]["tok_precision"] +results[metric]["tok_recall"]) + 1E-10
103 |         results[metric]["tok_f1"] = 2*round(tok_f1_numerator / tok_f1_denominator, 3)
104 |         
105 |     return results
106 | 
107 | 
108 | def compute_raw_numbers(docs, all_labels, target_source, conf_threshold=0.5):
109 |     """Computes the raw metrics (true positives, true negatives, ...) on the dataset, using the spacy entities 
110 |     in each document as gold standard, and the annotations of a given source as the predicted values"""
111 | 
112 |     # We start by computing the TP, FP and FN values
113 |     tok_tp = {}
114 |     tok_fp = {}
115 |     tok_fn ={}
116 | 
117 |     tok_logloss = 0
118 |     tok_nb = 0
119 |     tok_tp_tn = 0
120 |     
121 |     ent_tp ={}
122 |     ent_fp = {}
123 |     ent_fn = {}
124 |     ent_support = {}
125 |     tok_support = {}
126 | 
127 |     for doc in docs:
128 |         if target_source in doc.spans:
129 |             spans = utils.get_spans_with_probs(doc, target_source)
130 |         else:
131 |             spans = []
132 |         spans = [span for (span, prob) in spans if prob >= conf_threshold]
133 |             
134 |         for label in all_labels:
135 |             true_spans = {(ent.start, ent.end) for ent in doc.ents if ent.label_==label}
136 |             pred_spans = {(span.start,span.end) for span in spans if span.label_==label}
137 |         
138 |             ent_tp[label] = ent_tp.get(label,0) + len(true_spans.intersection(pred_spans))
139 |             ent_fp[label] = ent_fp.get(label,0) + len(pred_spans - true_spans)
140 |             ent_fn[label] = ent_fn.get(label,0) +  len(true_spans - pred_spans)
141 |             ent_support[label] = ent_support.get(label, 0) + len(true_spans)
142 |             
143 |             true_tok_labels = {i for start,end in true_spans for i in range(start, end)}
144 |             pred_tok_labels = {i for start,end in pred_spans for i in range(start, end)}
145 |             tok_tp[label] = tok_tp.get(label, 0) + len(true_tok_labels.intersection(pred_tok_labels))
146 |             tok_fp[label] = tok_fp.get(label, 0) + len(pred_tok_labels - true_tok_labels)
147 |             tok_fn[label] = tok_fn.get(label,0) + len(true_tok_labels - pred_tok_labels)
148 |             tok_support[label] = tok_support.get(label, 0) + len(true_tok_labels)
149 | 
150 |         gold_probs, pred_probs = _get_probs(doc, all_labels, target_source)
151 |         tok_logloss += sklearn.metrics.log_loss(gold_probs, pred_probs, normalize=False)
152 |         tok_tp_tn += sum(gold_probs.argmax(axis=1) == pred_probs.argmax(axis=1))
153 |         tok_nb += len(doc)
154 | 
155 |     return (tok_tp, tok_fp, tok_fn, tok_logloss, tok_nb, tok_tp_tn, ent_tp, 
156 |             ent_fp, ent_fn, ent_support, tok_support)
157 | 
158 | 
159 | def _get_probs(doc, all_labels, target_source):
160 |     """Retrieves the gold and predicted probabilities (as matrices)"""
161 |     
162 |     out_label_indices = {"O":0}
163 |     for label in all_labels:
164 |         for prefix in "BI":
165 |             out_label_indices["%s-%s" % (prefix, label)] = len(out_label_indices)
166 |                             
167 |     gold_probs = np.zeros((len(doc), len(out_label_indices)), dtype=np.int16)    
168 |     for ent in doc.ents:
169 |         gold_probs[ent.start, out_label_indices.get("B-%s" % ent.label_, 0)] = 1
170 |         for i in range(ent.start+1, ent.end):
171 |             gold_probs[i, out_label_indices.get("I-%s" % ent.label_, 0)] = 1
172 |     
173 |     pred_probs = np.zeros(gold_probs.shape)
174 |     if target_source in doc.spans and "probs" in doc.spans[target_source].attrs:       
175 |         for tok_pos, labels in doc.spans[target_source].attrs["probs"].items():
176 |             for label, label_prob in labels.items():
177 |                 pred_probs[tok_pos, out_label_indices[label]] = label_prob
178 |         pred_probs[:,0] = np.clip(1-pred_probs[:,1:].sum(axis=1), 0.0, 1.0)
179 |     else:
180 |         vector = utils.spans_to_array(doc, all_labels, [target_source])[:,0]
181 |         pred_probs[np.arange(vector.size), vector] = True       
182 |     
183 |     return gold_probs, pred_probs
184 | 
185 | 
186 | def show_errors(docs, all_labels, target_source, conf_threshold=0.5):
187 |     """Utilities to display the errors/omissions of a given source"""
188 |     
189 |     for i, doc in enumerate(docs):
190 |         
191 |         spans = utils.get_spans_with_probs(doc, target_source)
192 |             
193 |         print("Doc %i:"%i, doc)
194 |         true_spans = {(ent.start, ent.end):ent.label_ for ent in doc.ents}
195 |         pred_spans = {(span.start,span.end):span.label_ for span, prob in spans if prob >=conf_threshold}
196 | 
197 |         for start,end in true_spans:
198 |             if (start,end) not in pred_spans:
199 |                 print("Not found: %s [%i:%i] -> %s"%(doc[start:end], start, end, true_spans[(start,end)]))
200 |             elif true_spans[(start,end)]!=pred_spans[(start,end)]:
201 |                 print("Wrong label: %s [%i:%i] -> %s but predicted as %s"%(doc[start:end], start, end, 
202 |                 true_spans[(start,end)], pred_spans[(start,end)]))
203 | 
204 |         for start,end in pred_spans:
205 |             if (start,end) not in true_spans:
206 |                 print("Spurious: %s [%i:%i] -> %s"%(doc[start:end], start, end, pred_spans[(start,end)]))
207 | 


--------------------------------------------------------------------------------
/examples/sentiment/__init__.py:
--------------------------------------------------------------------------------
1 | from . import norec_sentiment, sentiment_lexicons, sentiment_models, transformer_model, weak_supervision_sentiment


--------------------------------------------------------------------------------
/examples/sentiment/norec_sentiment.py:
--------------------------------------------------------------------------------
 1 | from skweak.base import CombinedAnnotator
 2 | from .sentiment_lexicons import LexiconAnnotator, NRC_SentAnnotator, VADAnnotator, SocalAnnotator, BUTAnnotator
 3 | from .sentiment_models import DocBOWAnnotator, MultilingualAnnotator, MBertAnnotator
 4 | import os
 5 | from spacy.tokens import Doc #type: ignore
 6 | from typing import Sequence, Tuple, Optional, Iterable
 7 | from collections import defaultdict
 8 | 
 9 | 
10 | class FullSentimentAnnotator(CombinedAnnotator):
11 |     """Annotation based on the heuristic"""
12 | 
13 |     def add_all(self):
14 |         """Adds all implemented annotation functions, models and filters"""
15 | 
16 |         print("Loading lexicon functions")
17 |         self.add_lexicons()
18 |         print("Loading learned sentiment model functions")
19 |         self.add_ml_models()
20 | 
21 |         return self
22 | 
23 |     def add_lexicons(self):
24 |         """Adds Spacy NER models to the annotator"""
25 | 
26 |         self.add_annotator(LexiconAnnotator("norsent_forms", "../data/sentiment/lexicons/norsentlex/Fullform"))
27 |         self.add_annotator(LexiconAnnotator("norsent_lemma", "../data/sentiment/lexicons/norsentlex/Lemma"))
28 | 
29 |         self.add_annotator(VADAnnotator("NRC_VAD", "../data/sentiment/lexicons/NRC_VAD_Lexicon/Norwegian-no-NRC-VAD-Lexicon.txt"))
30 | 
31 |         self.add_annotator(SocalAnnotator("Socal-adj", "../data/sentiment/lexicons/socal/no_adj.txt"))
32 | 
33 |         self.add_annotator(SocalAnnotator("Socal-adv", "../data/sentiment/lexicons/socal/no_adv.txt"))
34 | 
35 |         self.add_annotator(SocalAnnotator("Socal-google", "../data/sentiment/lexicons/socal/no_google.txt"))
36 | 
37 |         self.add_annotator(SocalAnnotator("Socal-int", "../data/sentiment/lexicons/socal/no_int.txt"))
38 | 
39 |         self.add_annotator(SocalAnnotator("Socal-noun", "../data/sentiment/lexicons/socal/no_noun.txt"))
40 |         self.add_annotator(SocalAnnotator("Socal-verb", "../data/sentiment/lexicons/socal/no_verb.txt"))
41 | 
42 |         self.add_annotator(SocalAnnotator("IBM", "../data/sentiment/lexicons/IBM_Debater/no_unigram.txt"))
43 | 
44 |         self.add_annotator(NRC_SentAnnotator("NRC-Sent-Emo", "../data/sentiment/lexicons/NRC_Sentiment_Emotion/no_sent.txt"))
45 | 
46 |         self.add_annotator(BUTAnnotator("norsent_forms-BUT", "../data/sentiment/lexicons/norsentlex/Fullform"))
47 | 
48 |         self.add_annotator(BUTAnnotator("norsent_lemma-BUT", "../data/sentiment/lexicons/norsentlex/Lemma"))
49 | 
50 |         return self
51 | 
52 |     def add_ml_models(self):
53 |         self.add_annotator(DocBOWAnnotator("doc-level-norec", "../data/sentiment/models/bow"))
54 |         self.add_annotator(MultilingualAnnotator("nlptown-bert-multilingual-sentiment"))
55 |         self.add_annotator(MBertAnnotator("mbert-sst"))
56 |         return self
57 | 
58 | 


--------------------------------------------------------------------------------
/examples/sentiment/sentiment_lexicons.py:
--------------------------------------------------------------------------------
  1 | from skweak.base import SpanAnnotator
  2 | import os
  3 | from spacy.tokens import Doc #type: ignore
  4 | from typing import Sequence, Tuple, Optional, Iterable
  5 | from collections import defaultdict
  6 | 
  7 | ####################################################################
  8 | # Labelling sources based on lexicons
  9 | ####################################################################
 10 | 
 11 | class LexiconAnnotator(SpanAnnotator):
 12 |     """Annotation based on a sentiment lexicon"""
 13 | 
 14 |     def __init__(self, name, lexicon_dir, margin=0):
 15 |         """Creates a new annotator based on a Spacy model. """
 16 |         super(LexiconAnnotator, self).__init__(name)
 17 | 
 18 |         self.margin = margin
 19 | 
 20 |         pos_file = None
 21 |         for file in os.listdir(lexicon_dir):
 22 |             if "positive" in file.lower() and "txt" in file:
 23 |                 pos_file = os.path.join(lexicon_dir, file)
 24 |                 self.pos = set([l.strip() for l in open(pos_file)])
 25 |         if pos_file is None:
 26 |             print("No positive lexicon file found in {}".format(lexicon_dir))
 27 | 
 28 |         neg_file = None
 29 |         for file in os.listdir(lexicon_dir):
 30 |             if "negative" in file.lower() and "txt" in file:
 31 |                 neg_file = os.path.join(lexicon_dir, file)
 32 |                 self.neg = set([l.strip() for l in open(neg_file)])
 33 |         if neg_file is None:
 34 |             print("No negative lexicon file found in {}".format(lexicon_dir))
 35 | 
 36 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
 37 |         pos = 0
 38 |         neg = 0
 39 | 
 40 |         # Iterate through tokens and add up positive and negative tokens
 41 |         for token in doc:
 42 |             if token.text in self.pos:
 43 |                 pos += 1
 44 |             if token.text in self.neg:
 45 |                 neg += 1
 46 | 
 47 |         # check if there are more pos or neg tokens, plus a margin
 48 |         # Regarding labels: positive: 2, neutral: 1, negative: 0
 49 |         if pos > (neg + self.margin):
 50 |             label = 2
 51 |         elif neg > (pos + self.margin):
 52 |             label = 0
 53 |         else:
 54 |             label = 1
 55 |         yield 0, len(doc), label #type: ignore
 56 | 
 57 | 
 58 | class VADAnnotator(SpanAnnotator):
 59 |     """Annotation based on a sentiment lexicon"""
 60 | 
 61 |     def __init__(self, name, lexicon_path, margin=0.2):
 62 |         """Creates a new annotator based on a Spacy model. """
 63 |         super(VADAnnotator, self).__init__(name)
 64 | 
 65 |         self.margin = margin
 66 | 
 67 |         self.lexicon = defaultdict(lambda: 0.5)
 68 |         for i, line in enumerate(open(lexicon_path)):
 69 |             if i > 0: # skip the header
 70 |                 en_term, no_term, v, a, d = line.strip().split("\t")
 71 |                 self.lexicon[no_term] = float(v)
 72 | 
 73 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
 74 |         scores = [0.5]
 75 | 
 76 |         # Iterate through tokens and add up positive and negative tokens
 77 |         for token in doc:
 78 |             scores.append(self.lexicon[token.text])
 79 | 
 80 |         mean_score = sum(scores) / len(scores)
 81 |         # check if there are more pos or neg tokens, plus a margin
 82 |         # Regarding labels: positive: 2, neutral: 1, negative: 0
 83 |         if mean_score > (0.5 + self.margin):
 84 |             label = 2
 85 |         elif mean_score < (0.5 + self.margin):
 86 |             label = 0
 87 |         else:
 88 |             label = 1
 89 |         yield 0, len(doc), label #type: ignore
 90 | 
 91 | 
 92 | 
 93 | class SocalAnnotator(SpanAnnotator):
 94 |     """Annotation based on a sentiment lexicon"""
 95 | 
 96 |     def __init__(self, name, lexicon_path, margin=0):
 97 |         """Creates a new annotator based on a Spacy model. """
 98 |         super(SocalAnnotator, self).__init__(name)
 99 | 
100 |         self.margin = margin
101 | 
102 |         self.lexicon = defaultdict(lambda: 0)
103 |         for i, line in enumerate(open(lexicon_path)):
104 |             if i > 0: # skip the header
105 |                 try:
106 |                     no_term, score = line.strip().split("\t")
107 |                     self.lexicon[no_term] = float(score) #type: ignore
108 |                 except ValueError:
109 |                     print(str(i) + ": " + line)
110 | 
111 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
112 |         scores = [0]
113 | 
114 |         # Iterate through tokens and add up positive and negative tokens
115 |         for token in doc:
116 |             scores.append(self.lexicon[token.text])
117 | 
118 |         mean_score = sum(scores) / len(scores)
119 |         # check if there are more pos or neg tokens, plus a margin
120 |         # Regarding labels: positive: 2, neutral: 1, negative: 0
121 |         if mean_score > (0 + self.margin):
122 |             label = 2
123 |         elif mean_score < (0 + self.margin):
124 |             label = 0
125 |         else:
126 |             label = 1
127 |         yield 0, len(doc), label #type: ignore
128 | 
129 | 
130 | class NRC_SentAnnotator(SpanAnnotator):
131 |     """Annotation based on a sentiment lexicon"""
132 | 
133 |     def __init__(self, name, lexicon_path, margin=0):
134 |         """Creates a new annotator based on a Spacy model. """
135 |         super(NRC_SentAnnotator, self).__init__(name)
136 | 
137 |         self.margin = margin
138 |         self.pos = set()
139 |         self.neg = set()
140 | 
141 |         for i, line in enumerate(open(lexicon_path)):
142 |             try:
143 |                 no_term, sent, score = line.strip().split("\t")
144 |                 if int(score) == 1:
145 |                     if sent == "positive":
146 |                         self.pos.add(no_term)
147 |                     if sent == "negative":
148 |                         self.neg.add(no_term)
149 |             except:
150 |                 pass
151 | 
152 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
153 |         pos = 0
154 |         neg = 0
155 | 
156 |         # Iterate through tokens and add up positive and negative tokens
157 |         for token in doc:
158 |             if token.text in self.pos:
159 |                 pos += 1
160 |             if token.text in self.neg:
161 |                 neg += 1
162 | 
163 |         # check if there are more pos or neg tokens, plus a margin
164 |         # Regarding labels: positive: 2, neutral: 1, negative: 0
165 |         if pos > (neg + self.margin):
166 |             label = 2
167 |         elif neg > (pos + self.margin):
168 |             label = 0
169 |         else:
170 |             label = 1
171 |         yield 0, len(doc), label #type: ignore
172 | 
173 | 
174 | class BUTAnnotator(SpanAnnotator):
175 |     """Annotation based on the heuristic"""
176 | 
177 |     def __init__(self, name, lexicon_dir, margin=0):
178 |         """Creates a new annotator based on a Spacy model. """
179 |         super(BUTAnnotator, self).__init__(name)
180 | 
181 |         self.margin = margin
182 | 
183 |         pos_file = None
184 |         for file in os.listdir(lexicon_dir):
185 |             if "positive" in file.lower() and "txt" in file:
186 |                 pos_file = os.path.join(lexicon_dir, file)
187 |                 self.pos = set([l.strip() for l in open(pos_file)])
188 |         if pos_file is None:
189 |             print("No positive lexicon file found in {}".format(lexicon_dir))
190 | 
191 |         neg_file = None
192 |         for file in os.listdir(lexicon_dir):
193 |             if "negative" in file.lower() and "txt" in file:
194 |                 neg_file = os.path.join(lexicon_dir, file)
195 |                 self.neg = set([l.strip() for l in open(neg_file)])
196 |         if neg_file is None:
197 |             print("No negative lexicon file found in {}".format(lexicon_dir))
198 | 
199 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
200 |         pos = 0
201 |         neg = 0
202 | 
203 |         # Iterate through tokens and add up positive and negative tokens
204 |         tokens = [t.text for t in doc]
205 |         if "men" in tokens:
206 |             idx = tokens.index("men") + 1
207 |             for token in tokens[idx:]:
208 |                 if token in self.pos:
209 |                     pos += 1
210 |                 if token in self.neg:
211 |                     neg += 1
212 | 
213 |         # check if there are more pos or neg tokens, plus a margin
214 |         # Regarding labels: positive: 2, neutral: 1, negative: 0
215 |         if pos > (neg + self.margin):
216 |             label = 2
217 |         elif neg > (pos + self.margin):
218 |             label = 0
219 |         else:
220 |             label = 1
221 |         yield 0, len(doc), label #type: ignore
222 | 


--------------------------------------------------------------------------------
/examples/sentiment/sentiment_models.py:
--------------------------------------------------------------------------------
  1 | from skweak.base import SpanAnnotator
  2 | import os
  3 | from spacy.tokens import Doc # type: ignore
  4 | from typing import Sequence, Tuple, Optional, Iterable
  5 | from collections import defaultdict
  6 | 
  7 | from sklearn.svm import LinearSVC
  8 | from sklearn.feature_extraction.text import TfidfVectorizer
  9 | from sklearn.metrics import f1_score
 10 | 
 11 | from transformers import pipeline, BertForSequenceClassification, BertTokenizer
 12 | 
 13 | import tarfile
 14 | import pickle
 15 | import os
 16 | 
 17 | 
 18 | class MBertAnnotator(SpanAnnotator):
 19 |     """Annotation based on multi-lingual BERT trained on Stanford Sentiment Treebank"""
 20 |     def __init__(self, name):
 21 |         super(MBertAnnotator, self).__init__(name)
 22 |         self.classifier = BertForSequenceClassification.from_pretrained("../data/sentiment/models/sst", num_labels=3)
 23 |         self.classifier.eval() # type: ignore
 24 |         self.tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
 25 |         print("Loaded mBERT from {}".format("../data/sentiment/models/sst"))
 26 | 
 27 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
 28 | 
 29 |         text = [" ".join([t.text for t in doc])]
 30 |         encoding = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True)
 31 |         output = self.classifier(**encoding)
 32 |         # classifier outputs a dict, eg {'label': '5 stars', 'score': 0.99}
 33 |         # so we need to get the label and transform it to an int
 34 |         _, p = output.logits.max(1)
 35 |         label = int(p[0])
 36 |         yield 0, len(doc), label # type: ignore
 37 | 
 38 | 
 39 | class MultilingualAnnotator(SpanAnnotator):
 40 |     """Annotation based on multi-lingual BERT trained on review data in 6 languages"""
 41 | 
 42 |     def __init__(self, name):
 43 |         """Creates a new annotator based on a Spacy model. """
 44 |         super(MultilingualAnnotator, self).__init__(name)
 45 | 
 46 |         self.classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")
 47 |         print("Loaded nlptown/bert-base-multilingual-uncased-sentiment")
 48 | 
 49 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
 50 | 
 51 |         text = [" ".join([t.text for t in doc])]
 52 |         labels = self.classifier(text)[0]
 53 |         # classifier outputs a dict, eg {'label': '5 stars', 'score': 0.99}
 54 |         # so we need to get the label and transform it to an int
 55 |         pred = int(labels["label"][0])
 56 | 
 57 |         # check if there are more pos or neg tokens, plus a margin
 58 |         # Regarding labels: positive: 2, neutral: 1, negative: 0
 59 |         if pred > 3:
 60 |             label = 2
 61 |         elif pred < 3:
 62 |             label = 0
 63 |         else:
 64 |             label = 1
 65 |         yield 0, len(doc), label # type: ignore
 66 | 
 67 | 
 68 | class DocBOWAnnotator(SpanAnnotator):
 69 |     """Annotation based on a TF-IDF Bag-of-words document-level classifier"""
 70 | 
 71 |     def __init__(self, name, model_path, doclevel_data=None):
 72 |         """Creates a new annotator based on a Spacy model. """
 73 |         super(DocBOWAnnotator, self).__init__(name)
 74 | 
 75 |         self.model_path = model_path
 76 |         self.doclevel_data = doclevel_data
 77 | 
 78 |         if self.doclevel_data is not None:
 79 |             print("Fitting model on {}".format(self.doclevel_data))
 80 |             self.fit(doclevel_data)
 81 |             print("Saving vectorizer and model to {}".format(model_path))
 82 |             self.save_model(self.model_path)
 83 |         else:
 84 |             try:
 85 |                 self.load_model(self.model_path)
 86 |                 print("Loaded model from {}".format(self.model_path))
 87 |             except FileNotFoundError:
 88 |                 print("Trained model not found. Train a model first by providing the doclevel_data when instantiating the annotator.")
 89 | 
 90 |     def save_model(self, model_path):
 91 |         os.makedirs(model_path, exist_ok=True)
 92 |         with open(os.path.join(model_path, "vectorizer.pkl"), "wb") as o:
 93 |             pickle.dump(self.vectorizer, o)
 94 | 
 95 |         with open(os.path.join(model_path, "bow_model.pkl"), "wb") as o:
 96 |             pickle.dump(self.model, o)
 97 | 
 98 |     def load_model(self, model_path):
 99 |         with open(os.path.join(model_path, "vectorizer.pkl"), "rb") as o:
100 |             self.vectorizer = pickle.load(o)
101 |         with open(os.path.join(model_path, "bow_model.pkl"), "rb") as o:
102 |             self.model = pickle.load(o)
103 | 
104 |     def open_norec_doc(self, file_path, split="train"):
105 |         tar = tarfile.open(file_path, "r:gz")
106 | 
107 |         train_names = [tarinfo for tarinfo in tar.getmembers() if split in tarinfo.name and ".conllu" in tarinfo.name]
108 | 
109 |         docs, ratings = [], []
110 | 
111 |         for fname in train_names:
112 |             content = tar.extractfile(fname)
113 |             language = content.readline().decode("utf8").rstrip("\n")[-2:]
114 |             rating = content.readline().decode("utf8").rstrip("\n")[-1]
115 |             doc_id = content.readline().decode("utf8").rstrip("\n").split()[-1]
116 | 
117 |             words = []
118 |             for line in content:
119 |                 line = line.decode("utf8")
120 |                 if line[0] == '#':
121 |                     continue
122 |                 if not line.rstrip("\n"):
123 |                     continue
124 |                 else:
125 |                     words.append(line.split("\t")[1])
126 | 
127 |             docs.append(" ".join(words))
128 |             ratings.append(int(rating))
129 |         return docs, ratings
130 | 
131 |     def fit(self, file_path):
132 |         train_docs, train_ratings = self.open_norec_doc(file_path, split="train")
133 |         test_docs, test_ratings = self.open_norec_doc(file_path, split="test")
134 | 
135 |         self.vectorizer = TfidfVectorizer()
136 |         trainX = self.vectorizer.fit_transform(train_docs)
137 |         self.model = LinearSVC()
138 |         self.model.fit(trainX, train_ratings)
139 | 
140 |         testX = self.vectorizer.transform(test_docs)
141 | 
142 |         pred = self.model.predict(testX)
143 |         print("Doc-level F1: {0:.3f}".format(f1_score(test_ratings, pred, average="macro")))
144 | 
145 | 
146 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
147 | 
148 |         text = [" ".join([t.text for t in doc])]
149 |         X = self.vectorizer.transform(text)
150 |         pred = self.model.predict(X)[0]
151 | 
152 |         # check if there are more pos or neg tokens, plus a margin
153 |         # Regarding labels: positive: 2, neutral: 1, negative: 0
154 |         if pred > 4:
155 |             label = 2
156 |         elif pred < 3:
157 |             label = 0
158 |         else:
159 |             label = 1
160 |         yield 0, len(doc), label
161 | 
162 | 


--------------------------------------------------------------------------------
/examples/sentiment/transformer_model.py:
--------------------------------------------------------------------------------
  1 | from transformers import BertTokenizer, BertForSequenceClassification
  2 | from transformers import AdamW
  3 | from transformers import get_linear_schedule_with_warmup
  4 | 
  5 | from torch.nn import functional as F
  6 | import torch
  7 | import numpy as np
  8 | 
  9 | import argparse
 10 | from tqdm import tqdm
 11 | 
 12 | from sklearn.metrics import f1_score
 13 | 
 14 | import sys
 15 | sys.path.insert(0, "..")
 16 | from skweak.utils import docbin_reader
 17 | 
 18 | class SSTDataLoader():
 19 |     def __init__(self, datafile, num_examples=None):
 20 |         labels, examples = [], []
 21 |         for line in open(datafile):
 22 |             label, sent = line.strip().split("\t", 1)
 23 |             labels.append(int(label))
 24 |             examples.append(sent)
 25 |         if num_examples is not None:
 26 |             labels = labels[:num_examples]
 27 |             examples = examples[:num_examples]
 28 |         self.labels = np.array(labels)
 29 |         self.examples = np.array(examples)
 30 | 
 31 |     def get_batches(self, batch_size=32, shuffle=True):
 32 |         if shuffle:
 33 |             idxs = np.arange(len(self.labels))
 34 |             np.random.shuffle(idxs)
 35 |             labels = list(self.labels[idxs])
 36 |             examples = list(self.examples[idxs])
 37 |         else:
 38 |             labels = list(self.labels)
 39 |             examples = list(self.examples)
 40 |         num_batches = self.get_num_batches(batch_size)
 41 |         i = 0
 42 |         for batch in range(num_batches):
 43 |             blabels = torch.tensor(labels[i:i+batch_size])
 44 |             bexamples = examples[i:i+batch_size]
 45 |             i += batch_size
 46 |             yield (blabels, bexamples)
 47 | 
 48 |     def get_num_batches(self, batch_size=32):
 49 |         num_batches = len(self.labels) // batch_size
 50 |         if (len(self.labels) % batch_size) > 0:
 51 |             num_batches += 1
 52 |         return num_batches
 53 | 
 54 | class DocbinDataLoader():
 55 |     def __init__(self, datafile, num_examples=None, gold=False):
 56 |         labels, examples = [], []
 57 |         for doc in docbin_reader(datafile):
 58 |             examples.append(doc.text)
 59 |             if gold:
 60 |                 labels.append(doc.user_data["gold"])
 61 |             else:
 62 |                 labels.append(list(doc.user_data["agg_spans"]["hmm"].values())[0])
 63 |         if num_examples is not None:
 64 |             labels = labels[:num_examples]
 65 |             examples = examples[:num_examples]
 66 |         self.labels = np.array(labels)
 67 |         self.examples = np.array(examples)
 68 | 
 69 |     def get_batches(self, batch_size=32, shuffle=True):
 70 |         if shuffle:
 71 |             idxs = np.arange(len(self.labels))
 72 |             np.random.shuffle(idxs)
 73 |             labels = list(self.labels[idxs])
 74 |             examples = list(self.examples[idxs])
 75 |         else:
 76 |             labels = list(self.labels)
 77 |             examples = list(self.examples)
 78 |         num_batches = self.get_num_batches(batch_size)
 79 |         i = 0
 80 |         for batch in range(num_batches):
 81 |             blabels = torch.tensor(labels[i:i+batch_size])
 82 |             bexamples = examples[i:i+batch_size]
 83 |             i += batch_size
 84 |             yield (blabels, bexamples)
 85 | 
 86 |     def get_num_batches(self, batch_size=32):
 87 |         num_batches = len(self.labels) // batch_size
 88 |         if (len(self.labels) % batch_size) > 0:
 89 |             num_batches += 1
 90 |         return num_batches
 91 | 
 92 | def train(model, save_dir="../data/sentiment/models/norbert"):
 93 |     model.train()
 94 | 
 95 | 
 96 |     optimizer = AdamW(model.parameters(), lr=1e-5)
 97 | 
 98 |     num_train_steps = int(len(train_loader.examples) / args.train_batch_size) * args.num_train_epochs
 99 | 
100 |     scheduler = get_linear_schedule_with_warmup(optimizer, args.warmup_steps, num_train_steps)
101 | 
102 |     best_dev_f1 = 0.0
103 | 
104 |     print("training for {} epochs...".format(args.num_train_epochs))
105 | 
106 |     for epoch_num, epoch in enumerate(range(args.num_train_epochs)):
107 |         model.train()
108 |         train_loss = 0
109 |         num_batches = 0
110 |         train_preds = []
111 |         train_gold = []
112 |         for b in tqdm(train_loader.get_batches(batch_size=args.train_batch_size), total=train_loader.get_num_batches(batch_size=args.train_batch_size)):
113 |             labels, sents = b
114 |             encoding = tokenizer(sents, return_tensors='pt', padding=True, truncation=True, max_length=150)
115 | 
116 |             outputs = model(**encoding)
117 |             _, p = outputs.logits.max(1)
118 |             train_preds.extend(p.tolist())
119 |             train_gold.extend(labels.tolist())
120 |             loss = F.cross_entropy(outputs.logits, labels)
121 |             train_loss += loss.data
122 |             num_batches += 1
123 |             loss.backward()
124 |             optimizer.step() #type: ignore
125 |             scheduler.step() #type: ignore
126 |             optimizer.zero_grad() #type: ignore
127 |         print("Epoch {0}: Loss {1:.3f}".format(epoch_num + 1, train_loss / num_batches))
128 |         print("Train F1: {0:.3f}".format(f1_score(train_gold, train_preds, average="macro")))
129 | 
130 | 
131 |         model.eval()
132 |         dev_loss = 0
133 |         num_batches = 0
134 |         dev_preds = []
135 |         dev_gold = []
136 |         for b in tqdm(dev_loader.get_batches(batch_size=args.eval_batch_size), total=dev_loader.get_num_batches(batch_size=args.eval_batch_size)):
137 |             labels, sents = b
138 |             encoding = tokenizer(sents, return_tensors='pt', padding=True, truncation=True, max_length=150)
139 | 
140 |             outputs = model(**encoding)
141 |             _, p = outputs.logits.max(1)
142 |             dev_preds.extend(p.tolist())
143 |             dev_gold.extend(labels.tolist())
144 |             loss = F.cross_entropy(outputs.logits, labels)
145 |             dev_loss += loss.data
146 |             num_batches += 1
147 |         dev_f1 = f1_score(dev_gold, dev_preds, average="macro")
148 |         print("Dev F1: {0:.3f}".format(dev_f1))
149 | 
150 |         if dev_f1 > best_dev_f1: #type: ignore
151 |             best_dev_f1 = dev_f1
152 |             print("Current best dev: {0:.3f}".format(best_dev_f1))
153 |             print("Saving model")
154 |             model.save_pretrained(save_dir)
155 | 
156 | 
157 | def test(model):
158 |     print("loading best model on dev data")
159 |     model.eval()
160 |     test_loss = 0
161 |     num_batches = 0
162 |     test_preds = []
163 |     test_gold = []
164 |     for b in tqdm(test_loader.get_batches(batch_size=args.eval_batch_size), total=test_loader.get_num_batches(batch_size=args.eval_batch_size)):
165 |         labels, sents = b
166 |         encoding = tokenizer(sents, return_tensors='pt', padding=True, truncation=True, max_length=150)
167 | 
168 |         outputs = model(**encoding)
169 |         _, p = outputs.logits.max(1)
170 |         test_preds.extend(p.tolist())
171 |         test_gold.extend(labels.tolist())
172 |         loss = F.cross_entropy(outputs.logits, labels)
173 |         test_loss += loss.data
174 |         num_batches += 1
175 |     test_f1 = f1_score(test_gold, test_preds, average="macro")
176 |     print("Test F1: {0:.3f}".format(test_f1))
177 | 
178 | if __name__ == "__main__":
179 |     parser = argparse.ArgumentParser()
180 |     parser.add_argument("--train_batch_size", default=16, type=int)
181 |     parser.add_argument("--eval_batch_size", default=16, type=int)
182 |     parser.add_argument("--num_train_epochs", default=20, type=int)
183 |     parser.add_argument("--warmup_steps", default=50, type=int)
184 |     parser.add_argument("--model",
185 |                         default="../data/sentiment/models/norbert")
186 |     parser.add_argument("--save_dir",
187 |                         default="../data/sentiment/models/nobert")
188 |     parser.add_argument("--train", action="store_true")
189 |     parser.add_argument("--test", action="store_true")
190 |     parser.add_argument("--train_on_gold", action="store_true")
191 | 
192 | 
193 |     args = parser.parse_args()
194 | 
195 |     print("loading data...")
196 |     # train_loader = SSTDataLoader("../data/sentiment/sst/train.txt")
197 |     # dev_loader = SSTDataLoader("../data/sentiment/sst/dev.txt")
198 |     # test_loader = SSTDataLoader("../data/sentiment/sst/test.txt")
199 |     train_loader = DocbinDataLoader("../data/sentiment/norec_sentence/train_pred.docbin", num_examples=500, gold=args.train_on_gold)
200 |     dev_loader = DocbinDataLoader("../data/sentiment/norec_sentence/dev_pred.docbin", num_examples=500, gold=args.train_on_gold)
201 |     test_loader = DocbinDataLoader("../data/sentiment/norec_sentence/test_pred.docbin", gold=True)
202 | 
203 |     print("loading model...")
204 |     tokenizer = BertTokenizer.from_pretrained("ltgoslo/norbert")
205 |     model = BertForSequenceClassification.from_pretrained(args.model, num_labels=3)
206 | 
207 |     if args.train:
208 |         train(model, args.save_dir)
209 | 
210 |     # Test model
211 |     if args.test:
212 |         test(model)
213 | 
214 | 


--------------------------------------------------------------------------------
/examples/sentiment/weak_supervision_sentiment.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import spacy
  3 | from spacy.tokens import DocBin
  4 | import pandas as pd
  5 | 
  6 | from .norec_sentiment import FullSentimentAnnotator
  7 | from skweak import utils
  8 | from sklearn.metrics import f1_score
  9 | from .sentiment_models import MBertAnnotator
 10 | 
 11 | from sklearn.svm import LinearSVC
 12 | from sklearn.feature_extraction.text import TfidfVectorizer
 13 | 
 14 | import skweak
 15 | 
 16 | 
 17 | ##################################################################
 18 | # Preprocessing
 19 | ##################################################################
 20 | 
 21 | nlp = spacy.load("nb_core_news_md")
 22 | 
 23 | train_doc_bin = DocBin(store_user_data=True)
 24 | dev_doc_bin = DocBin(store_user_data=True)
 25 | test_doc_bin = DocBin(store_user_data=True)
 26 | 
 27 | train = pd.read_csv("./data/sentiment/norec_sentence/train.txt", delimiter="\t", header=None) #type: ignore
 28 | dev = pd.read_csv("./data/sentiment/norec_sentence/dev.txt", delimiter="\t", header=None) #type: ignore
 29 | test = pd.read_csv("./data/sentiment/norec_sentence/test.txt", delimiter="\t", header=None) #type: ignore
 30 | 
 31 | for sid, (label, sent) in train.iterrows():
 32 |     doc = nlp(sent)
 33 |     doc.user_data["gold"] = label
 34 |     train_doc_bin.add(doc)
 35 | train_doc_bin.to_disk("./data/sentiment/norec_sentence/train.docbin")
 36 | 
 37 | for sid, (label, sent) in dev.iterrows():
 38 |     doc = nlp(sent)
 39 |     doc.user_data["gold"] = label
 40 |     dev_doc_bin.add(doc)
 41 | dev_doc_bin.to_disk("./data/sentiment/norec_sentence/dev.docbin")
 42 | 
 43 | for sid, (label, sent) in test.iterrows():
 44 |     doc = nlp(sent)
 45 |     doc.user_data["gold"] = label
 46 |     test_doc_bin.add(doc)
 47 | test_doc_bin.to_disk("./data/sentiment/norec_sentence/test.docbin")
 48 | 
 49 | 
 50 | ##################################################################
 51 | # Weak supervision
 52 | ##################################################################
 53 | 
 54 | ann = FullSentimentAnnotator()
 55 | ann.add_all()
 56 | 
 57 | ann.annotate_docbin("./data/sentiment/norec_sentence/train.docbin", "./data/sentiment/norec_sentence/train_pred.docbin")
 58 | 
 59 | ann.annotate_docbin("./data/sentiment/norec_sentence/dev.docbin", "./data/sentiment/norec_sentence/dev_pred.docbin")
 60 | 
 61 | ann.annotate_docbin("./data/sentiment/norec_sentence/test_pred.docbin", "./data/sentiment/norec_sentence/test_pred.docbin")
 62 | 
 63 | unified_model = skweak.aggregation.HMM("hmm", [0, 1, 2], sequence_labelling=False) #type: ignore
 64 | unified_model.fit("./data/sentiment/norec_sentence/train_pred.docbin")
 65 | unified_model.annotate_docbin("./data/sentiment/norec_sentence/train_pred.docbin", "./data/sentiment/norec_sentence/train_pred.docbin")
 66 | 
 67 | #unified_model = skweak.aggregation.HMM("hmm", [0, 1, 2], sequence_labelling=False)
 68 | #unified_model.fit("./data/sentiment/norec_sentence/dev_pred.docbin")
 69 | unified_model.annotate_docbin("./data/sentiment/norec_sentence/dev_pred.docbin", "./data/sentiment/norec_sentence/dev_pred.docbin")
 70 | 
 71 | #unified_model = skweak.aggregation.HMM("hmm", [0, 1, 2], sequence_labelling=False)
 72 | #unified_model.fit("./data/sentiment/norec_sentence/test_pred.docbin")
 73 | unified_model.annotate_docbin("./data/sentiment/norec_sentence/test_pred.docbin", "./data/sentiment/norec_sentence/test_pred.docbin")
 74 | 
 75 | mv = skweak.aggregation.MajorityVoter("mv", [0, 1, 2], sequence_labelling=False) #type: ignore
 76 | mv.annotate_docbin("./data/sentiment/norec_sentence/test_pred.docbin", "./data/sentiment/norec_sentence/test_pred.docbin")
 77 | 
 78 | pred_docs = list(utils.docbin_reader("./data/sentiment/norec_sentence/test_pred.docbin"))
 79 | 
 80 | 
 81 | ##################################################################
 82 | # Evaluation of upper bound
 83 | ##################################################################
 84 | 
 85 | 
 86 | train_docs = list(utils.docbin_reader("./data/sentiment/norec_sentence/train.docbin"))
 87 | 
 88 | pred_docs = list(utils.docbin_reader("./data/sentiment/norec_sentence/test_pred.docbin"))
 89 | 
 90 | vectorizer = TfidfVectorizer(ngram_range=(1, 3))
 91 | model = LinearSVC()
 92 | 
 93 | train = [" ".join([t.text for t in doc]) for doc in train_docs]
 94 | trainX = vectorizer.fit_transform(train)
 95 | train_y = [doc.user_data["gold"] for doc in train_docs]
 96 | model.fit(trainX, train_y)
 97 | 
 98 | test = [" ".join([t.text for t in doc]) for doc in pred_docs]
 99 | testX = vectorizer.transform(test)
100 | pred = model.predict(testX)
101 | 
102 | gold = [d.user_data["gold"] for d in pred_docs]
103 | 
104 | f1 = f1_score(gold, pred, average="macro")
105 | print("Upper Bound F1: {0:.3f}".format(f1))
106 | 
107 | ##################################################################
108 | # Evaluation of majority baseline
109 | ##################################################################
110 | 
111 | maj_class = [1] * len(gold)
112 | maj_f1 = f1_score(gold, maj_class, average="macro")
113 | print("Majority class: {0:.3f}".format(maj_f1))
114 | 
115 | print("-" * 25)
116 | 
117 | ##################################################################
118 | # Evaluation of labelling functions
119 | ##################################################################
120 | 
121 | 
122 | for lexicon in pred_docs[0].user_data["spans"].keys():
123 |     pred = []
124 |     for d in pred_docs:
125 |         for span in d.spans[lexicon]:
126 |             pred.append(span.label_)
127 | 
128 |     lex_f1 = f1_score(gold, pred, average="macro")
129 |     print("{0}:\t{1:.3f}".format(lexicon, lex_f1))
130 | 
131 | ##################################################################
132 | # Evaluation of aggregating functions
133 | ##################################################################
134 | 
135 | 
136 | 
137 | for aggregator in ["mv", "hmm"]:
138 |     pred = []
139 |     for d in pred_docs:
140 |         for span in d.spans[aggregator]:
141 |             pred.append(span.label_)
142 |     hmm_f1 = f1_score(gold, pred, average="macro")
143 |     print("{0}:\t{1:.3f}".format(aggregator, hmm_f1))
144 | 
145 | 


--------------------------------------------------------------------------------
/poetry.toml:
--------------------------------------------------------------------------------
1 | [virtualenvs]
2 | in-project = true
3 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "skweak"
 3 | version = "0.3.3"
 4 | description = "Software toolkit for weak supervision in NLP"
 5 | authors = ["Perre Lison <plison@nr.no>"]
 6 | maintainers = ["Perre Lison <plison@nr.no>"]
 7 | keywords = ["weak supervision", "sklearn", "scikit-learn", "nlp", "text processing", "language processing",
 8 |     "text mining", "text classification", "token classification", "ner", "named entity recognition", "hmm", "spacy"]
 9 | repository = "https://github.com/NorskRegnesentral/skweak"
10 | license = "MIT"
11 | readme = "README.md"
12 | classifiers = [
13 |     "Programming Language :: Python :: 3",
14 |     "Programming Language :: Python :: 3.7",
15 |     "Programming Language :: Python :: 3.8",
16 |     "Programming Language :: Python :: 3.9",
17 |     "Programming Language :: Python :: 3.10",
18 |     "Programming Language :: Python :: 3.11",
19 |     "License :: OSI Approved :: MIT License",
20 |     "Operating System :: OS Independent",
21 |     "Intended Audience :: Developers",
22 |     "Intended Audience :: Science/Research",
23 |     "Topic :: Text Processing",
24 |     "Topic :: Text Processing :: Linguistic",
25 |     "Topic :: Scientific/Engineering",
26 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
27 |     "Topic :: Scientific/Engineering :: Human Machine Interfaces",
28 |     "Topic :: Scientific/Engineering :: Information Analysis",
29 | ]
30 | packages = [{ include = "skweak" }]
31 | 
32 | [tool.poetry.dependencies]
33 | python = ">=3.7.4,<3.12"
34 | spacy = "^3.0"
35 | hmmlearn = "~0.3.0"
36 | scipy = "^1.5.4"
37 | pandas = ">=0.23,<3.0"
38 | 
39 | [tool.poetry.group.dev]
40 | optional = true
41 | 
42 | [tool.poetry.group.dev.dependencies]
43 | pytest = "^7.4.2"
44 | spacy = "~3.6.1"
45 | 
46 | # Fixed spaCy model dependencies
47 | en_core_web_sm = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl" }
48 | en_core_web_md = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl" }
49 | 
50 | # Workaround for using up-to-date binary wheels for all Python versions
51 | numpy = [
52 |     { version = "~1.21.1", python = ">=3.7,<3.9" },
53 |     { version = "~1.26", python = ">=3.9,<3.12" }
54 | ]
55 | scipy = [
56 |     { version = "~1.7.3", python = ">=3.7,<3.9" },
57 |     { version = "~1.11.2", python = ">=3.9,<3.12" }
58 | ]
59 | scikit-learn = [
60 |     { version = "~1.0.2", python = ">=3.7,<3.8" },
61 |     { version = "~1.3.1", python = ">=3.8,<3.12" }
62 | ]
63 | pandas = [
64 |     { version = "~1.3.5", python = ">=3.7,<3.9" },
65 |     { version = "~2.1.1", python = ">=3.9,<3.12" }
66 | ]
67 | 
68 | # TODO: Shall we use black?
69 | 
70 | [tool.pytest.ini_options]
71 | testpaths = ["tests"]
72 | addopts = "-s -v --durations=0"
73 | cache_dir = ".cache/pytest"
74 | 
75 | [build-system]
76 | requires = ["poetry-core"]
77 | build-backend = "poetry.core.masonry.api"
78 | 


--------------------------------------------------------------------------------
/skweak/__init__.py:
--------------------------------------------------------------------------------
1 | from . import base, doclevel, gazetteers, heuristics, aggregation, utils, spacy, voting, generative
2 | __version__ = "0.3.3"
3 | 


--------------------------------------------------------------------------------
/skweak/base.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from abc import abstractmethod
  3 | from typing import Iterable, Optional, Sequence, Tuple
  4 | 
  5 | from spacy.tokens import Doc, Span  # type: ignore
  6 | 
  7 | from . import utils
  8 | 
  9 | ############################################
 10 | # Abstract class for all annotators
 11 | ############################################
 12 | 
 13 | class AbstractAnnotator:
 14 |     """Base class for all annotation or aggregation sources 
 15 |     employed in skweak"""
 16 | 
 17 |     def __init__(self, name: str):
 18 |         """Initialises the annotator with a name"""
 19 |         self.name = name
 20 | 
 21 |     @abstractmethod
 22 |     def __call__(self, doc: Doc) -> Doc:
 23 |         """Annotates a single Spacy Doc object"""
 24 | 
 25 |         raise NotImplementedError()
 26 | 
 27 |     def pipe(self, docs: Iterable[Doc]) -> Iterable[Doc]:
 28 |         """Annotates a stream of Spacy Doc objects"""
 29 | 
 30 |         # This is the default implementation, which should be replaced if
 31 |         # we have better ways of annotating large numbers of documents
 32 |         for doc in docs:
 33 |             yield self(doc)
 34 | 
 35 |     def annotate_docbin(self, docbin_input_path: str,
 36 |                         docbin_output_path: Optional[str] = None,
 37 |                         spacy_model_name: str = "en_core_web_md",
 38 |                         cutoff: Optional[int] = None, nb_to_skip: int = 0):
 39 |         """Runs the annotator on the documents of a DocBin file, and write the output
 40 |         to docbin_output_path (or to the same file if it is set to None). The spacy 
 41 |         model name must be the same as the one used to create the DocBin file in the 
 42 |         first place. 
 43 | 
 44 |         If cutoff is set, the annotation stops after the given number of documents. If
 45 |         nb_to_skip is set, the method skips a number of documents at the start.
 46 |         """
 47 | 
 48 |         docs = utils.docbin_reader(docbin_input_path, spacy_model_name,
 49 |                                    cutoff=cutoff, nb_to_skip=nb_to_skip)
 50 |         new_docs = []
 51 |         for doc in self.pipe(docs):
 52 |             new_docs.append(doc)
 53 |             if len(new_docs) % 1000 == 0:
 54 |                 print("Number of processed documents:", len(new_docs))
 55 | 
 56 |         docbin_output_path = docbin_output_path or docbin_input_path
 57 |         utils.docbin_writer(new_docs, docbin_output_path)
 58 | 
 59 | 
 60 | ####################################################################
 61 | # Type of annotators
 62 | ####################################################################
 63 | 
 64 | class SpanAnnotator(AbstractAnnotator):
 65 |     """Generic class for the annotation of token spans"""
 66 | 
 67 |     def __init__(self, name: str):
 68 |         """Initialises the annotator with a source name"""
 69 | 
 70 |         super(SpanAnnotator, self).__init__(name)
 71 | 
 72 |         # Set of other labelling sources that have priority
 73 |         self.incompatible_sources = []
 74 | 
 75 |     # type:ignore
 76 |     def add_incompatible_sources(self, other_sources: Sequence[str]):
 77 |         """Specifies a list of sources that are not compatible with the current 
 78 |         source and should take precedence over it in case of overlap"""
 79 | 
 80 |         self.incompatible_sources.extend(other_sources)
 81 | 
 82 |     def __call__(self, doc: Doc) -> Doc:
 83 | 
 84 |         # We start by clearing all existing annotations
 85 |         doc.spans[self.name] = []
 86 | 
 87 |         # And we look at all suggested spans
 88 |         for start, end, label in self.find_spans(doc):
 89 | 
 90 |             # We only add the span if it is compatible with other sources
 91 |             if self._is_allowed_span(doc, start, end):
 92 |                 span = Span(doc, start, end, label)
 93 |                 doc.spans[self.name].append(span)
 94 | 
 95 |         return doc
 96 | 
 97 |     @abstractmethod
 98 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
 99 |         """Generates (start, end, label) triplets corresponding to token-level
100 |         spans associated with a given label. """
101 | 
102 |         raise NotImplementedError("Must implement find_spans method")
103 | 
104 |     def _is_allowed_span(self, doc, start, end):
105 |         """Checks whether the span is allowed (given incompatibilities with other sources)"""
106 | 
107 |         for other_source in self.incompatible_sources:
108 | 
109 |             intervals = sorted((span.start, span.end) for span in
110 |                                doc.spans.get(other_source, []))
111 | 
112 |             # Performs a binary search to efficiently detect overlapping spans
113 |             start_search, end_search = utils._binary_search(
114 |                 start, end, intervals)
115 |             for interval_start, interval_end in intervals[start_search:end_search]:
116 |                 if start < interval_end and end > interval_start:
117 |                     return False
118 |         return True
119 |     
120 |     
121 | class TextAnnotator(AbstractAnnotator):
122 |     """Abstract class for labelling functions used for text classification
123 |     (the goal being to predict the label of a full document)"""
124 | 
125 |     def __call__(self, doc: Doc) -> Doc:
126 | 
127 |         # We start by clearing all existing annotations
128 |         
129 |         doc.spans[self.name] = []
130 | 
131 |         result = self.get_label(doc)
132 |         
133 |         # We only add the annotation is the function returns a label
134 |         if result is not None:
135 |             span = Span(doc, 0, len(doc), result)
136 |             doc.spans[self.name].append(span)
137 | 
138 |         return doc
139 | 
140 |     @abstractmethod
141 |     def get_label(self, doc: Doc) -> Optional[str]:
142 |         """Returns the label of the document as predicted by the function,
143 |         or None if the labelling function "abstains" from giving a prediction"""
144 |         raise NotImplementedError("Must implement get_label method")
145 |     
146 | 
147 | ####################################################################
148 | # Combination of annotators
149 | ####################################################################
150 | 
151 | 
152 | class CombinedAnnotator(AbstractAnnotator):
153 |     """Annotator of entities in documents, combining several sub-annotators  """
154 | 
155 |     def __init__(self):
156 |         super(CombinedAnnotator, self).__init__("")
157 |         self.annotators = []
158 | 
159 |     def __call__(self, doc: Doc) -> Doc:
160 |         """Annotates a single  document with the sub-annotators
161 |         NB: avoid using this method for large collections of documents (as it is quite 
162 |         inefficient), and prefer the method pipe that runs on batches of documents.
163 |         """
164 | 
165 |         for annotator in self.annotators:
166 |             doc = annotator(doc)
167 |         return doc
168 | 
169 |     def pipe(self, docs: Iterable[Doc]) -> Iterable[Doc]:
170 |         """Annotates the stream of documents using the sub-annotators."""
171 | 
172 |         # We duplicate the streams of documents
173 |         streams = itertools.tee(docs, len(self.annotators)+1)
174 | 
175 |         # We create one pipe per annotator
176 |         pipes = [annotator.pipe(stream) for annotator, stream in
177 |                  zip(self.annotators, streams[1:])]
178 | 
179 |         for doc in streams[0]:
180 |             for pipe in pipes:
181 |                 try:
182 |                     next(pipe)
183 |                 except BaseException as e:
184 |                     print("ignoring document:", doc)
185 |                     raise e
186 | 
187 |             yield doc
188 | 
189 |     def add_annotator(self, annotator: AbstractAnnotator):
190 |         """Adds an annotator to the list"""
191 | 
192 |         self.annotators.append(annotator)
193 |         return self
194 | 
195 |     def add_annotators(self, *annotators: AbstractAnnotator):
196 |         """Adds several annotators to the list"""
197 | 
198 |         for annotator in annotators:
199 |             self.add_annotator(annotator)
200 |         return self
201 | 
202 |     def get_annotator(self, annotator_name: str):
203 |         """Returns the annotator identified by its name (and throws an
204 |         exception if no annotator can be found)"""
205 | 
206 |         for annotator in self.annotators:
207 |             if annotator.name == annotator_name:
208 |                 return annotator
209 | 
210 |         raise RuntimeError("Could not find annotator %s" % annotator_name)
211 | 


--------------------------------------------------------------------------------
/skweak/doclevel.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from typing import Dict, Iterable, List, Tuple
  3 | 
  4 | from . import base, utils
  5 | from .gazetteers import GazetteerAnnotator, Trie
  6 | from spacy.tokens import Doc, Span  # type: ignore
  7 | 
  8 | class DocumentHistoryAnnotator(base.SpanAnnotator):
  9 |     """Annotation based on the document history: 
 10 |     1) if a person name has been mentioned in full (at least two consecutive tokens, 
 11 |     most often first name followed by last name), then mark future occurrences of the 
 12 |     last token (last name) as a PER as well. 
 13 |     2) if an organisation has been mentioned together with a legal type, mark all other 
 14 |     occurrences (possibly without the legal type at the end) also as a COMPANY.
 15 |     """
 16 | 
 17 |     def __init__(self, basename: str, other_name: str, labels: List[str],
 18 |                  case_sensitive=True):
 19 |         """Creates a new annotator looking at the global document context, based on another 
 20 |         annotation layer (typically a layer aggregating existing annotations). Only the 
 21 |         labels specified in the argument will be taken into account."""
 22 | 
 23 |         super(DocumentHistoryAnnotator, self).__init__(basename)
 24 |         self.other_name = other_name
 25 |         self.labels = labels
 26 |         self.case_sensitive = case_sensitive
 27 | 
 28 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
 29 |         """Search for spans on one single document"""
 30 | 
 31 |         # Extract the first mentions of each entity
 32 |         first_observed = self.get_first_mentions(doc)
 33 |         
 34 |         # We construct tries based on the first mentions
 35 |         tries = {label: Trie() for label in self.labels}
 36 |         first_observed_bounds = set()
 37 |         for tokens, span in first_observed.items():
 38 |             tries[span.label_].add(tokens)
 39 |             first_observed_bounds.add((span.start, span.end))
 40 |         
 41 |         gazetteer = GazetteerAnnotator(self.name, tries, case_sensitive=self.case_sensitive,
 42 |                                        additional_checks=not self.case_sensitive)
 43 | 
 44 |         for start, end, label in gazetteer.find_spans(doc):
 45 |             if (start, end) not in first_observed_bounds:
 46 |                 yield start, end, label
 47 | 
 48 |         return doc
 49 | 
 50 |     def get_first_mentions(self, doc) -> Dict[List[str], Span]:
 51 |         """Returns a set containing the first mentions of each entity as triples
 52 |         (start, end, label) according to the "other_name' layer.
 53 | 
 54 |         The first mentions also contains subsequences: for instance, a named entity
 55 |         "Pierre Lison" will also contain the first mentions of ['Pierre'] and ['Lison'].
 56 |         """
 57 |         if self.other_name not in doc.spans:
 58 |             return {}
 59 | 
 60 |         first_observed = {}
 61 |         for span in doc.spans[self.other_name]:
 62 | 
 63 |             # NB: We only consider entities with at least two tokens
 64 |             if span.label_ not in self.labels or len(span) < 2:
 65 |                 continue
 66 | 
 67 |             # We also extract subsequences
 68 |             for length in range(1, len(span)+1):
 69 |                 for i in range(length, len(span)+1):
 70 | 
 71 |                     start2 = span.start + i-length
 72 |                     end2 = span.start + i
 73 |                     subseq = tuple(tok.text for tok in doc[start2:end2])
 74 | 
 75 |                     # We ony consider first mentions
 76 |                     if subseq in first_observed:
 77 |                         continue
 78 | 
 79 |                     # To avoid too many FPs, the mention must have at least 4 charactes
 80 |                     if sum(len(tok) for tok in subseq) <4:
 81 |                         continue
 82 |                     
 83 |                     # And if the span looks like a proper name, then at least one 
 84 |                     # token in the subsequence must look like a proper name too 
 85 |                     if (any(utils.is_likely_proper(tok) for tok in span) and not 
 86 |                           any(utils.is_likely_proper(tok) for tok in doc[start2:end2])):
 87 |                         continue
 88 |                         
 89 |                     first_observed[subseq] = Span(doc, start2, end2, span.label_)
 90 | 
 91 |         return first_observed
 92 | 
 93 | 
 94 | class DocumentMajorityAnnotator(base.SpanAnnotator):
 95 |     """Annotation based on majority label for the same entity string elsewhere in the 
 96 |     document. The annotation creates two layers for each label, one for case-sensitive 
 97 |     occurrences of the entity string in the document, and one for case-insensitive 
 98 |     occurrences.
 99 |     """
100 | 
101 |     def __init__(self, basename: str, other_name: str, case_sensitive=True):
102 |         """Creates a new annotator that looks at (often aggregated) annotations from
103 |         another layer, and annotates entities based on their majority label elsewhere
104 |         in the document. """
105 | 
106 |         super(DocumentMajorityAnnotator, self).__init__(basename)
107 |         self.other_name = other_name
108 |         self.case_sensitive = case_sensitive
109 | 
110 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
111 |         """Generates span annotations for one single document based on 
112 |         majority labels"""
113 | 
114 |         # We search for the majority label for each entity string
115 |         majority_labels = self.get_majority_labels(doc)
116 | 
117 |         # we build trie to easily search for these entities in the text
118 |         tries = {label: Trie()
119 |                  for label in set(majority_labels.values())}
120 |         for ent_tokens, label in majority_labels.items():
121 |             tries[label].add(list(ent_tokens))
122 | 
123 |         gazetteer = GazetteerAnnotator(self.name, tries, self.case_sensitive,
124 |                                        additional_checks=not self.case_sensitive)
125 |         for start, end, label in gazetteer.find_spans(doc):
126 |             yield start, end, label
127 | 
128 |     def get_majority_labels(self, doc: Doc) -> Dict[Tuple[str], str]:
129 |         """Given a document, searches for the majority label for each entity string
130 |         with at least self.min_counts number of occurrences. """
131 | 
132 |         # Get the counts for each label per entity string
133 |         # (and also for each form, to take various casings into account)
134 |         label_counts = defaultdict(dict)
135 |         form_counts = defaultdict(dict)
136 |         spans = utils.get_spans_with_probs(doc, self.other_name)
137 |         all_tokens_low = [tok.lower_ for tok in doc]
138 |         checked = {}
139 |         for span, prob in spans:
140 | 
141 |             # We only apply document majority for strings occurring more than once
142 |             tokens_low = tuple(all_tokens_low[span.start:span.end])
143 |             if tokens_low not in checked:
144 |                 occurs_several_times = utils.at_least_nb_occurrences(
145 |                     tokens_low, all_tokens_low, 2)
146 |                 checked[tokens_low] = occurs_several_times
147 |             else:
148 |                 occurs_several_times = checked[tokens_low]
149 | 
150 |             # If the string occurs more than once, update the counts
151 |             if occurs_several_times:
152 |                 label_counts[tokens_low][span.label_] = \
153 |                     label_counts[tokens_low].get(span.label_, 0) + prob
154 |                 tokens = tuple(tok.text for tok in span)
155 |                 form_counts[tokens_low][tokens] = form_counts[tokens_low].get(
156 |                     tokens, 0) + prob
157 | 
158 |         # Search for the most common label for each entity string
159 |         majority_labels = {}
160 |         for lower_tokens, labels_for_ent in label_counts.items():
161 |             majority_label = max(
162 |                 labels_for_ent, key=lambda x: labels_for_ent[x])
163 |             forms_for_ent = form_counts[lower_tokens]
164 |             majority_form = max(forms_for_ent, key=lambda x: forms_for_ent[x])
165 | 
166 |             majority_labels[majority_form] = majority_label
167 | 
168 |         return majority_labels
169 | 


--------------------------------------------------------------------------------
/skweak/gazetteers.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import json
  3 | import re
  4 | from typing import Dict, Iterable, List, Optional, Tuple
  5 | 
  6 | from spacy.tokens import Doc, Span, Token  # type: ignore
  7 | 
  8 | from . import base, utils
  9 | 
 10 | ############################################
 11 | # Gazetteer annotator
 12 | ############################################
 13 | 
 14 | class GazetteerAnnotator(base.SpanAnnotator):
 15 |     """Annotation using a gazetteer, i.e. a large list of entity terms. The annotation can
 16 |     look at either case-sensitive and case-insensitive occurrences.  The annotator relies 
 17 |     on a token-level trie for efficient search. """
 18 | 
 19 |     def __init__(self, name: str, tries: Dict[str, 'Trie'], case_sensitive: bool = True, 
 20 |                  lookahead: int = 10, additional_checks: bool=True):
 21 |         """Creates a new gazeteer, based on:
 22 |         - a trie
 23 |         - an output label associated with the trie
 24 |         - a flag indicating whether the gazetteer should be case-sensitive or not
 25 |         - the maximum size of the lookahead window
 26 |         - a flag indicating whether to do additional checks to reduce the 
 27 |           number of false positives when searching for named entities"""
 28 | 
 29 |         super(GazetteerAnnotator, self).__init__(name)
 30 | 
 31 |         self.tries = tries
 32 |         self.case_sensitive = case_sensitive
 33 |         self.lookahead = lookahead
 34 |         self.additional_checks = additional_checks
 35 |         
 36 | 
 37 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
 38 |         """Search for occurrences of entity terms in the spacy document"""
 39 | 
 40 |         # We extract the tokens (as list of strings)
 41 |         tokens = utils.get_tokens(doc)
 42 | 
 43 |         # We extract the (token-level) indices for next sentence boundaries
 44 |         next_sentence_boundaries = utils.get_next_sentence_boundaries(doc)
 45 | 
 46 |         i = 0
 47 |         while i < len(doc):
 48 | 
 49 |             tok = doc[i]
 50 | 
 51 |             # We create a lookahead window starting at the token
 52 |             lookahead_length = self._get_lookahead(tok, next_sentence_boundaries[i])
 53 | 
 54 |             if lookahead_length:
 55 | 
 56 |                 window = tokens[i:i+lookahead_length]
 57 |                 matches = []
 58 |                 # We loop on all tries (one per label)
 59 |                 for label, trie in self.tries.items():
 60 |                     
 61 |                     # We search for the longest match
 62 |                     match = trie.find_longest_match(window, self.case_sensitive)
 63 |                     if match:
 64 |                         # We check whether the match is valid
 65 |                         if (not self.additional_checks or 
 66 |                             self._is_valid_match(doc[i:i+len(match)], match)):
 67 |                             matches.append((match, label))
 68 |                 
 69 |                 # We choose the longest match(es)
 70 |                 if matches:
 71 |                     max_length = max(len(match) for match, _ in matches)
 72 |                     for match, label in matches:
 73 |                         if len(match)==max_length:
 74 |                             yield i, i+max_length, label
 75 | 
 76 |                     # We skip the text until the end of the match
 77 |                     i += max_length-1
 78 |                     
 79 |             i += 1
 80 | 
 81 |     def _get_lookahead(self, token: Token, next_sentence_boundary: int) -> int:
 82 |         """Returns the longest possible span starting with the current token, and
 83 |         satisfying the three following criteria:
 84 |         - the maximum length of the span is self.lookahead
 85 |         - the span cannot start with a punctuation symbol or within a compound phrase
 86 |         - the span cannot cross sentence boundaries
 87 |         """
 88 | 
 89 |         if token.is_punct:
 90 |             return 0
 91 |         elif (self.additional_checks and token.i > 0 and token.nbor(-1).dep_ == "compound" 
 92 |               and token.nbor(-1).head == token):
 93 |             return 0
 94 | 
 95 |         return min(next_sentence_boundary-token.i, self.lookahead)
 96 | 
 97 |     def _is_valid_match(self, match_span: Span, ent_tokens: List[str]) -> bool:
 98 |         """Checks whether the match satisfies the following criteria:
 99 |         - the match does not end with a punctuation symbol or within a compound phrase
100 |           (with a head that looks like a proper name)
101 |         - if the actual tokens of the entity contains tokens in "title" case, the match
102 |           must contain at least one token that looks like a proper name
103 |           (to avoid too many false positives).
104 |         """
105 | 
106 |         last_token = match_span[-1]
107 |         if last_token.is_punct:
108 |             return False
109 |         elif match_span.end < len(match_span.doc):
110 |             if (last_token.dep_ == "compound" and last_token.head.i > last_token.i
111 |                     and utils.is_likely_proper(last_token.head)):
112 |                 return False
113 | 
114 |         if (any(tok.istitle() for tok in ent_tokens) and
115 |                 not any(utils.is_likely_proper(tok) for tok in match_span)):
116 |             return False
117 |         return True
118 | 
119 | 
120 | ############################################
121 | # Trie data structure (used for gazetteers)
122 | ############################################
123 | 
124 | class Trie:
125 |     """Implementation of a trie for searching for occurrences of terms in a text. 
126 | 
127 |     Internally, the trie is made of nodes expressed as (dict, bool) pairs, where the
128 |     dictionary expressed possible edges (tokens) going out from the node, and the boolean
129 |     indicates whether the node is terminal or not. 
130 |     """
131 | 
132 |     def __init__(self, entries: List[List[str]] = None):
133 |         """Creates a new trie. If provided, entries must be a list of tokenised entries"""
134 | 
135 |         self.start = {}
136 |         self.len = 0
137 | 
138 |         if entries is not None:
139 |             for entry in entries:
140 |                 self.add(entry)
141 | 
142 |     def find_longest_match(self, tokens: List[str], case_sensitive=True) -> List[str]:
143 |         """Search for the longest match (that is, the longest element in the trie that matches
144 |         a prefix of the provided tokens). The tokens must be expressed as a list of strings. 
145 |         The method returns the match as a list of tokens, which is empty is no match could
146 |         be found. 
147 | 
148 |         If case_sensitive is set to False, the method also checks for matches of alternative 
149 |         casing of the words (lowercase, uppercase and titled)
150 |         """
151 | 
152 |         edges = self.start
153 |         prefix_length = 0
154 |         matches = []
155 | 
156 |         for i, token in enumerate(tokens):
157 | 
158 |             match = self._find_match(token, edges, case_sensitive)
159 |             if match:
160 |                 edges, is_terminal = edges[match]
161 |                 matches.append(match)
162 |                 if is_terminal:
163 |                     prefix_length = i+1
164 |             else:
165 |                 break
166 | 
167 |         return matches[:prefix_length]
168 | 
169 |     def _find_match(self, token: str, branch: Dict, case_sensitive: bool) -> Optional[str]:
170 |         """Checks whether the token matches any edge in the branch. If yes, 
171 |         returns the match (which can be slightly different from the token if
172 |         case_sensitive is set to False). Otherwise returns None."""
173 | 
174 |         if not branch:
175 |             return None
176 |         elif case_sensitive:
177 |             return token if token in branch else None
178 |         elif token in branch:
179 |             return token
180 | 
181 |         if not token.istitle():
182 |             titled = token.title()
183 |             if titled in branch:
184 |                 return titled
185 |         if not token.islower():
186 |             lowered = token.lower()
187 |             if lowered in branch:
188 |                 return lowered
189 |         if not token.isupper():
190 |             uppered = token.upper()
191 |             if uppered in branch:
192 |                 return uppered
193 | 
194 |         return None
195 | 
196 |     def __contains__(self, tokens: List[str]) -> bool:
197 |         """Returns whether the list of tokens are contained in the trie
198 |         (in case-sensitive mode)"""
199 | 
200 |         return self.contains(tokens)
201 | 
202 |     def contains(self, tokens: List[str], case_sensitive=True) -> bool:
203 |         """Returns whether the list of tokens are contained in the trie"""
204 | 
205 |         edges = self.start
206 |         is_terminal = False
207 |         for token in tokens:
208 |             match = self._find_match(token, edges, case_sensitive)
209 |             if not match:
210 |                 return False
211 |             edges, is_terminal = edges[match]
212 |         return is_terminal
213 | 
214 |     def add(self, tokens: List[str]):
215 |         """Adds a new (tokens, value) pair to the trie"""
216 | 
217 |         # We add new edges to the trie
218 |         edges = self.start
219 |         for token in tokens[:-1]:
220 | 
221 |             # We create a sub-dictionary if it does not exist
222 |             if token not in edges:
223 |                 newdict = {}
224 |                 edges[token] = (newdict, False)
225 |                 edges = newdict
226 | 
227 |             else:
228 |                 next_edges, is_terminal = edges[token]
229 | 
230 |                 # If the current set of edges is None, map to a dictionary
231 |                 if next_edges is None:
232 |                     newdict = {}
233 |                     edges[token] = (newdict, is_terminal)
234 |                     edges = newdict
235 |                 else:
236 |                     edges = next_edges
237 | 
238 |         last_token = tokens[-1]
239 |         if last_token not in edges:
240 |             edges[last_token] = (None, True)
241 |         else:
242 |             edges[last_token] = (edges[last_token][0], True)
243 | 
244 |         self.len += 1
245 | 
246 |     def __len__(self) -> int:
247 |         """Returns the total number of (tokens, value) pairs in the trie"""
248 |         return self.len
249 | 
250 |     def __iter__(self):
251 |         """Generates all elements from the trie"""
252 | 
253 |         for tokens in self._iter_from_edges(self.start):
254 |             yield tokens
255 | 
256 |     def _iter_from_edges(self, edges):
257 |         """Generates all elements from a branch in the trie"""
258 | 
259 |         for token, (sub_branch, is_terminal) in edges.items():
260 |             if is_terminal:
261 |                 yield [token]
262 |             if sub_branch is not None:
263 |                 for tokens2 in self._iter_from_edges(sub_branch):
264 |                     yield [token, *tokens2]
265 | 
266 |     def __repr__(self) -> str:
267 |         """Returns a representation of the trie as a flattened list"""
268 | 
269 |         return list(self).__repr__()
270 | 
271 | 
272 | ############################################
273 | # Utility functions
274 | ############################################
275 | 
276 | 
277 | def extract_json_data(json_file: str, cutoff: Optional[int] = None,
278 |                       spacy_model="en_core_web_md") -> Dict[str, Trie]:
279 |     """Extract entities from a Json file and build trie from it (one per class).
280 | 
281 |     If cutoff is set to a number, stops the extraction after a number of values
282 |     for each class (useful for debugging purposes)."""
283 | 
284 |     print("Extracting data from", json_file)
285 |     tries = {}
286 |     tokeniser = None
287 |     if json_file.endswith(".json.gz"):
288 |         fd = gzip.open(json_file, "r")
289 |         data = json.loads(fd.read().decode("utf-8"))
290 |         fd.close()
291 |     elif json_file.endswith(".json"):
292 |         fd = open(json_file)
293 |         data = json.load(fd)
294 |         fd.close()
295 |     else:
296 |         raise RuntimeError(str(json_file) + " does not look like a JSON file")    
297 | 
298 |     for neClass, names in data.items():
299 | 
300 |         remaining = []
301 |         if cutoff is not None:
302 |             names = names[:cutoff]
303 |         print("Populating trie for class %s (number: %i)" %
304 |                 (neClass, len(names)))
305 | 
306 |         trie = Trie()
307 |         for name in names:
308 |             if type(name) == str:
309 |                 tokens = name.split(" ")
310 | 
311 |                 # If the tokens contain special characters, we need to run spacy to
312 |                 # ensure we get the same tokenisation as in spacy-tokenised texts
313 |                 if any(tok for tok in tokens if not tok.isalpha()
314 |                         and not tok.isnumeric() and not re.match("[A-Z]\\.$", tok)):
315 |                     import spacy
316 |                     tokeniser = tokeniser or spacy.load(
317 |                         spacy_model).tokenizer
318 |                     tokens = [t.text for t in tokeniser(name)]
319 | 
320 |                 if len(tokens) > 0:
321 |                     trie.add(tokens)
322 | 
323 |             # If the items are already tokenised, we can load the trie faster
324 |             elif type(name) == list:
325 |                 if len(name) > 0:
326 |                     trie.add(name)
327 | 
328 |         tries[neClass] = trie
329 |     return tries
330 | 


--------------------------------------------------------------------------------
/skweak/heuristics.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import (Callable, Collection, Dict, Iterable, Optional, Sequence,
  3 |                     Set, Tuple)
  4 | 
  5 | from spacy.tokens import Doc, Span, Token  # type: ignore
  6 | 
  7 | from .base import SpanAnnotator
  8 | 
  9 | ####################################################################
 10 | # Labelling sources based on heuristics / handcrafted rules
 11 | ####################################################################
 12 | 
 13 | 
 14 | class FunctionAnnotator(SpanAnnotator):
 15 |     """Annotation based on a heuristic function that generates (start,end,label)
 16 |     given a spacy document"""
 17 | 
 18 |     def __init__(self, name: str, 
 19 |                  function: Callable[[Doc], Iterable[Tuple[int, int, str]]],
 20 |                  to_exclude: Sequence[str] = ()):
 21 |         """Create an annotator based on a function generating labelled spans given 
 22 |         a Spacy Doc object. Spans that overlap with existing spans from sources 
 23 |         listed in 'to_exclude' are ignored. """
 24 | 
 25 |         super(FunctionAnnotator, self).__init__(name)
 26 |         self.find_spans = function
 27 |         self.add_incompatible_sources(to_exclude)
 28 | 
 29 | 
 30 | class RegexAnnotator(SpanAnnotator):
 31 |     """Annotation based on a heuristic regular expression that generates 
 32 |     (start,end,label) given a spacy document"""
 33 | 
 34 |     def __init__(
 35 |         self,
 36 |         name: str,
 37 |         pattern: str,
 38 |         tag: str,
 39 |         to_exclude: Sequence[str] = (),
 40 |         alignment_mode : str = "expand",
 41 |     ):
 42 |         """Create an annotator based on a regular expression generating labelled
 43 |         spans given a Spacy Doc object. The regex matches are tagged with the
 44 |         value of the 'tag' param. Spans that overlap with existing spans 
 45 |         from sources listed in 'to_exclude' are ignored."""
 46 | 
 47 |         super().__init__(name)
 48 |         self.pattern = pattern
 49 |         self.tag = tag
 50 |         self.alignment_mode = alignment_mode
 51 |         self.add_incompatible_sources(to_exclude)
 52 | 
 53 | 
 54 |     @staticmethod
 55 |     def regex_search(pattern, string):
 56 | 
 57 |         prev_end = 0
 58 |         while True:
 59 |             match = re.search(pattern, string)
 60 |             if not match:
 61 |                 break
 62 | 
 63 |             start, end = match.span()
 64 |             yield start + prev_end, end + prev_end
 65 |             prev_end += end
 66 |             string = string[end:]
 67 | 
 68 | 
 69 |     def find_spans(self, doc):
 70 | 
 71 |         for start, end in self.regex_search(self.pattern, doc.text):
 72 |             span = doc.char_span(start, end, self.tag, alignment_mode=self.alignment_mode)
 73 |             yield span.start, span.end, self.tag
 74 | 
 75 | 
 76 | class TokenConstraintAnnotator(SpanAnnotator):
 77 |     """Annotator relying on a token-level constraint. Continuous spans that 
 78 |     satisfy this constraint will be marked by the provided label."""
 79 | 
 80 |     def __init__(self, name: str, constraint: Callable[[Token], bool],
 81 |                  label: str, min_characters=3, 
 82 |                  gap_tokens:Optional[Set]=None):  
 83 |         """Given a token-level constraint, a label name, and a minimum
 84 |         number of characters, annotates with the label all (maximal) 
 85 |         contiguous spans whose tokens satisfy the constraint."""
 86 | 
 87 |         super(TokenConstraintAnnotator, self).__init__(name)
 88 |         self.constraint = constraint
 89 |         self.label = label
 90 |         self.min_characters = min_characters
 91 |         
 92 |         # Hyphens should'nt stop a span
 93 |         self.gap_tokens = gap_tokens if gap_tokens is not None else {"-"} 
 94 | 
 95 |     def add_gap_tokens(self, gap_tokens: Collection[str]):
 96 |         """Adds tokens (typically function words) that are allowed in the span 
 97 |         even if they do not satisfy the constraint, provided they are surrounded
 98 |         by words that do satisfy the constraint. """
 99 | 
100 |         self.gap_tokens.update(gap_tokens)
101 | 
102 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
103 |         """
104 |         Searches for all spans whose tokens satisfy the constraint (and meet
105 |         the minimum character length), and marks those with the provided label. 
106 |         """
107 | 
108 |         i = 0
109 |         while i < len(doc):
110 |             tok = doc[i]
111 |             # We search for the longest span that satisfy the constraint
112 |             if self.constraint(tok):
113 |                 j = i+1
114 |                 while j < len(doc):
115 |                     # We check the constraint
116 |                     if self.constraint(doc[j]) and self._is_allowed_span(doc, i, j+1):
117 |                         j += 1
118 | 
119 |                     # We also check whether the token is a gap word
120 |                     elif (doc[j].text in self.gap_tokens and j < len(doc)-1
121 |                           and self.constraint(doc[j+1])
122 |                           and self._is_allowed_span(doc, i, j+2)):
123 |                         j += 2
124 |                     else:
125 |                         break
126 | 
127 |                 # We check whether the span has a minimal length
128 |                 if len(doc[i:j].text) >= self.min_characters:
129 |                     yield i, j, self.label
130 | 
131 |                 i = j
132 |             else:
133 |                 i += 1
134 | 
135 | 
136 | class SpanConstraintAnnotator(SpanAnnotator):
137 |     """Annotation by looking at text spans (from another source) 
138 |     that satisfy a span-level constraint"""
139 | 
140 |     def __init__(self, name: str, other_name: str, constraint: Callable[[Span], bool],
141 |                  label: Optional[str] = None):
142 |         """Creates a new annotator that looks at the annotations from the
143 |         other_name source, and adds them to this source if it satisfied a 
144 |         given constraint on spans. If label is other than None, the method
145 |         simply reuses the same label as the one specified by other_name."""
146 | 
147 |         super(SpanConstraintAnnotator, self).__init__(name)
148 |         self.other_name = other_name
149 |         self.constraint = constraint
150 |         self.label = label
151 | 
152 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
153 |         """Loops through the spans annotated by the other source, and, for each, checks
154 |         whether they satisfy the provided constraint. If yes, adds the labelled span
155 |         to the annotations for this source. """
156 | 
157 |         if self.other_name not in doc.spans:
158 |             return
159 | 
160 |         for span in doc.spans[self.other_name]:
161 |             if self.constraint(span):
162 |                 yield span.start, span.end, (self.label or span.label_)
163 | 
164 | 
165 | class SpanEditorAnnotator(SpanAnnotator):
166 |     """Annotation by editing/correcting text spans from another source 
167 |     based on a simple editing function"""
168 | 
169 |     def __init__(self, name: str, other_name: str, editor: Callable[[Span], Span],
170 |                  label: Optional[str] = None):
171 |         """Creates a new annotator that looks at the annotations from the
172 |         other_name source, and edits the span according to a given function.
173 |         If label is other than None, the method simply reuses the same label 
174 |         as the one specified by other_name."""
175 | 
176 |         super(SpanEditorAnnotator, self).__init__(name)
177 |         self.other_name = other_name
178 |         self.editor = editor
179 |         self.label = label
180 | 
181 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
182 |         """Loops through the spans annotated by the other source and runs the
183 |         editor function on it. """
184 | 
185 |         if self.other_name not in doc.spans:
186 |             return
187 | 
188 |         for span in doc.spans[self.other_name]:
189 |             edited = self.editor(span)
190 |             if edited is not None and edited.end > edited.start:
191 |                 yield edited.start, edited.end, (self.label or span.label_)
192 | 
193 | 
194 | ####################################################################
195 | # Other labelling sources
196 | ####################################################################
197 | 
198 | class VicinityAnnotator(SpanAnnotator):
199 |     """Annotator based on cue words located in the vicinity (window of 
200 |     surrounding words) of a given span. """
201 | 
202 |     def __init__(self, name: str, cue_words: Dict[str, str], other_name: str,
203 |                  max_window: int = 8):
204 |         """Creates a new annotator based on a set of cue words (each mapped 
205 |         to a given output label) along with the name of another labelling
206 |         source from which span candidates will be extracted."""
207 | 
208 |         super(VicinityAnnotator, self).__init__(name)
209 | 
210 |         self.cue_words = cue_words
211 |         self.other_name = other_name
212 |         self.max_window = max_window
213 | 
214 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
215 |         """Searches for spans that have a cue word in their vicinity - and if 
216 |         yes, tag the span with the label associated with the cue word."""
217 | 
218 |         if self.other_name not in doc.spans:
219 |             return
220 |         
221 |         # We loop on the span candidates from the other labelling source
222 |         for span in doc.spans[self.other_name]:
223 | 
224 |             # Determine the boundaries of the context (based on the window)
225 |             # NB: we do not wish to cross sentence boundaries
226 |             left_bound = max(span.sent.start, span.start - self.max_window//2+1)
227 |             right_bound = min(span.sent.end, span.end+self.max_window//2+1)
228 | 
229 |             for tok in doc[left_bound:right_bound]:
230 |                 for tok_form in {tok.text, tok.lower_, tok.lemma_}:
231 |                     if tok_form in self.cue_words:
232 |                         yield span.start, span.end, self.cue_words[tok_form]
233 | 


--------------------------------------------------------------------------------
/skweak/spacy.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import json
  3 | from typing import Dict, Iterable, List, Tuple
  4 | 
  5 | import spacy
  6 | from spacy.tokens import Doc, Span  # type: ignore
  7 | 
  8 | from .base import SpanAnnotator
  9 | 
 10 | ####################################################################
 11 | # Labelling source based on neural models
 12 | ####################################################################
 13 | 
 14 | 
 15 | class ModelAnnotator(SpanAnnotator):
 16 |     """Annotation based on a spacy NER model"""
 17 | 
 18 |     def __init__(self, name:str, model_path:str, 
 19 |                  disabled:List[str]=["parser", "tagger", "lemmatizer", "attribute_ruler"]):
 20 |         """Creates a new annotator based on a Spacy model. """
 21 |         
 22 |         super(ModelAnnotator, self).__init__(name)
 23 |         self.model = spacy.load(model_path, disable=disabled)
 24 | 
 25 | 
 26 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
 27 |         """Annotates one single document using the Spacy NER model"""
 28 | 
 29 |         # Create a new document (to avoid conflicting annotations)
 30 |         doc2 = self.create_new_doc(doc)
 31 |         # And run the model
 32 |         for _, proc in self.model.pipeline:
 33 |             doc2 = proc(doc2)
 34 |         # Add the annotation
 35 |         for ent in doc2.ents:
 36 |             yield ent.start, ent.end, ent.label_
 37 | 
 38 |     def pipe(self, docs: Iterable[Doc]) -> Iterable[Doc]:
 39 |         """Annotates the stream of documents based on the Spacy model"""
 40 | 
 41 |         stream1, stream2 = itertools.tee(docs, 2)
 42 | 
 43 |         # Remove existing entities from the document
 44 |         stream2 = (self.create_new_doc(d) for d in stream2)
 45 |         
 46 |         # And run the model
 47 |         for _, proc in self.model.pipeline:
 48 |             stream2 = proc.pipe(stream2)
 49 |         
 50 |         for doc, doc_copy in zip(stream1, stream2):
 51 | 
 52 |             doc.spans[self.name] = []
 53 | 
 54 |             # Add the annotation
 55 |             for ent in doc_copy.ents:
 56 |                 doc.spans[self.name].append(Span(doc, ent.start, ent.end, ent.label_))
 57 | 
 58 |             yield doc
 59 | 
 60 |     def create_new_doc(self, doc: Doc) -> Doc:
 61 |         """Create a new, empty Doc (but with the same tokenisation as before)"""
 62 | 
 63 |         return spacy.tokens.Doc(self.model.vocab, [tok.text for tok in doc], #type: ignore
 64 |                                [tok.whitespace_ for tok in doc])
 65 | 
 66 | 
 67 | class TruecaseAnnotator(ModelAnnotator):
 68 |     """Spacy model annotator that preprocess all texts to convert them to a 
 69 |     "truecased" representation (see below)"""
 70 | 
 71 |     def __init__(self, name:str, model_path:str, form_frequencies:str,
 72 |                  disabled:List[str]=["parser", "tagger", "lemmatizer", "attribute_ruler"]):
 73 |         """Creates a new annotator based on a Spacy model, and a dictionary containing
 74 |         the most common case forms for a given word (to be able to truecase the document)."""
 75 |         
 76 |         super(TruecaseAnnotator, self).__init__(name, model_path, disabled)
 77 |         with open(form_frequencies) as fd:
 78 |             self.form_frequencies = json.load(fd)
 79 | 
 80 |     def create_new_doc(self, doc: Doc, min_prob: float = 0.25) -> Doc:
 81 |         """Performs truecasing of the tokens in the spacy document. Based on relative 
 82 |         frequencies of word forms, tokens that 
 83 |         (1) are made of letters, with a first letter in uppercase
 84 |         (2) and are not sentence start
 85 |         (3) and have a relative frequency below min_prob
 86 |         ... will be replaced by its most likely case (such as lowercase). """
 87 | 
 88 |         if not self.form_frequencies:
 89 |             raise RuntimeError(
 90 |                 "Cannot truecase without a dictionary of form frequencies")
 91 | 
 92 |         tokens = []
 93 |         spaces = []
 94 |         doctext = doc.text
 95 |         for tok in doc:
 96 |             toktext = tok.text
 97 | 
 98 |             # We only change casing for words in Title or UPPER
 99 |             if tok.is_alpha and toktext[0].isupper():
100 |                 cond1 = tok.is_upper and len(toktext) > 2  # word in uppercase
101 |                 cond2 = toktext[0].isupper(
102 |                 ) and not tok.is_sent_start  # titled word
103 |                 if cond1 or cond2:
104 |                     token_lc = toktext.lower()
105 |                     if token_lc in self.form_frequencies:
106 |                         frequencies = self.form_frequencies[token_lc]
107 |                         if frequencies.get(toktext, 0) < min_prob:
108 |                             alternative = sorted(
109 |                                 frequencies.keys(), key=lambda x: frequencies[x])[-1]
110 | 
111 |                             # We do not change from Title to to UPPER
112 |                             if not tok.is_title or not alternative.isupper():
113 |                                 toktext = alternative
114 | 
115 |             tokens.append(toktext)
116 | 
117 |             # Spacy needs to know whether the token is followed by a space
118 |             if tok.i < len(doc)-1:
119 |                 spaces.append(doctext[tok.idx+len(tok)].isspace())
120 |             else:
121 |                 spaces.append(False)
122 | 
123 |         # Creates a new document with the tokenised words and space information
124 |         doc2 = Doc(self.model.vocab, words=tokens, spaces=spaces) #type: ignore
125 |         return doc2
126 | 
127 | 
128 | class LabelMapper(SpanAnnotator):
129 |     """When using ModelAnnotators, e.g. spacy_lg models, often the
130 |     labels introduced is not what one is looking for. This function takes in
131 |     a dict of labels to replace and desired label to replace with, e.g. 
132 |     {
133 |         ('FAC','GPE'):"LOC",
134 |         ('NORP'):"ORG",
135 |         ('DATE','EVENT', ..., 'WORK_OF_ART'): "MISC"
136 |     }
137 |     """
138 | 
139 |     def __init__(
140 |         self,
141 |         name: str,
142 |         mapping: Dict[Iterable[str], str],
143 |         sources: Iterable[str],
144 |         inplace: bool = True,
145 |     ):
146 |         """Creates a new annotator that looks at the labels of certain
147 |         span groups (specified by 'sources') for each doc. If the label 
148 |         is found in the mapping dictionary, it is replaced accordingly.
149 |         If the inplace flag is active, the labels are modified in their
150 |         respective span groups. If inactive, creates a new span group
151 |         for all relabelled spans."""
152 | 
153 |         super().__init__(name)
154 |         self.sources = sources
155 |         self.inplace = inplace
156 | 
157 |         # populate mapping dict
158 |         self.mapping = {}
159 |         for k, v in mapping.items():
160 |             if isinstance(k, str):
161 |                 self.mapping[k] = v
162 |             else:
163 |                 for key in k:
164 |                     self.mapping[key] = v
165 | 
166 | 
167 |     def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
168 |         """Loops through the spans annotated by the other source and runs the
169 |         editor function on it. Unique because it doesn't return spans but instead
170 |         edits the span groups in place!"""
171 | 
172 |         for source in set(self.sources).intersection(doc.spans):
173 |             
174 |             new_group = []
175 |             for span in doc.spans[source]:
176 | 
177 |                 if span.label_ in self.mapping:
178 | 
179 |                     span = Span(
180 |                         doc,
181 |                         span.start,
182 |                         span.end,
183 |                         self.mapping.get(span.label_)
184 |                     )
185 | 
186 |                 if self.inplace:
187 |                     new_group.append(span)
188 |                 else:
189 |                     yield span.start, span.end, span.label_
190 |                     
191 |             if self.inplace:
192 |                 doc.spans[source] = new_group
193 | 


--------------------------------------------------------------------------------
/skweak/voting.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from typing import Dict, List, Optional
  3 | 
  4 | import numpy as np
  5 | import pandas
  6 | 
  7 | from .aggregation import (AbstractAggregator, MultilabelAggregatorMixin,
  8 |                           SequenceAggregatorMixin, TextAggregatorMixin)
  9 | 
 10 | warnings.simplefilter(action='ignore', category=FutureWarning)
 11 | 
 12 |     
 13 | ############################################
 14 | # Majority voting
 15 | ############################################   
 16 | 
 17 | 
 18 | class MajorityVoterMixin(AbstractAggregator):
 19 |     """Implementation of a subset of methods from AbstractAggregator when
 20 |     the aggregation is performed for text/span classification.
 21 |     This class should not be instantiated directly."""
 22 |      
 23 |     def __init__(self, initial_weights=None):
 24 |         """Do not call this initializer directly, and use the fully
 25 |         implemented classes (MajorityVoter, NaiveBayes, HMM, etc.) instead"""
 26 |            
 27 |         # initial_weights is a dictionary associating source names to numerical weights
 28 |         #  in the range [0, +inf]. The default assumes weights = 1 for all functions. You
 29 |         #  can disable a labelling function by giving it a weight of 0.        """
 30 | 
 31 |         self.weights = initial_weights if initial_weights else {}
 32 | 
 33 |     def aggregate(self, obs: pandas.DataFrame) -> pandas.DataFrame:
 34 |         """Takes as input a 2D dataframe of shape (nb_entries, nb_sources) 
 35 |         associating each token/span to a set of observations from labelling 
 36 |         sources, and returns a 2D dataframe of shape (nb_entries, nb_labels)
 37 |         assocating each entry to the probability of each output label. 
 38 | 
 39 |         This probability is here computed based on making each source "vote"
 40 |         on its output label. The most likely label will thus be the one that
 41 |         is indicated by most sources. If underspecified labels are included, they 
 42 |         are also part of the vote count. """
 43 | 
 44 |         weights = np.array([self.weights.get(source, 1) for source in obs.columns])
 45 | 
 46 |         # We count the votes for each label on all sources 
 47 |         # (taking weights into account)
 48 |         def count_fun(x):
 49 |             return np.bincount(x[x>=0], weights=weights[x>=0], 
 50 |                                minlength=len(self.observed_labels)) 
 51 |         label_votes = np.apply_along_axis(count_fun, 1, obs.values).astype(np.float32)
 52 |         
 53 |         # For token-level sequence labelling, we need to normalise the number 
 54 |         # of "O" occurrences, since they both indicate the absence of 
 55 |         # prediction, but are also a possible output
 56 |         if self.observed_labels[0]=="O":
 57 |             label_votes = self.normalise_o_labels(label_votes)
 58 | 
 59 |         # We transform the votes from observations into output labels,
 60 |         out_label_votes = label_votes.dot(self._get_vote_matrix())
 61 | 
 62 |         # Normalisation
 63 |         total = np.expand_dims(out_label_votes.sum(axis=1), axis=1)
 64 |         probs = out_label_votes / (total + 1E-30)
 65 |         df = pandas.DataFrame(probs, index=obs.index, columns=self.out_labels)
 66 |         return df
 67 |       
 68 |       
 69 |     def normalise_o_labels(self, label_votes, power_base=3.0):
 70 |         """The normalised counts for the O labels are defined as B^(c-t), 
 71 |         where c are the raw counts for the O labels, t are the total number of 
 72 |         counts per data point, and B is a constant."""
 73 | 
 74 |         # If an observation is not voting for anything, we consider it as "O"
 75 |         not_voting_obs = (self._get_vote_matrix().sum(axis=1) == 0)
 76 |         label_votes[:,0] += label_votes[:,not_voting_obs].sum(axis=1)
 77 |         label_votes[:,not_voting_obs] = 0
 78 |         
 79 |         # Do the normalisation 
 80 |         diff = label_votes[:,0] - label_votes.sum(axis=1)
 81 |         label_votes[:,0] =  power_base ** diff
 82 |         return label_votes
 83 | 
 84 | 
 85 | 
 86 | ############################################
 87 | # Concrete majority voter aggregators
 88 | ############################################
 89 |         
 90 | class MajorityVoter(MajorityVoterMixin,TextAggregatorMixin):
 91 |     """Aggregator for text classification based on majority voting"""
 92 | 
 93 |     def __init__(self, name:str, labels:List[str], 
 94 |                  initial_weights:Optional[Dict[str,float]]=None):
 95 |         """Creates a new aggregator for text classification using majority
 96 |         voting. For each unique span annotated by at least one labelling source, 
 97 |         the class constructs a probability distribution over possible labels 
 98 |         based on the number of labelling sources "voting" for that label.
 99 |         
100 |         Arguments:
101 |         - name is the aggregator name
102 |         - labels is a list of output labels to aggregate. Labels that are not 
103 |           mentioned here are ignored. 
104 |         - initial_weights provides a numeric weight to labelling sources.
105 |           If left unspecified, the class assumes uniform weights.
106 |         """
107 |         AbstractAggregator.__init__(self, name, labels)
108 |         MajorityVoterMixin.__init__(self,initial_weights)
109 |        
110 | 
111 | class SequentialMajorityVoter(MajorityVoterMixin,SequenceAggregatorMixin):
112 |     """Aggregator for sequence labelling based on majority voting"""
113 |     
114 |     def __init__(self, name:str, labels:List[str], prefixes:str="BIO",
115 |                  initial_weights:Optional[Dict[str,float]]=None):
116 |         """Creates a new aggregator for sequence labelling using majority
117 |         voting. For each token annotated by at least one labelling source, 
118 |         the class constructs a probability distribution over possible labels 
119 |         based on the number of labelling sources "voting" for that label.
120 |         
121 |         Arguments:
122 |         - name is the aggregator name
123 |         - labels is a list of output labels to aggregate. Labels that are not 
124 |         mentioned here are ignored. 
125 |         - prefixes is the tagging scheme to use, such as IO, BIO or BILUO
126 |         - initial_weights provides a numeric weight to labelling sources.
127 |           If left unspecified, the class assumes uniform weights.
128 |         """
129 |         AbstractAggregator.__init__(self, name, labels)
130 |         SequenceAggregatorMixin.__init__(self, prefixes)
131 |         MajorityVoterMixin.__init__(self,initial_weights)     
132 |       
133 |       
134 |        
135 | class MultilabelMajorityVoter(MultilabelAggregatorMixin, MajorityVoterMixin,
136 |                             TextAggregatorMixin,AbstractAggregator):
137 |     
138 |     def __init__(self, name:str, labels:List[str], 
139 |                  initial_weights:Optional[Dict[str,float]]=None):
140 |         """Creates a new, multilabel aggregator for text classification using majority
141 |         voting. For each unique span annotated by at least one labelling source, 
142 |         the class constructs a probability distribution over possible labels 
143 |         based on the number of labelling sources "voting" for that label.
144 |         
145 |         Arguments:
146 |         - name is the aggregator name
147 |         - labels is a list of output labels to aggregate. Labels that are not 
148 |           mentioned here are ignored. 
149 |         - initial_weights provides a numeric weight to labelling sources.
150 |           If left unspecified, the class assumes uniform weights.
151 |           
152 |         The class allows for multiple labelling to be valid for each text.
153 |         Labels are incompatible with one another should be provided through the
154 |         set_exclusive_labels method.
155 |         """
156 |         AbstractAggregator.__init__(self, name, labels)
157 |         MajorityVoterMixin.__init__(self, initial_weights=initial_weights)
158 |         MultilabelAggregatorMixin.__init__(self, MajorityVoter, initial_weights=initial_weights)
159 | 
160 | 
161 |   
162 | class MultilabelSequentialMajorityVoter(MultilabelAggregatorMixin, SequenceAggregatorMixin,
163 |                                         AbstractAggregator):
164 |     
165 |     def __init__(self, name:str, labels:List[str], prefixes:str="BIO",
166 |                  initial_weights:Optional[Dict[str,float]]=None):
167 |         """Creates a new, multilabel aggregator for sequence labelling 
168 |         using majority voting. For each token annotated by at least one 
169 |         labelling source, the class constructs a probability distribution 
170 |         over possible labels based on the number of labelling sources 
171 |         "voting" for that label.
172 |         
173 |         Arguments:
174 |         - name is the aggregator name
175 |         - labels is a list of output labels to aggregate. Labels that are not 
176 |           mentioned here are ignored. 
177 |         - prefixes is the tagging scheme to use, such as IO, BIO or BILUO
178 |         - initial_weights provides a numeric weight to labelling sources.
179 |           If left unspecified, the class assumes uniform weights.
180 |           
181 |         The class allows for multiple labelling to be valid for each token.
182 |         Labels are incompatible with one another should be provided through the
183 |         set_exclusive_labels method.
184 |         """
185 |         AbstractAggregator.__init__(self, name, labels)
186 |         SequenceAggregatorMixin.__init__(self, prefixes)
187 |         MultilabelAggregatorMixin.__init__(self, SequentialMajorityVoter, initial_weights=initial_weights)
188 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | from . import test_doclevel, test_gazetteers, test_heuristics, test_aggregation, test_utils


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pytest
 3 | import spacy
 4 | 
 5 | @pytest.fixture(scope="session")
 6 | def nlp():
 7 |     import spacy
 8 |     return spacy.load("en_core_web_md")
 9 | 
10 | @pytest.fixture(scope="session")
11 | def nlp_small():
12 |     import spacy
13 |     return spacy.load("en_core_web_sm")


--------------------------------------------------------------------------------
/tests/test_doclevel.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import skweak
 3 | import re
 4 | from spacy.tokens import Span  # type: ignore
 5 | 
 6 | def test_subsequences():
 7 |     text = ["This", "is", "a", "test", "."]
 8 |     subsequences = [["This"], ["is"], ["a"], ["test"], ["."], ["This", "is"], ["is", "a"], 
 9 |                     ["a", "test"], ["test", "."], ["This", "is", "a"], ["is", "a", "test"], 
10 |                     ["a", "test", "."], ["This", "is", "a", "test"], ["is", "a", "test", "."]]
11 |     assert sorted(skweak.utils.get_subsequences(text)) == sorted(subsequences + [text])
12 |     
13 |     
14 | def test_history(nlp):
15 |     text = re.sub("\\s+", " ", """This is a story about Pierre Lison and his work at 
16 |                   Yetanothername Inc., which is just a name we invented. But of course, 
17 |                   Lison did not really work for Yetanothername, because it is a fictious 
18 |                   name, even when spelled like YETANOTHERNAME.""")
19 |     doc = nlp(text)
20 |     annotator1 = skweak.spacy.ModelAnnotator("spacy", "en_core_web_sm")
21 |     annotator2 = skweak.doclevel.DocumentHistoryAnnotator("hist_cased", "spacy", ["PERSON", "ORG"])
22 |     annotator3 = skweak.doclevel.DocumentHistoryAnnotator("hist_uncased", "spacy", ["PERSON", "ORG"],
23 |                                                           case_sensitive=False)
24 |     doc = annotator3(annotator2(annotator1(doc)))
25 |     assert Span(doc, 5, 7, "PERSON") in doc.spans["spacy"]
26 |     assert Span(doc, 11, 13, "ORG") in doc.spans["spacy"]
27 |     assert Span(doc, 26, 27, "PERSON") in doc.spans["hist_cased"]
28 |     assert Span(doc, 32, 33, "ORG") in doc.spans["hist_cased"]
29 |     assert Span(doc, 32, 33, "ORG") in doc.spans["hist_uncased"]
30 |     print("DEBUG", doc[45], doc[45].lemma_, doc[45].tag_)
31 |     assert Span(doc, 45, 46, "ORG") in doc.spans["hist_uncased"]
32 |     
33 |     
34 | def test_majority(nlp):
35 |     text = re.sub("\\s+", " ", """This is a story about Pierre Lison from Belgium.  He
36 |                   is working as a researcher at the Norwegian Computing Center. The work 
37 |                   of Pierre Lison includes among other weak supervision. He was born and
38 |                   studied in belgium but does not live in Belgium anymore. """)
39 |     doc = nlp(text)
40 |     annotator1 = skweak.spacy.ModelAnnotator("spacy", "en_core_web_md")
41 |     annotator2 = skweak.doclevel.DocumentMajorityAnnotator("maj_cased", "spacy")
42 |     annotator3 = skweak.doclevel.DocumentMajorityAnnotator("maj_uncased", "spacy", 
43 |                                                            case_sensitive=False)
44 |     doc = annotator3(annotator2(annotator1(doc)))
45 |     assert Span(doc, 5, 7, "PERSON") in doc.spans["spacy"]
46 |     assert Span(doc, 8, 9, "GPE") in doc.spans["spacy"]
47 |     assert Span(doc, 17, 21, "ORG") in doc.spans["spacy"]
48 |     assert Span(doc, 25, 27, "PERSON") in doc.spans["spacy"]
49 |     assert Span(doc, 45, 46, "GPE") in doc.spans["spacy"]
50 |     assert Span(doc, 5, 7, "PERSON") in doc.spans["maj_cased"]
51 |     assert Span(doc, 25, 27, "PERSON") in doc.spans["maj_cased"]
52 |     assert Span(doc, 8, 9, "GPE") in doc.spans["maj_cased"]
53 |     assert Span(doc, 45, 46, "GPE") in doc.spans["maj_cased"]
54 |     assert Span(doc, 8, 9, "GPE") in doc.spans["maj_uncased"]
55 |  #   assert Span(doc, 39, 40, "GPE") in doc.spans["maj_uncased"]
56 |     assert Span(doc, 45, 46, "GPE") in doc.spans["maj_uncased"]
57 | 
58 | 
59 | def test_truecase(nlp):
60 |     text = re.sub("\\s+", " ", """This is A STORY about Pierre LISON from BELGIUM. He IS 
61 |                   WORKING as a RESEARCHER at the Norwegian COMPUTING Center. The WORK of 
62 |                   Pierre LISON includes AMONG OTHER weak SUPERVISION. He WAS BORN AND 
63 |                   studied in belgium BUT does NOT LIVE IN BELGIUM anymore.""")
64 |     doc = nlp(text)
65 |     annotator1 = skweak.spacy.TruecaseAnnotator("truecase", "en_core_web_sm", "data/form_frequencies.json")
66 |     doc = annotator1(doc)
67 |     assert Span(doc, 5, 7, "PERSON") in doc.spans["truecase"]
68 |     assert Span(doc, 8, 9, "GPE") in doc.spans["truecase"]
69 |     assert Span(doc, 18, 19, "NORP") in doc.spans["truecase"]
70 |     assert Span(doc, 25, 27, "PERSON") in doc.spans["truecase"]
71 |     assert Span(doc, 45, 46, "GPE") in doc.spans["truecase"]        
72 | 


--------------------------------------------------------------------------------
/tests/test_gazetteers.py:
--------------------------------------------------------------------------------
  1 | from skweak import gazetteers, utils
  2 | import json, gzip
  3 | from spacy.tokens import Span #type: ignore
  4 | 
  5 | def test_trie1():
  6 |     trie = gazetteers.Trie()
  7 |     trie.add(["Donald", "Trump"])
  8 |     trie.add(["Donald", "Duck"])
  9 |     trie.add(["Donald", "Duck", "Magazine"])
 10 |     
 11 |     assert ["Donald", "Trump"] in trie
 12 |     assert ["Donald", "Duck"] in trie 
 13 |     assert ["Donald", "Duck", "Magazine"] in trie 
 14 |     assert ["Donald"] not in trie 
 15 |     assert ["Trump"] not in trie 
 16 |     assert ["Pierre"] not in trie
 17 |     assert trie.find_longest_match(["Donald", "Trump", "was", "the"]) == ["Donald", "Trump"]
 18 |     assert trie.find_longest_match(["Donald", "Duck", "was", "the"]) == ["Donald", "Duck"]
 19 |     assert trie.find_longest_match(["Donald", "Duck", "Magazine", "the"]) == ["Donald", "Duck", "Magazine"]
 20 |     
 21 |     assert trie.find_longest_match(["Donald"]) == []
 22 |     assert trie.find_longest_match(["Pierre"]) == []
 23 |     
 24 |     assert sorted(trie) == [["Donald", "Duck"], ["Donald", "Duck", "Magazine"], 
 25 |                             ["Donald", "Trump"]]
 26 |     
 27 |     
 28 | def test_trie2(nlp, json_file="data/wikidata_small_tokenised.json.gz", cutoff=100):
 29 |     tries = gazetteers.extract_json_data(json_file, cutoff=cutoff)
 30 |     fd = gzip.open(json_file, "r")
 31 |     data = json.loads(fd.read().decode("utf-8"))
 32 |     fd.close()
 33 |     
 34 |     for neClass, names_for_class in data.items():
 35 |         nb_names = 0
 36 |         trie = tries[neClass]
 37 |         for name in names_for_class:
 38 |             tokens = list(name)
 39 |             if len(tokens)==0:
 40 |                 continue
 41 |             assert tokens in trie
 42 |             assert trie.find_longest_match(tokens) == tokens
 43 |             nb_names += 1
 44 |             if nb_names >= cutoff:
 45 |                 break   
 46 |     
 47 | def test_trie_case_insensitive():
 48 |     trie = gazetteers.Trie()
 49 |     trie.add(["Donald", "Trump"])
 50 |     trie.add(["Donald", "Duck"])
 51 |     trie.add(["Donald", "Duck", "Magazine"])
 52 |     
 53 |     assert trie.find_longest_match(["Donald", "Trump", "was", "the"], 
 54 |                                case_sensitive=False) == ["Donald", "Trump"]
 55 |     assert trie.find_longest_match(["Donald", "trump", "was", "the"], 
 56 |                                case_sensitive=False) == ["Donald", "Trump"]
 57 |     assert trie.find_longest_match(["DONALD", "trump", "was", "the"], 
 58 |                                case_sensitive=False) == ["Donald", "Trump"]
 59 |     assert trie.find_longest_match(["Donald", "Duck", "Magazine", "the"], 
 60 |                                case_sensitive=False) == ["Donald", "Duck", "Magazine"]
 61 |     assert trie.find_longest_match(["Donald", "Duck", "magazine", "the"], 
 62 |                                case_sensitive=False) == ["Donald", "Duck", "Magazine"]
 63 |     
 64 |     assert trie.find_longest_match(["Donald"], case_sensitive=False) == []
 65 | 
 66 | def test_gazetteer(nlp):
 67 |     trie = gazetteers.Trie()
 68 |     trie.add(["Donald", "Trump"])
 69 |     trie.add(["Donald", "Duck"])
 70 |     trie.add(["Donald", "Duck", "Magazine"])
 71 |     trie.add(["Apple"])
 72 |    
 73 |     gazetteer = gazetteers.GazetteerAnnotator("test_gazetteer", {"ENT":trie})
 74 |     doc1 = nlp("Donald Trump is now reading Donald Duck Magazine.")
 75 |     doc2 = nlp("Donald Trump (unrelated with Donald Duck) is now reading Donald Duck Magazine.")
 76 |     doc1, doc2 = gazetteer.pipe([doc1, doc2])
 77 |     assert Span(doc1, 0, 2, "ENT") in doc1.spans["test_gazetteer"]
 78 |     assert Span(doc1, 5, 8, "ENT") in doc1.spans["test_gazetteer"]
 79 |     assert Span(doc2, 0, 2, "ENT") in doc2.spans["test_gazetteer"]
 80 |     assert Span(doc2, 5, 7, "ENT") in doc2.spans["test_gazetteer"]
 81 |     assert Span(doc2, 11, 14, "ENT") in doc2.spans["test_gazetteer"]
 82 | 
 83 |     gazetteer = gazetteers.GazetteerAnnotator("test_gazetteer", {"ENT":trie}, case_sensitive=False)
 84 |     doc1 = nlp("Donald Trump is now reading Donald Duck Magazine.")
 85 |     doc2 = nlp("Donald trump (unrelated with donald Duck) is now reading Donald Duck magazine.")
 86 | 
 87 |     doc3 = nlp("At Apple, we do not like to simply eat an apple.")
 88 |     doc1, doc2, doc3 = gazetteer.pipe([doc1, doc2, doc3])
 89 |     assert Span(doc1, 0, 2, "ENT") in doc1.spans["test_gazetteer"]
 90 |     assert Span(doc1, 5, 8, "ENT") in doc1.spans["test_gazetteer"]
 91 |     assert Span(doc2, 0, 2, "ENT") in doc2.spans["test_gazetteer"]
 92 |     assert Span(doc2, 5, 7, "ENT") in doc2.spans["test_gazetteer"]
 93 |     assert Span(doc2, 11, 14, "ENT") in doc2.spans["test_gazetteer"]
 94 |     assert Span(doc3, 1, 2, "ENT") in doc3.spans["test_gazetteer"]
 95 | 
 96 |  
 97 | def test_gazetteer2(nlp):
 98 |     
 99 |     class Trie2(gazetteers.Trie):
100 |         def __init__(self):
101 |             super(Trie2, self).__init__()
102 |             self.nb_queries = 0
103 |             
104 |         def find_longest_match(self, tokens, case_sensitive=True):
105 |             self.nb_queries += 1
106 |             return super(Trie2, self).find_longest_match(tokens, case_sensitive)
107 |             
108 |     trie = Trie2()
109 |     trie.add(["Donald", "Trump"])
110 |     trie.add(["Donald", "Duck"])
111 |     trie.add(["Donald", "Duck", "Magazine"])
112 |     
113 |     gazetteer = gazetteers.GazetteerAnnotator("test_gazetteer", {"ENT":trie})
114 |     doc1 = nlp("Donald Trump is now reading Donald Duck Magazine.")
115 |     gazetteer(doc1)
116 |     assert trie.nb_queries == 5
117 |     


--------------------------------------------------------------------------------
/tests/test_heuristics.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import skweak
  3 | import re
  4 | from spacy.tokens import Span #type: ignore
  5 |              
  6 | def time_generator(doc):        
  7 |     i = 0
  8 |     while i < len(doc):
  9 |         tok = doc[i]
 10 | 
 11 |         if (i < len(doc)-1 and tok.text[0].isdigit() and 
 12 |             doc[i+1].lower_ in {"am", "pm", "a.m.", "p.m.", "am.", "pm."}):
 13 |             yield i, i+2, "TIME"
 14 |             i += 1
 15 |         elif tok.text[0].isdigit() and re.match("\\d{1,2}\\:\\d{1,2}", tok.text):
 16 |             yield i, i+1, "TIME"
 17 |             i += 1
 18 |         i += 1  
 19 |            
 20 | def number_generator(doc):
 21 |     i = 0
 22 |     while i < len(doc):
 23 |         tok = doc[i]
 24 |             
 25 |         if re.search("\\d", tok.text):
 26 |             j = i+1
 27 |             if j < len(doc) and doc[j].lower_ in ["%", "percent", "pc.", "pc", "pct", 
 28 |                                                   "pct.", "percents", "percentage"]:
 29 |                 j += 1
 30 |                 yield i, j, "PERCENT"        
 31 |             elif not re.search("[a-zA-Z]", tok.text):
 32 |                 yield i, j,  "CARDINAL"
 33 |             i = j-1
 34 |         i += 1
 35 | 
 36 | def test_function(nlp):
 37 |     doc = nlp("I woke up at 07:30 this morning, being 95% reloaded, with 8 hours of sleep.")
 38 |     annotator1 = skweak.heuristics.FunctionAnnotator("time", time_generator)
 39 |     annotator2 = skweak.heuristics.FunctionAnnotator("number", number_generator)
 40 |     annotator2.add_incompatible_sources(["time"])
 41 |     annotator = skweak.base.CombinedAnnotator()
 42 |     annotator.add_annotator(annotator1)
 43 |     annotator.add_annotator(annotator2)
 44 |     doc = annotator(doc)
 45 |     assert Span(doc, 4,5, "TIME") in doc.spans["time"]
 46 |     assert Span(doc, 9, 11, "PERCENT") in doc.spans["number"]
 47 |     assert Span(doc, 14, 15, "CARDINAL") in doc.spans["number"]
 48 | 
 49 | 
 50 | def test_gap_tokens(nlp):
 51 |     doc = nlp("The Norwegian Computing Center's Employee Union is a long entity, much longer than Jean-Pierre.")
 52 |     annotator1 = skweak.heuristics.TokenConstraintAnnotator("test1", skweak.utils.is_likely_proper, "ENT")
 53 |     doc = annotator1(doc)
 54 |     assert Span(doc, 1, 4, "ENT") in doc.spans["test1"]
 55 |     assert Span(doc, 5, 7, "ENT") in doc.spans["test1"]
 56 |     assert Span(doc, 15, 18, "ENT") in doc.spans["test1"]
 57 |     annotator2 = skweak.heuristics.TokenConstraintAnnotator("test2", skweak.utils.is_likely_proper, "ENT")
 58 |     annotator2.add_gap_tokens(["'s", "-"])
 59 |     doc = annotator2(doc)
 60 |     assert Span(doc, 1, 7, "ENT") in doc.spans["test2"]
 61 |     assert Span(doc, 15, 18, "ENT") in doc.spans["test2"]
 62 | 
 63 | def test_span_annotator(nlp):
 64 |     doc = nlp("My name is Pierre Lison and I work at the Norwegian Computing Center.")
 65 |     annotator = skweak.heuristics.TokenConstraintAnnotator("proper", skweak.utils.is_likely_proper, "ENT")
 66 |     doc = annotator(doc)
 67 |     assert Span(doc, 3, 5, "ENT") in doc.spans["proper"]
 68 |     assert Span(doc, 10, 13, "ENT") in doc.spans["proper"]
 69 |     annotator2 = skweak.heuristics.SpanConstraintAnnotator("rare_proper", "proper", skweak.utils.is_infrequent)
 70 |     doc = annotator2(doc)
 71 |  #   assert Span(doc, 3, 5, "ENT") in doc.spans["rare_proper"]
 72 |     
 73 |     
 74 | def test_vicinity(nlp):
 75 |     doc = nlp("My name is Pierre Lison.")
 76 |     annotator1 = skweak.heuristics.TokenConstraintAnnotator("proper", skweak.utils.is_likely_proper, "ENT")
 77 |     annotator2 = skweak.heuristics.VicinityAnnotator("neighbours", {"name":"PERSON"}, "proper")
 78 |     annotator = skweak.base.CombinedAnnotator().add_annotators(annotator1, annotator2)
 79 |     doc = annotator(doc)
 80 |     assert Span(doc, 3, 5, "ENT") in doc.spans["proper"]
 81 |     assert Span(doc, 3, 5, "PERSON") in doc.spans["neighbours"]
 82 | 
 83 |     
 84 |     
 85 |     
 86 | def test_model(nlp):
 87 |     doc = nlp("My name is Pierre Lison, I come from Belgium and I work at the Norwegian Computing Center.")
 88 |     
 89 |     annotator = skweak.spacy.ModelAnnotator("core_web_md", "en_core_web_md")
 90 |     doc = annotator(doc)
 91 |     assert Span(doc, 3, 5, "PERSON") in doc.spans["core_web_md"]
 92 |     assert Span(doc, 9, 10, "GPE") in doc.spans["core_web_md"]
 93 |     assert (Span(doc, 14, 18, "FAC") in doc.spans["core_web_md"]
 94 |             or Span(doc, 14, 18, "ORG") in doc.spans["core_web_md"])
 95 |    
 96 |     doc.ents = ()
 97 |     doc, *_ = annotator.pipe([doc])
 98 |     assert Span(doc, 3, 5, "PERSON") in doc.spans["core_web_md"]
 99 |     assert Span(doc, 9, 10, "GPE") in doc.spans["core_web_md"]
100 |     assert (Span(doc, 14, 18, "FAC") in doc.spans["core_web_md"]
101 |             or Span(doc, 14, 18, "ORG") in doc.spans["core_web_md"])
102 |     
103 |     doc.ents = ()
104 |     annotator1 = skweak.heuristics.TokenConstraintAnnotator("proper", skweak.utils.is_likely_proper, "ENT")
105 |     annotator2 = skweak.heuristics.VicinityAnnotator("neighbours", {"name":"PERSON"}, "proper")
106 |     annotator = skweak.base.CombinedAnnotator().add_annotators(annotator, annotator1, annotator2)
107 |     doc, *_ = annotator.pipe([doc])
108 |     assert Span(doc, 3, 5, "PERSON") in doc.spans["core_web_md"]
109 |     assert Span(doc, 9, 10, "GPE") in doc.spans["core_web_md"]
110 |     assert (Span(doc, 14, 18, "FAC") in doc.spans["core_web_md"]
111 |             or Span(doc, 14, 18, "ORG") in doc.spans["core_web_md"])
112 |     assert Span(doc, 3, 5, "ENT") in doc.spans["proper"]
113 |     assert Span(doc, 9, 10, "ENT") in doc.spans["proper"]
114 |     assert Span(doc, 15, 18, "ENT") in doc.spans["proper"]
115 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from skweak import utils
  3 | import os
  4 | from spacy.tokens import Span #type: ignore
  5 | 
  6 | def test_likely_proper(nlp_small, nlp):     
  7 |     for nlpx in [nlp_small, nlp]:
  8 |         doc = nlpx("This is a test. Please tell me that is works.")
  9 |         for tok in doc:
 10 |             assert not utils.is_likely_proper(tok)
 11 |         doc = nlpx("Pierre Lison is living in Oslo.")
 12 |         for i, tok in enumerate(doc):
 13 |             assert utils.is_likely_proper(tok) == (i in {0,1,5})
 14 |         doc = nlpx("Short sentence. But here, Beyond can be an organisation.")
 15 |         for i, tok in enumerate(doc):
 16 |             assert utils.is_likely_proper(tok) == (i in {6})
 17 |             
 18 |     doc = nlp_small("Buying an iPad makes you ekrjøewlkrj in the USA.")
 19 |     for i, tok in enumerate(doc):
 20 |         assert utils.is_likely_proper(tok) == (i in {2,8})
 21 |     doc = nlp("Buying an iPad makes you ekrjøewlkrj in the USA.")
 22 |     for i, tok in enumerate(doc):
 23 |         assert utils.is_likely_proper(tok) == (i in {2,8,5})
 24 | 
 25 |             
 26 | def test_infrequent(nlp_small, nlp):
 27 |     doc = nlp_small("The Moscow Art Museum awaits you")
 28 |     assert not utils.is_infrequent(doc[:5])
 29 |     doc = nlp("The Moscow Art Museum awaits you")
 30 |     assert utils.is_infrequent(doc[:5])
 31 |     doc = nlp_small("completelyUnknownToken")
 32 |     assert not utils.is_infrequent(doc[:1])
 33 |     doc = nlp("completelyUnknownToken")
 34 |     assert utils.is_infrequent(doc[:1])
 35 | 
 36 | def test_compound(nlp):
 37 |     doc = nlp("The White House focuses on risk assessment.")
 38 |     assert not utils.in_compound(doc[0])
 39 |     assert utils.in_compound(doc[1])
 40 |     assert utils.in_compound(doc[2])
 41 |     assert not utils.in_compound(doc[3])
 42 |     assert not utils.in_compound(doc[4])
 43 |     assert utils.in_compound(doc[5])
 44 |     assert utils.in_compound(doc[6])
 45 |     assert not utils.in_compound(doc[7])
 46 |     
 47 | 
 48 | 
 49 | def test_get_spans(nlp_small):
 50 |     
 51 |     doc = nlp_small("This is just a small test for checking that the method works correctly")
 52 |     doc.spans["source1"] = [Span(doc, 0, 2, label="LABEL1"),
 53 |                             Span(doc, 4, 5, label="LABEL2")]
 54 |     doc.spans["source2"] = [Span(doc, 0, 1, label="LABEL3"),
 55 |                             Span(doc, 2, 6, label="LABEL2")]
 56 |     doc.spans["source4"] = [Span(doc, 0, 2, label="LABEL2")]
 57 |     doc.spans["source3"] = [Span(doc, 7, 9, label="LABEL2"),
 58 |                             Span(doc, 1, 4, label="LABEL1")]
 59 |              
 60 |     assert set((span.start, span.end) for span in 
 61 |                utils.get_spans(doc, ["source1", "source2"]))  == {(0,2), (2,6)}                   
 62 |     assert set((span.start, span.end) for span in 
 63 |                utils.get_spans(doc, ["source1", "source3"])) == {(1,4), (4,5), (7,9)}
 64 |     assert {(span.start, span.end):span.label_ for span in 
 65 |             utils.get_spans(doc, ["source1", "source4"])}  == {(0,2):"LABEL2", (4,5):"LABEL2"}
 66 |     assert set((span.start, span.end) for span in 
 67 |                utils.get_spans(doc, ["source2", "source3"])) == {(0,1), (2,6), (7,9)}
 68 |     
 69 |     
 70 |     
 71 |     
 72 | def test_replace_ner(nlp_small):
 73 |     doc = nlp_small("Pierre Lison is working at the Norwegian Computing Center.")
 74 |     assert doc.ents[0].text=="Pierre Lison"
 75 |     assert doc.ents[0].label_=="PERSON"
 76 |     doc.spans["test"] = [Span(doc, 6, 9, label="RESEARCH_ORG")]
 77 |     doc = utils.replace_ner_spans(doc, "test")
 78 |     assert doc.ents[0].text=="Norwegian Computing Center"
 79 |     assert doc.ents[0].label_=="RESEARCH_ORG"
 80 | 
 81 | 
 82 | def test_docbins(nlp_small, temp_file="data/temporary_test.docbin"):
 83 |     doc = nlp_small("Pierre Lison is working at the Norwegian Computing Center.")
 84 |     doc2 = nlp_small("He is working on various NLP topics.")
 85 |     doc.spans["test"] = [Span(doc, 0, 2, label="PERSON")]
 86 |     utils.docbin_writer([doc, doc2], temp_file)
 87 |     doc3, doc4 = list(utils.docbin_reader(temp_file, "en_core_web_sm"))
 88 |     assert doc.text == doc3.text 
 89 |     assert doc2.text == doc4.text 
 90 |     assert [(e.text, e.label_) for e in doc.ents] == [(e.text, e.label_) for e in doc3.ents]
 91 |     assert doc.user_data == doc3.user_data 
 92 |     os.remove(temp_file)
 93 |     
 94 |     
 95 | 
 96 | def test_json(nlp_small, temp_file="data/temporary_test.json"):
 97 |     import spacy
 98 |     if int(spacy.__version__[0]) > 2:
 99 |         return
100 |     
101 |     doc = nlp_small("Pierre Lison is working at the Norwegian Computing Center.")
102 |     doc2 = nlp_small("He is working on various NLP topics.")
103 |     doc.spans["test"] = [Span(doc, 6, 9, label="RESEARCH_ORG")]
104 |     doc2.spans["test"] = []
105 |     
106 |     utils.json_writer([doc, doc2], temp_file, source="test")
107 |     fd = open(temp_file, "r")
108 |     assert "I-RESEARCH_ORG" in fd.read()
109 |     fd.close()
110 |     os.remove(temp_file)
111 |     
112 |     
113 | def test_valid_transitions():
114 |     assert utils.is_valid_start("O")    
115 |     assert utils.is_valid_start("B-ORG")     
116 |     assert not utils.is_valid_start("I-ORG")     
117 |     assert utils.is_valid_start("I-ORG", "IO")
118 |     assert utils.is_valid_start("U-ORG", "BILUO")    
119 |     assert not utils.is_valid_start("L-ORG")
120 |     
121 |     assert utils.is_valid_transition("O","O")    
122 |     assert utils.is_valid_transition("O","B-ORG")
123 |     assert utils.is_valid_transition("O","U-ORG")
124 |     assert not utils.is_valid_transition("O","I-ORG")
125 |     assert utils.is_valid_transition("O","I-ORG", "IO")
126 |     assert not utils.is_valid_transition("O","L-ORG")
127 |     
128 |     assert utils.is_valid_transition("B-ORG","I-ORG")
129 |     assert utils.is_valid_transition("B-ORG","L-ORG", "BILUO")
130 |     assert not utils.is_valid_transition("B-ORG","I-GPE")
131 |     assert not utils.is_valid_transition("B-ORG","B-ORG", "BILUO")
132 |     assert utils.is_valid_transition("I-ORG", "B-ORG")
133 |     assert not utils.is_valid_transition("I-ORG", "B-ORG", "BILUO")
134 |     assert not utils.is_valid_transition("I-ORG", "O", "BILUO")
135 |     assert utils.is_valid_transition("I-ORG", "O")
136 |     assert utils.is_valid_transition("I-ORG", "O", "IO")
137 |     assert utils.is_valid_transition("I-ORG", "U-GPE")
138 |     assert not utils.is_valid_transition("I-ORG", "I-GPE")
139 |     assert utils.is_valid_transition("I-ORG", "U-GPE")
140 |     assert utils.is_valid_transition("I-ORG", "L-ORG", "BILUO")
141 |     assert not utils.is_valid_transition("L-ORG", "L-ORG", "BILUO")
142 |     assert not utils.is_valid_transition("L-ORG", "I-ORG", "BILUO")
143 |     assert utils.is_valid_transition("U-ORG", "U-ORG")
144 |     assert utils.is_valid_transition("U-ORG", "U-GPE")
145 |     assert utils.is_valid_transition("U-ORG", "O")
146 |     assert utils.is_valid_transition("L-ORG", "O", "BILUO")
147 |     assert not utils.is_valid_transition("I-ORG", "O", "BILUO")
148 | 


--------------------------------------------------------------------------------