├── .github
    └── workflows
    │   ├── python-package.yml
    │   └── python-publish.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .vscode
    └── settings.json
├── CITATION.cff
├── LICENSE
├── README.md
├── concise_concepts
    ├── __init__.py
    ├── conceptualizer
    │   ├── Conceptualizer.py
    │   └── __init__.py
    └── examples
    │   ├── __init__.py
    │   ├── data.py
    │   ├── example_gensim_custom_model.py
    │   ├── example_gensim_custom_path.py
    │   ├── example_gensim_default.py
    │   └── example_spacy.py
├── img
    ├── example.png
    └── logo.png
├── poetry.lock
├── pyproject.toml
├── setup.cfg
└── tests
    ├── __init__.py
    └── test_model_import.py


/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [main]
 9 |   pull_request:
10 |     branches: [main]
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       fail-fast: false
17 |       matrix:
18 |         python-version: ["3.8", "3.9", "3.10"]
19 | 
20 |     steps:
21 |       - uses: actions/checkout@v3
22 |       - name: Set up Python ${{ matrix.python-version }}
23 |         uses: actions/setup-python@v3
24 |         with:
25 |           python-version: ${{ matrix.python-version }}
26 |       - name: Install dependencies
27 |         run: |
28 |           python -m pip install --upgrade pip
29 |           python -m pip install flake8 pytest pytest-cov
30 |           python -m pip install poetry
31 |           poetry export -f requirements.txt -o requirements.txt --without-hashes
32 |           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
33 |           python -m spacy download en_core_web_md
34 |       - name: Lint with flake8
35 |         run: |
36 |           # stop the build if there are Python syntax errors or undefined names
37 |           flake8 . --count --max-complexity=18 --enable=W0614 --select=C,E,F,W,B,B950 --ignore=E203,E266,E501,W503 --exclude=.git,__pycache__,build,dist --max-line-length=119 --show-source --statistics
38 |       - name: Test with pytest
39 |         run: |
40 |           pytest --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html
41 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [created]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install build
33 |     - name: Build package
34 |       run: python -m build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | /test_spacy.py
131 | .model
132 | /concise_concepts/word2vec.model.vectors.npy
133 | /test.html
134 | 
135 | # Downloaded models
136 | *.model
137 | *.model.*
138 | *.json
139 | test.py
140 | s2v_old


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.0.1
 4 |     hooks:
 5 |       - id: check-added-large-files
 6 |       - id: end-of-file-fixer
 7 |       - id: check-ast
 8 |       - id: check-case-conflict
 9 |       - id: check-docstring-first
10 |       - id: check-merge-conflict
11 |       - id: check-symlinks
12 |       - id: check-toml
13 |       - id: check-xml
14 |       - id: check-yaml
15 |       - id: destroyed-symlinks
16 |       - id: detect-private-key
17 |       - id: fix-encoding-pragma
18 |   - repo: https://github.com/psf/black
19 |     rev: 22.3.0
20 |     hooks:
21 |       - id: black
22 |       - id: black-jupyter
23 |   # Execute isort on all changed files (make sure the version is the same as in pyproject)
24 |   - repo: https://github.com/pycqa/isort
25 |     rev: 5.10.1
26 |     hooks:
27 |       - id: isort
28 |   # Execute flake8 on all changed files (make sure the version is the same as in pyproject)
29 |   - repo: https://github.com/pycqa/flake8
30 |     rev: 4.0.1
31 |     hooks:
32 |       - id: flake8
33 |         additional_dependencies:
34 |           ["flake8-docstrings", "flake8-bugbear", "pep8-naming"]
35 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "python.testing.pytestEnabled": true,
 3 |     "python.linting.flake8Enabled": true,
 4 |     "python.formatting.provider": "black",
 5 |     "editor.rulers": [
 6 |         119
 7 |     ],
 8 |     "python.linting.enabled": true,
 9 | }
10 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.0.0
2 | message: "If you use this software, please cite it as below."
3 | authors:
4 |   - family-names: David
5 |     given-names: Berenstein
6 | title: "Concise Concepts - an easy and intuitive approach to few-shot NER using most similar expansion over spaCy embeddings."
7 | version: 0.7.3
8 | date-released: 2022-12-31
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Pandora Intelligence
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Concise Concepts
  2 | When wanting to apply NER to concise concepts, it is really easy to come up with examples, but pretty difficult to train an entire pipeline. Concise Concepts uses few-shot NER based on word embedding similarity to get you going
  3 | with easy! Now with entity scoring!
  4 | 
  5 | 
  6 | [![Python package](https://github.com/Pandora-Intelligence/concise-concepts/actions/workflows/python-package.yml/badge.svg?branch=main)](https://github.com/Pandora-Intelligence/concise-concepts/actions/workflows/python-package.yml)
  7 | [![Current Release Version](https://img.shields.io/github/release/pandora-intelligence/concise-concepts.svg?style=flat-square&logo=github)](https://github.com/pandora-intelligence/concise-concepts/releases)
  8 | [![pypi Version](https://img.shields.io/pypi/v/concise-concepts.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/concise-concepts/)
  9 | [![PyPi downloads](https://static.pepy.tech/personalized-badge/concise-concepts?period=total&units=international_system&left_color=grey&right_color=orange&left_text=pip%20downloads)](https://pypi.org/project/concise-concepts/)
 10 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black)
 11 | 
 12 | 
 13 | ## Usage
 14 | This library defines matching patterns based on the most similar words found in each group, which are used to fill a [spaCy EntityRuler](https://spacy.io/api/entityruler). To better understand the rule definition, I recommend playing around with the [spaCy Rule-based Matcher Explorer](https://demos.explosion.ai/matcher).
 15 | 
 16 | ### Tutorials
 17 | - [TechVizTheDataScienceGuy](https://www.youtube.com/c/TechVizTheDataScienceGuy) created a [nice tutorial](https://prakhar-mishra.medium.com/few-shot-named-entity-recognition-in-natural-language-processing-92d31f0d1143) on how to use it.
 18 | 
 19 | - [I](https://www.linkedin.com/in/david-berenstein-1bab11105/) created a [tutorial](https://www.rubrix.ml/blog/concise-concepts-rubrix/) in collaboration with Rubrix.
 20 | 
 21 | The section [Matching Pattern Rules](#matching-pattern-rules) expands on the construction, analysis and customization of these matching patterns.
 22 | 
 23 | 
 24 | # Install
 25 | 
 26 | ```
 27 | pip install concise-concepts
 28 | ```
 29 | 
 30 | # Quickstart
 31 | 
 32 | Take a look at the [configuration section](#configuration) for more info.
 33 | 
 34 | ## Spacy Pipeline Component
 35 | 
 36 | Note that, [custom embedding models](#custom-embedding-models) are passed via `model_path`.
 37 | 
 38 | ```python
 39 | import spacy
 40 | from spacy import displacy
 41 | 
 42 | data = {
 43 |     "fruit": ["apple", "pear", "orange"],
 44 |     "vegetable": ["broccoli", "spinach", "tomato"],
 45 |     "meat": ['beef', 'pork', 'turkey', 'duck']
 46 | }
 47 | 
 48 | text = """
 49 |     Heat the oil in a large pan and add the Onion, celery and carrots.
 50 |     Then, cook over a medium–low heat for 10 minutes, or until softened.
 51 |     Add the courgette, garlic, red peppers and oregano and cook for 2–3 minutes.
 52 |     Later, add some oranges and chickens. """
 53 | 
 54 | nlp = spacy.load("en_core_web_md", disable=["ner"])
 55 | 
 56 | nlp.add_pipe(
 57 |     "concise_concepts",
 58 |     config={
 59 |         "data": data,
 60 |         "ent_score": True,  # Entity Scoring section
 61 |         "verbose": True,
 62 |         "exclude_pos": ["VERB", "AUX"],
 63 |         "exclude_dep": ["DOBJ", "PCOMP"],
 64 |         "include_compound_words": False,
 65 |         "json_path": "./fruitful_patterns.json",
 66 |         "topn": (100,500,300)
 67 |     },
 68 | )
 69 | doc = nlp(text)
 70 | 
 71 | options = {
 72 |     "colors": {"fruit": "darkorange", "vegetable": "limegreen", "meat": "salmon"},
 73 |     "ents": ["fruit", "vegetable", "meat"],
 74 | }
 75 | 
 76 | ents = doc.ents
 77 | for ent in ents:
 78 |     new_label = f"{ent.label_} ({ent._.ent_score:.0%})"
 79 |     options["colors"][new_label] = options["colors"].get(ent.label_.lower(), None)
 80 |     options["ents"].append(new_label)
 81 |     ent.label_ = new_label
 82 | doc.ents = ents
 83 | 
 84 | displacy.render(doc, style="ent", options=options)
 85 | ```
 86 | ![](https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/example.png)
 87 | 
 88 | ## Standalone
 89 | 
 90 | This might be useful when iterating over few_shot training data when not wanting to reload larger models continuously.
 91 | Note that, [custom embedding models](#custom-embedding-models) are passed via `model`.
 92 | 
 93 | ```python
 94 | import gensim
 95 | import spacy
 96 | 
 97 | from concise_concepts import Conceptualizer
 98 | 
 99 | model = gensim.downloader.load("fasttext-wiki-news-subwords-300")
100 | nlp = spacy.load("en_core_web_sm")
101 | data = {
102 |     "disease": ["cancer", "diabetes", "heart disease", "influenza", "pneumonia"],
103 |     "symptom": ["headache", "fever", "cough", "nausea", "vomiting", "diarrhea"],
104 | }
105 | conceptualizer = Conceptualizer(nlp, data, model)
106 | conceptualizer.nlp("I have a headache and a fever.").ents
107 | 
108 | data = {
109 |     "disease": ["cancer", "diabetes"],
110 |     "symptom": ["headache", "fever"],
111 | }
112 | conceptualizer = Conceptualizer(nlp, data, model)
113 | conceptualizer.nlp("I have a headache and a fever.").ents
114 | ```
115 | 
116 | # Configuration
117 | ## Matching Pattern Rules
118 | A general introduction about the usage of matching patterns in the [usage section](#usage).
119 | ### Customizing Matching Pattern Rules
120 | Even though the baseline parameters provide a decent result, the construction of these matching rules can be customized via the config passed to the spaCy pipeline.
121 | 
122 |  - `exclude_pos`: A list of POS tags to be excluded from the rule-based match.
123 |  - `exclude_dep`: A list of dependencies to be excluded from the rule-based match.
124 |  - `include_compound_words`:  If True, it will include compound words in the entity. For example, if the entity is "New York", it will also include "New York City" as an entity.
125 |  - `case_sensitive`: Whether to match the case of the words in the text.
126 | 
127 | 
128 | ### Analyze Matching Pattern Rules
129 | To motivate actually looking at the data and support interpretability, the matching patterns that have been generated are stored as `./main_patterns.json`. This behavior can be changed by using the `json_path` variable via the config passed to the spaCy pipeline.
130 | 
131 | 
132 | ## Fuzzy matching using `spaczz`
133 | 
134 |  - `fuzzy`: A boolean value that determines whether to use fuzzy matching
135 | 
136 | ```python
137 | data = {
138 |     "fruit": ["apple", "pear", "orange"],
139 |     "vegetable": ["broccoli", "spinach", "tomato"],
140 |     "meat": ["beef", "pork", "fish", "lamb"]
141 | }
142 | 
143 | nlp.add_pipe("concise_concepts", config={"data": data, "fuzzy": True})
144 | ```
145 | 
146 | ## Most Similar Word Expansion
147 | 
148 | - `topn`: Use a specific number of words to expand over.
149 | 
150 | ```python
151 | data = {
152 |     "fruit": ["apple", "pear", "orange"],
153 |     "vegetable": ["broccoli", "spinach", "tomato"],
154 |     "meat": ["beef", "pork", "fish", "lamb"]
155 | }
156 | 
157 | topn = [50, 50, 150]
158 | 
159 | assert len(topn) == len
160 | 
161 | nlp.add_pipe("concise_concepts", config={"data": data, "topn": topn})
162 | ```
163 | 
164 | ## Entity Scoring
165 | 
166 | - `ent_score`: Use embedding based word similarity to score entities against their groups
167 | 
168 | ```python
169 | import spacy
170 | 
171 | data = {
172 |     "ORG": ["Google", "Apple", "Amazon"],
173 |     "GPE": ["Netherlands", "France", "China"],
174 | }
175 | 
176 | text = """Sony was founded in Japan."""
177 | 
178 | nlp = spacy.load("en_core_web_lg")
179 | nlp.add_pipe("concise_concepts", config={"data": data, "ent_score": True, "case_sensitive": True})
180 | doc = nlp(text)
181 | 
182 | print([(ent.text, ent.label_, ent._.ent_score) for ent in doc.ents])
183 | # output
184 | #
185 | # [('Sony', 'ORG', 0.5207586), ('Japan', 'GPE', 0.7371268)]
186 | ```
187 | 
188 | ## Custom Embedding Models
189 | 
190 | - `model_path`: Use custom `sense2vec.Sense2Vec`, `gensim.Word2vec` `gensim.FastText`, or `gensim.KeyedVectors`, or a pretrained model from [gensim](https://radimrehurek.com/gensim/downloader.html) library or a custom model path. For using a `sense2vec.Sense2Vec` take a look [here](https://github.com/explosion/sense2vec#pretrained-vectors).
191 | - `model`: within [standalone usage](#standalone), it is possible to pass these models directly.
192 | 
193 | ```python
194 | data = {
195 |     "fruit": ["apple", "pear", "orange"],
196 |     "vegetable": ["broccoli", "spinach", "tomato"],
197 |     "meat": ["beef", "pork", "fish", "lamb"]
198 | }
199 | 
200 | # model from https://radimrehurek.com/gensim/downloader.html or path to local file
201 | model_path = "glove-wiki-gigaword-300"
202 | 
203 | nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path})
204 | ````
205 | 


--------------------------------------------------------------------------------
/concise_concepts/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from typing import List, Union
 3 | 
 4 | from gensim.models import FastText, Word2Vec
 5 | from gensim.models.keyedvectors import KeyedVectors
 6 | from spacy.language import Language
 7 | 
 8 | from .conceptualizer import Conceptualizer
 9 | 
10 | 
11 | @Language.factory(
12 |     "concise_concepts",
13 |     default_config={
14 |         "data": None,
15 |         "topn": None,
16 |         "model_path": None,
17 |         "word_delimiter": "_",
18 |         "ent_score": False,
19 |         "exclude_pos": [
20 |             "VERB",
21 |             "AUX",
22 |             "ADP",
23 |             "DET",
24 |             "CCONJ",
25 |             "PUNCT",
26 |             "ADV",
27 |             "ADJ",
28 |             "PART",
29 |             "PRON",
30 |         ],
31 |         "exclude_dep": [],
32 |         "include_compound_words": False,
33 |         "fuzzy": False,
34 |         "case_sensitive": False,
35 |         "json_path": "./matching_patterns.json",
36 |         "verbose": True,
37 |     },
38 | )
39 | def make_concise_concepts(
40 |     nlp: Language,
41 |     name: str,
42 |     data: Union[dict, list],
43 |     topn: Union[list, None],
44 |     model_path: Union[str, FastText, Word2Vec, KeyedVectors, None],
45 |     word_delimiter: str,
46 |     ent_score: bool,
47 |     exclude_pos: List[str],
48 |     exclude_dep: List[str],
49 |     include_compound_words: bool,
50 |     fuzzy: bool,
51 |     case_sensitive: bool,
52 |     json_path: str,
53 |     verbose: bool,
54 | ):
55 |     return Conceptualizer(
56 |         nlp=nlp,
57 |         data=data,
58 |         topn=topn,
59 |         model=model_path,
60 |         word_delimiter=word_delimiter,
61 |         ent_score=ent_score,
62 |         exclude_pos=exclude_pos,
63 |         exclude_dep=exclude_dep,
64 |         include_compound_words=include_compound_words,
65 |         fuzzy=fuzzy,
66 |         case_sensitive=case_sensitive,
67 |         json_path=json_path,
68 |         verbose=verbose,
69 |         name=name,
70 |     )
71 | 


--------------------------------------------------------------------------------
/concise_concepts/conceptualizer/Conceptualizer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import json
  3 | import logging
  4 | import re
  5 | import types
  6 | from copy import deepcopy
  7 | from pathlib import Path
  8 | from typing import List, Union
  9 | 
 10 | import gensim.downloader
 11 | import spaczz  # noqa: F401
 12 | from gensim import matutils  # utility fnc for pickling, common scipy operations etc
 13 | from gensim.models import FastText, Word2Vec
 14 | from gensim.models.keyedvectors import KeyedVectors
 15 | from numpy import argmax, dot
 16 | from sense2vec import Sense2Vec
 17 | from spacy import Language, util
 18 | from spacy.tokens import Doc, Span
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | POS_LIST = [
 23 |     "ADJ",
 24 |     "ADP",
 25 |     "ADV",
 26 |     "AUX",
 27 |     "CONJ",
 28 |     "CCONJ",
 29 |     "DET",
 30 |     "INTJ",
 31 |     "NOUN",
 32 |     "NUM",
 33 |     "PART",
 34 |     "PRON",
 35 |     "PROPN",
 36 |     "PUNCT",
 37 |     "SCONJ",
 38 |     "SYM",
 39 |     "VERB",
 40 |     "X",
 41 |     "SPACE",
 42 | ]
 43 | 
 44 | 
 45 | class Conceptualizer:
 46 |     def __init__(
 47 |         self,
 48 |         nlp: Language,
 49 |         data: dict = {},
 50 |         model: Union[str, FastText, KeyedVectors, Word2Vec] = None,
 51 |         topn: list = None,
 52 |         word_delimiter: str = "_",
 53 |         ent_score: bool = False,
 54 |         exclude_pos: list = None,
 55 |         exclude_dep: list = None,
 56 |         include_compound_words: bool = False,
 57 |         case_sensitive: bool = False,
 58 |         fuzzy: bool = False,
 59 |         json_path: str = "./matching_patterns.json",
 60 |         verbose: bool = True,
 61 |         name: str = "concise_concepts",
 62 |     ):
 63 |         """
 64 |         The function takes in a dictionary of words and their synonyms, and then creates a new dictionary of words and
 65 |         their synonyms, but with the words in the new dictionary all in uppercase
 66 | 
 67 |         :param nlp: The spaCy model to use.
 68 |         :type nlp: Language
 69 |         :param name: The name of the entity.
 70 |         :type name: str
 71 |         :param data: A dictionary of the words you want to match. The keys are the classes you want to match,
 72 |             and the values are the words you want to expand over.
 73 |         :type data: dict
 74 |         :param topn: The number of words to be returned for each class.
 75 |         :type topn: list
 76 |         :param model_path: The path to the model you want to use. If you don't have a model, you can use the spaCy one.
 77 |         :param word_delimiter: The delimiter used to separate words in model the dictionary, defaults to _ (optional)
 78 |         :param ent_score: If True, the extension "ent_score" will be added to the Span object. This will be the score of
 79 |             the entity, defaults to False (optional)
 80 |         :param exclude_pos: A list of POS tags to exclude from the rule based match
 81 |         :param exclude_dep: list of dependencies to exclude from the rule based match
 82 |         :param include_compound_words: If True, it will include compound words in the entity. For example,
 83 |             if the entity is "New York", it will also include "New York City" as an entity, defaults to False (optional)
 84 |         :param case_sensitive: Whether to match the case of the words in the text, defaults to False (optional)
 85 |         """
 86 |         assert data, ValueError("You must provide a dictionary of words to match")
 87 |         self.verbose = verbose
 88 |         self.log_cache = {"key": list(), "word": list(), "key_word": list()}
 89 |         if Span.has_extension("ent_score"):
 90 |             Span.remove_extension("ent_score")
 91 |         if ent_score:
 92 |             Span.set_extension("ent_score", default=None)
 93 |         self.ent_score = ent_score
 94 |         self.data = data
 95 |         self.name = name
 96 |         self.nlp = nlp
 97 |         self.fuzzy = fuzzy
 98 |         self.topn = topn
 99 |         self.model = model
100 |         self.match_rule = {}
101 |         self.set_exclude_pos(exclude_pos)
102 |         self.set_exclude_dep(exclude_dep)
103 |         self.json_path = json_path
104 |         self.include_compound_words = include_compound_words
105 |         self.case_sensitive = case_sensitive
106 |         self.word_delimiter = word_delimiter
107 |         if "lemmatizer" not in self.nlp.component_names:
108 |             logger.warning(
109 |                 "No lemmatizer found in spacy pipeline. Consider adding it for matching"
110 |                 " on LEMMA instead of exact text."
111 |             )
112 |             self.match_key = "TEXT"
113 |         else:
114 |             self.match_key = "LEMMA"
115 | 
116 |         for ruler in ["entity_ruler", "spaczz_ruler"]:
117 |             if ruler in self.nlp.component_names:
118 |                 logger.warning(
119 |                     f"{ruler} already exists in the pipeline. Removing old rulers"
120 |                 )
121 |                 self.nlp.remove_pipe(ruler)
122 |         self.run()
123 | 
124 |     def set_exclude_dep(self, exclude_dep: list):
125 |         if exclude_dep is None:
126 |             exclude_dep = []
127 |         if exclude_dep:
128 |             self.match_rule["DEP"] = {"NOT_IN": exclude_dep}
129 | 
130 |     def set_exclude_pos(self, exclude_pos: list):
131 |         if exclude_pos is None:
132 |             exclude_pos = [
133 |                 "VERB",
134 |                 "AUX",
135 |                 "ADP",
136 |                 "DET",
137 |                 "CCONJ",
138 |                 "PUNCT",
139 |                 "ADV",
140 |                 "ADJ",
141 |                 "PART",
142 |                 "PRON",
143 |             ]
144 |         if exclude_pos:
145 |             self.match_rule["POS"] = {"NOT_IN": exclude_pos}
146 |             self.exclude_pos = exclude_pos
147 |         else:
148 |             self.exclude_pos = []
149 | 
150 |     def run(self) -> None:
151 |         self.check_validity_path()
152 |         self.set_gensim_model()
153 |         self.verify_data(self.verbose)
154 |         self.determine_topn()
155 |         self.expand_concepts()
156 |         # settle words around overlapping concepts
157 |         for _ in range(5):
158 |             self.expand_concepts()
159 |             self.infer_original_data()
160 |             self.resolve_overlapping_concepts()
161 |         self.infer_original_data()
162 |         self.create_conceptual_patterns()
163 |         self.set_concept_dict()
164 | 
165 |         if not self.ent_score:
166 |             del self.kv
167 | 
168 |         self.data_upper = {k.upper(): v for k, v in self.data.items()}
169 | 
170 |     def check_validity_path(self) -> None:
171 |         """
172 |         If the path is a file, create the parent directory if it doesn't exist. If the path is a directory, create the
173 |         directory and set the path to the default file name
174 |         """
175 |         if self.json_path:
176 |             if Path(self.json_path).suffix:
177 |                 Path(self.json_path).parents[0].mkdir(parents=True, exist_ok=True)
178 |             else:
179 |                 Path(self.json_path).mkdir(parents=True, exist_ok=True)
180 |                 old_path = str(self.json_path)
181 |                 self.json_path = Path(self.json_path) / "matching_patterns.json"
182 |                 logger.warning(
183 |                     f"Path ´{old_path} is a directory, not a file. Setting"
184 |                     f" ´json_path´to {self.json_path}"
185 |                 )
186 | 
187 |     def determine_topn(self) -> None:
188 |         """
189 |         If the user doesn't specify a topn value for each class,
190 |         then the topn value for each class is set to 100
191 |         """
192 |         if self.topn is None:
193 |             self.topn_dict = {key: 100 for key in self.data}
194 |         else:
195 |             num_classes = len(self.data)
196 |             assert (
197 |                 len(self.topn) == num_classes
198 |             ), f"Provide a topn integer for each of the {num_classes} classes."
199 |             self.topn_dict = dict(zip(self.data, self.topn))
200 | 
201 |     def set_gensim_model(self) -> None:
202 |         """
203 |         If the model_path is not None, then we try to load the model from the path.
204 |         If it's not a valid path, then we raise an exception.
205 |         If the model_path is None, then we load the model from the internal embeddings of the spacy model
206 |         """
207 |         if isinstance(self.model, str):
208 |             if self.model:
209 |                 available_models = gensim.downloader.info()["models"]
210 |                 if self.model in available_models:
211 |                     self.kv = gensim.downloader.load(self.model)
212 |                 else:
213 |                     try:
214 |                         self.kv = Sense2Vec().from_disk(self.model)
215 |                     except Exception as e0:
216 |                         try:
217 |                             self.kv = FastText.load(self.model).wv
218 |                         except Exception as e1:
219 |                             try:
220 |                                 self.kv = Word2Vec.load(self.model).wv
221 |                             except Exception as e2:
222 |                                 try:
223 |                                     self.kv = KeyedVectors.load(self.model)
224 |                                 except Exception as e3:
225 |                                     try:
226 |                                         self.kv = KeyedVectors.load_word2vec_format(
227 |                                             self.model, binary=True
228 |                                         )
229 |                                     except Exception as e4:
230 |                                         raise Exception(
231 |                                             "Not a valid model.Sense2Vec, FastText,"
232 |                                             f" Word2Vec, KeyedVectors.\n {e0}\n {e1}\n"
233 |                                             f" {e2}\n {e3}\n {e4}"
234 |                                         )
235 |         elif isinstance(self.model, (FastText, Word2Vec)):
236 |             self.kv = self.model.wv
237 |         elif isinstance(self.model, KeyedVectors):
238 |             self.kv = self.model
239 |         elif isinstance(self.model, Sense2Vec):
240 |             self.kv = self.model
241 |         else:
242 |             wordList = []
243 |             vectorList = []
244 | 
245 |             assert len(
246 |                 self.nlp.vocab.vectors
247 |             ), "Choose a spaCy model with internal embeddings, e.g. md or lg."
248 | 
249 |             for key, vector in self.nlp.vocab.vectors.items():
250 |                 wordList.append(self.nlp.vocab.strings[key])
251 |                 vectorList.append(vector)
252 | 
253 |             self.kv = KeyedVectors(self.nlp.vocab.vectors_length)
254 | 
255 |             self.kv.add_vectors(wordList, vectorList)
256 | 
257 |     def verify_data(self, verbose: bool = True) -> None:
258 |         """
259 |         It takes a dictionary of lists of words, and returns a dictionary of lists of words,
260 |         where each word in the list is present in the word2vec model
261 |         """
262 |         verified_data: dict[str, list[str]] = dict()
263 |         for key, value in self.data.items():
264 |             verified_values = []
265 |             present_key = self._check_presence_vocab(key)
266 |             if present_key:
267 |                 key = present_key
268 |             if not present_key and verbose and key not in self.log_cache["key"]:
269 |                 logger.warning(f"key ´{key}´ not present in vector model")
270 |                 self.log_cache["key"].append(key)
271 |             for word in value:
272 |                 present_word = self._check_presence_vocab(word)
273 |                 if present_word:
274 |                     verified_values.append(present_word)
275 |                 elif verbose and word not in self.log_cache["word"]:
276 |                     logger.warning(
277 |                         f"word ´{word}´ from key ´{key}´ not present in vector model"
278 |                     )
279 |                     self.log_cache["word"].append(word)
280 |             verified_data[key] = verified_values
281 |             if not len(verified_values):
282 |                 msg = (
283 |                     f"None of the entries for key {key} are present in the vector"
284 |                     " model. "
285 |                 )
286 |                 if present_key:
287 |                     logger.warning(
288 |                         msg + f"Using {present_key} as word to expand over instead."
289 |                     )
290 |                     verified_data[key] = present_key
291 |                 else:
292 |                     raise Exception(msg)
293 |         self.data = deepcopy(verified_data)
294 |         self.original_data = deepcopy(verified_data)
295 | 
296 |     def expand_concepts(self) -> None:
297 |         """
298 |         For each key in the data dictionary, find the topn most similar words to the key and the values in the data
299 |         dictionary, and add those words to the values in the data dictionary
300 |         """
301 |         for key in self.data:
302 |             present_key = self._check_presence_vocab(key)
303 |             if present_key:
304 |                 key_list = [present_key]
305 |             else:
306 |                 key_list = []
307 |             if isinstance(self.kv, Sense2Vec):
308 |                 similar = self.kv.most_similar(
309 |                     self.data[key] + key_list,
310 |                     n=self.topn_dict[key],
311 |                 )
312 |             else:
313 |                 similar = self.kv.most_similar(
314 |                     self.data[key] + key_list,
315 |                     topn=self.topn_dict[key],
316 |                 )
317 |             self.data[key] = list({word for word, _ratio in similar})
318 | 
319 |     def resolve_overlapping_concepts(self) -> None:
320 |         """
321 |         It removes words from the data that are in other concepts, and then removes words that are not closest to the
322 |         centroid of the concept
323 |         """
324 |         for key in self.data:
325 |             self.data[key] = [
326 |                 word
327 |                 for word in self.data[key]
328 |                 if key == self.most_similar_to_given(word, list(self.data.keys()))
329 |             ]
330 | 
331 |     def most_similar_to_given(self, key1, keys_list):
332 |         """Get the `key` from `keys_list` most similar to `key1`."""
333 |         return keys_list[argmax([self.similarity(key1, key) for key in keys_list])]
334 | 
335 |     def similarity(self, w1, w2):
336 |         """Compute cosine similarity between two keys.
337 | 
338 |         Parameters
339 |         ----------
340 |         w1 : str
341 |             Input key.
342 |         w2 : str
343 |             Input key.
344 | 
345 |         Returns
346 |         -------
347 |         float
348 |             Cosine similarity between `w1` and `w2`.
349 | 
350 |         """
351 |         return dot(matutils.unitvec(self.kv[w1]), matutils.unitvec(self.kv[w2]))
352 | 
353 |     def infer_original_data(self) -> None:
354 |         """
355 |         It takes the original data and adds the new data to it, then removes the new data from the original data.
356 |         """
357 |         for key in self.data:
358 |             self.data[key] = list(set(self.data[key] + self.original_data[key]))
359 | 
360 |         for key_x in self.data:
361 |             for key_y in self.data:
362 |                 if key_x != key_y:
363 |                     self.data[key_x] = [
364 |                         word
365 |                         for word in self.data[key_x]
366 |                         if word not in self.original_data[key_y]
367 |                     ]
368 | 
369 |     def lemmatize_concepts(self) -> None:
370 |         """
371 |         For each key in the data dictionary,
372 |         the function takes the list of concepts associated with that key, and lemmatizes
373 |         each concept.
374 |         """
375 |         for key in self.data:
376 |             self.data[key] = list(
377 |                 set([doc[0].lemma_ for doc in self.nlp.pipe(self.data[key])])
378 |             )
379 | 
380 |     def create_conceptual_patterns(self) -> None:
381 |         """
382 |         For each key in the data dictionary,
383 |         create a pattern for each word in the list of words associated with that key.
384 | 
385 | 
386 |         The pattern is a dictionary with three keys:
387 | 
388 |         1. "lemma"
389 |         2. "POS"
390 |         3. "DEP"
391 | 
392 |         The value for each key is another dictionary with one key and one value.
393 | 
394 |         The key is either "regex" or "NOT_IN" or "IN".
395 | 
396 |         The value is either a regular expression or a list of strings.
397 | 
398 |         The regular expression is the word associated with the key in the data dictionary.
399 | 
400 |         The list of strings is either ["VERB"] or ["nsubjpass"] or ["amod", "compound"].
401 | 
402 |         The regular expression is case insensitive.
403 | 
404 |         The pattern is
405 |         """
406 |         lemma_patterns = []
407 |         fuzzy_patterns = []
408 | 
409 |         def add_patterns(input_dict: dict) -> None:
410 |             """
411 |             It creates a  list of dictionaries that can be used for a spaCy entity ruler
412 | 
413 |             :param input_dict: a dictionary
414 |             :type input_dict: dict
415 |             """
416 | 
417 |             if isinstance(self.kv, Sense2Vec):
418 |                 input_dict = {
419 |                     key.split("|")[0]: [word.split("|")[0] for word in value]
420 |                     for key, value in input_dict.items()
421 |                 }
422 |             for key in input_dict:
423 |                 words = input_dict[key]
424 |                 for word in words:
425 |                     if word != key:
426 |                         word_parts = self._split_word(word)
427 |                         op_pattern = {
428 |                             "TEXT": {
429 |                                 "REGEX": "|".join([" ", "-", "_", "/"]),
430 |                                 "OP": "*",
431 |                             }
432 |                         }
433 |                         partial_pattern_parts = []
434 |                         lemma_pattern_parts = []
435 |                         for partial_pattern in word_parts:
436 |                             word_part = partial_pattern
437 |                             if self.fuzzy:
438 |                                 partial_pattern = {
439 |                                     "FUZZY": word_part,
440 |                                 }
441 |                             partial_pattern = {"TEXT": partial_pattern}
442 |                             lemma_pattern_parts.append({self.match_key: word_part})
443 |                             lemma_pattern_parts.append(op_pattern)
444 |                             partial_pattern_parts.append(partial_pattern)
445 |                             partial_pattern_parts.append(op_pattern)
446 | 
447 |                         pattern = {
448 |                             "label": key.upper(),
449 |                             "pattern": partial_pattern_parts[:-1],
450 |                             "id": f"{word}_individual",
451 |                         }
452 | 
453 |                         # add fuzzy matching formatting if fuzzy matching is enabled
454 |                         fuzzy_patterns.append(pattern)
455 | 
456 |                         # add lemmma matching
457 |                         if lemma_pattern_parts:
458 |                             lemma_pattern = {
459 |                                 "label": key.upper(),
460 |                                 "pattern": lemma_pattern_parts[:-1],
461 |                                 "id": f"{word}_lemma_individual",
462 |                             }
463 |                             lemma_patterns.append(lemma_pattern)
464 | 
465 |                         if self.include_compound_words:
466 |                             compound_rule = [
467 |                                 {
468 |                                     "DEP": {"IN": ["amod", "compound"]},
469 |                                     "OP": "*",
470 |                                 }
471 |                             ]
472 |                             partial_pattern_parts.append(
473 |                                 {
474 |                                     "label": key.upper(),
475 |                                     "pattern": compound_rule
476 |                                     + partial_pattern_parts[:-1]
477 |                                     + compound_rule,
478 |                                     "id": f"{word}_compound",
479 |                                 }
480 |                             )
481 |                             if lemma_pattern_parts:
482 |                                 lemma_patterns.append(
483 |                                     {
484 |                                         "label": key.upper(),
485 |                                         "pattern": compound_rule
486 |                                         + lemma_pattern_parts[:-1]
487 |                                         + compound_rule,
488 |                                         "id": f"{word}_lemma_compound",
489 |                                     }
490 |                                 )
491 | 
492 |         add_patterns(self.data)
493 | 
494 |         if self.json_path:
495 |             with open(self.json_path, "w") as f:
496 |                 json.dump(lemma_patterns + fuzzy_patterns, f)
497 | 
498 |         config = {"overwrite_ents": True}
499 |         if self.case_sensitive:
500 |             config["phrase_matcher_attr"] = "LOWER"
501 | 
502 |         self.ruler = self.nlp.add_pipe("entity_ruler", config=config)
503 |         self.ruler.add_patterns(lemma_patterns)
504 | 
505 |         # Add spaczz entity ruler if fuzzy
506 |         if self.fuzzy:
507 |             for pattern in fuzzy_patterns:
508 |                 pattern["type"] = "token"
509 |             self.fuzzy_ruler = self.nlp.add_pipe("spaczz_ruler", config=config)
510 |             self.fuzzy_ruler.add_patterns(fuzzy_patterns)
511 | 
512 |     def __call__(self, doc: Doc) -> Doc:
513 |         """
514 |         It takes a doc object and assigns a score to each entity in the doc object
515 | 
516 |         :param doc: Doc
517 |         :type doc: Doc
518 |         """
519 |         if isinstance(doc, str):
520 |             doc = self.nlp(doc)
521 |         elif isinstance(doc, Doc):
522 |             if self.ent_score:
523 |                 doc = self.assign_score_to_entities(doc)
524 | 
525 |         return doc
526 | 
527 |     def pipe(self, stream, batch_size=128) -> Doc:
528 |         """
529 |         It takes a stream of documents, and for each document,
530 |         it assigns a score to each entity in the document
531 | 
532 |         :param stream: a generator of documents
533 |         :param batch_size: The number of documents to be processed at a time, defaults to 128 (optional)
534 |         """
535 |         if isinstance(stream, str):
536 |             stream = [stream]
537 | 
538 |         if not isinstance(stream, types.GeneratorType):
539 |             stream = self.nlp.pipe(stream, batch_size=batch_size)
540 | 
541 |         for docs in util.minibatch(stream, size=batch_size):
542 |             for doc in docs:
543 |                 if self.ent_score:
544 |                     doc = self.assign_score_to_entities(doc)
545 |                 yield doc
546 | 
547 |     def assign_score_to_entities(self, doc: Doc) -> Doc:
548 |         """
549 |         The function takes a spaCy document as input and assigns a score to each entity in the document. The score is
550 |         calculated using the word embeddings of the entity and the concept.
551 |         The score is assigned to the entity using the
552 |         `._.ent_score` attribute
553 | 
554 |         :param doc: Doc
555 |         :type doc: Doc
556 |         :return: The doc object with the entities and their scores.
557 |         """
558 |         ents = doc.ents
559 |         for ent in ents:
560 |             if ent.label_ in self.data_upper:
561 |                 ent_text = ent.text
562 | 
563 |                 # get word part representations
564 |                 if self._check_presence_vocab(ent_text):
565 |                     entity = [self._check_presence_vocab(ent_text)]
566 |                 else:
567 |                     entity = []
568 |                     for part in self._split_word(ent_text):
569 |                         present_part = self._check_presence_vocab(part)
570 |                         if present_part:
571 |                             entity.append(present_part)
572 | 
573 |                 # get concepts to match
574 |                 concept = self.concept_data.get(ent.label_, None)
575 | 
576 |                 # compare set similarities
577 |                 if entity and concept:
578 |                     ent._.ent_score = self.kv.n_similarity(entity, concept)
579 |                 else:
580 |                     ent._.ent_score = 0
581 |                     if self.verbose:
582 |                         if f"{ent_text}_{concept}" not in self.log_cache["key_word"]:
583 |                             logger.warning(
584 |                                 f"Entity ´{ent.text}´ and/or label ´{concept}´ not"
585 |                                 " found in vector model. Nothing to compare to, so"
586 |                                 " setting ent._.ent_score to 0."
587 |                             )
588 |                             self.log_cache["key_word"].append(f"{ent_text}_{concept}")
589 |             else:
590 |                 ent._.ent_score = 0
591 |                 if self.verbose:
592 |                     if ent.text not in self.log_cache["word"]:
593 |                         logger.warning(
594 |                             f"Entity ´{ent.text}´ not found in vector model. Nothing to"
595 |                             " compare to, so setting ent._.ent_score to 0."
596 |                         )
597 |                         self.log_cache["word"].append(ent.text)
598 |         doc.ents = ents
599 |         return doc
600 | 
601 |     def set_concept_dict(self):
602 |         self.concept_data = {k.upper(): v for k, v in self.data.items()}
603 |         for ent_label in self.concept_data:
604 |             concept = []
605 |             for word in self.concept_data[ent_label]:
606 |                 present_word = self._check_presence_vocab(word)
607 |                 if present_word:
608 |                     concept.append(present_word)
609 |             self.concept_data[ent_label] = concept
610 | 
611 |     def _split_word(self, word: str) -> List[str]:
612 |         """
613 |         It splits a word into a list of subwords, using the word delimiter
614 | 
615 |         :param word: str
616 |         :type word: str
617 |         :return: A list of strings or any.
618 |         """
619 |         return re.split(f"[{re.escape(self.word_delimiter)}]+", word)
620 | 
621 |     def _check_presence_vocab(self, word: str) -> str:
622 |         """
623 |         If the word is not lowercase and the case_sensitive flag is set to False, then check if the lowercase version of
624 |         the word is in the vocabulary. If it is, return the lowercase version of the word. Otherwise, return the word
625 |         itself
626 | 
627 |         :param word: The word to check for presence in the vocabulary
628 |         :type word: str
629 |         :return: The word itself if it is present in the vocabulary, otherwise the word with the highest probability of
630 |         being the word that was intended.
631 |         """
632 |         word = word.replace(" ", "_")
633 |         if not word.islower() and not self.case_sensitive:
634 |             present_word = self.__check_presence_vocab(word.lower())
635 |             if present_word:
636 |                 return present_word
637 |         return self.__check_presence_vocab(word)
638 | 
639 |     def __check_presence_vocab(self, word: str) -> str:
640 |         """
641 |         If the word is in the vocabulary, return the word. If not, replace spaces and dashes with the word delimiter and
642 |         check if the new word is in the vocabulary. If so, return the new word
643 | 
644 |         :param word: str - the word to check
645 |         :type word: str
646 |         :return: The word or the check_word
647 |         """
648 |         if isinstance(self.kv, Sense2Vec):
649 |             return self.kv.get_best_sense(word, (set(POS_LIST) - set(self.exclude_pos)))
650 |         else:
651 |             if word in self.kv:
652 |                 return word
653 | 


--------------------------------------------------------------------------------
/concise_concepts/conceptualizer/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from .Conceptualizer import Conceptualizer
3 | 
4 | __all__ = ["Conceptualizer"]
5 | 


--------------------------------------------------------------------------------
/concise_concepts/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/f31d1c3aa5a9a6908790ed04fc77edc18bf9221a/concise_concepts/examples/__init__.py


--------------------------------------------------------------------------------
/concise_concepts/examples/data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | text = """
 3 |     Heat the oil in a large pan and add the Onion, celery and carrots.
 4 |     Then, cook over a medium–low heat for 10 minutes, or until softened.
 5 |     Add the courgette, garlic, red peppers and oregano and cook for 2–3 minutes.
 6 |     Later, add some oranges, chickens. """
 7 | 
 8 | text_fuzzy = """
 9 |     Heat the oil in a large pan and add the Onion, celery and carots.
10 |     Then, cook over a medium–low heat for 10 minutes, or until softened.
11 |     Add the courgette, garlic, red peppers and oregano and cook for 2–3 minutes.
12 |     Later, add some oranges, chickens. """
13 | 
14 | data = {
15 |     "fruit": ["apple", "pear", "orange"],
16 |     "vegetable": ["broccoli", "spinach", "tomato"],
17 |     "meat": ["chicken", "beef", "pork", "fish", "lamb"],
18 | }
19 | 


--------------------------------------------------------------------------------
/concise_concepts/examples/example_gensim_custom_model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import spacy
 3 | from gensim.models import Word2Vec
 4 | from gensim.test.utils import common_texts
 5 | 
 6 | import concise_concepts  # noqa: F401
 7 | 
 8 | data = {"human": ["trees"], "interface": ["computer"]}
 9 | 
10 | text = (
11 |     "believe me, it's the slowest mobile I saw. Don't go on screen and Battery, it is"
12 |     " an extremely slow mobile phone and takes ages to open and navigate. Forget about"
13 |     " heavy use, it can't handle normal regular use. I made a huge mistake but pls"
14 |     " don't buy this mobile. It's only a few months and I am thinking to change it. Its"
15 |     " dam SLOW SLOW SLOW."
16 | )
17 | 
18 | model = Word2Vec(
19 |     sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4
20 | )
21 | model.save("word2vec.model")
22 | model_path = "word2vec.model"
23 | 
24 | nlp = spacy.blank("en")
25 | nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path})
26 | 


--------------------------------------------------------------------------------
/concise_concepts/examples/example_gensim_custom_path.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import gensim.downloader as api
 3 | import spacy
 4 | 
 5 | import concise_concepts  # noqa: F401
 6 | 
 7 | from .data import data, text
 8 | 
 9 | model_path = "word2vec.model"
10 | model = api.load("glove-twitter-25")
11 | model.save(model_path)
12 | nlp = spacy.blank("en")
13 | 
14 | nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path})
15 | 
16 | doc = nlp(text)
17 | print([(ent.text, ent.label_) for ent in doc.ents])
18 | 


--------------------------------------------------------------------------------
/concise_concepts/examples/example_gensim_default.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import spacy
 3 | 
 4 | import concise_concepts  # noqa: F401
 5 | 
 6 | from .data import data, text
 7 | 
 8 | model_path = "glove-twitter-25"
 9 | 
10 | nlp = spacy.blank("en")
11 | 
12 | nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path})
13 | 
14 | doc = nlp(text)
15 | print([(ent.text, ent.label_) for ent in doc.ents])
16 | 


--------------------------------------------------------------------------------
/concise_concepts/examples/example_spacy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import spacy
 3 | 
 4 | import concise_concepts  # noqa: F401
 5 | 
 6 | from .data import data, text
 7 | 
 8 | nlp = spacy.load("en_core_web_md")
 9 | 
10 | nlp.add_pipe("concise_concepts", config={"data": data})
11 | 
12 | doc = nlp(text)
13 | print([(ent.text, ent.label_) for ent in doc.ents])
14 | 


--------------------------------------------------------------------------------
/img/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/f31d1c3aa5a9a6908790ed04fc77edc18bf9221a/img/example.png


--------------------------------------------------------------------------------
/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/f31d1c3aa5a9a6908790ed04fc77edc18bf9221a/img/logo.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "concise-concepts"
 3 | version = "0.8.1"
 4 | description = "This repository contains an easy and intuitive approach to few-shot NER using most similar expansion over spaCy embeddings. Now with entity confidence scores!"
 5 | authors = ["David Berenstein <david.m.berenstein@gmail.com>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | homepage = "https://github.com/pandora-intelligence/concise-concepts"
 9 | repository = "https://github.com/pandora-intelligence/concise-concepts"
10 | documentation = "https://github.com/pandora-intelligence/concise-concepts"
11 | keywords = ["spacy", "NER", "few-shot classification", "nlu"]
12 | classifiers = [
13 |     "Intended Audience :: Developers",
14 |     "Intended Audience :: Science/Research",
15 |     "License :: OSI Approved :: MIT License",
16 |     "Operating System :: OS Independent",
17 |     "Programming Language :: Python :: 3.8",
18 |     "Programming Language :: Python :: 3.9",
19 |     "Programming Language :: Python :: 3.10",
20 |     "Programming Language :: Python :: 3.11",
21 |     "Topic :: Scientific/Engineering",
22 |     "Topic :: Software Development"
23 | ]
24 | packages = [{include = "concise_concepts"}]
25 | 
26 | 
27 | [tool.poetry.dependencies]
28 | python = ">=3.8,<3.12"
29 | spacy = "^3"
30 | scipy = "^1.7"
31 | gensim = "^4"
32 | spaczz = "^0.5.4"
33 | sense2vec = "^2.0.1"
34 | 
35 | [tool.poetry.plugins]
36 | 
37 | [tool.poetry.plugins."spacy_factories"]
38 | "spacy" = "concise_concepts.__init__:make_concise_concepts"
39 | 
40 | [tool.poetry.group.dev.dependencies]
41 | black = "^22"
42 | flake8 = "^5"
43 | pytest = "^7.1"
44 | pre-commit = "^2.20"
45 | pep8-naming = "^0.13"
46 | flake8-bugbear = "^22.9"
47 | flake8-docstrings = "^1.6"
48 | ipython = "^8.7.0"
49 | ipykernel = "^6.17.1"
50 | 
51 | [build-system]
52 | requires = ["poetry-core>=1.0.0"]
53 | build-backend = "poetry.core.masonry.api"
54 | 
55 | [tool.pytest.ini_options]
56 | testpaths = "tests"
57 | 
58 | [tool.black]
59 | preview = true
60 | 
61 | [tool.isort]
62 | profile = "black"
63 | src_paths = ["concise_concepts"]
64 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 119
 3 | max-complexity = 18
 4 | docstring-convention=google
 5 | exclude = .git,__pycache__,build,dist
 6 | select = C,E,F,W,B,B950
 7 | ignore =
 8 |     E203,E266,E501,W503
 9 | enable =
10 |     W0614
11 | per-file-ignores =
12 |     test_*.py: D
13 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/f31d1c3aa5a9a6908790ed04fc77edc18bf9221a/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_model_import.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | def test_spacy_embeddings():
  3 |     from concise_concepts.examples import example_spacy  # noqa: F401
  4 | 
  5 | 
  6 | def test_gensim_default():
  7 |     from concise_concepts.examples import example_gensim_default  # noqa: F401
  8 | 
  9 | 
 10 | def test_gensim_custom_path():
 11 |     from concise_concepts.examples import example_gensim_custom_path  # noqa: F401
 12 | 
 13 | 
 14 | def test_gensim_custom_model():
 15 |     from concise_concepts.examples import example_gensim_custom_model  # noqa: F401
 16 | 
 17 | 
 18 | def test_standalone_spacy():
 19 |     import spacy
 20 | 
 21 |     from concise_concepts import Conceptualizer
 22 | 
 23 |     nlp = spacy.load("en_core_web_md")
 24 |     data = {
 25 |         "disease": ["cancer", "diabetes", "heart disease", "influenza", "pneumonia"],
 26 |         "symptom": ["headache", "fever", "cough", "nausea", "vomiting", "diarrhea"],
 27 |     }
 28 |     conceptualizer = Conceptualizer(nlp, data)
 29 |     assert (
 30 |         list(conceptualizer.pipe(["I have a headache and a fever."]))[0].to_json()
 31 |         == list(conceptualizer.nlp.pipe(["I have a headache and a fever."]))[
 32 |             0
 33 |         ].to_json()
 34 |     )
 35 |     assert (
 36 |         conceptualizer("I have a headache and a fever.").to_json()
 37 |         == conceptualizer.nlp("I have a headache and a fever.").to_json()
 38 |     )
 39 | 
 40 |     data = {
 41 |         "disease": ["cancer", "diabetes"],
 42 |         "symptom": ["headache", "fever"],
 43 |     }
 44 |     conceptualizer = Conceptualizer(nlp, data)
 45 | 
 46 | 
 47 | def test_standalone_gensim():
 48 |     import gensim
 49 |     import spacy
 50 | 
 51 |     from concise_concepts import Conceptualizer
 52 | 
 53 |     model_path = "glove-twitter-25"
 54 |     model = gensim.downloader.load(model_path)
 55 |     nlp = spacy.load("en_core_web_md")
 56 |     data = {
 57 |         "disease": ["cancer", "diabetes", "heart disease", "influenza", "pneumonia"],
 58 |         "symptom": ["headache", "fever", "cough", "nausea", "vomiting", "diarrhea"],
 59 |     }
 60 |     conceptualizer = Conceptualizer(nlp, data, model=model)
 61 |     print(list(conceptualizer.pipe(["I have a headache and a fever."]))[0].ents)
 62 |     print(list(conceptualizer.nlp.pipe(["I have a headache and a fever."]))[0].ents)
 63 |     print(conceptualizer("I have a headache and a fever.").ents)
 64 |     print(conceptualizer.nlp("I have a headache and a fever.").ents)
 65 | 
 66 | 
 67 | def test_spaczz():
 68 |     # -*- coding: utf-8 -*-
 69 |     import spacy
 70 | 
 71 |     import concise_concepts  # noqa: F401
 72 |     from concise_concepts.examples.data import data, text, text_fuzzy
 73 | 
 74 |     nlp = spacy.load("en_core_web_md")
 75 | 
 76 |     nlp.add_pipe("concise_concepts", config={"data": data, "fuzzy": True})
 77 | 
 78 |     assert len(nlp(text).ents) == len(nlp(text_fuzzy).ents)
 79 | 
 80 | 
 81 | def test_sense2vec():
 82 |     # -*- coding: utf-8 -*-
 83 |     import requests
 84 |     import spacy
 85 | 
 86 |     import concise_concepts  # noqa: F401
 87 |     from concise_concepts.examples.data import data, text
 88 | 
 89 |     model_path = "s2v_old"
 90 |     # download .tar.gz file an URL
 91 |     # and extract it to a folder
 92 |     url = "https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz"
 93 |     r = requests.get(url, allow_redirects=True)
 94 |     open("s2v_reddit_2015_md.tar.gz", "wb").write(r.content)
 95 |     # extract tar.gz file
 96 |     filename = "s2v_reddit_2015_md.tar.gz"
 97 |     import tarfile
 98 | 
 99 |     tar = tarfile.open(filename, "r:gz")
100 |     tar.extractall()
101 |     tar.close()
102 | 
103 |     nlp = spacy.load("en_core_web_md")
104 | 
105 |     nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path})
106 | 
107 |     assert len(nlp(text).ents)
108 | 


--------------------------------------------------------------------------------