├── .github └── workflows │ ├── python-package.yml │ └── python-publish.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .vscode └── settings.json ├── CITATION.cff ├── LICENSE ├── README.md ├── concise_concepts ├── __init__.py ├── conceptualizer │ ├── Conceptualizer.py │ └── __init__.py └── examples │ ├── __init__.py │ ├── data.py │ ├── example_gensim_custom_model.py │ ├── example_gensim_custom_path.py │ ├── example_gensim_default.py │ └── example_spacy.py ├── img ├── example.png └── logo.png ├── poetry.lock ├── pyproject.toml ├── setup.cfg └── tests ├── __init__.py └── test_model_import.py /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [main] 9 | pull_request: 10 | branches: [main] 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | python-version: ["3.8", "3.9", "3.10"] 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v3 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | python -m pip install flake8 pytest pytest-cov 30 | python -m pip install poetry 31 | poetry export -f requirements.txt -o requirements.txt --without-hashes 32 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 33 | python -m spacy download en_core_web_md 34 | - name: Lint with flake8 35 | run: | 36 | # stop the build if there are Python syntax errors or undefined names 37 | flake8 . --count --max-complexity=18 --enable=W0614 --select=C,E,F,W,B,B950 --ignore=E203,E266,E501,W503 --exclude=.git,__pycache__,build,dist --max-line-length=119 --show-source --statistics 38 | - name: Test with pytest 39 | run: | 40 | pytest --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html 41 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [created] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | /test_spacy.py 131 | .model 132 | /concise_concepts/word2vec.model.vectors.npy 133 | /test.html 134 | 135 | # Downloaded models 136 | *.model 137 | *.model.* 138 | *.json 139 | test.py 140 | s2v_old -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.0.1 4 | hooks: 5 | - id: check-added-large-files 6 | - id: end-of-file-fixer 7 | - id: check-ast 8 | - id: check-case-conflict 9 | - id: check-docstring-first 10 | - id: check-merge-conflict 11 | - id: check-symlinks 12 | - id: check-toml 13 | - id: check-xml 14 | - id: check-yaml 15 | - id: destroyed-symlinks 16 | - id: detect-private-key 17 | - id: fix-encoding-pragma 18 | - repo: https://github.com/psf/black 19 | rev: 22.3.0 20 | hooks: 21 | - id: black 22 | - id: black-jupyter 23 | # Execute isort on all changed files (make sure the version is the same as in pyproject) 24 | - repo: https://github.com/pycqa/isort 25 | rev: 5.10.1 26 | hooks: 27 | - id: isort 28 | # Execute flake8 on all changed files (make sure the version is the same as in pyproject) 29 | - repo: https://github.com/pycqa/flake8 30 | rev: 4.0.1 31 | hooks: 32 | - id: flake8 33 | additional_dependencies: 34 | ["flake8-docstrings", "flake8-bugbear", "pep8-naming"] 35 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.pytestEnabled": true, 3 | "python.linting.flake8Enabled": true, 4 | "python.formatting.provider": "black", 5 | "editor.rulers": [ 6 | 119 7 | ], 8 | "python.linting.enabled": true, 9 | } 10 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.0.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: David 5 | given-names: Berenstein 6 | title: "Concise Concepts - an easy and intuitive approach to few-shot NER using most similar expansion over spaCy embeddings." 7 | version: 0.7.3 8 | date-released: 2022-12-31 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Pandora Intelligence 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Concise Concepts 2 | When wanting to apply NER to concise concepts, it is really easy to come up with examples, but pretty difficult to train an entire pipeline. Concise Concepts uses few-shot NER based on word embedding similarity to get you going 3 | with easy! Now with entity scoring! 4 | 5 | 6 | [![Python package](https://github.com/Pandora-Intelligence/concise-concepts/actions/workflows/python-package.yml/badge.svg?branch=main)](https://github.com/Pandora-Intelligence/concise-concepts/actions/workflows/python-package.yml) 7 | [![Current Release Version](https://img.shields.io/github/release/pandora-intelligence/concise-concepts.svg?style=flat-square&logo=github)](https://github.com/pandora-intelligence/concise-concepts/releases) 8 | [![pypi Version](https://img.shields.io/pypi/v/concise-concepts.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/concise-concepts/) 9 | [![PyPi downloads](https://static.pepy.tech/personalized-badge/concise-concepts?period=total&units=international_system&left_color=grey&right_color=orange&left_text=pip%20downloads)](https://pypi.org/project/concise-concepts/) 10 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black) 11 | 12 | 13 | ## Usage 14 | This library defines matching patterns based on the most similar words found in each group, which are used to fill a [spaCy EntityRuler](https://spacy.io/api/entityruler). To better understand the rule definition, I recommend playing around with the [spaCy Rule-based Matcher Explorer](https://demos.explosion.ai/matcher). 15 | 16 | ### Tutorials 17 | - [TechVizTheDataScienceGuy](https://www.youtube.com/c/TechVizTheDataScienceGuy) created a [nice tutorial](https://prakhar-mishra.medium.com/few-shot-named-entity-recognition-in-natural-language-processing-92d31f0d1143) on how to use it. 18 | 19 | - [I](https://www.linkedin.com/in/david-berenstein-1bab11105/) created a [tutorial](https://www.rubrix.ml/blog/concise-concepts-rubrix/) in collaboration with Rubrix. 20 | 21 | The section [Matching Pattern Rules](#matching-pattern-rules) expands on the construction, analysis and customization of these matching patterns. 22 | 23 | 24 | # Install 25 | 26 | ``` 27 | pip install concise-concepts 28 | ``` 29 | 30 | # Quickstart 31 | 32 | Take a look at the [configuration section](#configuration) for more info. 33 | 34 | ## Spacy Pipeline Component 35 | 36 | Note that, [custom embedding models](#custom-embedding-models) are passed via `model_path`. 37 | 38 | ```python 39 | import spacy 40 | from spacy import displacy 41 | 42 | data = { 43 | "fruit": ["apple", "pear", "orange"], 44 | "vegetable": ["broccoli", "spinach", "tomato"], 45 | "meat": ['beef', 'pork', 'turkey', 'duck'] 46 | } 47 | 48 | text = """ 49 | Heat the oil in a large pan and add the Onion, celery and carrots. 50 | Then, cook over a medium–low heat for 10 minutes, or until softened. 51 | Add the courgette, garlic, red peppers and oregano and cook for 2–3 minutes. 52 | Later, add some oranges and chickens. """ 53 | 54 | nlp = spacy.load("en_core_web_md", disable=["ner"]) 55 | 56 | nlp.add_pipe( 57 | "concise_concepts", 58 | config={ 59 | "data": data, 60 | "ent_score": True, # Entity Scoring section 61 | "verbose": True, 62 | "exclude_pos": ["VERB", "AUX"], 63 | "exclude_dep": ["DOBJ", "PCOMP"], 64 | "include_compound_words": False, 65 | "json_path": "./fruitful_patterns.json", 66 | "topn": (100,500,300) 67 | }, 68 | ) 69 | doc = nlp(text) 70 | 71 | options = { 72 | "colors": {"fruit": "darkorange", "vegetable": "limegreen", "meat": "salmon"}, 73 | "ents": ["fruit", "vegetable", "meat"], 74 | } 75 | 76 | ents = doc.ents 77 | for ent in ents: 78 | new_label = f"{ent.label_} ({ent._.ent_score:.0%})" 79 | options["colors"][new_label] = options["colors"].get(ent.label_.lower(), None) 80 | options["ents"].append(new_label) 81 | ent.label_ = new_label 82 | doc.ents = ents 83 | 84 | displacy.render(doc, style="ent", options=options) 85 | ``` 86 | ![](https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/example.png) 87 | 88 | ## Standalone 89 | 90 | This might be useful when iterating over few_shot training data when not wanting to reload larger models continuously. 91 | Note that, [custom embedding models](#custom-embedding-models) are passed via `model`. 92 | 93 | ```python 94 | import gensim 95 | import spacy 96 | 97 | from concise_concepts import Conceptualizer 98 | 99 | model = gensim.downloader.load("fasttext-wiki-news-subwords-300") 100 | nlp = spacy.load("en_core_web_sm") 101 | data = { 102 | "disease": ["cancer", "diabetes", "heart disease", "influenza", "pneumonia"], 103 | "symptom": ["headache", "fever", "cough", "nausea", "vomiting", "diarrhea"], 104 | } 105 | conceptualizer = Conceptualizer(nlp, data, model) 106 | conceptualizer.nlp("I have a headache and a fever.").ents 107 | 108 | data = { 109 | "disease": ["cancer", "diabetes"], 110 | "symptom": ["headache", "fever"], 111 | } 112 | conceptualizer = Conceptualizer(nlp, data, model) 113 | conceptualizer.nlp("I have a headache and a fever.").ents 114 | ``` 115 | 116 | # Configuration 117 | ## Matching Pattern Rules 118 | A general introduction about the usage of matching patterns in the [usage section](#usage). 119 | ### Customizing Matching Pattern Rules 120 | Even though the baseline parameters provide a decent result, the construction of these matching rules can be customized via the config passed to the spaCy pipeline. 121 | 122 | - `exclude_pos`: A list of POS tags to be excluded from the rule-based match. 123 | - `exclude_dep`: A list of dependencies to be excluded from the rule-based match. 124 | - `include_compound_words`: If True, it will include compound words in the entity. For example, if the entity is "New York", it will also include "New York City" as an entity. 125 | - `case_sensitive`: Whether to match the case of the words in the text. 126 | 127 | 128 | ### Analyze Matching Pattern Rules 129 | To motivate actually looking at the data and support interpretability, the matching patterns that have been generated are stored as `./main_patterns.json`. This behavior can be changed by using the `json_path` variable via the config passed to the spaCy pipeline. 130 | 131 | 132 | ## Fuzzy matching using `spaczz` 133 | 134 | - `fuzzy`: A boolean value that determines whether to use fuzzy matching 135 | 136 | ```python 137 | data = { 138 | "fruit": ["apple", "pear", "orange"], 139 | "vegetable": ["broccoli", "spinach", "tomato"], 140 | "meat": ["beef", "pork", "fish", "lamb"] 141 | } 142 | 143 | nlp.add_pipe("concise_concepts", config={"data": data, "fuzzy": True}) 144 | ``` 145 | 146 | ## Most Similar Word Expansion 147 | 148 | - `topn`: Use a specific number of words to expand over. 149 | 150 | ```python 151 | data = { 152 | "fruit": ["apple", "pear", "orange"], 153 | "vegetable": ["broccoli", "spinach", "tomato"], 154 | "meat": ["beef", "pork", "fish", "lamb"] 155 | } 156 | 157 | topn = [50, 50, 150] 158 | 159 | assert len(topn) == len 160 | 161 | nlp.add_pipe("concise_concepts", config={"data": data, "topn": topn}) 162 | ``` 163 | 164 | ## Entity Scoring 165 | 166 | - `ent_score`: Use embedding based word similarity to score entities against their groups 167 | 168 | ```python 169 | import spacy 170 | 171 | data = { 172 | "ORG": ["Google", "Apple", "Amazon"], 173 | "GPE": ["Netherlands", "France", "China"], 174 | } 175 | 176 | text = """Sony was founded in Japan.""" 177 | 178 | nlp = spacy.load("en_core_web_lg") 179 | nlp.add_pipe("concise_concepts", config={"data": data, "ent_score": True, "case_sensitive": True}) 180 | doc = nlp(text) 181 | 182 | print([(ent.text, ent.label_, ent._.ent_score) for ent in doc.ents]) 183 | # output 184 | # 185 | # [('Sony', 'ORG', 0.5207586), ('Japan', 'GPE', 0.7371268)] 186 | ``` 187 | 188 | ## Custom Embedding Models 189 | 190 | - `model_path`: Use custom `sense2vec.Sense2Vec`, `gensim.Word2vec` `gensim.FastText`, or `gensim.KeyedVectors`, or a pretrained model from [gensim](https://radimrehurek.com/gensim/downloader.html) library or a custom model path. For using a `sense2vec.Sense2Vec` take a look [here](https://github.com/explosion/sense2vec#pretrained-vectors). 191 | - `model`: within [standalone usage](#standalone), it is possible to pass these models directly. 192 | 193 | ```python 194 | data = { 195 | "fruit": ["apple", "pear", "orange"], 196 | "vegetable": ["broccoli", "spinach", "tomato"], 197 | "meat": ["beef", "pork", "fish", "lamb"] 198 | } 199 | 200 | # model from https://radimrehurek.com/gensim/downloader.html or path to local file 201 | model_path = "glove-wiki-gigaword-300" 202 | 203 | nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path}) 204 | ```` 205 | -------------------------------------------------------------------------------- /concise_concepts/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from typing import List, Union 3 | 4 | from gensim.models import FastText, Word2Vec 5 | from gensim.models.keyedvectors import KeyedVectors 6 | from spacy.language import Language 7 | 8 | from .conceptualizer import Conceptualizer 9 | 10 | 11 | @Language.factory( 12 | "concise_concepts", 13 | default_config={ 14 | "data": None, 15 | "topn": None, 16 | "model_path": None, 17 | "word_delimiter": "_", 18 | "ent_score": False, 19 | "exclude_pos": [ 20 | "VERB", 21 | "AUX", 22 | "ADP", 23 | "DET", 24 | "CCONJ", 25 | "PUNCT", 26 | "ADV", 27 | "ADJ", 28 | "PART", 29 | "PRON", 30 | ], 31 | "exclude_dep": [], 32 | "include_compound_words": False, 33 | "fuzzy": False, 34 | "case_sensitive": False, 35 | "json_path": "./matching_patterns.json", 36 | "verbose": True, 37 | }, 38 | ) 39 | def make_concise_concepts( 40 | nlp: Language, 41 | name: str, 42 | data: Union[dict, list], 43 | topn: Union[list, None], 44 | model_path: Union[str, FastText, Word2Vec, KeyedVectors, None], 45 | word_delimiter: str, 46 | ent_score: bool, 47 | exclude_pos: List[str], 48 | exclude_dep: List[str], 49 | include_compound_words: bool, 50 | fuzzy: bool, 51 | case_sensitive: bool, 52 | json_path: str, 53 | verbose: bool, 54 | ): 55 | return Conceptualizer( 56 | nlp=nlp, 57 | data=data, 58 | topn=topn, 59 | model=model_path, 60 | word_delimiter=word_delimiter, 61 | ent_score=ent_score, 62 | exclude_pos=exclude_pos, 63 | exclude_dep=exclude_dep, 64 | include_compound_words=include_compound_words, 65 | fuzzy=fuzzy, 66 | case_sensitive=case_sensitive, 67 | json_path=json_path, 68 | verbose=verbose, 69 | name=name, 70 | ) 71 | -------------------------------------------------------------------------------- /concise_concepts/conceptualizer/Conceptualizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | import types 6 | from copy import deepcopy 7 | from pathlib import Path 8 | from typing import List, Union 9 | 10 | import gensim.downloader 11 | import spaczz # noqa: F401 12 | from gensim import matutils # utility fnc for pickling, common scipy operations etc 13 | from gensim.models import FastText, Word2Vec 14 | from gensim.models.keyedvectors import KeyedVectors 15 | from numpy import argmax, dot 16 | from sense2vec import Sense2Vec 17 | from spacy import Language, util 18 | from spacy.tokens import Doc, Span 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | POS_LIST = [ 23 | "ADJ", 24 | "ADP", 25 | "ADV", 26 | "AUX", 27 | "CONJ", 28 | "CCONJ", 29 | "DET", 30 | "INTJ", 31 | "NOUN", 32 | "NUM", 33 | "PART", 34 | "PRON", 35 | "PROPN", 36 | "PUNCT", 37 | "SCONJ", 38 | "SYM", 39 | "VERB", 40 | "X", 41 | "SPACE", 42 | ] 43 | 44 | 45 | class Conceptualizer: 46 | def __init__( 47 | self, 48 | nlp: Language, 49 | data: dict = {}, 50 | model: Union[str, FastText, KeyedVectors, Word2Vec] = None, 51 | topn: list = None, 52 | word_delimiter: str = "_", 53 | ent_score: bool = False, 54 | exclude_pos: list = None, 55 | exclude_dep: list = None, 56 | include_compound_words: bool = False, 57 | case_sensitive: bool = False, 58 | fuzzy: bool = False, 59 | json_path: str = "./matching_patterns.json", 60 | verbose: bool = True, 61 | name: str = "concise_concepts", 62 | ): 63 | """ 64 | The function takes in a dictionary of words and their synonyms, and then creates a new dictionary of words and 65 | their synonyms, but with the words in the new dictionary all in uppercase 66 | 67 | :param nlp: The spaCy model to use. 68 | :type nlp: Language 69 | :param name: The name of the entity. 70 | :type name: str 71 | :param data: A dictionary of the words you want to match. The keys are the classes you want to match, 72 | and the values are the words you want to expand over. 73 | :type data: dict 74 | :param topn: The number of words to be returned for each class. 75 | :type topn: list 76 | :param model_path: The path to the model you want to use. If you don't have a model, you can use the spaCy one. 77 | :param word_delimiter: The delimiter used to separate words in model the dictionary, defaults to _ (optional) 78 | :param ent_score: If True, the extension "ent_score" will be added to the Span object. This will be the score of 79 | the entity, defaults to False (optional) 80 | :param exclude_pos: A list of POS tags to exclude from the rule based match 81 | :param exclude_dep: list of dependencies to exclude from the rule based match 82 | :param include_compound_words: If True, it will include compound words in the entity. For example, 83 | if the entity is "New York", it will also include "New York City" as an entity, defaults to False (optional) 84 | :param case_sensitive: Whether to match the case of the words in the text, defaults to False (optional) 85 | """ 86 | assert data, ValueError("You must provide a dictionary of words to match") 87 | self.verbose = verbose 88 | self.log_cache = {"key": list(), "word": list(), "key_word": list()} 89 | if Span.has_extension("ent_score"): 90 | Span.remove_extension("ent_score") 91 | if ent_score: 92 | Span.set_extension("ent_score", default=None) 93 | self.ent_score = ent_score 94 | self.data = data 95 | self.name = name 96 | self.nlp = nlp 97 | self.fuzzy = fuzzy 98 | self.topn = topn 99 | self.model = model 100 | self.match_rule = {} 101 | self.set_exclude_pos(exclude_pos) 102 | self.set_exclude_dep(exclude_dep) 103 | self.json_path = json_path 104 | self.include_compound_words = include_compound_words 105 | self.case_sensitive = case_sensitive 106 | self.word_delimiter = word_delimiter 107 | if "lemmatizer" not in self.nlp.component_names: 108 | logger.warning( 109 | "No lemmatizer found in spacy pipeline. Consider adding it for matching" 110 | " on LEMMA instead of exact text." 111 | ) 112 | self.match_key = "TEXT" 113 | else: 114 | self.match_key = "LEMMA" 115 | 116 | for ruler in ["entity_ruler", "spaczz_ruler"]: 117 | if ruler in self.nlp.component_names: 118 | logger.warning( 119 | f"{ruler} already exists in the pipeline. Removing old rulers" 120 | ) 121 | self.nlp.remove_pipe(ruler) 122 | self.run() 123 | 124 | def set_exclude_dep(self, exclude_dep: list): 125 | if exclude_dep is None: 126 | exclude_dep = [] 127 | if exclude_dep: 128 | self.match_rule["DEP"] = {"NOT_IN": exclude_dep} 129 | 130 | def set_exclude_pos(self, exclude_pos: list): 131 | if exclude_pos is None: 132 | exclude_pos = [ 133 | "VERB", 134 | "AUX", 135 | "ADP", 136 | "DET", 137 | "CCONJ", 138 | "PUNCT", 139 | "ADV", 140 | "ADJ", 141 | "PART", 142 | "PRON", 143 | ] 144 | if exclude_pos: 145 | self.match_rule["POS"] = {"NOT_IN": exclude_pos} 146 | self.exclude_pos = exclude_pos 147 | else: 148 | self.exclude_pos = [] 149 | 150 | def run(self) -> None: 151 | self.check_validity_path() 152 | self.set_gensim_model() 153 | self.verify_data(self.verbose) 154 | self.determine_topn() 155 | self.expand_concepts() 156 | # settle words around overlapping concepts 157 | for _ in range(5): 158 | self.expand_concepts() 159 | self.infer_original_data() 160 | self.resolve_overlapping_concepts() 161 | self.infer_original_data() 162 | self.create_conceptual_patterns() 163 | self.set_concept_dict() 164 | 165 | if not self.ent_score: 166 | del self.kv 167 | 168 | self.data_upper = {k.upper(): v for k, v in self.data.items()} 169 | 170 | def check_validity_path(self) -> None: 171 | """ 172 | If the path is a file, create the parent directory if it doesn't exist. If the path is a directory, create the 173 | directory and set the path to the default file name 174 | """ 175 | if self.json_path: 176 | if Path(self.json_path).suffix: 177 | Path(self.json_path).parents[0].mkdir(parents=True, exist_ok=True) 178 | else: 179 | Path(self.json_path).mkdir(parents=True, exist_ok=True) 180 | old_path = str(self.json_path) 181 | self.json_path = Path(self.json_path) / "matching_patterns.json" 182 | logger.warning( 183 | f"Path ´{old_path} is a directory, not a file. Setting" 184 | f" ´json_path´to {self.json_path}" 185 | ) 186 | 187 | def determine_topn(self) -> None: 188 | """ 189 | If the user doesn't specify a topn value for each class, 190 | then the topn value for each class is set to 100 191 | """ 192 | if self.topn is None: 193 | self.topn_dict = {key: 100 for key in self.data} 194 | else: 195 | num_classes = len(self.data) 196 | assert ( 197 | len(self.topn) == num_classes 198 | ), f"Provide a topn integer for each of the {num_classes} classes." 199 | self.topn_dict = dict(zip(self.data, self.topn)) 200 | 201 | def set_gensim_model(self) -> None: 202 | """ 203 | If the model_path is not None, then we try to load the model from the path. 204 | If it's not a valid path, then we raise an exception. 205 | If the model_path is None, then we load the model from the internal embeddings of the spacy model 206 | """ 207 | if isinstance(self.model, str): 208 | if self.model: 209 | available_models = gensim.downloader.info()["models"] 210 | if self.model in available_models: 211 | self.kv = gensim.downloader.load(self.model) 212 | else: 213 | try: 214 | self.kv = Sense2Vec().from_disk(self.model) 215 | except Exception as e0: 216 | try: 217 | self.kv = FastText.load(self.model).wv 218 | except Exception as e1: 219 | try: 220 | self.kv = Word2Vec.load(self.model).wv 221 | except Exception as e2: 222 | try: 223 | self.kv = KeyedVectors.load(self.model) 224 | except Exception as e3: 225 | try: 226 | self.kv = KeyedVectors.load_word2vec_format( 227 | self.model, binary=True 228 | ) 229 | except Exception as e4: 230 | raise Exception( 231 | "Not a valid model.Sense2Vec, FastText," 232 | f" Word2Vec, KeyedVectors.\n {e0}\n {e1}\n" 233 | f" {e2}\n {e3}\n {e4}" 234 | ) 235 | elif isinstance(self.model, (FastText, Word2Vec)): 236 | self.kv = self.model.wv 237 | elif isinstance(self.model, KeyedVectors): 238 | self.kv = self.model 239 | elif isinstance(self.model, Sense2Vec): 240 | self.kv = self.model 241 | else: 242 | wordList = [] 243 | vectorList = [] 244 | 245 | assert len( 246 | self.nlp.vocab.vectors 247 | ), "Choose a spaCy model with internal embeddings, e.g. md or lg." 248 | 249 | for key, vector in self.nlp.vocab.vectors.items(): 250 | wordList.append(self.nlp.vocab.strings[key]) 251 | vectorList.append(vector) 252 | 253 | self.kv = KeyedVectors(self.nlp.vocab.vectors_length) 254 | 255 | self.kv.add_vectors(wordList, vectorList) 256 | 257 | def verify_data(self, verbose: bool = True) -> None: 258 | """ 259 | It takes a dictionary of lists of words, and returns a dictionary of lists of words, 260 | where each word in the list is present in the word2vec model 261 | """ 262 | verified_data: dict[str, list[str]] = dict() 263 | for key, value in self.data.items(): 264 | verified_values = [] 265 | present_key = self._check_presence_vocab(key) 266 | if present_key: 267 | key = present_key 268 | if not present_key and verbose and key not in self.log_cache["key"]: 269 | logger.warning(f"key ´{key}´ not present in vector model") 270 | self.log_cache["key"].append(key) 271 | for word in value: 272 | present_word = self._check_presence_vocab(word) 273 | if present_word: 274 | verified_values.append(present_word) 275 | elif verbose and word not in self.log_cache["word"]: 276 | logger.warning( 277 | f"word ´{word}´ from key ´{key}´ not present in vector model" 278 | ) 279 | self.log_cache["word"].append(word) 280 | verified_data[key] = verified_values 281 | if not len(verified_values): 282 | msg = ( 283 | f"None of the entries for key {key} are present in the vector" 284 | " model. " 285 | ) 286 | if present_key: 287 | logger.warning( 288 | msg + f"Using {present_key} as word to expand over instead." 289 | ) 290 | verified_data[key] = present_key 291 | else: 292 | raise Exception(msg) 293 | self.data = deepcopy(verified_data) 294 | self.original_data = deepcopy(verified_data) 295 | 296 | def expand_concepts(self) -> None: 297 | """ 298 | For each key in the data dictionary, find the topn most similar words to the key and the values in the data 299 | dictionary, and add those words to the values in the data dictionary 300 | """ 301 | for key in self.data: 302 | present_key = self._check_presence_vocab(key) 303 | if present_key: 304 | key_list = [present_key] 305 | else: 306 | key_list = [] 307 | if isinstance(self.kv, Sense2Vec): 308 | similar = self.kv.most_similar( 309 | self.data[key] + key_list, 310 | n=self.topn_dict[key], 311 | ) 312 | else: 313 | similar = self.kv.most_similar( 314 | self.data[key] + key_list, 315 | topn=self.topn_dict[key], 316 | ) 317 | self.data[key] = list({word for word, _ratio in similar}) 318 | 319 | def resolve_overlapping_concepts(self) -> None: 320 | """ 321 | It removes words from the data that are in other concepts, and then removes words that are not closest to the 322 | centroid of the concept 323 | """ 324 | for key in self.data: 325 | self.data[key] = [ 326 | word 327 | for word in self.data[key] 328 | if key == self.most_similar_to_given(word, list(self.data.keys())) 329 | ] 330 | 331 | def most_similar_to_given(self, key1, keys_list): 332 | """Get the `key` from `keys_list` most similar to `key1`.""" 333 | return keys_list[argmax([self.similarity(key1, key) for key in keys_list])] 334 | 335 | def similarity(self, w1, w2): 336 | """Compute cosine similarity between two keys. 337 | 338 | Parameters 339 | ---------- 340 | w1 : str 341 | Input key. 342 | w2 : str 343 | Input key. 344 | 345 | Returns 346 | ------- 347 | float 348 | Cosine similarity between `w1` and `w2`. 349 | 350 | """ 351 | return dot(matutils.unitvec(self.kv[w1]), matutils.unitvec(self.kv[w2])) 352 | 353 | def infer_original_data(self) -> None: 354 | """ 355 | It takes the original data and adds the new data to it, then removes the new data from the original data. 356 | """ 357 | for key in self.data: 358 | self.data[key] = list(set(self.data[key] + self.original_data[key])) 359 | 360 | for key_x in self.data: 361 | for key_y in self.data: 362 | if key_x != key_y: 363 | self.data[key_x] = [ 364 | word 365 | for word in self.data[key_x] 366 | if word not in self.original_data[key_y] 367 | ] 368 | 369 | def lemmatize_concepts(self) -> None: 370 | """ 371 | For each key in the data dictionary, 372 | the function takes the list of concepts associated with that key, and lemmatizes 373 | each concept. 374 | """ 375 | for key in self.data: 376 | self.data[key] = list( 377 | set([doc[0].lemma_ for doc in self.nlp.pipe(self.data[key])]) 378 | ) 379 | 380 | def create_conceptual_patterns(self) -> None: 381 | """ 382 | For each key in the data dictionary, 383 | create a pattern for each word in the list of words associated with that key. 384 | 385 | 386 | The pattern is a dictionary with three keys: 387 | 388 | 1. "lemma" 389 | 2. "POS" 390 | 3. "DEP" 391 | 392 | The value for each key is another dictionary with one key and one value. 393 | 394 | The key is either "regex" or "NOT_IN" or "IN". 395 | 396 | The value is either a regular expression or a list of strings. 397 | 398 | The regular expression is the word associated with the key in the data dictionary. 399 | 400 | The list of strings is either ["VERB"] or ["nsubjpass"] or ["amod", "compound"]. 401 | 402 | The regular expression is case insensitive. 403 | 404 | The pattern is 405 | """ 406 | lemma_patterns = [] 407 | fuzzy_patterns = [] 408 | 409 | def add_patterns(input_dict: dict) -> None: 410 | """ 411 | It creates a list of dictionaries that can be used for a spaCy entity ruler 412 | 413 | :param input_dict: a dictionary 414 | :type input_dict: dict 415 | """ 416 | 417 | if isinstance(self.kv, Sense2Vec): 418 | input_dict = { 419 | key.split("|")[0]: [word.split("|")[0] for word in value] 420 | for key, value in input_dict.items() 421 | } 422 | for key in input_dict: 423 | words = input_dict[key] 424 | for word in words: 425 | if word != key: 426 | word_parts = self._split_word(word) 427 | op_pattern = { 428 | "TEXT": { 429 | "REGEX": "|".join([" ", "-", "_", "/"]), 430 | "OP": "*", 431 | } 432 | } 433 | partial_pattern_parts = [] 434 | lemma_pattern_parts = [] 435 | for partial_pattern in word_parts: 436 | word_part = partial_pattern 437 | if self.fuzzy: 438 | partial_pattern = { 439 | "FUZZY": word_part, 440 | } 441 | partial_pattern = {"TEXT": partial_pattern} 442 | lemma_pattern_parts.append({self.match_key: word_part}) 443 | lemma_pattern_parts.append(op_pattern) 444 | partial_pattern_parts.append(partial_pattern) 445 | partial_pattern_parts.append(op_pattern) 446 | 447 | pattern = { 448 | "label": key.upper(), 449 | "pattern": partial_pattern_parts[:-1], 450 | "id": f"{word}_individual", 451 | } 452 | 453 | # add fuzzy matching formatting if fuzzy matching is enabled 454 | fuzzy_patterns.append(pattern) 455 | 456 | # add lemmma matching 457 | if lemma_pattern_parts: 458 | lemma_pattern = { 459 | "label": key.upper(), 460 | "pattern": lemma_pattern_parts[:-1], 461 | "id": f"{word}_lemma_individual", 462 | } 463 | lemma_patterns.append(lemma_pattern) 464 | 465 | if self.include_compound_words: 466 | compound_rule = [ 467 | { 468 | "DEP": {"IN": ["amod", "compound"]}, 469 | "OP": "*", 470 | } 471 | ] 472 | partial_pattern_parts.append( 473 | { 474 | "label": key.upper(), 475 | "pattern": compound_rule 476 | + partial_pattern_parts[:-1] 477 | + compound_rule, 478 | "id": f"{word}_compound", 479 | } 480 | ) 481 | if lemma_pattern_parts: 482 | lemma_patterns.append( 483 | { 484 | "label": key.upper(), 485 | "pattern": compound_rule 486 | + lemma_pattern_parts[:-1] 487 | + compound_rule, 488 | "id": f"{word}_lemma_compound", 489 | } 490 | ) 491 | 492 | add_patterns(self.data) 493 | 494 | if self.json_path: 495 | with open(self.json_path, "w") as f: 496 | json.dump(lemma_patterns + fuzzy_patterns, f) 497 | 498 | config = {"overwrite_ents": True} 499 | if self.case_sensitive: 500 | config["phrase_matcher_attr"] = "LOWER" 501 | 502 | self.ruler = self.nlp.add_pipe("entity_ruler", config=config) 503 | self.ruler.add_patterns(lemma_patterns) 504 | 505 | # Add spaczz entity ruler if fuzzy 506 | if self.fuzzy: 507 | for pattern in fuzzy_patterns: 508 | pattern["type"] = "token" 509 | self.fuzzy_ruler = self.nlp.add_pipe("spaczz_ruler", config=config) 510 | self.fuzzy_ruler.add_patterns(fuzzy_patterns) 511 | 512 | def __call__(self, doc: Doc) -> Doc: 513 | """ 514 | It takes a doc object and assigns a score to each entity in the doc object 515 | 516 | :param doc: Doc 517 | :type doc: Doc 518 | """ 519 | if isinstance(doc, str): 520 | doc = self.nlp(doc) 521 | elif isinstance(doc, Doc): 522 | if self.ent_score: 523 | doc = self.assign_score_to_entities(doc) 524 | 525 | return doc 526 | 527 | def pipe(self, stream, batch_size=128) -> Doc: 528 | """ 529 | It takes a stream of documents, and for each document, 530 | it assigns a score to each entity in the document 531 | 532 | :param stream: a generator of documents 533 | :param batch_size: The number of documents to be processed at a time, defaults to 128 (optional) 534 | """ 535 | if isinstance(stream, str): 536 | stream = [stream] 537 | 538 | if not isinstance(stream, types.GeneratorType): 539 | stream = self.nlp.pipe(stream, batch_size=batch_size) 540 | 541 | for docs in util.minibatch(stream, size=batch_size): 542 | for doc in docs: 543 | if self.ent_score: 544 | doc = self.assign_score_to_entities(doc) 545 | yield doc 546 | 547 | def assign_score_to_entities(self, doc: Doc) -> Doc: 548 | """ 549 | The function takes a spaCy document as input and assigns a score to each entity in the document. The score is 550 | calculated using the word embeddings of the entity and the concept. 551 | The score is assigned to the entity using the 552 | `._.ent_score` attribute 553 | 554 | :param doc: Doc 555 | :type doc: Doc 556 | :return: The doc object with the entities and their scores. 557 | """ 558 | ents = doc.ents 559 | for ent in ents: 560 | if ent.label_ in self.data_upper: 561 | ent_text = ent.text 562 | 563 | # get word part representations 564 | if self._check_presence_vocab(ent_text): 565 | entity = [self._check_presence_vocab(ent_text)] 566 | else: 567 | entity = [] 568 | for part in self._split_word(ent_text): 569 | present_part = self._check_presence_vocab(part) 570 | if present_part: 571 | entity.append(present_part) 572 | 573 | # get concepts to match 574 | concept = self.concept_data.get(ent.label_, None) 575 | 576 | # compare set similarities 577 | if entity and concept: 578 | ent._.ent_score = self.kv.n_similarity(entity, concept) 579 | else: 580 | ent._.ent_score = 0 581 | if self.verbose: 582 | if f"{ent_text}_{concept}" not in self.log_cache["key_word"]: 583 | logger.warning( 584 | f"Entity ´{ent.text}´ and/or label ´{concept}´ not" 585 | " found in vector model. Nothing to compare to, so" 586 | " setting ent._.ent_score to 0." 587 | ) 588 | self.log_cache["key_word"].append(f"{ent_text}_{concept}") 589 | else: 590 | ent._.ent_score = 0 591 | if self.verbose: 592 | if ent.text not in self.log_cache["word"]: 593 | logger.warning( 594 | f"Entity ´{ent.text}´ not found in vector model. Nothing to" 595 | " compare to, so setting ent._.ent_score to 0." 596 | ) 597 | self.log_cache["word"].append(ent.text) 598 | doc.ents = ents 599 | return doc 600 | 601 | def set_concept_dict(self): 602 | self.concept_data = {k.upper(): v for k, v in self.data.items()} 603 | for ent_label in self.concept_data: 604 | concept = [] 605 | for word in self.concept_data[ent_label]: 606 | present_word = self._check_presence_vocab(word) 607 | if present_word: 608 | concept.append(present_word) 609 | self.concept_data[ent_label] = concept 610 | 611 | def _split_word(self, word: str) -> List[str]: 612 | """ 613 | It splits a word into a list of subwords, using the word delimiter 614 | 615 | :param word: str 616 | :type word: str 617 | :return: A list of strings or any. 618 | """ 619 | return re.split(f"[{re.escape(self.word_delimiter)}]+", word) 620 | 621 | def _check_presence_vocab(self, word: str) -> str: 622 | """ 623 | If the word is not lowercase and the case_sensitive flag is set to False, then check if the lowercase version of 624 | the word is in the vocabulary. If it is, return the lowercase version of the word. Otherwise, return the word 625 | itself 626 | 627 | :param word: The word to check for presence in the vocabulary 628 | :type word: str 629 | :return: The word itself if it is present in the vocabulary, otherwise the word with the highest probability of 630 | being the word that was intended. 631 | """ 632 | word = word.replace(" ", "_") 633 | if not word.islower() and not self.case_sensitive: 634 | present_word = self.__check_presence_vocab(word.lower()) 635 | if present_word: 636 | return present_word 637 | return self.__check_presence_vocab(word) 638 | 639 | def __check_presence_vocab(self, word: str) -> str: 640 | """ 641 | If the word is in the vocabulary, return the word. If not, replace spaces and dashes with the word delimiter and 642 | check if the new word is in the vocabulary. If so, return the new word 643 | 644 | :param word: str - the word to check 645 | :type word: str 646 | :return: The word or the check_word 647 | """ 648 | if isinstance(self.kv, Sense2Vec): 649 | return self.kv.get_best_sense(word, (set(POS_LIST) - set(self.exclude_pos))) 650 | else: 651 | if word in self.kv: 652 | return word 653 | -------------------------------------------------------------------------------- /concise_concepts/conceptualizer/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .Conceptualizer import Conceptualizer 3 | 4 | __all__ = ["Conceptualizer"] 5 | -------------------------------------------------------------------------------- /concise_concepts/examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/f31d1c3aa5a9a6908790ed04fc77edc18bf9221a/concise_concepts/examples/__init__.py -------------------------------------------------------------------------------- /concise_concepts/examples/data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | text = """ 3 | Heat the oil in a large pan and add the Onion, celery and carrots. 4 | Then, cook over a medium–low heat for 10 minutes, or until softened. 5 | Add the courgette, garlic, red peppers and oregano and cook for 2–3 minutes. 6 | Later, add some oranges, chickens. """ 7 | 8 | text_fuzzy = """ 9 | Heat the oil in a large pan and add the Onion, celery and carots. 10 | Then, cook over a medium–low heat for 10 minutes, or until softened. 11 | Add the courgette, garlic, red peppers and oregano and cook for 2–3 minutes. 12 | Later, add some oranges, chickens. """ 13 | 14 | data = { 15 | "fruit": ["apple", "pear", "orange"], 16 | "vegetable": ["broccoli", "spinach", "tomato"], 17 | "meat": ["chicken", "beef", "pork", "fish", "lamb"], 18 | } 19 | -------------------------------------------------------------------------------- /concise_concepts/examples/example_gensim_custom_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import spacy 3 | from gensim.models import Word2Vec 4 | from gensim.test.utils import common_texts 5 | 6 | import concise_concepts # noqa: F401 7 | 8 | data = {"human": ["trees"], "interface": ["computer"]} 9 | 10 | text = ( 11 | "believe me, it's the slowest mobile I saw. Don't go on screen and Battery, it is" 12 | " an extremely slow mobile phone and takes ages to open and navigate. Forget about" 13 | " heavy use, it can't handle normal regular use. I made a huge mistake but pls" 14 | " don't buy this mobile. It's only a few months and I am thinking to change it. Its" 15 | " dam SLOW SLOW SLOW." 16 | ) 17 | 18 | model = Word2Vec( 19 | sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4 20 | ) 21 | model.save("word2vec.model") 22 | model_path = "word2vec.model" 23 | 24 | nlp = spacy.blank("en") 25 | nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path}) 26 | -------------------------------------------------------------------------------- /concise_concepts/examples/example_gensim_custom_path.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import gensim.downloader as api 3 | import spacy 4 | 5 | import concise_concepts # noqa: F401 6 | 7 | from .data import data, text 8 | 9 | model_path = "word2vec.model" 10 | model = api.load("glove-twitter-25") 11 | model.save(model_path) 12 | nlp = spacy.blank("en") 13 | 14 | nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path}) 15 | 16 | doc = nlp(text) 17 | print([(ent.text, ent.label_) for ent in doc.ents]) 18 | -------------------------------------------------------------------------------- /concise_concepts/examples/example_gensim_default.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import spacy 3 | 4 | import concise_concepts # noqa: F401 5 | 6 | from .data import data, text 7 | 8 | model_path = "glove-twitter-25" 9 | 10 | nlp = spacy.blank("en") 11 | 12 | nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path}) 13 | 14 | doc = nlp(text) 15 | print([(ent.text, ent.label_) for ent in doc.ents]) 16 | -------------------------------------------------------------------------------- /concise_concepts/examples/example_spacy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import spacy 3 | 4 | import concise_concepts # noqa: F401 5 | 6 | from .data import data, text 7 | 8 | nlp = spacy.load("en_core_web_md") 9 | 10 | nlp.add_pipe("concise_concepts", config={"data": data}) 11 | 12 | doc = nlp(text) 13 | print([(ent.text, ent.label_) for ent in doc.ents]) 14 | -------------------------------------------------------------------------------- /img/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/f31d1c3aa5a9a6908790ed04fc77edc18bf9221a/img/example.png -------------------------------------------------------------------------------- /img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/f31d1c3aa5a9a6908790ed04fc77edc18bf9221a/img/logo.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "concise-concepts" 3 | version = "0.8.1" 4 | description = "This repository contains an easy and intuitive approach to few-shot NER using most similar expansion over spaCy embeddings. Now with entity confidence scores!" 5 | authors = ["David Berenstein "] 6 | license = "MIT" 7 | readme = "README.md" 8 | homepage = "https://github.com/pandora-intelligence/concise-concepts" 9 | repository = "https://github.com/pandora-intelligence/concise-concepts" 10 | documentation = "https://github.com/pandora-intelligence/concise-concepts" 11 | keywords = ["spacy", "NER", "few-shot classification", "nlu"] 12 | classifiers = [ 13 | "Intended Audience :: Developers", 14 | "Intended Audience :: Science/Research", 15 | "License :: OSI Approved :: MIT License", 16 | "Operating System :: OS Independent", 17 | "Programming Language :: Python :: 3.8", 18 | "Programming Language :: Python :: 3.9", 19 | "Programming Language :: Python :: 3.10", 20 | "Programming Language :: Python :: 3.11", 21 | "Topic :: Scientific/Engineering", 22 | "Topic :: Software Development" 23 | ] 24 | packages = [{include = "concise_concepts"}] 25 | 26 | 27 | [tool.poetry.dependencies] 28 | python = ">=3.8,<3.12" 29 | spacy = "^3" 30 | scipy = "^1.7" 31 | gensim = "^4" 32 | spaczz = "^0.5.4" 33 | sense2vec = "^2.0.1" 34 | 35 | [tool.poetry.plugins] 36 | 37 | [tool.poetry.plugins."spacy_factories"] 38 | "spacy" = "concise_concepts.__init__:make_concise_concepts" 39 | 40 | [tool.poetry.group.dev.dependencies] 41 | black = "^22" 42 | flake8 = "^5" 43 | pytest = "^7.1" 44 | pre-commit = "^2.20" 45 | pep8-naming = "^0.13" 46 | flake8-bugbear = "^22.9" 47 | flake8-docstrings = "^1.6" 48 | ipython = "^8.7.0" 49 | ipykernel = "^6.17.1" 50 | 51 | [build-system] 52 | requires = ["poetry-core>=1.0.0"] 53 | build-backend = "poetry.core.masonry.api" 54 | 55 | [tool.pytest.ini_options] 56 | testpaths = "tests" 57 | 58 | [tool.black] 59 | preview = true 60 | 61 | [tool.isort] 62 | profile = "black" 63 | src_paths = ["concise_concepts"] 64 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 119 3 | max-complexity = 18 4 | docstring-convention=google 5 | exclude = .git,__pycache__,build,dist 6 | select = C,E,F,W,B,B950 7 | ignore = 8 | E203,E266,E501,W503 9 | enable = 10 | W0614 11 | per-file-ignores = 12 | test_*.py: D 13 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidberenstein1957/concise-concepts/f31d1c3aa5a9a6908790ed04fc77edc18bf9221a/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_model_import.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | def test_spacy_embeddings(): 3 | from concise_concepts.examples import example_spacy # noqa: F401 4 | 5 | 6 | def test_gensim_default(): 7 | from concise_concepts.examples import example_gensim_default # noqa: F401 8 | 9 | 10 | def test_gensim_custom_path(): 11 | from concise_concepts.examples import example_gensim_custom_path # noqa: F401 12 | 13 | 14 | def test_gensim_custom_model(): 15 | from concise_concepts.examples import example_gensim_custom_model # noqa: F401 16 | 17 | 18 | def test_standalone_spacy(): 19 | import spacy 20 | 21 | from concise_concepts import Conceptualizer 22 | 23 | nlp = spacy.load("en_core_web_md") 24 | data = { 25 | "disease": ["cancer", "diabetes", "heart disease", "influenza", "pneumonia"], 26 | "symptom": ["headache", "fever", "cough", "nausea", "vomiting", "diarrhea"], 27 | } 28 | conceptualizer = Conceptualizer(nlp, data) 29 | assert ( 30 | list(conceptualizer.pipe(["I have a headache and a fever."]))[0].to_json() 31 | == list(conceptualizer.nlp.pipe(["I have a headache and a fever."]))[ 32 | 0 33 | ].to_json() 34 | ) 35 | assert ( 36 | conceptualizer("I have a headache and a fever.").to_json() 37 | == conceptualizer.nlp("I have a headache and a fever.").to_json() 38 | ) 39 | 40 | data = { 41 | "disease": ["cancer", "diabetes"], 42 | "symptom": ["headache", "fever"], 43 | } 44 | conceptualizer = Conceptualizer(nlp, data) 45 | 46 | 47 | def test_standalone_gensim(): 48 | import gensim 49 | import spacy 50 | 51 | from concise_concepts import Conceptualizer 52 | 53 | model_path = "glove-twitter-25" 54 | model = gensim.downloader.load(model_path) 55 | nlp = spacy.load("en_core_web_md") 56 | data = { 57 | "disease": ["cancer", "diabetes", "heart disease", "influenza", "pneumonia"], 58 | "symptom": ["headache", "fever", "cough", "nausea", "vomiting", "diarrhea"], 59 | } 60 | conceptualizer = Conceptualizer(nlp, data, model=model) 61 | print(list(conceptualizer.pipe(["I have a headache and a fever."]))[0].ents) 62 | print(list(conceptualizer.nlp.pipe(["I have a headache and a fever."]))[0].ents) 63 | print(conceptualizer("I have a headache and a fever.").ents) 64 | print(conceptualizer.nlp("I have a headache and a fever.").ents) 65 | 66 | 67 | def test_spaczz(): 68 | # -*- coding: utf-8 -*- 69 | import spacy 70 | 71 | import concise_concepts # noqa: F401 72 | from concise_concepts.examples.data import data, text, text_fuzzy 73 | 74 | nlp = spacy.load("en_core_web_md") 75 | 76 | nlp.add_pipe("concise_concepts", config={"data": data, "fuzzy": True}) 77 | 78 | assert len(nlp(text).ents) == len(nlp(text_fuzzy).ents) 79 | 80 | 81 | def test_sense2vec(): 82 | # -*- coding: utf-8 -*- 83 | import requests 84 | import spacy 85 | 86 | import concise_concepts # noqa: F401 87 | from concise_concepts.examples.data import data, text 88 | 89 | model_path = "s2v_old" 90 | # download .tar.gz file an URL 91 | # and extract it to a folder 92 | url = "https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz" 93 | r = requests.get(url, allow_redirects=True) 94 | open("s2v_reddit_2015_md.tar.gz", "wb").write(r.content) 95 | # extract tar.gz file 96 | filename = "s2v_reddit_2015_md.tar.gz" 97 | import tarfile 98 | 99 | tar = tarfile.open(filename, "r:gz") 100 | tar.extractall() 101 | tar.close() 102 | 103 | nlp = spacy.load("en_core_web_md") 104 | 105 | nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path}) 106 | 107 | assert len(nlp(text).ents) 108 | --------------------------------------------------------------------------------