├── holmes_extractor
├── lang
│ ├── __init__.py
│ ├── de
│ │ ├── __init__.py
│ │ └── data
│ │ │ ├── __init__.py
│ │ │ └── derivation.csv
│ └── en
│ │ ├── __init__.py
│ │ └── data
│ │ ├── __init__.py
│ │ └── derivation.csv
├── word_matching
│ ├── __init__,py
│ ├── entity.py
│ ├── embedding.py
│ ├── direct.py
│ ├── question.py
│ ├── entity_embedding.py
│ ├── general.py
│ ├── derivation.py
│ └── ontology.py
├── about.py
├── config.cfg
├── __init__.py
└── errors.py
├── MANIFEST.in
├── docs
├── holmes_thumbnail.png
└── ontology_example.png
├── pyproject.toml
├── LICENSE
├── examples
├── example_chatbot_DE_insurance.py
├── example_chatbot_EN_insurance.py
├── example_search_DE_law.py
├── example_search_DE_literature.py
├── example_supervised_topic_model_EN.py
├── example_search_EN_literature.py
├── example_chatbot_EN_insurance_ontology.owl
└── example_chatbot_DE_insurance_ontology.owl
├── tests
├── common
│ ├── test_ontology2.owl
│ ├── test_cpu_gpu.py
│ ├── test_serialization.py
│ └── test_manager.py
├── de
│ ├── test_ontology.owl
│ ├── test_questions_DE.py
│ └── test_doc_examples_DE.py
└── en
│ ├── test_ontology.owl
│ └── test_doc_examples_EN.py
├── setup.cfg
├── .gitignore
├── .github
└── workflows
│ └── test-holmes.yml
└── SHORTREADME.md
/holmes_extractor/lang/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/holmes_extractor/lang/de/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/holmes_extractor/lang/en/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/holmes_extractor/lang/de/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/holmes_extractor/lang/en/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/holmes_extractor/word_matching/__init__,py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/holmes_extractor/about.py:
--------------------------------------------------------------------------------
1 | __version__ = "4.0.0"
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include SHORTREADME.md
2 | global-include *.cfg
3 | global-include *.csv
4 | global-include LICENSE
5 |
--------------------------------------------------------------------------------
/docs/holmes_thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/msg-systems/holmes-extractor/HEAD/docs/holmes_thumbnail.png
--------------------------------------------------------------------------------
/docs/ontology_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/msg-systems/holmes-extractor/HEAD/docs/ontology_example.png
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools",
4 | "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
7 |
--------------------------------------------------------------------------------
/holmes_extractor/config.cfg:
--------------------------------------------------------------------------------
1 | [vector_nlps]
2 | # Names of models for which a second model is used as a source of vocabularies and vectors.
3 | en_core_web_trf = en_core_web_lg
4 |
--------------------------------------------------------------------------------
/holmes_extractor/__init__.py:
--------------------------------------------------------------------------------
1 | from .about import __version__
2 | from .manager import Manager
3 | from .ontology import Ontology
4 | import os
5 | os.environ["TOKENIZERS_PARALLELISM"] = "True"
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2019-2021 msg systems ag, 2022 ExplosionAI GmbH, AstraZeneca
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 |
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 |
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/examples/example_chatbot_DE_insurance.py:
--------------------------------------------------------------------------------
1 | import os
2 | import holmes_extractor as holmes
3 |
4 | if __name__ in ('__main__', 'example_chatbot_DE_insurance'):
5 | script_directory = os.path.dirname(os.path.realpath(__file__))
6 | ontology = holmes.Ontology(os.sep.join((
7 | script_directory, 'example_chatbot_DE_insurance_ontology.owl')))
8 | holmes_manager = holmes.Manager(model='de_core_news_lg', ontology=ontology, number_of_workers=2)
9 | holmes_manager.register_search_phrase('Jemand benötigt eine Versicherung')
10 | holmes_manager.register_search_phrase('Ein ENTITYPER schließt eine Versicherung ab')
11 | holmes_manager.register_search_phrase('ENTITYPER benötigt eine Versicherung')
12 | holmes_manager.register_search_phrase('Eine Versicherung für einen Zeitraum')
13 | holmes_manager.register_search_phrase('Eine Versicherung fängt an')
14 | holmes_manager.register_search_phrase('Jemand zahlt voraus')
15 |
16 | holmes_manager.start_chatbot_mode_console()
17 | # e.g. 'Richard Hudson und Max Mustermann brauchen eine Krankenversicherung für die nächsten fünf Jahre'
18 |
--------------------------------------------------------------------------------
/examples/example_chatbot_EN_insurance.py:
--------------------------------------------------------------------------------
1 | import os
2 | import holmes_extractor as holmes
3 |
4 | if __name__ in ('__main__', 'example_chatbot_EN_insurance'):
5 | script_directory = os.path.dirname(os.path.realpath(__file__))
6 | ontology = holmes.Ontology(os.sep.join((
7 | script_directory, 'example_chatbot_EN_insurance_ontology.owl')))
8 | holmes_manager = holmes.Manager(
9 | model='en_core_web_lg', ontology=ontology, number_of_workers=2)
10 | holmes_manager.register_search_phrase('Somebody requires insurance')
11 | holmes_manager.register_search_phrase('An ENTITYPERSON takes out insurance')
12 | holmes_manager.register_search_phrase('A company buys payment insurance')
13 | holmes_manager.register_search_phrase('An ENTITYPERSON needs insurance')
14 | holmes_manager.register_search_phrase('Insurance for a period')
15 | holmes_manager.register_search_phrase('An insurance begins')
16 | holmes_manager.register_search_phrase('Somebody prepays')
17 | holmes_manager.register_search_phrase('Somebody makes an insurance payment')
18 |
19 | holmes_manager.start_chatbot_mode_console()
20 | # e.g. 'Richard Hudson and John Doe require health insurance for the next five years'
21 |
--------------------------------------------------------------------------------
/tests/common/test_ontology2.owl:
--------------------------------------------------------------------------------
1 |
2 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/examples/example_search_DE_law.py:
--------------------------------------------------------------------------------
1 | import urllib.request
2 |
3 | # You will need to install bs4 (python -m pip install bs4)
4 | from bs4 import BeautifulSoup
5 | import holmes_extractor as holmes
6 |
7 | def download_and_register(url, label):
8 | print('Downloading', label)
9 | # Download the content
10 | page = urllib.request.urlopen(url)
11 | # Extract the raw text from the HTML document
12 | soup = BeautifulSoup(page, 'html.parser')
13 | # Register the document with Holmes
14 | print('Parsing and registering', label)
15 | holmes_manager.parse_and_register_document(soup.get_text(), label)
16 |
17 | if __name__ in ('__main__', 'example_search_DE_law'):
18 | # Start the Holmes Manager with the German model
19 | holmes_manager = holmes.Manager(model='de_core_news_lg', number_of_workers=2)
20 | download_and_register('https://www.gesetze-im-internet.de/vvg_2008/BJNR263110007.html', 'VVG_2008')
21 | # This may take several minutes
22 | download_and_register('https://www.gesetze-im-internet.de/vag_2016/BJNR043410015.html', 'VAG')
23 | holmes_manager.start_topic_matching_search_mode_console(initial_question_word_embedding_match_threshold=0.7)
24 |
25 | # Example queries:
26 | #
27 | # Der Versicherer darf den Vertrag fristlos kündigen, wenn der Versicherungsnehmer beim Abschluss des Vertrags die vorvertragliche Anzeigepflicht verletzt hat.
28 | # Der Versicherer darf Leistungen verweigern.
29 | # Der Versicherer darf die Prämie anpassen.
30 | # Eine Richtlinie einer ENTITYORG
31 |
--------------------------------------------------------------------------------
/holmes_extractor/errors.py:
--------------------------------------------------------------------------------
1 | class HolmesError(Exception):
2 | def __init__(self, text):
3 | self.text = text
4 |
5 | def __str__(self):
6 | return self.text
7 |
8 |
9 | class SearchPhraseContainsNegationError(HolmesError):
10 | pass
11 |
12 |
13 | class SearchPhraseContainsConjunctionError(HolmesError):
14 | pass
15 |
16 |
17 | class SearchPhraseContainsCoreferringPronounError(HolmesError):
18 | pass
19 |
20 |
21 | class SearchPhraseWithoutMatchableWordsError(HolmesError):
22 | pass
23 |
24 |
25 | class SearchPhraseContainsMultipleClausesError(HolmesError):
26 | pass
27 |
28 |
29 | class DuplicateDocumentError(HolmesError):
30 | pass
31 |
32 |
33 | class NoSearchPhraseError(HolmesError):
34 | pass
35 |
36 |
37 | class NoDocumentError(HolmesError):
38 | pass
39 |
40 |
41 | class WrongModelDeserializationError(HolmesError):
42 | pass
43 |
44 |
45 | class WrongVersionDeserializationError(HolmesError):
46 | pass
47 |
48 |
49 | class DocumentTooBigError(HolmesError):
50 | pass
51 |
52 |
53 | class FewerThanTwoClassificationsError(HolmesError):
54 | pass
55 |
56 |
57 | class NoPhraseletsAfterFilteringError(HolmesError):
58 | pass
59 |
60 |
61 | class EmbeddingThresholdLessThanRelationThresholdError(HolmesError):
62 | pass
63 |
64 |
65 | class IncompatibleAnalyzeDerivationalMorphologyDeserializationError(HolmesError):
66 | pass
67 |
68 |
69 | class MultiprocessingParsingNotSupportedError(HolmesError):
70 | pass
71 |
72 |
73 | class OntologyObjectSharedBetweenManagersError(HolmesError):
74 | pass
75 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = holmes-extractor
3 | version = 4.0.0
4 | description = Information extraction from English and German texts based on predicate logic
5 | long_description = file: SHORTREADME.md
6 | long_description_content_type = text/markdown
7 | url = https://github.com/explosion/holmes-extractor
8 | author = Richard Paul Hudson, Explosion AI
9 | author_email = richard@explosion.ai
10 | license = MIT
11 | keywords= nlp, information-extraction, spacy, spacy-extension, python, machine-learning, ontology, semantics
12 | classifiers =
13 | Development Status :: 5 - Production/Stable
14 | Intended Audience :: Developers
15 | Intended Audience :: Financial and Insurance Industry
16 | Intended Audience :: Healthcare Industry
17 | Intended Audience :: Information Technology
18 | Intended Audience :: Legal Industry
19 | Intended Audience :: Other Audience
20 | Intended Audience :: Education
21 | Intended Audience :: Science/Research
22 | License :: OSI Approved :: MIT License
23 | Natural Language :: English
24 | Natural Language :: German
25 | Programming Language :: Python :: 3
26 | Programming Language :: Python :: 3.6
27 | Programming Language :: Python :: 3.7
28 | Programming Language :: Python :: 3.8
29 | Programming Language :: Python :: 3.9
30 | Programming Language :: Python :: 3.10
31 | Topic :: Scientific/Engineering :: Artificial Intelligence
32 | Topic :: Scientific/Engineering :: Information Analysis
33 | Topic :: Text Processing :: Linguistic
34 |
35 | [options]
36 | include_package_data = True
37 | python_requires = >=3.6,<3.11
38 | install_requires =
39 | spacy>=3.1.0,<3.4.0
40 | coreferee>=1.2.0
41 | rdflib
42 | [options.package_data]
43 | * = *.cfg, *.csv
44 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # Visual Studio Code
132 | .vscode
133 |
--------------------------------------------------------------------------------
/.github/workflows/test-holmes.yml:
--------------------------------------------------------------------------------
1 | name: Holmes Matrix Test
2 | on:
3 | workflow_dispatch:
4 | push:
5 |
6 | jobs:
7 | test-holmes:
8 | strategy:
9 | matrix:
10 | os: [macos-latest, windows-latest, ubuntu-latest]
11 | python_version: ['3.6', '3.7', '3.8', '3.9', '3.10']
12 | spacy_version: ['3.3.0']
13 | click_version: ['8.0.1']
14 | include:
15 | - os: 'ubuntu-latest'
16 | python_version: '3.9'
17 | spacy_version: '3.2.4'
18 | click_version: '8.0.1'
19 | - os: 'ubuntu-latest'
20 | python_version: '3.9'
21 | spacy_version: '3.1.6'
22 | click_version: '7.1.2'
23 | runs-on: ${{ matrix.os }}
24 | steps:
25 |
26 | - name: Increase swap file size on Windows
27 | if: ${{ matrix.os == 'windows-latest' }}
28 | uses: al-cheb/configure-pagefile-action@v1.2
29 | with:
30 | minimum-size: 64GB
31 | maximum-size: 64GB
32 |
33 | - name: Checkout repository code
34 | uses: actions/checkout@v3
35 | with:
36 | ref: master
37 |
38 | - name: Initialize Python
39 | uses: actions/setup-python@v3
40 | with:
41 | python-version: ${{ matrix.python_version }}
42 |
43 | - name: Install dependencies
44 | run: |
45 | python -m pip install --upgrade pip setuptools wheel
46 | pip install spacy==${{ matrix.spacy_version }} pytest spacy-lookups-data
47 |
48 | # see https://github.com/explosion/spaCy/issues/10564
49 | pip uninstall click -y
50 | pip install "click==${{ matrix.click_version }}"
51 |
52 | - name: Install spaCy models
53 | run: |
54 | python -m spacy download en_core_web_sm
55 | python -m spacy download en_core_web_lg
56 | python -m spacy download en_core_web_trf
57 | python -m spacy download de_core_news_lg
58 | python -m spacy download pl_core_news_md
59 |
60 | - name: Install Coreferee
61 | run: |
62 | pip install coreferee
63 | python -m coreferee install en
64 | python -m coreferee install de
65 | python -m coreferee install pl
66 |
67 | - name: Install Holmes
68 | run: |
69 | cd ${{ github.workspace }}
70 | pip install .
71 |
72 | - name: Test Holmes
73 | run: |
74 | python -m pytest tests/de/test_doc_examples_DE.py
75 | python -m pytest tests/de/test_phraselet_production_DE.py
76 | python -m pytest tests/de/test_questions_DE.py
77 | python -m pytest tests/de/test_semantics_DE.py
78 | python -m pytest tests/de/test_structural_matching_DE.py
79 | python -m pytest tests/de/test_structural_matching_with_coreference_DE.py
80 | python -m pytest tests/de/test_supervised_topic_classification_DE.py
81 | python -m pytest tests/de/test_topic_matching_DE.py
82 | python -m pytest tests/en/test_doc_examples_EN.py
83 | python -m pytest tests/en/test_phraselet_production_EN.py
84 | python -m pytest tests/en/test_questions_EN.py
85 | python -m pytest tests/en/test_semantics_EN.py
86 | python -m pytest tests/en/test_structural_matching_EN.py
87 | python -m pytest tests/en/test_structural_matching_with_coreference_EN.py
88 | python -m pytest tests/en/test_supervised_topic_classification_EN.py
89 | python -m pytest tests/en/test_topic_matching_EN.py
90 | python -m pytest tests/common/test_manager.py
91 | python -m pytest tests/common/test_cpu_gpu.py
92 | python -m pytest tests/common/test_errors.py
93 | python -m pytest tests/common/test_ontology.py
94 | python -m pytest tests/common/test_serialization.py
95 | python -m pytest tests/common/test_word_level_matching.py
96 | python -m pytest tests/common/test_multithreading.py
--------------------------------------------------------------------------------
/tests/de/test_ontology.owl:
--------------------------------------------------------------------------------
1 |
2 |
9 |
10 |
11 |
12 |
13 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
--------------------------------------------------------------------------------
/holmes_extractor/word_matching/entity.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Optional, List
2 | from spacy.tokens import Token, Doc
3 | from .general import WordMatch, WordMatchingStrategy
4 | from ..parsing import MultiwordSpan, CorpusWordPosition, SearchPhrase
5 |
6 |
7 | class EntityWordMatchingStrategy(WordMatchingStrategy):
8 |
9 | WORD_MATCH_TYPE_LABEL = "entity"
10 |
11 | @staticmethod
12 | def _get_explanation(search_phrase_display_word: str) -> str:
13 | return "".join(
14 | ("Has an entity label matching ", search_phrase_display_word.upper(), ".")
15 | )
16 |
17 | def match_multiwords(
18 | self,
19 | search_phrase: SearchPhrase,
20 | search_phrase_token: Token,
21 | document_token: Token,
22 | document_multiwords: List[MultiwordSpan],
23 | ) -> Optional[WordMatch]:
24 |
25 | entity_placeholder = self.semantic_matching_helper.get_entity_placeholder(
26 | search_phrase_token
27 | )
28 | if entity_placeholder is None:
29 | return None
30 |
31 | for multiword in document_multiwords:
32 | if any(
33 | 1
34 | for i in multiword.token_indexes
35 | if not self._entity_placeholder_matches(
36 | entity_placeholder, document_token.doc[i]
37 | )
38 | ):
39 | continue
40 | return WordMatch(
41 | search_phrase_token=search_phrase_token,
42 | search_phrase_word=entity_placeholder,
43 | document_token=document_token,
44 | first_document_token=document_token.doc[multiword.token_indexes[0]],
45 | last_document_token=document_token.doc[multiword.token_indexes[-1]],
46 | document_subword=None,
47 | document_word=multiword.text,
48 | word_match_type=self.WORD_MATCH_TYPE_LABEL,
49 | explanation=self._get_explanation(entity_placeholder),
50 | )
51 | return None
52 |
53 | def match_token(
54 | self,
55 | search_phrase: SearchPhrase,
56 | search_phrase_token: Token,
57 | document_token: Token,
58 | ) -> Optional[WordMatch]:
59 |
60 | entity_placeholder = self.semantic_matching_helper.get_entity_placeholder(
61 | search_phrase_token
62 | )
63 | if entity_placeholder is None:
64 | return None
65 |
66 | if self._entity_placeholder_matches(entity_placeholder, document_token):
67 | return WordMatch(
68 | search_phrase_token=search_phrase_token,
69 | search_phrase_word=entity_placeholder,
70 | document_token=document_token,
71 | first_document_token=document_token,
72 | last_document_token=document_token,
73 | document_subword=None,
74 | document_word=document_token.text.lower(),
75 | word_match_type=self.WORD_MATCH_TYPE_LABEL,
76 | explanation=self._get_explanation(entity_placeholder),
77 | )
78 | return None
79 |
80 | def add_reverse_dict_entries(
81 | self,
82 | reverse_dict: Dict[str, List[CorpusWordPosition]],
83 | doc: Doc,
84 | document_label: str,
85 | ) -> None:
86 | for token in doc:
87 | # parent check is necessary so we only find multiword entities once per
88 | # search phrase. sibling_marker_deps applies to siblings which would
89 | # otherwise be excluded because the main sibling would normally also match the
90 | # entity root word.
91 | if len(token.ent_type_) > 0 and (
92 | token.dep_ == "ROOT"
93 | or token.dep_ in self.semantic_matching_helper.sibling_marker_deps
94 | or token.ent_type_ != token.head.ent_type_
95 | ):
96 | entity_label = "".join(("ENTITY", token.ent_type_))
97 | self.add_reverse_dict_entry(
98 | reverse_dict,
99 | entity_label,
100 | document_label,
101 | token.i,
102 | None,
103 | )
104 | entity_defined_multiword = (
105 | self.semantic_matching_helper.get_entity_defined_multiword(token)
106 | )
107 | if entity_defined_multiword is not None:
108 | self.add_reverse_dict_entry(
109 | reverse_dict,
110 | entity_defined_multiword.text.lower(),
111 | document_label,
112 | token.i,
113 | None,
114 | )
115 |
116 | def _entity_placeholder_matches(
117 | self, entity_placeholder: str, document_token: Token
118 | ) -> bool:
119 | return (
120 | document_token.ent_type_ == entity_placeholder[6:]
121 | and len(document_token._.holmes.lemma.strip()) > 0
122 | ) or (
123 | entity_placeholder == "ENTITYNOUN"
124 | and document_token.pos_ in self.semantic_matching_helper.noun_pos
125 | )
126 | # len(document_token._.holmes.lemma.strip()) > 0: some German spaCy models sometimes
127 | # classifies whitespace as entities.
128 |
--------------------------------------------------------------------------------
/tests/common/test_cpu_gpu.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from thinc.api import prefer_gpu, require_cpu
3 | import holmes_extractor as holmes
4 |
5 | class CpuGpuTest(unittest.TestCase):
6 |
7 | def test_document_based_structural_matching_cpu_gpu(self):
8 | require_cpu()
9 | holmes_manager = holmes.Manager('en_core_web_sm', number_of_workers=2)
10 | holmes_manager.parse_and_register_document(
11 | document_text="The dog chased the cat.", label='pets')
12 | prefer_gpu()
13 | holmes_manager.register_search_phrase("A dog chases a cat")
14 | self.assertEqual(len(holmes_manager.match()), 1)
15 |
16 | def test_document_based_structural_matching_gpu_cpu(self):
17 | prefer_gpu()
18 | holmes_manager = holmes.Manager('en_core_web_sm', number_of_workers=2)
19 | holmes_manager.parse_and_register_document(
20 | document_text="The dog chased the cat.", label='pets')
21 | require_cpu()
22 | holmes_manager.register_search_phrase("A dog chases a cat")
23 | self.assertEqual(len(holmes_manager.match()), 1)
24 |
25 | def test_search_phrase_based_structural_matching_cpu_gpu(self):
26 | require_cpu()
27 | holmes_manager = holmes.Manager('en_core_web_sm', number_of_workers=2)
28 | holmes_manager.register_search_phrase("A dog chases a cat")
29 | prefer_gpu()
30 | holmes_manager.parse_and_register_document(
31 | document_text="The dog chased the cat.", label='pets')
32 | self.assertEqual(len(holmes_manager.match()), 1)
33 |
34 | def test_search_phrase_based_structural_matching_gpu_cpu(self):
35 | prefer_gpu()
36 | holmes_manager = holmes.Manager('en_core_web_sm', number_of_workers=2)
37 | holmes_manager.register_search_phrase("A dog chases a cat")
38 | require_cpu()
39 | holmes_manager.parse_and_register_document(
40 | document_text="The dog chased the cat.", label='pets')
41 | self.assertEqual(len(holmes_manager.match()), 1)
42 |
43 | def test_topic_matching_cpu_gpu(self):
44 | require_cpu()
45 | holmes_manager = holmes.Manager('en_core_web_sm', number_of_workers=2)
46 | holmes_manager.parse_and_register_document(
47 | document_text="The dog chased the cat.", label='pets')
48 | prefer_gpu()
49 | topic_matches = holmes_manager.topic_match_documents_against("A dog chases a cat")
50 | self.assertEqual(len(topic_matches), 1)
51 |
52 | def test_topic_matching_gpu_cpu(self):
53 | prefer_gpu()
54 | holmes_manager = holmes.Manager('en_core_web_sm', number_of_workers=2)
55 | holmes_manager.parse_and_register_document(
56 | document_text="The dog chased the cat.", label='pets')
57 | require_cpu()
58 | topic_matches = holmes_manager.topic_match_documents_against("A dog chases a cat")
59 | self.assertEqual(len(topic_matches), 1)
60 |
61 | def test_supervised_document_classification_cpu_gpu(self):
62 | require_cpu()
63 | holmes_manager = holmes.Manager('en_core_web_sm', number_of_workers=2)
64 | sttb = holmes_manager.get_supervised_topic_training_basis(
65 | one_hot=False
66 | )
67 | sttb.parse_and_register_training_document("An animal", "animal", "d4")
68 | sttb.parse_and_register_training_document("A computer", "computers", "d5")
69 | sttb.prepare()
70 | # With so little training data, the NN does not consistently learn correctly
71 | for i in range(20):
72 | trainer = sttb.train(
73 | minimum_occurrences=0,
74 | cv_threshold=0,
75 | max_epochs=1000,
76 | learning_rate=0.0001,
77 | convergence_threshold=0,
78 | )
79 | stc = trainer.classifier()
80 | if (
81 | list(stc.parse_and_classify("You are an animal.").keys())[0] == "animal"
82 | ):
83 | break
84 | if i == 20:
85 | self.assertTrue(
86 | list(stc.parse_and_classify("You are an animal.").keys())[0] == "animal"
87 | )
88 |
89 | prefer_gpu()
90 | self.assertTrue(
91 | list(stc.parse_and_classify("You are an animal.").keys())[0] == "animal")
92 | self.assertIsNone(
93 | stc.parse_and_classify("My name is Charles and I like sewing.")
94 | )
95 |
96 | def test_supervised_document_classification_gpu_cpu(self):
97 | prefer_gpu()
98 | holmes_manager = holmes.Manager('en_core_web_sm', number_of_workers=2)
99 | sttb = holmes_manager.get_supervised_topic_training_basis(
100 | one_hot=False
101 | )
102 | sttb.parse_and_register_training_document("An animal", "animal", "d4")
103 | sttb.parse_and_register_training_document("A computer", "computers", "d5")
104 | sttb.prepare()
105 | # With so little training data, the NN does not consistently learn correctly
106 | for i in range(20):
107 | trainer = sttb.train(
108 | minimum_occurrences=0,
109 | cv_threshold=0,
110 | max_epochs=1000,
111 | learning_rate=0.0001,
112 | convergence_threshold=0,
113 | )
114 | stc = trainer.classifier()
115 | if (
116 | list(stc.parse_and_classify("You are an animal.").keys())[0] == "animal"
117 | ):
118 | break
119 | if i == 20:
120 | self.assertTrue(
121 | list(stc.parse_and_classify("You are an animal.").keys())[0] == "animal"
122 | )
123 |
124 | require_cpu()
125 | self.assertTrue(
126 | list(stc.parse_and_classify("You are an animal.").keys())[0] == "animal")
127 | self.assertIsNone(
128 | stc.parse_and_classify("My name is Charles and I like sewing.")
129 | )
130 |
--------------------------------------------------------------------------------
/holmes_extractor/word_matching/embedding.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 | from spacy.tokens import Token
3 | from .general import WordMatch, WordMatchingStrategy
4 | from ..parsing import SemanticMatchingHelper, Subword, SearchPhrase
5 |
6 |
7 | class EmbeddingWordMatchingStrategy(WordMatchingStrategy):
8 |
9 | WORD_MATCH_TYPE_LABEL = "embedding"
10 |
11 | @staticmethod
12 | def _get_explanation(similarity: float, search_phrase_display_word: str) -> str:
13 | printable_similarity = str(int(similarity * 100))
14 | return "".join(
15 | (
16 | "Has a word embedding that is ",
17 | printable_similarity,
18 | "% similar to ",
19 | search_phrase_display_word.upper(),
20 | ".",
21 | )
22 | )
23 |
24 | def __init__(
25 | self,
26 | semantic_matching_helper: SemanticMatchingHelper,
27 | perform_coreference_resolution: bool,
28 | overall_similarity_threshold: float,
29 | initial_question_word_overall_similarity_threshold: float,
30 | ):
31 | self.overall_similarity_threshold = overall_similarity_threshold
32 | self.initial_question_word_overall_similarity_threshold = (
33 | initial_question_word_overall_similarity_threshold
34 | )
35 | super().__init__(semantic_matching_helper, perform_coreference_resolution)
36 |
37 | def match_token(
38 | self,
39 | search_phrase: SearchPhrase,
40 | search_phrase_token: Token,
41 | document_token: Token,
42 | ) -> Optional[WordMatch]:
43 |
44 | return self._check_for_word_match(
45 | search_phrase, search_phrase_token, document_token, None
46 | )
47 |
48 | def match_subword(
49 | self,
50 | search_phrase: SearchPhrase,
51 | search_phrase_token: Token,
52 | document_token: Token,
53 | document_subword: Subword,
54 | ) -> Optional[WordMatch]:
55 |
56 | return self._check_for_word_match(
57 | search_phrase, search_phrase_token, document_token, document_subword
58 | )
59 |
60 | def _check_for_word_match(
61 | self,
62 | search_phrase: SearchPhrase,
63 | search_phrase_token: Token,
64 | document_token: Token,
65 | document_subword: Optional[Subword],
66 | ) -> Optional[WordMatch]:
67 | if (
68 | search_phrase_token.i
69 | in search_phrase.matchable_non_entity_tokens_to_vectors.keys()
70 | and self.semantic_matching_helper.embedding_matching_permitted(
71 | search_phrase_token
72 | )
73 | ):
74 | search_phrase_vector = search_phrase.matchable_non_entity_tokens_to_vectors[
75 | search_phrase_token.i
76 | ]
77 | if search_phrase_vector is None:
78 | return None
79 | if document_subword is not None:
80 | if not self.semantic_matching_helper.embedding_matching_permitted(
81 | document_subword
82 | ):
83 | return None
84 | document_vector = document_subword.vector
85 | document_word = document_subword.lemma
86 | else:
87 | if not self.semantic_matching_helper.embedding_matching_permitted(
88 | document_token
89 | ):
90 | return None
91 | document_vector = document_token._.holmes.vector
92 | document_word = document_token.lemma_
93 | if (
94 | (
95 | search_phrase_token._.holmes.is_initial_question_word
96 | or search_phrase_token._.holmes.has_initial_question_word_in_phrase
97 | )
98 | and self.initial_question_word_overall_similarity_threshold is not None
99 | ):
100 | working_overall_similarity_threshold = (
101 | self.initial_question_word_overall_similarity_threshold
102 | )
103 | else:
104 | working_overall_similarity_threshold = self.overall_similarity_threshold
105 | single_token_similarity_threshold = (
106 | working_overall_similarity_threshold
107 | ** len(search_phrase.matchable_non_entity_tokens_to_vectors)
108 | )
109 | if document_vector is not None:
110 | similarity_measure = self.semantic_matching_helper.cosine_similarity(
111 | search_phrase_vector, document_vector
112 | )
113 | if similarity_measure > single_token_similarity_threshold:
114 | if (
115 | not search_phrase.topic_match_phraselet
116 | and len(search_phrase_token._.holmes.lemma.split()) > 1
117 | ):
118 | search_phrase_display_word = search_phrase_token.lemma_
119 | else:
120 | search_phrase_display_word = search_phrase_token._.holmes.lemma
121 | word_match = WordMatch(
122 | search_phrase_token=search_phrase_token,
123 | search_phrase_word=search_phrase_display_word,
124 | document_token=document_token,
125 | first_document_token=document_token,
126 | last_document_token=document_token,
127 | document_subword=document_subword,
128 | document_word=document_word,
129 | word_match_type=self.WORD_MATCH_TYPE_LABEL,
130 | explanation=self._get_explanation(
131 | similarity_measure, search_phrase_display_word
132 | ),
133 | )
134 | word_match.similarity_measure = similarity_measure
135 | return word_match
136 | return None
137 |
--------------------------------------------------------------------------------
/examples/example_search_DE_literature.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import urllib.request
4 | from multiprocessing import cpu_count
5 | # You will need to install bs4 (python -m pip install bs4)
6 | from bs4 import BeautifulSoup
7 | import holmes_extractor as holmes
8 | # You will need to install falcon (python -m pip install falcon)
9 | import falcon
10 |
11 | if __name__ in ('__main__', 'example_search_DE_literature'):
12 |
13 | working_directory = # REPLACE WITH PATH TO WORKING DIRECTORY IN SINGLE OR DOUBLE QUOTES
14 | HOLMES_EXTENSION = 'hdc'
15 | flag_filename = os.sep.join((working_directory, 'STORY_PARSING_COMPLETE'))
16 |
17 | print('Initializing Holmes (this may take some time) ...')
18 | # Start the Holmes manager with the German model
19 | holmes_manager = holmes.Manager(
20 | model='de_core_news_lg')
21 |
22 | def process_documents_from_front_page(front_page_uri, front_page_label):
23 | """ Download and save all the stories from a front page."""
24 |
25 | front_page = urllib.request.urlopen(front_page_uri)
26 | front_page_soup = BeautifulSoup(front_page, 'html.parser')
27 | document_texts = []
28 | labels = []
29 | # For each story ...
30 | for anchor in front_page_soup.find_all('a'):
31 | if not anchor['href'].startswith('/') and not anchor['href'].startswith('https'):
32 | this_document_url = '/'.join((front_page_uri, anchor['href']))
33 | print('Downloading story', anchor.contents[0], 'from front page', front_page_label)
34 | # Get the HTML document for the story
35 | this_document = urllib.request.urlopen(this_document_url)
36 | # Extract the raw text from the HTML document
37 | this_document_soup = BeautifulSoup(this_document, 'html.parser')
38 | this_document_text = this_document_soup.prettify()
39 | this_document_text = this_document_text.split('', 1)[1]
40 | this_document_text = this_document_text.split('', ' ')
42 | # Remove any carriage returns and line feeds from the raw text
43 | this_document_text = this_document_text.replace(
44 | '\n', ' ').replace('\r', ' ').replace(' ', ' ')
45 | # Replace multiple spaces with single spaces
46 | this_document_text = ' '.join(this_document_text.split())
47 | # Create a document label from the front page label and the story name
48 | this_document_label = ' - '.join((front_page_label, anchor.contents[0]))
49 | document_texts.append(this_document_text)
50 | labels.append(this_document_label)
51 | parsed_documents = holmes_manager.nlp.pipe(document_texts, n_process=cpu_count())
52 | for index, parsed_document in enumerate(parsed_documents):
53 | label = labels[index]
54 | print('Saving', label)
55 | output_filename = os.sep.join((working_directory, label))
56 | output_filename = '.'.join((output_filename, HOLMES_EXTENSION))
57 | with open(output_filename, "wb") as file:
58 | file.write(parsed_document.to_bytes())
59 |
60 | def load_documents_from_working_directory():
61 | serialized_documents = {}
62 | for file in os.listdir(working_directory):
63 | if file.endswith(HOLMES_EXTENSION):
64 | print('Loading', file)
65 | label = file[:-4]
66 | long_filename = os.sep.join((working_directory, file))
67 | with open(long_filename, "rb") as file:
68 | contents = file.read()
69 | serialized_documents[label] = contents
70 | print('Indexing documents (this may take some time) ...')
71 | holmes_manager.register_serialized_documents(serialized_documents)
72 |
73 | if os.path.exists(working_directory):
74 | if not os.path.isdir(working_directory):
75 | raise RuntimeError(' '.join((working_directory, 'must be a directory')))
76 | else:
77 | os.mkdir(working_directory)
78 |
79 | if os.path.isfile(flag_filename):
80 | load_documents_from_working_directory()
81 | else:
82 | process_documents_from_front_page(
83 | "https://maerchen.com/grimm/", 'Gebrüder Grimm')
84 | process_documents_from_front_page(
85 | "https://maerchen.com/grimm2/", 'Gebrüder Grimm')
86 | process_documents_from_front_page(
87 | "https://maerchen.com/andersen/", 'Hans Christian Andersen')
88 | process_documents_from_front_page(
89 | "https://maerchen.com/bechstein/", 'Ludwig Bechstein')
90 | process_documents_from_front_page(
91 | "https://maerchen.com/wolf/", 'Johann Wilhelm Wolf')
92 | # Generate flag file to indicate files can be reloaded on next run
93 | open(flag_filename, 'a').close()
94 | load_documents_from_working_directory()
95 |
96 | #Comment following line in to activate interactive console
97 | #holmes_manager.start_topic_matching_search_mode_console(only_one_result_per_document=True)
98 |
99 | # The following code starts a RESTful Http service to perform topic searches. It is deployed as
100 | # as WSGI application. An example of how to start it - issued from the directory that
101 | # contains the script - is
102 |
103 | # python -m waitress example_search_DE_literature:application
104 |
105 | # You will need to install waitress (python -m pip install waitress)
106 |
107 | class RestHandler():
108 | def on_get(self, req, resp):
109 | resp.text = \
110 | json.dumps(holmes_manager.topic_match_documents_against(
111 | req.params['entry'][0:200], only_one_result_per_document=True))
112 | resp.cache_control = ["s-maxage=31536000"]
113 |
114 | application = falcon.App()
115 | application.add_route('/german', RestHandler())
116 |
--------------------------------------------------------------------------------
/SHORTREADME.md:
--------------------------------------------------------------------------------
1 | **Holmes** is a Python 3 library (v3.6—v3.10) running on top of
2 | [spaCy](https://spacy.io/) (v3.1—v3.3) that supports a number of use cases
3 | involving information extraction from English and German texts. In all use cases, the information
4 | extraction is based on analysing the semantic relationships expressed by the component parts of
5 | each sentence:
6 |
7 | - In the [chatbot](#getting-started) use case, the system is configured using one or more **search phrases**.
8 | Holmes then looks for structures whose meanings correspond to those of these search phrases within
9 | a searched **document**, which in this case corresponds to an individual snippet of text or speech
10 | entered by the end user. Within a match, each word with its own meaning (i.e. that does not merely fulfil a grammatical function) in the search phrase
11 | corresponds to one or more such words in the document. Both the fact that a search phrase was matched and any structured information the search phrase extracts can be used to drive the chatbot.
12 |
13 | - The [structural extraction](#structural-extraction) use case uses exactly the same
14 | [structural matching](#how-it-works-structural-matching) technology as the chatbot use
15 | case, but searching takes place with respect to a pre-existing document or documents that are typically much
16 | longer than the snippets analysed in the chatbot use case, and the aim is to extract and store structured information. For example, a set of business articles could be searched to find all the places where one company is said to be planning to
17 | take over a second company. The identities of the companies concerned could then be stored in a database.
18 |
19 | - The [topic matching](#topic-matching) use case aims to find passages in a document or documents whose meaning
20 | is close to that of another document, which takes on the role of the **query document**, or to that of a **query phrase** entered ad-hoc by the user. Holmes extracts a number of small **phraselets** from the query phrase or
21 | query document, matches the documents being searched against each phraselet, and conflates the results to find
22 | the most relevant passages within the documents. Because there is no strict requirement that every
23 | word with its own meaning in the query document match a specific word or words in the searched documents, more matches are found
24 | than in the structural extraction use case, but the matches do not contain structured information that can be
25 | used in subsequent processing. The topic matching use case is demonstrated by [a website allowing searches within
26 | six Charles Dickens novels (for English) and around 350 traditional stories (for German)](https://holmes-demo.explosion.services/).
27 |
28 | - The [supervised document classification](#supervised-document-classification) use case uses training data to
29 | learn a classifier that assigns one or more **classification labels** to new documents based on what they are about.
30 | It classifies a new document by matching it against phraselets that were extracted from the training documents in the
31 | same way that phraselets are extracted from the query document in the topic matching use case. The technique is
32 | inspired by bag-of-words-based classification algorithms that use n-grams, but aims to derive n-grams whose component
33 | words are related semantically rather than that just happen to be neighbours in the surface representation of a language.
34 |
35 | In all four use cases, the **individual words** are matched using a [number of strategies](#word-level-matching-strategies).
36 | To work out whether two grammatical structures that contain individually matching words correspond logically and
37 | constitute a match, Holmes transforms the syntactic parse information provided by the [spaCy](https://spacy.io/) library
38 | into semantic structures that allow texts to be compared using predicate logic. As a user of Holmes, you do not need to
39 | understand the intricacies of how this works, although there are some
40 | [important tips](#writing-effective-search-phrases) around writing effective search phrases for the chatbot and
41 | structural extraction use cases that you should try and take on board.
42 |
43 | Holmes aims to offer generalist solutions that can be used more or less out of the box with
44 | relatively little tuning, tweaking or training and that are rapidly applicable to a wide range of use cases.
45 | At its core lies a logical, programmed, rule-based system that describes how syntactic representations in each
46 | language express semantic relationships. Although the supervised document classification use case does incorporate a
47 | neural network and although the spaCy library upon which Holmes builds has itself been pre-trained using machine
48 | learning, the essentially rule-based nature of Holmes means that the chatbot, structural extraction and topic matching use
49 | cases can be put to use out of the box without any training and that the supervised document classification use case
50 | typically requires relatively little training data, which is a great advantage because pre-labelled training data is
51 | not available for many real-world problems.
52 |
53 | Holmes has a long and complex history and we are now able to publish it under the MIT license thanks to the goodwill and openness of several companies. I, Richard Hudson, wrote the versions up to 3.0.0 while working at [msg systems](https://www.msg.group/en), a large international software consultancy based near Munich. In late 2021, I changed employers and now work for [Explosion](https://explosion.ai/), the creators of [spaCy](https://spacy.io/) and [Prodigy](https://prodi.gy/). Elements of the Holmes library are covered by a [US patent](https://patents.google.com/patent/US8155946B2/en) that I myself wrote in the early 2000s while working at a startup called Definiens that has since been acquired by [AstraZeneca](https://www.astrazeneca.com/). With the kind permission of both AstraZeneca and msg systems, I am now maintaining Holmes at Explosion and can offer it for the first time under a permissive license: anyone can now use Holmes under the terms of the MIT
54 | license without having to worry about the patent.
55 |
56 | For more information, please see the [main documentation on Github](https://github.com/explosion/holmes-extractor).
57 |
--------------------------------------------------------------------------------
/tests/common/test_serialization.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import holmes_extractor as holmes
4 |
5 | script_directory = os.path.dirname(os.path.realpath(__file__))
6 | holmes_manager = holmes.Manager('en_core_web_trf', number_of_workers=2)
7 | holmes_manager.register_search_phrase("A dog chases a cat")
8 | german_holmes_manager = holmes.Manager('de_core_news_lg', number_of_workers=2)
9 |
10 |
11 | class SerializationTest(unittest.TestCase):
12 |
13 | def test_matching_with_holmes_manager_document_after_serialization(self):
14 | holmes_manager.remove_all_documents()
15 | holmes_manager.parse_and_register_document(
16 | "The cat was chased by the dog", 'pets')
17 | serialized_doc = holmes_manager.serialize_document('pets')
18 | self.assertEqual(len(holmes_manager.match()), 1)
19 |
20 | def test_matching_with_reserialized_holmes_manager_document(self):
21 | holmes_manager.remove_all_documents()
22 | holmes_manager.parse_and_register_document(
23 | "The cat was chased by the dog", 'pets')
24 | serialized_doc = holmes_manager.serialize_document('pets')
25 | holmes_manager.remove_all_documents()
26 | holmes_manager.register_serialized_document(
27 | serialized_doc, 'pets')
28 | self.assertEqual(len(holmes_manager.match()), 1)
29 |
30 | def test_matching_with_multiple_reserialized_holmes_manager_document(self):
31 | holmes_manager.remove_all_documents()
32 | holmes_manager.parse_and_register_document(
33 | "The cat was chased by the dog", 'pets')
34 | serialized_doc = holmes_manager.serialize_document('pets')
35 | working_dict = {'pets': serialized_doc, 'pets2': serialized_doc}
36 | holmes_manager.remove_all_documents()
37 | holmes_manager.register_serialized_documents(working_dict)
38 | self.assertEqual(len(holmes_manager.match()), 2)
39 |
40 | def test_serialization_with_coreference(self):
41 | holmes_manager.remove_all_documents()
42 | holmes_manager.parse_and_register_document(
43 | "I saw a cat. It was chased by the dog", 'pets')
44 | serialized_doc = holmes_manager.serialize_document('pets')
45 | holmes_manager.remove_all_documents()
46 | holmes_manager.register_serialized_document(
47 | serialized_doc, 'pets')
48 | self.assertEqual(len(holmes_manager.match()), 1)
49 |
50 | def test_matching_with_both_documents(self):
51 | holmes_manager.remove_all_documents()
52 | holmes_manager.parse_and_register_document(
53 | "The cat was chased by the dog", 'pets')
54 | serialized_doc = holmes_manager.serialize_document('pets')
55 | holmes_manager.register_serialized_document(
56 | serialized_doc, 'pets2')
57 | self.assertEqual(len(holmes_manager.match()), 2)
58 |
59 | def test_document_to_serialize_does_not_exist(self):
60 | holmes_manager.remove_all_documents()
61 | serialized_doc = holmes_manager.serialize_document('pets')
62 | self.assertEqual(serialized_doc, None)
63 |
64 | def test_parent_token_indexes(self):
65 | holmes_manager.remove_all_documents()
66 | holmes_manager.parse_and_register_document(
67 | "Houses in the village.", 'village')
68 | serialized_doc = holmes_manager.serialize_document('village')
69 | holmes_manager.register_serialized_document(
70 | serialized_doc, 'village2')
71 | old_doc = holmes_manager.get_document(
72 | 'village')
73 | new_doc = holmes_manager.get_document(
74 | 'village2')
75 | self.assertEqual(old_doc[0]._.holmes.string_representation_of_children(),
76 | '1:prep; 3:pobjp')
77 | self.assertEqual(old_doc[3]._.holmes.string_representation_of_parents(),
78 | '0:pobjp; 1:pobj')
79 | self.assertEqual(old_doc[3]._.holmes.coreference_linked_parent_dependencies, [
80 | [0, 'pobjp'], [1, 'pobj']])
81 | self.assertEqual(new_doc[0]._.holmes.string_representation_of_children(),
82 | '1:prep; 3:pobjp')
83 | self.assertEqual(new_doc[3]._.holmes.coreference_linked_parent_dependencies, [
84 | [0, 'pobjp'], [1, 'pobj']])
85 | self.assertEqual(new_doc[3]._.holmes.string_representation_of_parents(),
86 | '0:pobjp; 1:pobj')
87 |
88 | def test_subwords(self):
89 | german_holmes_manager.remove_all_documents()
90 | german_holmes_manager.parse_and_register_document(
91 | "Bundesoberbehörde.", 'bo')
92 | serialized_doc = german_holmes_manager.serialize_document('bo')
93 | german_holmes_manager.register_serialized_document(
94 | serialized_doc, 'bo2')
95 | old_doc = german_holmes_manager.get_document('bo')
96 | new_doc = german_holmes_manager.get_document(
97 | 'bo2')
98 | self.assertEqual(old_doc[0]._.holmes.subwords[0].text, 'bundes')
99 | self.assertEqual(old_doc[0]._.holmes.subwords[0].lemma, 'bund')
100 | self.assertEqual(old_doc[0]._.holmes.subwords[1].text, 'oberbehörde')
101 | self.assertEqual(old_doc[0]._.holmes.subwords[1].lemma, 'oberbehörde')
102 | self.assertEqual(new_doc[0]._.holmes.subwords[0].text, 'bundes')
103 | self.assertEqual(new_doc[0]._.holmes.subwords[0].lemma, 'bund')
104 | self.assertEqual(new_doc[0]._.holmes.subwords[1].text, 'oberbehörde')
105 | self.assertEqual(new_doc[0]._.holmes.subwords[1].lemma, 'oberbehörde')
106 |
107 | def test_derived_lemma(self):
108 | holmes_manager.remove_all_documents()
109 | holmes_manager.parse_and_register_document(
110 | "A lot of information.", 'information')
111 | serialized_doc = holmes_manager.serialize_document(
112 | 'information')
113 | holmes_manager.register_serialized_document(
114 | serialized_doc, 'information2')
115 | old_doc = holmes_manager.get_document(
116 | 'information')
117 | new_doc = holmes_manager.get_document(
118 | 'information2')
119 | self.assertEqual(old_doc[3]._.holmes.derived_lemma, 'inform')
120 | self.assertEqual(new_doc[3]._.holmes.derived_lemma, 'inform')
121 |
--------------------------------------------------------------------------------
/holmes_extractor/word_matching/direct.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, List, Dict
2 | from spacy.tokens import Token, Doc
3 | from .general import WordMatch, WordMatchingStrategy
4 | from ..parsing import (
5 | MultiwordSpan,
6 | CorpusWordPosition,
7 | Subword,
8 | SearchPhrase,
9 | )
10 |
11 |
12 | class DirectWordMatchingStrategy(WordMatchingStrategy):
13 |
14 | WORD_MATCH_TYPE_LABEL = "direct"
15 |
16 | @staticmethod
17 | def _get_explanation(search_phrase_display_word: str) -> str:
18 | return "".join(("Matches ", search_phrase_display_word.upper(), " directly."))
19 |
20 | def match_multiwords(
21 | self,
22 | search_phrase: SearchPhrase,
23 | search_phrase_token: Token,
24 | document_token: Token,
25 | document_multiwords: List[MultiwordSpan],
26 | ) -> Optional[WordMatch]:
27 |
28 | if len(search_phrase_token._.holmes.lemma.split()) == 1:
29 | return None
30 | for (
31 | search_phrase_representation
32 | ) in search_phrase_token._.holmes.direct_matching_reprs:
33 | for multiword in document_multiwords:
34 | for document_representation in multiword.direct_matching_reprs:
35 | if search_phrase_representation == document_representation:
36 | search_phrase_display_word = search_phrase_token._.holmes.lemma
37 | return WordMatch(
38 | search_phrase_token=search_phrase_token,
39 | search_phrase_word=search_phrase_representation,
40 | document_token=document_token,
41 | first_document_token=document_token.doc[
42 | multiword.token_indexes[0]
43 | ],
44 | last_document_token=document_token.doc[
45 | multiword.token_indexes[-1]
46 | ],
47 | document_subword=None,
48 | document_word=document_representation,
49 | word_match_type=self.WORD_MATCH_TYPE_LABEL,
50 | explanation=self._get_explanation(
51 | search_phrase_display_word
52 | ),
53 | )
54 | return None
55 |
56 | def match_token(
57 | self,
58 | search_phrase: SearchPhrase,
59 | search_phrase_token: Token,
60 | document_token: Token,
61 | ) -> Optional[WordMatch]:
62 |
63 | for (
64 | search_phrase_representation
65 | ) in search_phrase_token._.holmes.direct_matching_reprs:
66 | for (
67 | document_representation
68 | ) in document_token._.holmes.direct_matching_reprs:
69 | if search_phrase_representation == document_representation:
70 | search_phrase_display_word = search_phrase_token._.holmes.lemma
71 | return WordMatch(
72 | search_phrase_token=search_phrase_token,
73 | search_phrase_word=search_phrase_representation,
74 | document_token=document_token,
75 | first_document_token=document_token,
76 | last_document_token=document_token,
77 | document_subword=None,
78 | document_word=document_representation,
79 | word_match_type=self.WORD_MATCH_TYPE_LABEL,
80 | extracted_word=self.get_extracted_word_for_token(
81 | document_token, document_representation
82 | ),
83 | explanation=self._get_explanation(search_phrase_display_word),
84 | )
85 | return None
86 |
87 | def match_subword(
88 | self,
89 | search_phrase: SearchPhrase,
90 | search_phrase_token: Token,
91 | document_token: Token,
92 | document_subword: Subword,
93 | ) -> Optional[WordMatch]:
94 |
95 | for (
96 | search_phrase_representation
97 | ) in search_phrase_token._.holmes.direct_matching_reprs:
98 | for document_representation in document_subword.direct_matching_reprs:
99 | if search_phrase_representation == document_representation:
100 | search_phrase_display_word = search_phrase_token._.holmes.lemma
101 | return WordMatch(
102 | search_phrase_token=search_phrase_token,
103 | search_phrase_word=search_phrase_representation,
104 | document_token=document_token,
105 | first_document_token=document_token,
106 | last_document_token=document_token,
107 | document_subword=document_subword,
108 | document_word=document_representation,
109 | word_match_type=self.WORD_MATCH_TYPE_LABEL,
110 | explanation=self._get_explanation(search_phrase_display_word),
111 | )
112 | return None
113 |
114 | def add_words_matching_search_phrase_root_token(
115 | self, search_phrase: SearchPhrase
116 | ) -> None:
117 | for word in search_phrase.root_token._.holmes.direct_matching_reprs:
118 | search_phrase.add_word_information(word)
119 |
120 | def add_reverse_dict_entries(
121 | self,
122 | reverse_dict: Dict[str, List[CorpusWordPosition]],
123 | doc: Doc,
124 | document_label: str,
125 | ) -> None:
126 | for token in doc:
127 | for representation in token._.holmes.direct_matching_reprs:
128 | self.add_reverse_dict_entry(
129 | reverse_dict,
130 | representation.lower(),
131 | document_label,
132 | token.i,
133 | None,
134 | )
135 | for subword in token._.holmes.subwords:
136 | for representation in subword.direct_matching_reprs:
137 | self.add_reverse_dict_entry(
138 | reverse_dict,
139 | representation.lower(),
140 | document_label,
141 | token.i,
142 | subword.index,
143 | )
144 |
--------------------------------------------------------------------------------
/examples/example_supervised_topic_model_EN.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import urllib.request
4 | import zipfile
5 | from thinc.api import prefer_gpu
6 | import holmes_extractor as holmes
7 |
8 | working_directory = # REPLACE WITH PATH TO WORKING DIRECTORY IN SINGLE OR DOUBLE QUOTES
9 |
10 | if __name__ in ("__main__", "example_supervised_topic_model_EN"):
11 |
12 | def is_training_data(document_number):
13 | # We use any documents with numbers ending in 8,9,0 for test and all other documents for
14 | # training.
15 | return document_number[-1:] not in ("8", "9", "0")
16 |
17 | def get_document_filename_info(filename):
18 | # e.g. 'bbc/business/001.txt'
19 | category = filename.split("/")[1]
20 | document_number = filename.split("/")[2].split(".")[0]
21 | return category, document_number
22 |
23 | def evaluate_classifier(zip_filename, classifier):
24 | correct_classification_counter = (
25 | wrong_classification_counter
26 | ) = no_classification_counter = correct_as_additional_classification_counter = 0
27 | with zipfile.ZipFile(zip_filename) as bbc_zipfile:
28 | for filename in (
29 | filename
30 | for filename in sorted(bbc_zipfile.namelist())
31 | if filename.lower().endswith(".txt")
32 | and not filename.endswith("README.TXT")
33 | ):
34 | category, document_number = get_document_filename_info(filename)
35 | if not is_training_data(document_number):
36 | with bbc_zipfile.open(filename, "r") as test_doc:
37 | test_contents = str(test_doc.read())
38 | test_contents = test_contents.replace("\n", " ").replace(
39 | "\r", " "
40 | )
41 | classification_dict = classifier.parse_and_classify(test_contents)
42 | if classification_dict is None:
43 | suggested_categories = []
44 | else:
45 | suggested_categories = [
46 | c
47 | for c in classification_dict
48 | if classification_dict[c] > 0.2
49 | ]
50 | if len(suggested_categories) == 0:
51 | no_classification_counter += 1
52 | elif suggested_categories[0] == category:
53 | correct_classification_counter += 1
54 | elif category in suggested_categories:
55 | correct_as_additional_classification_counter += 1
56 | else:
57 | wrong_classification_counter += 1
58 | print(
59 | "".join(
60 | (
61 | filename,
62 | ": actual category ",
63 | category,
64 | "; suggested categories ",
65 | str(suggested_categories),
66 | )
67 | )
68 | )
69 | print()
70 | print("Totals:")
71 | print(correct_classification_counter, "correct classifications;")
72 | print(no_classification_counter, "unclassified documents;")
73 | print(wrong_classification_counter, "incorrect classifications;")
74 | print(
75 | correct_as_additional_classification_counter,
76 | "incorrect classifications where the "
77 | "correct classification was returned as an additional classification.",
78 | )
79 |
80 | def train_model(working_directory, zip_filename):
81 | training_basis = holmes_manager.get_supervised_topic_training_basis()
82 | with zipfile.ZipFile(zip_filename) as bbc_zipfile:
83 | for filename in (
84 | filename
85 | for filename in sorted(bbc_zipfile.namelist())
86 | if filename.lower().endswith(".txt")
87 | and not filename.endswith("README.TXT")
88 | ):
89 | category, document_number = get_document_filename_info(filename)
90 | if is_training_data(document_number):
91 | with bbc_zipfile.open(filename, "r") as training_doc:
92 | training_contents = str(training_doc.read())
93 | training_contents = training_contents.replace(
94 | "\n", " "
95 | ).replace("\r", " ")
96 | training_basis.parse_and_register_training_document(
97 | training_contents, category, filename
98 | )
99 | training_basis.prepare()
100 | prefer_gpu()
101 | classifier = training_basis.train().classifier()
102 | output_filename = os.sep.join((working_directory, "sdc-model"))
103 | with open(output_filename, "wb") as file:
104 | file.write(classifier.serialize_model())
105 | evaluate_classifier(zip_filename, classifier)
106 |
107 | holmes_manager = holmes.Manager("en_core_web_lg", number_of_workers=1)
108 |
109 | if os.path.exists(working_directory):
110 | if not os.path.isdir(working_directory):
111 | raise RuntimeError(" ".join((working_directory, "must be a directory")))
112 | else:
113 | os.mkdir(working_directory)
114 | zip_filename = os.sep.join((working_directory, "bbc-fulltext.zip"))
115 | if not os.path.exists(zip_filename):
116 | url = "http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip"
117 | with urllib.request.urlopen(url) as response, open(
118 | zip_filename, "wb"
119 | ) as out_file:
120 | shutil.copyfileobj(response, out_file)
121 | model_filename = os.sep.join((working_directory, "sdc-model"))
122 | if not os.path.exists(model_filename):
123 | train_model(working_directory, zip_filename)
124 | else:
125 | print(
126 | "Reloading existing trained model. "
127 | "Delete model from working directory to repeat training."
128 | )
129 | with open(model_filename, "rb") as model_file:
130 | classifier = holmes_manager.deserialize_supervised_topic_classifier(
131 | model_file.read()
132 | )
133 | evaluate_classifier(zip_filename, classifier)
134 |
--------------------------------------------------------------------------------
/holmes_extractor/word_matching/question.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Dict
2 | from spacy.tokens import Token
3 | from thinc.types import Floats1d
4 | from .general import WordMatch, WordMatchingStrategy
5 | from ..parsing import SearchPhrase, SemanticMatchingHelper, Subword
6 |
7 |
8 | class QuestionWordMatchingStrategy(WordMatchingStrategy):
9 |
10 | WORD_MATCH_TYPE_LABEL = "question"
11 |
12 | @staticmethod
13 | def _get_explanation(search_phrase_display_word: str) -> str:
14 | return "".join(
15 | ("Matches the question word ", search_phrase_display_word.upper(), ".")
16 | )
17 |
18 | def __init__(
19 | self,
20 | semantic_matching_helper: SemanticMatchingHelper,
21 | perform_coreference_resolution: bool,
22 | initial_question_word_overall_similarity_threshold: float,
23 | entity_label_to_vector_dict: Dict[str, Floats1d],
24 | ):
25 | self.initial_question_word_overall_similarity_threshold = (
26 | initial_question_word_overall_similarity_threshold
27 | )
28 | self.entity_label_to_vector_dict = entity_label_to_vector_dict
29 | super().__init__(semantic_matching_helper, perform_coreference_resolution)
30 |
31 | def match_token(
32 | self,
33 | search_phrase: SearchPhrase,
34 | search_phrase_token: Token,
35 | document_token: Token,
36 | ) -> Optional[WordMatch]:
37 |
38 | if search_phrase_token._.holmes.is_initial_question_word:
39 | document_vector = document_token._.holmes.vector
40 | if document_vector is not None:
41 | question_word_matches = (
42 | self.semantic_matching_helper.question_word_matches(
43 | search_phrase_token,
44 | document_token,
45 | None,
46 | document_vector,
47 | self.entity_label_to_vector_dict,
48 | self.initial_question_word_overall_similarity_threshold
49 | ** len(search_phrase.matchable_non_entity_tokens_to_vectors),
50 | )
51 | )
52 | else:
53 | question_word_matches = (
54 | self.semantic_matching_helper.question_word_matches(
55 | search_phrase_token, document_token, None, None, None, None
56 | )
57 | )
58 | if question_word_matches:
59 | first_document_token_index = (
60 | last_document_token_index
61 | ) = document_token.i
62 | if (
63 | document_token.pos_ in self.semantic_matching_helper.noun_pos
64 | and len(document_token.ent_type_) > 0
65 | ):
66 | while first_document_token_index >= 1:
67 | if (
68 | document_token.doc[first_document_token_index - 1].pos_
69 | in self.semantic_matching_helper.noun_pos
70 | ):
71 | first_document_token_index = first_document_token_index - 1
72 | else:
73 | break
74 | while last_document_token_index + 1 < len(document_token.doc):
75 | if (
76 | document_token.doc[last_document_token_index + 1].pos_
77 | in self.semantic_matching_helper.noun_pos
78 | ):
79 | last_document_token_index = last_document_token_index + 1
80 | else:
81 | break
82 | return WordMatch(
83 | search_phrase_token=search_phrase_token,
84 | search_phrase_word=search_phrase_token._.holmes.lemma,
85 | document_token=document_token,
86 | first_document_token=document_token.doc[first_document_token_index],
87 | last_document_token=document_token.doc[last_document_token_index],
88 | document_subword=None,
89 | document_word=document_token._.holmes.lemma,
90 | word_match_type=self.WORD_MATCH_TYPE_LABEL,
91 | explanation=self._get_explanation(
92 | search_phrase_token._.holmes.lemma
93 | ),
94 | )
95 | return None
96 |
97 | def match_subword(
98 | self,
99 | search_phrase: SearchPhrase,
100 | search_phrase_token: Token,
101 | document_token: Token,
102 | document_subword: Subword,
103 | ) -> Optional[WordMatch]:
104 |
105 | if (
106 | document_subword.is_head
107 | ): # question words should not match a head subword but the whole word or multiword:
108 | return None
109 | if search_phrase_token._.holmes.is_initial_question_word:
110 | document_vector = document_subword.vector
111 | if document_vector is not None:
112 | question_word_matches = (
113 | self.semantic_matching_helper.question_word_matches(
114 | search_phrase_token,
115 | document_token,
116 | document_subword.index,
117 | document_vector,
118 | self.entity_label_to_vector_dict,
119 | self.initial_question_word_overall_similarity_threshold
120 | ** len(search_phrase.matchable_non_entity_tokens_to_vectors),
121 | )
122 | )
123 | else:
124 | question_word_matches = (
125 | self.semantic_matching_helper.question_word_matches(
126 | search_phrase_token,
127 | document_token,
128 | document_subword.index,
129 | None,
130 | None,
131 | None,
132 | )
133 | )
134 | if question_word_matches:
135 | return WordMatch(
136 | search_phrase_token=search_phrase_token,
137 | search_phrase_word=search_phrase_token._.holmes.lemma,
138 | document_token=document_token,
139 | first_document_token=document_token,
140 | last_document_token=document_token,
141 | document_subword=document_subword,
142 | document_word=document_subword.lemma,
143 | word_match_type=self.WORD_MATCH_TYPE_LABEL,
144 | explanation=self._get_explanation(document_subword.lemma),
145 | )
146 | return None
147 |
--------------------------------------------------------------------------------
/tests/common/test_manager.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import holmes_extractor as holmes
3 | from holmes_extractor.errors import NoDocumentError
4 |
5 | holmes_manager = holmes.Manager(
6 | 'en_core_web_trf', perform_coreference_resolution=False, number_of_workers=2)
7 |
8 | lg_holmes_manager = holmes.Manager(
9 | 'en_core_web_lg', perform_coreference_resolution=False, number_of_workers=2)
10 |
11 | class ManagerTest(unittest.TestCase):
12 |
13 | def _register_multiple_documents_and_search_phrases(self):
14 | holmes_manager.remove_all_search_phrases()
15 | holmes_manager.remove_all_documents()
16 | holmes_manager.parse_and_register_document(
17 | document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets')
18 | holmes_manager.parse_and_register_document(
19 | document_text="Everything I know suggests that lions enjoy eating gnu", label='safari')
20 | holmes_manager.register_search_phrase(
21 | "A dog chases a cat", label="test")
22 | holmes_manager.register_search_phrase(
23 | "A lion eats a gnu", label="test")
24 | holmes_manager.register_search_phrase(
25 | "irrelevancy", label="alpha")
26 | return
27 |
28 | def test_multiple(self):
29 | self._register_multiple_documents_and_search_phrases()
30 | self.assertEqual(len(holmes_manager.match()), 2)
31 |
32 | def test_remove_all_search_phrases(self):
33 | self._register_multiple_documents_and_search_phrases()
34 | holmes_manager.remove_all_search_phrases()
35 | holmes_manager.register_search_phrase("A dog chases a cat")
36 | self.assertEqual(len(holmes_manager.match()), 1)
37 |
38 | def test_remove_all_documents(self):
39 | self._register_multiple_documents_and_search_phrases()
40 | holmes_manager.remove_all_documents()
41 | holmes_manager.parse_and_register_document(
42 | document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets')
43 | self.assertEqual(len(holmes_manager.match()), 1)
44 |
45 | def test_remove_all_documents_with_label(self):
46 | self._register_multiple_documents_and_search_phrases()
47 | holmes_manager.remove_all_documents()
48 | holmes_manager.parse_and_register_document(
49 | document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets11')
50 | holmes_manager.parse_and_register_document(
51 | document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets12')
52 | holmes_manager.parse_and_register_document(
53 | document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets21')
54 | holmes_manager.parse_and_register_document(
55 | document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets22')
56 | self.assertEqual(len(holmes_manager.match()), 4)
57 | holmes_manager.remove_all_documents('pets22')
58 | self.assertEqual(len(holmes_manager.match()), 3)
59 | holmes_manager.remove_all_documents('pets1')
60 | self.assertEqual(len(holmes_manager.match()), 1)
61 | holmes_manager.remove_all_documents('pets')
62 | with self.assertRaises(NoDocumentError) as context:
63 | holmes_manager.match()
64 |
65 | def test_remove_document(self):
66 | self._register_multiple_documents_and_search_phrases()
67 | holmes_manager.parse_and_register_document(
68 | document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets2')
69 | self.assertEqual(len(holmes_manager.match()), 3)
70 | holmes_manager.remove_document(label='pets')
71 | holmes_manager.remove_document(label='safari')
72 | matches = holmes_manager.match()
73 | self.assertEqual(len(matches), 1)
74 | self.assertEqual(matches[0]['document'], 'pets2')
75 |
76 | def test_match_search_phrases_against(self):
77 | self._register_multiple_documents_and_search_phrases()
78 | self.assertEqual(len(holmes_manager.match(document_text=
79 | "All the time I am testing here, dogs keep on chasing cats.")), 1)
80 |
81 | def test_match_documents_against(self):
82 | self._register_multiple_documents_and_search_phrases()
83 | self.assertEqual(len(holmes_manager.match(search_phrase_text=
84 | "A lion eats a gnu.")), 1)
85 |
86 | def test_match_documents_and_search_phrases_against(self):
87 | self._register_multiple_documents_and_search_phrases()
88 | self.assertEqual(len(holmes_manager.match(search_phrase_text= "burn",
89 | document_text="Burn. Everything I know suggests that lions enjoy eating gnu")), 1)
90 | holmes_manager.remove_all_documents()
91 | holmes_manager.remove_all_search_phrases()
92 | self.assertEqual(len(holmes_manager.match(search_phrase_text= "burn",
93 | document_text="Burn. Everything I know suggests that lions enjoy eating gnu")), 1)
94 |
95 | def test_get_labels(self):
96 | self._register_multiple_documents_and_search_phrases()
97 | self.assertEqual(holmes_manager.list_search_phrase_labels(),
98 | ['alpha', 'test'])
99 |
100 | def test_get_document(self):
101 | self._register_multiple_documents_and_search_phrases()
102 | self.assertEqual(holmes_manager.get_document('safari')[5]._.holmes.lemma,
103 | 'lion')
104 |
105 | def test_remove_all_search_phrases_with_label(self):
106 | holmes_manager.remove_all_search_phrases()
107 | holmes_manager.register_search_phrase("testa", label="test1")
108 | holmes_manager.register_search_phrase("testb", label="test1")
109 | holmes_manager.register_search_phrase("testc", label="test2")
110 | holmes_manager.register_search_phrase("testd", label="test2")
111 | holmes_manager.remove_all_search_phrases_with_label("test2")
112 | holmes_manager.remove_all_search_phrases_with_label("testb")
113 | self.assertEqual(holmes_manager.list_search_phrase_labels(),
114 | ['test1'])
115 | self.assertEqual(len(holmes_manager.match(document_text=
116 | "testa")), 1)
117 | self.assertEqual(len(holmes_manager.match(document_text=
118 | "testb")), 1)
119 | self.assertEqual(len(holmes_manager.match(document_text=
120 | "testc")), 0)
121 | self.assertEqual(len(holmes_manager.match(document_text=
122 | "testd")), 0)
123 |
124 | def test_pipe_with_single_process(self):
125 | docs = lg_holmes_manager.nlp.pipe(['document1', 'document2'])
126 | self.assertEqual(str(next(docs)), 'document1')
127 |
128 |
129 | def test_pipe_with_multiple_processes(self):
130 | docs = lg_holmes_manager.nlp.pipe(['document1', 'document2'], n_process=2)
131 | self.assertEqual(str(next(docs)), 'document1')
132 | self.assertEqual(str(next(docs)), 'document2')
133 |
--------------------------------------------------------------------------------
/examples/example_search_EN_literature.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import json
4 | import urllib.request
5 | # You will need to install bs4 (python -m pip install bs4)
6 | from bs4 import BeautifulSoup
7 | import holmes_extractor as holmes
8 | # You will need to install falcon (python -m pip install falcon)
9 | import falcon
10 |
11 | if __name__ in ('__main__', 'example_search_EN_literature'):
12 |
13 | working_directory = # REPLACE WITH PATH TO WORKING DIRECTORY IN SINGLE OR DOUBLE QUOTES
14 | HOLMES_EXTENSION = 'hdc'
15 | flag_filename = os.sep.join((working_directory, 'STORY_PARSING_COMPLETE'))
16 | print('Initializing Holmes (this may take some time) ...')
17 |
18 | script_directory = os.path.dirname(os.path.realpath(__file__))
19 | ontology = holmes.Ontology(os.sep.join((
20 | script_directory, 'example_search_EN_literature_ontology.owl')))
21 |
22 | # Start the Holmes manager with the English model
23 | holmes_manager = holmes.Manager(
24 | model='en_core_web_trf', ontology=ontology)
25 |
26 | def extract_chapters_from_book(book_uri, title):
27 | """ Download and save the chapters from a book."""
28 |
29 | print()
30 | print(title)
31 | print()
32 | book = urllib.request.urlopen(book_uri).read().decode()
33 | book = re.sub("\\nPage \|.+?Rowling \\n", "", book)
34 | book = re.sub("\\nP a g e \|.+?Rowling \\n", "", book)
35 | book = re.sub("\\nPage \|.+?\\n", "", book)
36 | book = book.replace("Harry Potter and the Half Blood Prince - J.K. Rowling", "")
37 | book = book.replace("Harry Potter and the Goblet of Fire - J.K. Rowling", "")
38 | book = book.replace("Harry Potter and the Deathly Hallows - J.K. Rowling", "")
39 | book = book[1:]
40 | chapter_headings = [heading for heading in re.finditer("(?<=((\\n\\n\\n\\n)|(\* \\n\\n)))((?!.*(WEASLEY WILL MAKE SURE)|(DO NOT OPEN THE PARCEL)|(HEADMISTRESS OF HOGWARTS))[A-Z][A-Z\-’., ]+)(\\n{1,2}((?!.*(WHO\-MUST))[A-Z\-’., ]+))?(?=(\\n\\n([^\\n]|(\\n\\n((“Harry!”)|(Harry’s)|(Ron’s)|(“Hagrid)|(Three o’clock))))))", book)]
41 | chapter_counter = 1
42 | labels = []
43 | chapter_texts = []
44 | chapter_dict = {}
45 | for chapter_heading in chapter_headings:
46 | label = ''.join((
47 | 'Book ', title, ' Ch ', str(chapter_counter), " ‘",
48 | chapter_heading.group().replace('\n', '').strip(), "’"))
49 | labels.append(label)
50 | if chapter_counter == len(chapter_headings): # last chapter
51 | content = book[chapter_heading.end():]
52 | else:
53 | content = book[chapter_heading.end():chapter_headings[chapter_counter].start()]
54 | content = content.replace('\n', '')
55 | if content.endswith('& '):
56 | content = content[:-2]
57 | chapter_texts.append(content)
58 | print('Extracted', label)
59 | chapter_counter += 1
60 | parsed_chapters = holmes_manager.nlp.pipe(chapter_texts)
61 | for index, parsed_chapter in enumerate(parsed_chapters):
62 | label = labels[index]
63 | print('Saving', label)
64 | output_filename = os.sep.join((working_directory, label))
65 | output_filename = '.'.join((output_filename, HOLMES_EXTENSION))
66 | with open(output_filename, "wb") as file:
67 | file.write(parsed_chapter.to_bytes())
68 |
69 | def load_documents_from_working_directory():
70 | serialized_documents = {}
71 | for file in os.listdir(working_directory):
72 | if file.endswith(HOLMES_EXTENSION):
73 | print('Loading', file)
74 | label = file[:-4]
75 | long_filename = os.sep.join((working_directory, file))
76 | with open(long_filename, "rb") as file:
77 | contents = file.read()
78 | serialized_documents[label] = contents
79 | print('Indexing documents (this may take some time) ...')
80 | holmes_manager.register_serialized_documents(serialized_documents)
81 |
82 | if os.path.exists(working_directory):
83 | if not os.path.isdir(working_directory):
84 | raise RuntimeError(' '.join((working_directory, 'must be a directory')))
85 | else:
86 | os.mkdir(working_directory)
87 |
88 | if os.path.isfile(flag_filename):
89 | load_documents_from_working_directory()
90 | else:
91 | extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%201%20-%20The%20Philosopher's%20Stone.txt", "1 ‘The Philosopher\'s Stone’")
92 | extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%202%20-%20The%20Chamber%20of%20Secrets.txt", "2 ‘The Chamber of Secrets’")
93 | extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%203%20-%20The%20Prisoner%20of%20Azkaban.txt", "3 ‘The Prisoner of Azkaban’")
94 | extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%204%20-%20The%20Goblet%20of%20Fire.txt", "4 ‘The Goblet of Fire’")
95 | extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%205%20-%20The%20Order%20of%20the%20Phoenix.txt", "5 ‘The Order of the Phoenix’")
96 | extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%206%20-%20The%20Half%20Blood%20Prince.txt", "6 ‘The Half Blood Prince’")
97 | extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%207%20-%20The%20Deathly%20Hallows.txt", "7 ‘The Deathly Hallows’")
98 | # Generate flag file to indicate files can be reloaded on next run
99 | open(flag_filename, 'a').close()
100 | load_documents_from_working_directory()
101 |
102 | #Comment following line in to activate interactive console
103 | #holmes_manager.start_topic_matching_search_mode_console()
104 |
105 | # The following code starts a RESTful Http service to perform topic searches. It is deployed as
106 | # as WSGI application. An example of how to start it - issued from the directory that
107 | # contains the script - is
108 |
109 | # python -m waitress example_search_EN_literature:application
110 |
111 | # You will need to install waitress (python -m pip install waitress)
112 |
113 | class RestHandler():
114 | def on_get(self, req, resp):
115 | resp.text = \
116 | json.dumps(holmes_manager.topic_match_documents_against(
117 | req.params['entry'][0:200]))
118 | resp.cache_control = ["s-maxage=31536000"]
119 |
120 | application = falcon.App()
121 | application.add_route('/english', RestHandler())
122 |
--------------------------------------------------------------------------------
/holmes_extractor/word_matching/entity_embedding.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, List, Dict
2 | from spacy.tokens import Token
3 | from thinc.types import Floats1d
4 | from .general import WordMatch, WordMatchingStrategy
5 | from ..parsing import SearchPhrase, MultiwordSpan, SemanticMatchingHelper
6 |
7 |
8 | class EntityEmbeddingWordMatchingStrategy(WordMatchingStrategy):
9 |
10 | WORD_MATCH_TYPE_LABEL = "entity_embedding"
11 |
12 | @staticmethod
13 | def _get_explanation(similarity: float, search_phrase_display_word: str) -> str:
14 | printable_similarity = str(int(similarity * 100))
15 | return "".join(
16 | (
17 | "Has an entity label that is ",
18 | printable_similarity,
19 | "% similar to the word embedding corresponding to ",
20 | search_phrase_display_word.upper(),
21 | ".",
22 | )
23 | )
24 |
25 | def __init__(
26 | self,
27 | semantic_matching_helper: SemanticMatchingHelper,
28 | perform_coreference_resolution: bool,
29 | overall_similarity_threshold: float,
30 | initial_question_word_overall_similarity_threshold: float,
31 | entity_label_to_vector_dict: Dict[str, Floats1d],
32 | ):
33 | self.overall_similarity_threshold = overall_similarity_threshold
34 | self.initial_question_word_overall_similarity_threshold = (
35 | initial_question_word_overall_similarity_threshold
36 | )
37 | self.entity_label_to_vector_dict = entity_label_to_vector_dict
38 | super().__init__(semantic_matching_helper, perform_coreference_resolution)
39 |
40 | def match_multiwords(
41 | self,
42 | search_phrase: SearchPhrase,
43 | search_phrase_token: Token,
44 | document_token: Token,
45 | document_multiwords: List[MultiwordSpan],
46 | ) -> Optional[WordMatch]:
47 |
48 | if (
49 | search_phrase_token.i
50 | in search_phrase.matchable_non_entity_tokens_to_vectors.keys()
51 | and self.semantic_matching_helper.embedding_matching_permitted(
52 | search_phrase_token
53 | )
54 | ):
55 | search_phrase_vector = search_phrase.matchable_non_entity_tokens_to_vectors[
56 | search_phrase_token.i
57 | ]
58 | if (
59 | search_phrase_vector is None
60 | or not self.semantic_matching_helper.embedding_matching_permitted(
61 | document_token
62 | )
63 | ):
64 | return None
65 | for document_multiword in document_multiwords:
66 | if document_token.ent_type_ != "" and all(
67 | document_token.doc[i].ent_type_ == document_token.ent_type_
68 | for i in document_multiword.token_indexes
69 | ):
70 | potential_word_match = self._check_for_word_match(
71 | search_phrase=search_phrase,
72 | search_phrase_token=search_phrase_token,
73 | search_phrase_vector=search_phrase_vector,
74 | document_token=document_token,
75 | first_document_token=document_token.doc[
76 | document_multiword.token_indexes[0]
77 | ],
78 | last_document_token=document_token.doc[
79 | document_multiword.token_indexes[-1]
80 | ],
81 | )
82 | if potential_word_match is not None:
83 | return potential_word_match
84 |
85 | return None
86 |
87 | def match_token(
88 | self,
89 | search_phrase: SearchPhrase,
90 | search_phrase_token: Token,
91 | document_token: Token,
92 | ) -> Optional[WordMatch]:
93 |
94 | if (
95 | search_phrase_token.i
96 | in search_phrase.matchable_non_entity_tokens_to_vectors.keys()
97 | and self.semantic_matching_helper.embedding_matching_permitted(
98 | search_phrase_token
99 | )
100 | ):
101 | search_phrase_vector = search_phrase.matchable_non_entity_tokens_to_vectors[
102 | search_phrase_token.i
103 | ]
104 | if (
105 | search_phrase_vector is None
106 | or not self.semantic_matching_helper.embedding_matching_permitted(
107 | document_token
108 | )
109 | ):
110 | return None
111 | if document_token.ent_type_ != "":
112 | return self._check_for_word_match(
113 | search_phrase=search_phrase,
114 | search_phrase_token=search_phrase_token,
115 | search_phrase_vector=search_phrase_vector,
116 | document_token=document_token,
117 | first_document_token=document_token,
118 | last_document_token=document_token,
119 | )
120 | return None
121 |
122 | def _check_for_word_match(
123 | self,
124 | *,
125 | search_phrase: SearchPhrase,
126 | search_phrase_token: Token,
127 | search_phrase_vector: Floats1d,
128 | document_token: Token,
129 | first_document_token: Token,
130 | last_document_token: Token,
131 | ) -> Optional[WordMatch]:
132 | if (
133 | search_phrase_token._.holmes.is_initial_question_word
134 | or search_phrase_token._.holmes.has_initial_question_word_in_phrase
135 | ) and self.initial_question_word_overall_similarity_threshold is not None:
136 | working_overall_similarity_threshold = (
137 | self.initial_question_word_overall_similarity_threshold
138 | )
139 | else:
140 | working_overall_similarity_threshold = self.overall_similarity_threshold
141 | single_token_similarity_threshold = working_overall_similarity_threshold ** len(
142 | search_phrase.matchable_non_entity_tokens_to_vectors
143 | )
144 |
145 | similarity_measure = self.semantic_matching_helper.token_matches_ent_type(
146 | search_phrase_vector,
147 | self.entity_label_to_vector_dict,
148 | (document_token.ent_type_,),
149 | single_token_similarity_threshold,
150 | )
151 | if similarity_measure > 0:
152 | if (
153 | not search_phrase.topic_match_phraselet
154 | and len(search_phrase_token._.holmes.lemma.split()) > 1
155 | ):
156 | search_phrase_display_word = search_phrase_token.lemma_
157 | else:
158 | search_phrase_display_word = search_phrase_token._.holmes.lemma
159 | word_match = WordMatch(
160 | search_phrase_token=search_phrase_token,
161 | search_phrase_word=search_phrase_display_word,
162 | document_token=document_token,
163 | first_document_token=first_document_token,
164 | last_document_token=last_document_token,
165 | document_subword=None,
166 | document_word=document_token.lemma_,
167 | word_match_type=self.WORD_MATCH_TYPE_LABEL,
168 | explanation=self._get_explanation(
169 | similarity_measure, search_phrase_display_word
170 | ),
171 | )
172 | word_match.similarity_measure = similarity_measure
173 | return word_match
174 | return None
175 |
--------------------------------------------------------------------------------
/holmes_extractor/word_matching/general.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, List, Dict
2 | from spacy.tokens import Token, Doc
3 | from ..parsing import (
4 | CorpusWordPosition,
5 | MultiwordSpan,
6 | SemanticMatchingHelper,
7 | Subword,
8 | Index,
9 | SearchPhrase,
10 | )
11 |
12 |
13 | class WordMatchingStrategy:
14 | """Parent class for all word matching strategies. Each strategy only implements those methods that are relevant to it."""
15 |
16 | def __init__(
17 | self,
18 | semantic_matching_helper: SemanticMatchingHelper,
19 | perform_coreference_resolution: bool,
20 | ):
21 | self.semantic_matching_helper = semantic_matching_helper
22 | self.perform_coreference_resolution = perform_coreference_resolution
23 |
24 | def match_multiwords(
25 | self,
26 | search_phrase: SearchPhrase,
27 | search_phrase_token: Token,
28 | document_token: Token,
29 | document_multiwords: List[MultiwordSpan],
30 | ) -> Optional["WordMatch"]:
31 | """Attempts to match a search phrase token to a list of multiwords headed by a document token and ordered by decreasing size."""
32 | pass
33 |
34 | def match_token(
35 | self,
36 | search_phrase: SearchPhrase,
37 | search_phrase_token: Token,
38 | document_token: Token,
39 | ) -> Optional["WordMatch"]:
40 | """Attempts to match a search phrase token to a document token."""
41 | pass
42 |
43 | def match_subword(
44 | self,
45 | search_phrase: SearchPhrase,
46 | search_phrase_token: Token,
47 | document_token: Token,
48 | document_subword: Subword,
49 | ) -> Optional["WordMatch"]:
50 | """Attempts to match a search phrase token to a document subword (currently only relevant for German)."""
51 | pass
52 |
53 | def add_words_matching_search_phrase_root_token(
54 | self, search_phrase: SearchPhrase
55 | ) -> None:
56 | """Determines words that match a search phrase root token and notifies the *SearchPhrase* object of them."""
57 | pass
58 |
59 | def add_reverse_dict_entries(
60 | self, doc: Doc, document_label: str, reverse_dict: Dict[str, List[CorpusWordPosition]]
61 | ) -> None:
62 | """Determines words that match each token within a document and adds corresponding entries to the reverse dictionary."""
63 | pass
64 |
65 | @staticmethod
66 | def add_reverse_dict_entry(
67 | reverse_dict: Dict[str, List[CorpusWordPosition]],
68 | key_word: str,
69 | document_label: str,
70 | token_index: int,
71 | subword_index: int,
72 | ) -> None:
73 | """Adds a single entry to the reverse dictionary. Called by implementing classes."""
74 | index = Index(token_index, subword_index)
75 | corpus_word_position = CorpusWordPosition(document_label, index)
76 | if key_word in reverse_dict.keys():
77 | if corpus_word_position not in reverse_dict[key_word]:
78 | reverse_dict[key_word].append(corpus_word_position)
79 | else:
80 | reverse_dict[key_word] = [corpus_word_position]
81 |
82 | def get_extracted_word_for_token(self, token: Token, document_word: str) -> str:
83 | """Gets the extracted word for a token. If the token is part of a coreference chain, the extracted word is the most specific
84 | term within that chain; otherwise it is the same as the document word.
85 | """
86 | extracted_word = document_word
87 | if (
88 | self.perform_coreference_resolution
89 | and token._.holmes.most_specific_coreferring_term_index is not None
90 | ):
91 | most_specific_token = token.doc[
92 | token._.holmes.most_specific_coreferring_term_index
93 | ]
94 | if token._.holmes.lemma != most_specific_token._.holmes.lemma:
95 | if most_specific_token._.holmes.multiword_spans is not None:
96 | for multiword_span in most_specific_token._.holmes.multiword_spans:
97 | extracted_word = multiword_span.text
98 | return extracted_word
99 | extracted_word = most_specific_token.text.lower()
100 | return extracted_word
101 |
102 |
103 | class WordMatch:
104 | """A match between a searched phrase word and a document word.
105 |
106 | Properties:
107 |
108 | search_phrase_token -- the spaCy token from the search phrase.
109 | search_phrase_word -- the word that matched from the search phrase.
110 | document_token -- the spaCy token from the document.
111 | first_document_token -- the first token that matched from the document, which will equal
112 | *document_token* except with multiword matches.
113 | last_document_token -- the last token that matched from the document, which will equal
114 | *document_token* except with multiword matches.
115 | document_subword -- the subword from the token that matched, or *None* if the match was
116 | with the whole token.
117 | document_word -- the word or subword that matched structurally from the document.
118 | word_match_type -- *direct*, *entity*, *embedding*, or *derivation*.
119 | depth -- the vertical difference in the ontology from *search_phrase_word* to *document_word*
120 | (can be negative).
121 | extracted_word -- the most specific term that corresponded to *document_word* within the
122 | coreference chain.
123 | explanation -- a human-readable explanation of how the word match was determined designed
124 | e.g. for use as a tooltip.
125 | similarity_measure -- for type *embedding*, the similarity between the two tokens,
126 | otherwise 1.0.
127 | involves_coreference -- *True* if *document_token* and *structurally_matched_document_token*
128 | are different.
129 | """
130 |
131 | def __init__(
132 | self,
133 | *,
134 | search_phrase_token: Token,
135 | search_phrase_word: str,
136 | document_token: Token,
137 | first_document_token: Token,
138 | last_document_token: Token,
139 | document_subword: Subword,
140 | document_word: str,
141 | word_match_type: str,
142 | depth: int = 0,
143 | extracted_word: str = None,
144 | explanation: str
145 | ):
146 |
147 | self.search_phrase_token = search_phrase_token
148 | self.search_phrase_word = search_phrase_word
149 | self.document_token = document_token
150 | self.first_document_token = first_document_token
151 | self.last_document_token = last_document_token
152 | self.document_subword = document_subword
153 | self.document_word = document_word
154 | self.word_match_type = word_match_type
155 | self.is_negated = False # will be set by StructuralMatcher
156 | self.is_uncertain = False # will be set by StructuralMatcher
157 | self.structurally_matched_document_token = (
158 | None # will be set by StructuralMatcher
159 | )
160 | self.extracted_word = (
161 | extracted_word if extracted_word is not None else document_word
162 | )
163 | self.depth = depth
164 | self.similarity_measure = 1.0
165 | self.explanation = explanation
166 |
167 | @property
168 | def involves_coreference(self) -> bool:
169 | return self.document_token != self.structurally_matched_document_token
170 |
171 | def get_document_index(self) -> Index:
172 | if self.document_subword is not None:
173 | subword_index = self.document_subword.index
174 | else:
175 | subword_index = None
176 | return Index(self.document_token.i, subword_index)
177 |
--------------------------------------------------------------------------------
/holmes_extractor/word_matching/derivation.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Optional, List
2 | from spacy.tokens import Token, Doc
3 | from .general import WordMatch, WordMatchingStrategy
4 | from ..parsing import CorpusWordPosition, MultiwordSpan, Subword, SearchPhrase
5 |
6 |
7 | class DerivationWordMatchingStrategy(WordMatchingStrategy):
8 |
9 | WORD_MATCH_TYPE_LABEL = "derivation"
10 |
11 | @staticmethod
12 | def _get_explanation(search_phrase_display_word: str) -> str:
13 | return "".join(
14 | ("Has a common stem with ", search_phrase_display_word.upper(), ".")
15 | )
16 |
17 | def match_multiwords(
18 | self,
19 | search_phrase: SearchPhrase,
20 | search_phrase_token: Token,
21 | document_token: Token,
22 | document_multiwords: List[MultiwordSpan],
23 | ) -> Optional[WordMatch]:
24 |
25 | if len(search_phrase_token._.holmes.lemma.split()) == 1:
26 | return None
27 | if search_phrase_token._.holmes.derivation_matching_reprs is None and not any(
28 | m for m in document_multiwords if m.derivation_matching_reprs is not None
29 | ):
30 | return None
31 |
32 | for multiword in document_multiwords:
33 |
34 | search_phrase_reprs = []
35 | document_reprs = []
36 |
37 | if search_phrase_token._.holmes.derivation_matching_reprs is not None:
38 | search_phrase_reprs.extend(
39 | search_phrase_token._.holmes.derivation_matching_reprs
40 | )
41 | document_reprs.extend(multiword.direct_matching_reprs)
42 | if multiword.derivation_matching_reprs is not None:
43 | document_reprs.extend(multiword.derivation_matching_reprs)
44 | search_phrase_reprs.extend(
45 | search_phrase_token._.holmes.direct_matching_reprs
46 | )
47 |
48 | for search_phrase_representation in search_phrase_reprs:
49 | for document_representation in document_reprs:
50 | if search_phrase_representation == document_representation:
51 | search_phrase_display_word = search_phrase_token._.holmes.lemma
52 | return WordMatch(
53 | search_phrase_token=search_phrase_token,
54 | search_phrase_word=search_phrase_representation,
55 | document_token=document_token,
56 | first_document_token=document_token.doc[
57 | multiword.token_indexes[0]
58 | ],
59 | last_document_token=document_token.doc[
60 | multiword.token_indexes[-1]
61 | ],
62 | document_subword=None,
63 | document_word=document_representation,
64 | word_match_type=self.WORD_MATCH_TYPE_LABEL,
65 | explanation=self._get_explanation(
66 | search_phrase_display_word
67 | ),
68 | )
69 | return None
70 |
71 | def match_token(
72 | self,
73 | search_phrase: SearchPhrase,
74 | search_phrase_token: Token,
75 | document_token: Token,
76 | ) -> Optional[WordMatch]:
77 |
78 | search_phrase_reprs = []
79 | document_reprs = []
80 |
81 | if search_phrase_token._.holmes.derivation_matching_reprs is not None:
82 | search_phrase_reprs.extend(
83 | search_phrase_token._.holmes.derivation_matching_reprs
84 | )
85 | document_reprs.extend(document_token._.holmes.direct_matching_reprs)
86 | if document_token._.holmes.derivation_matching_reprs is not None:
87 | document_reprs.extend(document_token._.holmes.derivation_matching_reprs)
88 | search_phrase_reprs.extend(
89 | search_phrase_token._.holmes.direct_matching_reprs
90 | )
91 |
92 | for search_phrase_representation in search_phrase_reprs:
93 | for document_representation in document_reprs:
94 | if search_phrase_representation == document_representation:
95 | search_phrase_display_word = search_phrase_token._.holmes.lemma
96 | return WordMatch(
97 | search_phrase_token=search_phrase_token,
98 | search_phrase_word=search_phrase_representation,
99 | document_token=document_token,
100 | first_document_token=document_token,
101 | last_document_token=document_token,
102 | document_subword=None,
103 | document_word=document_representation,
104 | word_match_type=self.WORD_MATCH_TYPE_LABEL,
105 | extracted_word=self.get_extracted_word_for_token(
106 | document_token, document_representation
107 | ),
108 | explanation=self._get_explanation(search_phrase_display_word),
109 | )
110 | return None
111 |
112 | def match_subword(
113 | self,
114 | search_phrase: SearchPhrase,
115 | search_phrase_token: Token,
116 | document_token: Token,
117 | document_subword: Subword,
118 | ) -> Optional[WordMatch]:
119 |
120 | search_phrase_reprs = []
121 | document_reprs = []
122 |
123 | if search_phrase_token._.holmes.derivation_matching_reprs is not None:
124 | search_phrase_reprs.extend(
125 | search_phrase_token._.holmes.derivation_matching_reprs
126 | )
127 | document_reprs.extend(document_subword.direct_matching_reprs)
128 | if document_subword.derivation_matching_reprs is not None:
129 | document_reprs.extend(document_subword.derivation_matching_reprs)
130 | search_phrase_reprs.extend(
131 | search_phrase_token._.holmes.direct_matching_reprs
132 | )
133 |
134 | for search_phrase_representation in search_phrase_reprs:
135 | for document_representation in document_reprs:
136 | if search_phrase_representation == document_representation:
137 | search_phrase_display_word = search_phrase_token._.holmes.lemma
138 | return WordMatch(
139 | search_phrase_token=search_phrase_token,
140 | search_phrase_word=search_phrase_representation,
141 | document_token=document_token,
142 | first_document_token=document_token,
143 | last_document_token=document_token,
144 | document_subword=document_subword,
145 | document_word=document_representation,
146 | word_match_type=self.WORD_MATCH_TYPE_LABEL,
147 | explanation=self._get_explanation(search_phrase_display_word),
148 | )
149 | return None
150 |
151 | def add_words_matching_search_phrase_root_token(
152 | self, search_phrase: SearchPhrase
153 | ) -> None:
154 | if (
155 | search_phrase.root_token._.holmes.derived_lemma
156 | != search_phrase.root_token._.holmes.lemma
157 | ):
158 | search_phrase.add_word_information(
159 | search_phrase.root_token._.holmes.derived_lemma,
160 | )
161 |
162 | def add_reverse_dict_entries(
163 | self,
164 | reverse_dict: Dict[str, List[CorpusWordPosition]],
165 | doc: Doc,
166 | document_label: str,
167 | ) -> None:
168 | for token in doc:
169 | if token._.holmes.derived_lemma != token._.holmes.lemma:
170 | self.add_reverse_dict_entry(
171 | reverse_dict,
172 | token._.holmes.derived_lemma.lower(),
173 | document_label,
174 | token.i,
175 | None,
176 | )
177 | for subword in token._.holmes.subwords:
178 | if subword.derived_lemma != subword.lemma:
179 | self.add_reverse_dict_entry(
180 | reverse_dict,
181 | subword.derived_lemma.lower(),
182 | document_label,
183 | token.i,
184 | subword.index,
185 | )
186 |
--------------------------------------------------------------------------------
/holmes_extractor/word_matching/ontology.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, List, Dict, Union
2 | from holmes_extractor.ontology import Ontology
3 | from spacy.tokens import Token, Doc
4 | from .general import WordMatch, WordMatchingStrategy
5 | from ..parsing import (
6 | HolmesDictionary,
7 | CorpusWordPosition,
8 | MultiwordSpan,
9 | SemanticMatchingHelper,
10 | Subword,
11 | SearchPhrase,
12 | )
13 |
14 |
15 | class OntologyWordMatchingStrategy(WordMatchingStrategy):
16 | """
17 | The patent US8155946 associated with this code has been made available under the MIT licence,
18 | with kind permission from AstraZeneca.
19 | """
20 |
21 | WORD_MATCH_TYPE_LABEL = "ontology"
22 |
23 | ONTOLOGY_DEPTHS_TO_NAMES = {
24 | -4: "an ancestor",
25 | -3: "a great-grandparent",
26 | -2: "a grandparent",
27 | -1: "a parent",
28 | 0: "a synonym",
29 | 1: "a child",
30 | 2: "a grandchild",
31 | 3: "a great-grandchild",
32 | 4: "a descendant",
33 | }
34 |
35 | def _get_explanation(self, search_phrase_display_word: str, depth: int) -> str:
36 | depth = min(depth, 4)
37 | depth = max(depth, -4)
38 | return "".join(
39 | (
40 | "Is ",
41 | self.ONTOLOGY_DEPTHS_TO_NAMES[depth],
42 | " of ",
43 | search_phrase_display_word.upper(),
44 | " in the ontology.",
45 | )
46 | )
47 |
48 | def __init__(
49 | self,
50 | semantic_matching_helper: SemanticMatchingHelper,
51 | perform_coreference_resolution: bool,
52 | ontology: Ontology,
53 | analyze_derivational_morphology: bool,
54 | ontology_reverse_derivational_dict: Optional[Dict[str, str]],
55 | ):
56 | self.ontology = ontology
57 | self.analyze_derivational_morphology = analyze_derivational_morphology
58 | self.ontology_reverse_derivational_dict = ontology_reverse_derivational_dict
59 | super().__init__(semantic_matching_helper, perform_coreference_resolution)
60 |
61 | def match_multiwords(
62 | self,
63 | search_phrase: SearchPhrase,
64 | search_phrase_token: Token,
65 | document_token: Token,
66 | document_multiwords: List[MultiwordSpan],
67 | ) -> Optional[WordMatch]:
68 |
69 | for search_phrase_representation in self._get_reprs(
70 | search_phrase_token._.holmes
71 | ):
72 | for multiword in document_multiwords:
73 | entry = self.ontology.matches(
74 | search_phrase_representation, self._get_reprs(multiword)
75 | )
76 | if entry is not None:
77 | search_phrase_display_word = search_phrase_token._.holmes.lemma
78 | return WordMatch(
79 | search_phrase_token=search_phrase_token,
80 | search_phrase_word=search_phrase_representation,
81 | document_token=document_token,
82 | first_document_token=document_token.doc[
83 | multiword.token_indexes[0]
84 | ],
85 | last_document_token=document_token.doc[
86 | multiword.token_indexes[-1]
87 | ],
88 | document_subword=None,
89 | document_word=entry.word,
90 | word_match_type=self.WORD_MATCH_TYPE_LABEL,
91 | depth=entry.depth,
92 | explanation=self._get_explanation(
93 | search_phrase_display_word, entry.depth
94 | ),
95 | )
96 | return None
97 |
98 | def match_token(
99 | self,
100 | search_phrase: SearchPhrase,
101 | search_phrase_token: Token,
102 | document_token: Token,
103 | ) -> Optional[WordMatch]:
104 |
105 | for search_phrase_representation in self._get_reprs(
106 | search_phrase_token._.holmes
107 | ):
108 | entry = self.ontology.matches(
109 | search_phrase_representation, self._get_reprs(document_token._.holmes)
110 | )
111 | if entry is not None:
112 | search_phrase_display_word = search_phrase_token._.holmes.lemma
113 | return WordMatch(
114 | search_phrase_token=search_phrase_token,
115 | search_phrase_word=search_phrase_representation,
116 | document_token=document_token,
117 | first_document_token=document_token,
118 | last_document_token=document_token,
119 | document_subword=None,
120 | document_word=entry.word,
121 | word_match_type=self.WORD_MATCH_TYPE_LABEL,
122 | extracted_word=self.get_extracted_word_for_token(
123 | document_token, entry.word
124 | ),
125 | depth=entry.depth,
126 | explanation=self._get_explanation(
127 | search_phrase_display_word, entry.depth
128 | ),
129 | )
130 | return None
131 |
132 | def match_subword(
133 | self,
134 | search_phrase: SearchPhrase,
135 | search_phrase_token: Token,
136 | document_token: Token,
137 | document_subword: Subword,
138 | ) -> Optional[WordMatch]:
139 |
140 | for search_phrase_representation in self._get_reprs(
141 | search_phrase_token._.holmes
142 | ):
143 | entry = self.ontology.matches(
144 | search_phrase_representation, self._get_reprs(document_subword)
145 | )
146 | if entry is not None:
147 | search_phrase_display_word = search_phrase_token._.holmes.lemma
148 | return WordMatch(
149 | search_phrase_token=search_phrase_token,
150 | search_phrase_word=search_phrase_representation,
151 | document_token=document_token,
152 | first_document_token=document_token,
153 | last_document_token=document_token,
154 | document_subword=document_subword,
155 | document_word=entry.word,
156 | word_match_type=self.WORD_MATCH_TYPE_LABEL,
157 | depth=entry.depth,
158 | explanation=self._get_explanation(
159 | search_phrase_display_word, entry.depth
160 | ),
161 | )
162 | return None
163 |
164 | def add_words_matching_search_phrase_root_token(
165 | self, search_phrase: SearchPhrase
166 | ) -> None:
167 | search_phrase_reprs = search_phrase.root_token._.holmes.direct_matching_reprs[:]
168 | if (
169 | self.analyze_derivational_morphology
170 | and search_phrase.root_token._.holmes.derivation_matching_reprs is not None
171 | ):
172 | search_phrase_reprs.extend(
173 | search_phrase.root_token._.holmes.derivation_matching_reprs
174 | )
175 | for word in search_phrase_reprs:
176 | for entry in self.ontology.get_matching_entries(word):
177 | for repr in entry.reprs:
178 | search_phrase.add_word_information(repr)
179 |
180 | def add_reverse_dict_entries(
181 | self,
182 | reverse_dict: Dict[str, List[CorpusWordPosition]],
183 | doc: Doc,
184 | document_label: str,
185 | ) -> None:
186 | for token in doc:
187 | odw = self.semantic_matching_helper.get_ontology_defined_multiword(
188 | token, self.ontology
189 | )
190 | if odw is not None:
191 | for representation in odw.direct_matching_reprs:
192 | self.add_reverse_dict_entry(
193 | reverse_dict,
194 | representation.lower(),
195 | document_label,
196 | token.i,
197 | None,
198 | )
199 | if (
200 | self.analyze_derivational_morphology
201 | and odw.derivation_matching_reprs is not None
202 | ):
203 | for representation in odw.derivation_matching_reprs:
204 | self.add_reverse_dict_entry(
205 | reverse_dict,
206 | representation.lower(),
207 | document_label,
208 | token.i,
209 | None,
210 | )
211 |
212 | def _get_reprs(
213 | self, repr_bearer: Union[HolmesDictionary, Subword, MultiwordSpan]
214 | ) -> List[str]:
215 | reprs = repr_bearer.direct_matching_reprs
216 | if (
217 | self.analyze_derivational_morphology
218 | and repr_bearer.derivation_matching_reprs is not None
219 | ):
220 | reprs.extend(repr_bearer.derivation_matching_reprs)
221 | return reprs
222 |
--------------------------------------------------------------------------------
/holmes_extractor/lang/de/data/derivation.csv:
--------------------------------------------------------------------------------
1 | abbau,abbauen
2 | abbonement,abbonnieren
3 | abbruch,abbrechen
4 | abfahrt,abfahren
5 | abflug,abfliegen
6 | abgabe,abgeben
7 | ablauf,ablaufen
8 | abnahme,abnehmen
9 | abreise,abreisen
10 | absage,absagen
11 | abschluss,abschließen
12 | abschrift,abschreiben
13 | absicht,beabsichtigen
14 | abstieg,absteigen
15 | abwehr,abwehren
16 | adoption,adoptieren,adoptiert
17 | akzeptanz,akzeptieren
18 | amputation,amputieren
19 | analyse,analysieren
20 | anbau,anbauen
21 | anfang,anfangen
22 | angabe,angeben
23 | angebot,anbieten
24 | angriff,angreifen
25 | ankunft,ankommen
26 | anlage,anlegen
27 | annahme,annehmen
28 | anprobe,anprobieren
29 | anreise,anreisen
30 | anruf,anrufen
31 | anschluss,anschließen
32 | ansporn,anspornen
33 | anstieg,ansteigen
34 | anstoß,anstoßen
35 | anstrich,anstreichen
36 | antrieb,antreiben
37 | antwort,antworten
38 | anzeige,anzeigen
39 | arbeit,arbeiten
40 | arrangement,arrangieren
41 | assimilation,assimilieren
42 | attacke,attackieren
43 | ärger,ärgern
44 | audit,auditieren,auditierung
45 | aufbau,aufbauen
46 | aufbruch,aufbrechen
47 | aufgabe,aufgeben
48 | aufnahme,aufnehmen
49 | aufsicht,beaufsichtigen
50 | aufstieg,aufsteigen
51 | auftrag,beauftragen
52 | aufwand,aufwenden
53 | ausbau,ausbauen
54 | ausdruck,ausdrücken
55 | ausfall,ausfallen
56 | ausgabe,ausgeben
57 | ausgang,ausgehen
58 | ausgleich,ausgleichen
59 | ausleihe,ausleihen
60 | ausschluss,ausschließen
61 | aussprache,aussprechen
62 | ausstieg,aussteigen
63 | austausch,austauschen
64 | auswahl,auswählen
65 | bau,bauen
66 | bedrängen,bedrängnis
67 | befehl,befehlen
68 | beginn,beginnen
69 | beichte,beichten
70 | beistand,beistehen
71 | beitrag,beitragen
72 | beitritt,beitreten
73 | bekennen,bekenntnis
74 | beleg,belegen
75 | bericht,berichten
76 | beschluss,beschließen
77 | beschwerde,beschweren
78 | besitz,besitzen
79 | besuch,besuchen
80 | beten,gebet
81 | betrieb,betreiben
82 | betrug,betrügen
83 | beweis,beweisen
84 | biss,beißen
85 | bitte,bitten
86 | blamage,blamieren
87 | blick,blicken
88 | blitz,blitzen
89 | blockade,blockieren
90 | blüte,blühen
91 | boykott,boykottieren
92 | brand,brennen
93 | brüllen,gebrüll
94 | bummel,bummeln
95 | dank,bedanken
96 | dank,danken
97 | dauer,dauern
98 | debatte,debattieren
99 | deklaration,deklarieren
100 | dekoration,dekorieren
101 | dementi,dementieren
102 | demonstration,demonstrieren
103 | demontage,demontieren
104 | denken,gedanke
105 | deportation,deportieren
106 | desertation,desertieren
107 | desinfektion,desinfizieren
108 | destillation,destillieren
109 | diagnose,diagnostizieren
110 | dienen,dienst
111 | diskussion,diskutieren
112 | dokumentation,dokumentieren
113 | donner,donnern
114 | dopen,doping
115 | druck,drucken
116 | duft,duften
117 | dusche,duschen
118 | ehre,ehren
119 | eile,eilen
120 | einfall,einfallen
121 | eingabe,eingeben
122 | eingriff,eingreifen
123 | einkauf,einkaufen
124 | einnahme,einnehmen
125 | einsatz,einsetzen
126 | einsehen,einsicht
127 | einstieg,einsteigen
128 | einsturz,einstürzen
129 | einwurf,einwerfen
130 | einzug,einziehen
131 | emigration,emigrieren
132 | empfang,empfangen
133 | ende,enden
134 | engagement,engagieren
135 | entnahme,entnehmen
136 | entschluss,entschließen
137 | entwurf,entwerfen
138 | ereignen,ereignis
139 | erhalt,erhalten
140 | erkennen,erkenntnis
141 | erlass,erlassen
142 | erlauben,erlaubnis
143 | erleben,erlebnis
144 | ernte,ernten
145 | erschweren,erschwernis
146 | erwerb,erwerben
147 | existenz,existieren
148 | experiment,experimentieren
149 | explosion,explodieren
150 | export,exportieren
151 | extraktion,extrahieren
152 | fahrt,fahren
153 | fall,fallen
154 | fang,fangen
155 | faszination,faszinieren
156 | feier,feiern
157 | festnahme,festnehmen
158 | flirt,flirten
159 | flucht,fliehen
160 | flucht,flüchten
161 | flug,fliegen
162 | folge,folgen
163 | fortschritt,fortschreiten
164 | frage,fragen
165 | freigabe,freigeben
166 | freude,freuen
167 | frost,frieren
168 | frustration,frustrieren
169 | frühstück,frühstücken
170 | fund,finden
171 | furcht,fürchten
172 | fühlen,gefühl
173 | gabe,geben
174 | garantie,garantieren
175 | geruch,riechen
176 | gesang,singen
177 | geschmack,schmecken
178 | glanz,glänzen
179 | glaube,glauben
180 | glückwunsch,beglückwünschen
181 | gratulation,gratulieren
182 | griff,greifen
183 | gruß,grüßen
184 | guss,gießen
185 | hagel,hageln
186 | halt,halten
187 | harmonie,harmonieren
188 | hass,hassen
189 | hauch,hauchen
190 | heirat,heiraten
191 | herrschen,herrschaft
192 | hetze,hetzen
193 | hilfe,helfen
194 | hinweis,hinweisen
195 | identifikation,identifizieren
196 | ignoranz,ignorieren
197 | illustration,illustrieren
198 | immigration,immigrieren
199 | import,importieren
200 | infektion,infizieren
201 | information,informieren
202 | inhalt,beinhalten
203 | inspiration,inspirieren
204 | installation,installieren
205 | integration,integrieren
206 | interesse,interessieren
207 | interpretation,interpretieren
208 | interview,interviewen
209 | investieren,investition
210 | irritation,irritieren
211 | jagd,jagen
212 | joggen,jogging
213 | jubel,jubeln
214 | kampf,kämpfen
215 | kauf,kaufen
216 | kennen,kenntnis
217 | klage,klagen,beklagen
218 | klang,klingen
219 | kollision,kollidieren
220 | kombination,kombinieren
221 | kommunikation,kommunizieren
222 | komponieren,komposition
223 | konfrontation,konfrontieren
224 | konstruieren,konstruktion
225 | kontraktion,kontrahieren
226 | kontrolle,kontrollieren
227 | konzentration,konzentrieren
228 | kopie,kopieren
229 | korrektur,korrigieren
230 | korrespondenz,korrespondieren
231 | kritik,kritisieren
232 | kummer,bekümmern
233 | kuss,küssen
234 | langeweile,langweilen
235 | lauf,laufen
236 | lehre,lehren
237 | leihen,verleih,ausleihe
238 | liebe,lieben
239 | lob,loben
240 | lüge,lügen
241 | managen,management
242 | mangel,mangeln
243 | marsch,marschieren
244 | massage,massieren
245 | miete,mieten
246 | mitarbeit,mitarbeiten
247 | mitfühlen,mitgefühl
248 | mitschrift,mitschreiben
249 | montage,montieren
250 | müde,müdigkeit
251 | nachfolge,nachfolgen
252 | nachfrage,nachfragen
253 | nachlass,nachlassen
254 | nachweis,nachweisen
255 | neid,beneiden
256 | notiz,notieren
257 | operation,operieren
258 | opfer,opfern
259 | patrouille,patrouillieren
260 | pflege,pflegen
261 | picknick,picknicken
262 | plädoyer,plädieren
263 | politur,polieren
264 | pose,posieren
265 | predigt,predigen
266 | privileg,privilegieren
267 | probe,proben,probieren
268 | produktion,produzieren
269 | protest,protestieren
270 | protokoll,protokollieren
271 | provokation,provozieren
272 | qual,quälen
273 | quatschen,gequatsche
274 | rache,rächen
275 | rat,raten
276 | raub,rauben
277 | reaktion,reagieren
278 | rebellion,rebellieren
279 | rede,reden
280 | reduktion,reduzieren
281 | reform,reformieren
282 | regen,regnen
283 | regeneration,regenerieren
284 | reise,reisen
285 | reiz,reizen
286 | reklamation,reklamieren
287 | reparatur,reparieren
288 | respekt,respektieren
289 | restauration,restaurieren
290 | reue,bereuen
291 | revision,revidieren
292 | risiko,riskieren
293 | riss,reißen
294 | ritt,reiten
295 | rotation,rotieren
296 | ruf,rufen
297 | ruhe,ruhen
298 | ruin,ruinieren
299 | rückgabe,zurückgeben
300 | rückgriff,zurückgreifen
301 | rückkehr,zurückkehren
302 | rücktritt,zurücktreten
303 | rückzug,zurückziehen
304 | sabotage,sabotieren
305 | schau,schauen
306 | schauder,schaudern
307 | schein,scheinen
308 | scherz,scherzen
309 | schikane,schikanieren
310 | schimmer,schimmern
311 | schimpfen,geschimpfe
312 | schlaf,schlafen
313 | schlag,schlagen
314 | schmerz,schmerzen
315 | schmuggel,schmuggeln
316 | schnee,schneien
317 | schrei,schreien
318 | schrift,schreiben
319 | schritt,schreiten
320 | schuss,schießen
321 | schutz,schützen,beschützen
322 | schwatz,schwatzen
323 | schweiß,schwitzen
324 | schwindel,schwindeln
325 | schwur,schwüren
326 | schwung,schwingen
327 | sehen,sicht
328 | seufzen,seufzer
329 | sieg,siegen,besiegen
330 | sorge,sorgen
331 | spazieren,spaziergang
332 | spekulation,spekulieren
333 | spende,spenden
334 | spiel,spielen
335 | spionage,spionieren
336 | spott,spotten
337 | sprung,springen
338 | stagnation,stagnieren
339 | start,starten
340 | stau,stauen
341 | stimulation,stimulieren
342 | stopp,stoppen
343 | stoß,stoßen
344 | streik,streiken
345 | streit,streiten
346 | studium,studieren
347 | sturm,stürmen
348 | sturz,stürzen
349 | suche,suchen
350 | sünde,sündigen
351 | süß,sußigkeit
352 | tanz,tanzen
353 | tat,tun
354 | taufe,taufen
355 | tausch,tauschen
356 | teilnahme,teilnehmen
357 | telefonat,telefonieren
358 | test,testen
359 | training,trainieren
360 | transport,transportieren
361 | trauer,trauern
362 | traum,träumen
363 | tritt,treten
364 | triumph,triumphieren
365 | trost,trösten
366 | überfall,überfallen
367 | übergabe,übergeben
368 | umbau,umbauen
369 | umgang,umgehen
370 | umkehr,umkehren
371 | umstieg,umsteigen
372 | umtausch,umtauschen
373 | umzug,umziehen
374 | unterricht,unterrichten
375 | unterschrift,unterschreiben
376 | urteil,urteilen
377 | variation,variieren
378 | verbot,verbieten
379 | verbrauch,verbrauchen
380 | verbund,verbinden
381 | verdienen,verdienst
382 | vergabe,vergeben
383 | vergleich,vergleichen
384 | verhör,verhören
385 | verkauf,verkaufen
386 | verlauf,verlaufen
387 | verlust,verlieren
388 | verrat,verraten
389 | versand,versenden
390 | verschleiß,verschleißen
391 | verschluss,verschließen
392 | versteck,verstecken
393 | verstehen,verständnis
394 | versuch,versuchen
395 | versäumen,versäumnis
396 | verzehr,verzehren
397 | verzicht,verzichten
398 | voraussage,voraussagen
399 | vorgabe,vorgeben
400 | vorhersage,vorhersagen
401 | vorkommen,vorkommnis
402 | vorschlag,vorschlagen
403 | vorschrift,vorschreiben
404 | vortrag,vortragen
405 | wachsen,wachstum
406 | wagen,wagnis
407 | wahl,wählen
408 | wandel,wandeln
409 | wechsel,wechseln
410 | weggang,weggehen
411 | wegnahme,wegnehmen
412 | weiterfahrt,weiterfahren
413 | weitergabe,weitergeben
414 | wende,wenden
415 | wette,wetten
416 | widerruf,widerrufen
417 | widerspruch,widersprechen
418 | widerstand,widerstehen
419 | wiegen,gewicht
420 | wille,wollen
421 | wunsch,wünschen
422 | wurf,werfen
423 | wäsche,waschen
424 | zensur,zensieren
425 | zitation,zitieren
426 | zug,ziehen
427 | zunahme,zunehmen
428 | zusammenarbeit,zusammenarbeiten
429 | zusammenbau,zusammenbauen
430 | zusammenstoß,zusammenstoßen
431 | zwang,zwingen
432 | zweifel,bezweifeln
433 | zweifel,zweifeln
434 |
--------------------------------------------------------------------------------
/holmes_extractor/lang/en/data/derivation.csv:
--------------------------------------------------------------------------------
1 | abandon,abandonment
2 | able,ability
3 | abort,abortion
4 | abstract,abstraction
5 | abuse,abusive
6 | accept,acceptance
7 | accident,accidental
8 | accompany,accompaniment
9 | accomplish,accomplishment
10 | accountable,accountability
11 | accuracy,accurate
12 | accuse,accusation
13 | achieve,achievement
14 | acknowledge,acknowledgement
15 | act,action,activity
16 | adapt,adaptation
17 | add,addition,additional
18 | adjust,adjustment
19 | admire,admiration
20 | adopt,adoption
21 | advertise,advertize,advertisement
22 | advice,advise
23 | affect,effect
24 | agree,agreement
25 | alcohol,alcoholic
26 | allow,allowance
27 | alter,alteration
28 | amaze,amazing,amazement
29 | ambiguity,ambiguous,ambiguousness
30 | amuse,amusing,amusement
31 | analyse,analyze,analysis
32 | anger,angry
33 | announce,announcement
34 | anonymity,anonymous
35 | apology,apologize,apologetic
36 | appear,appearance
37 | applaud,applause
38 | appoint,appointment
39 | approve,approval
40 | argue,argument
41 | arrange,arrangement
42 | assert,assertion
43 | assess,assessment
44 | assure,assurance
45 | astonish,astonishing,astonishment
46 | attach,attachment
47 | attain,attainment
48 | attract,attraction
49 | attribute,attribution
50 | avoid,avoidance
51 | base,basic,basis
52 | beast,beastly
53 | behave,behavior,behaviour,behavioral
54 | belief,believe
55 | breath,breathe
56 | bury,burial
57 | capable,capability
58 | cease,cessation
59 | ceremony,ceremonial
60 | certain,certainty
61 | charm,charming
62 | cite,citation
63 | clean,cleanliness
64 | clear,clarity
65 | clinic,clinical
66 | collaboration,collaborative
67 | collect,collection
68 | combine,combination
69 | commerce,commercial
70 | commit,commitment
71 | compare,comparison
72 | compete,competition
73 | compile,compilation
74 | complete,completion
75 | compliant,compliance
76 | compose,composition
77 | comprehend,comprehension
78 | conclude,conclusion
79 | confirm,confirmation
80 | conform,conformity
81 | confront,confrontation
82 | confuse,confusion
83 | connect,connection
84 | consequent,consequence
85 | conservatism,conservative
86 | consider,consideration
87 | consistent,consistency
88 | constrain,constraint
89 | construct,construction
90 | consult,consultation
91 | continue,continuation
92 | contradict,contradiction
93 | contribute,contribution
94 | controversy,controversial
95 | convene,convention
96 | convenient,convenience
97 | cooperate,cooperative
98 | correct,correction
99 | correlate,correlative
100 | correspond,correspondence
101 | cover,coverage
102 | critical,criticise,criticism,criticize
103 | cruel,cruelty
104 | day,daily
105 | deceit,deceive,deception,deceptive
106 | decide,decision
107 | declare,declaration,declarative
108 | deep,depth
109 | defend,defence,defense,defensive
110 | define,definition
111 | deny,denial
112 | depend,dependence
113 | depress,depression,depressive
114 | describe,description
115 | despair,desperation
116 | destroy,destruction,destructive
117 | detach,detachment
118 | detect,detection
119 | deter,deterrent,deterrence
120 | determine,determination
121 | develop,development,developmental
122 | devote,devotion
123 | diagnose,diagnosis
124 | dictator,dictatorial
125 | die,dead,death
126 | differ,different,difference
127 | digest,digestion,digestive
128 | dimension,dimensional
129 | disagree,disagreement
130 | disappoint,disappointing,disappointment
131 | disaster,disastrous
132 | discourage,discouragement
133 | discuss,discussion
134 | dishonest,dishonesty
135 | dismiss,dismissal
136 | disobey,disobedient,disobedience
137 | dispose,disposal
138 | disrespect,disrespectful
139 | dissatisfy,dissatisfaction
140 | distant,distance
141 | distinct,distinction,distinctive
142 | distort,distortion
143 | distract,distracting,distraction
144 | disturb,disturbing,disturbance
145 | diverse,diversity
146 | divide,division
147 | domestic,domesticate
148 | dominant,dominate,dominance
149 | doubt,doubtful
150 | ease,easy
151 | edit,edition
152 | efficient,efficiency
153 | elect,election
154 | embarrass,embarrassment
155 | emerge,emergence
156 | emit,emission
157 | emphasis,emphatic,emphasise,emphasize
158 | employ,employment
159 | enclose,enclosure
160 | encourage,encouragement
161 | endure,endurance
162 | energy,energize,energetic
163 | enforce,enforcement
164 | engage,engagement
165 | enhance,enhancement
166 | enjoy,enjoyment
167 | enlarge,enlargement
168 | enormity,enormous
169 | enter,entrance
170 | entertain,entertainment
171 | entitle,entitlement
172 | envy,envious
173 | equal,equality
174 | equip,equipment
175 | evolve,evolution
176 | examine,examination
177 | excel,excellent,excellence
178 | excess,excessive
179 | excite,excitement
180 | execute,execution
181 | exhibit,exhibition
182 | exist,existence
183 | expand,expansion
184 | expanse,expansive
185 | expect,expectation
186 | expend,expenditure
187 | expense,expensive
188 | expire,expiry,expiration
189 | explain,explanation
190 | explode,explosion,explosive
191 | exploit,exploitation
192 | explore,exploration
193 | express,expression
194 | expel,expulsion
195 | extract,extraction
196 | fail,failure
197 | familiar,familiarise,familiarity,familiarize
198 | fear,fearful
199 | feasible,feasibility
200 | fiction,fictional
201 | finance,financial
202 | fly,flight
203 | forgive,forgiveness
204 | frequent,frequency
205 | fur,furry
206 | generous,generosity
207 | gift,give
208 | glass,glassy
209 | govern,government
210 | grand,grandeur
211 | grateful,gratitude
212 | guilt,guilty
213 | hard,hardship
214 | haste,hasty
215 | hierarchy,hierarchical
216 | high,height
217 | hinder,hindrance
218 | history,historical
219 | honest,honesty
220 | hope,hopeful
221 | hostile,hostility
222 | humid,humidity
223 | hunger,hungry
224 | hypothesis,hypothetical
225 | ice,icy
226 | identify,identity,identification
227 | ideology,ideological
228 | imagine,imagination
229 | impatient,impatience
230 | important,importance
231 | impress,impression
232 | imprison,imprisonment
233 | improbable,improbability
234 | improve,improvement
235 | impure,impurity
236 | incapable,incapability
237 | incident,incidence,incidental
238 | include,inclusion
239 | inconsistent,inconsistency
240 | independent,independence
241 | indifferent,indifference
242 | infeasible,infeasibility
243 | infect,infection
244 | infinite,infinity
245 | inform,information
246 | inhibit,inhibition
247 | injure,injury
248 | innocent,innocence
249 | insist,insistent,insistence
250 | inspect,inspection
251 | instant,instance
252 | institution,institutional
253 | instruct,instruction
254 | integral,integrate
255 | intelligent,intelligence
256 | intend,intention
257 | intense,intensity
258 | interrupt,interruption
259 | intervene,intervention
260 | introduce,introduction
261 | invade,invasion
262 | invent,invention
263 | invite,invitation
264 | involve,involvement
265 | liable,liability
266 | logic,logical
267 | loose,loosen
268 | lose,loss
269 | loyal,loyalty
270 | magic,magical
271 | maintain,maintenance
272 | manage,management
273 | manipulate,manipulative
274 | marry,marriage
275 | mass,massive
276 | maximal,maximum
277 | measure,measurement
278 | minimal,minimum
279 | mix,mixture
280 | modern,modernity
281 | modest,modesty
282 | month,monthly
283 | music,musical
284 | necessary,necessity,necessitate
285 | neglect,negligent,negligence
286 | nerve,nervous
287 | noble,nobility
288 | norm,normal,normality
289 | obey,obedient,obedience
290 | oblige,obligation,obligatory
291 | offend,offence,offense
292 | omit,omission
293 | option,optional
294 | package,packaging
295 | patient,patience
296 | patriot,patriotic,patriotism
297 | peace,peaceful
298 | peculiar,peculiarity
299 | perfect,perfection
300 | perform,performance
301 | permit,permission
302 | persist,persistent,persistence
303 | persuade,persuasion
304 | poem,poetic
305 | poor,poverty
306 | possess,possession,possessive
307 | possible,possibility
308 | post,postal
309 | practical,practicality
310 | practice,practise
311 | precise,precision
312 | prefer,preference
313 | prejudice,prejudicial
314 | prepare,preparation
315 | present,presence
316 | preserve,preservation,preservative
317 | presume,presumption
318 | presuppose,presupposition
319 | pretend,pretence
320 | prevalent,prevalence
321 | prevent,prevention
322 | probable,probability
323 | produce,production
324 | progress,progression
325 | prohibit,prohibition,prohibitory
326 | project,projection
327 | promote,promotion
328 | proof,prove
329 | propose,proposal
330 | protect,protection,protective
331 | publicise,publicize,publication
332 | punish,punishment
333 | pure,purity
334 | rare,rarity
335 | react,reaction
336 | reappear,reappearance
337 | reassure,reassurance
338 | rebel,rebellious
339 | receipt,receive
340 | recognise,recognize,recognition
341 | reconcile,reconciliation
342 | reconsider,reconsideration
343 | recruit,recruitment
344 | refer,referral
345 | refresh,refreshment
346 | refuse,refusal
347 | reinforce,reinforcement
348 | relax,relaxation
349 | relief,relieve
350 | reluctant,reluctance
351 | rely,reliance
352 | represent,representation
353 | reproduce,reproduction
354 | require,requirement
355 | reside,residence,residential
356 | resign,resignation
357 | resist,resistance
358 | resolve,resolution
359 | respect,respectful
360 | responsible,responsibility
361 | restrain,restraint
362 | restrict,restriction,restrictive
363 | reverse,reversal
364 | rigor,rigour,rigorous
365 | rival,rivalry
366 | rose,rosy
367 | satisfy,satisfaction
368 | secret,secrecy
369 | sector,sectoral
370 | sequence,sequential
371 | serve,service
372 | settle,settlement
373 | sex,sexual
374 | sign,signature
375 | sincere,sincerity
376 | solve,solution
377 | speak,speech
378 | sphere,spherical
379 | spite,spiteful
380 | spontaneity,spontaneous
381 | strong,strength
382 | stupid,stupidity
383 | substance,substantial
384 | succeed,success
385 | suggest,suggestion
386 | summer,summery
387 | superior,superiority
388 | suppose,supposition
389 | survive,survival
390 | suspend,suspension
391 | talent,talented
392 | tempt,temptation
393 | tense,tension
394 | thirst,thirsty
395 | threat,threaten
396 | transmit,transmission
397 | treat,treatment
398 | true,truth
399 | trivia,trivial
400 | unable,inability
401 | uncertain,uncertainty
402 | unimportant,unimportance
403 | unite,unity
404 | use,usage
405 | vary,variation
406 | virtue,virtuous
407 | warm,warmth
408 | waste,wastage
409 | week,weekly
410 | weigh,weight
411 | wide,width
412 | winter,wintery
413 | wood,wooden
414 | wool,wooly,woolen,woolly,woollen
415 | year,yearly
416 | young,youth
417 |
--------------------------------------------------------------------------------
/tests/de/test_questions_DE.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import holmes_extractor as holmes
3 | from holmes_extractor.topic_matching import TopicMatcher
4 |
5 | manager = holmes.Manager(model='de_core_news_lg', number_of_workers=1)
6 |
7 | class GermanInitialQuestionsTest(unittest.TestCase):
8 |
9 | def _check_equals(self, text_to_match, document_text, highest_score, answer_start, answer_end,
10 | word_embedding_match_threshold=0.42, initial_question_word_embedding_match_threshold=0.42,
11 | use_frequency_factor=True, *, alternative_highest_score=None):
12 | manager.remove_all_documents()
13 | manager.parse_and_register_document(document_text)
14 | topic_matches = manager.topic_match_documents_against(text_to_match,
15 | word_embedding_match_threshold=
16 | word_embedding_match_threshold,
17 | initial_question_word_embedding_match_threshold=initial_question_word_embedding_match_threshold,
18 | initial_question_word_answer_score=40,
19 | relation_score=20,
20 | reverse_only_relation_score=15, single_word_score=10, single_word_any_tag_score=5,
21 | different_match_cutoff_score=10,
22 | relation_matching_frequency_threshold=0.0,
23 | embedding_matching_frequency_threshold=0.0,
24 | use_frequency_factor=use_frequency_factor)
25 | if alternative_highest_score is None:
26 | self.assertEqual(int(topic_matches[0]['score']), highest_score)
27 | else:
28 | self.assertIn(int(topic_matches[0]['score']), (highest_score, alternative_highest_score))
29 | if answer_start is not None:
30 | self.assertEqual(topic_matches[0]['answers'][0][0], answer_start)
31 | self.assertEqual(topic_matches[0]['answers'][0][1], answer_end)
32 | else:
33 | self.assertEqual(len(topic_matches[0]['answers']), 0)
34 |
35 | def test_basic_matching_with_subword(self):
36 | self._check_equals("Was betrachtet man?", 'Informationsbetrachtung', 45, 0, 11)
37 |
38 | def test_governed_interrogative_pronoun_with_subword(self):
39 | self._check_equals("Welche Information betrachtet man?", 'Informationsbetrachtung', 55, 0, 11)
40 |
41 | def test_governed_interrogative_pronoun_with_subword_control(self):
42 | self._check_equals("Die Information betrachtet man.", 'Informationsbetrachtung', 35, None, None)
43 |
44 | def test_governed_interrogative_pronoun_with_complex_subword(self):
45 | self._check_equals("Welche Information betrachtet man?",
46 | 'Extraktionsinformationsbetrachtung', 55, 0, 22)
47 |
48 | def test_governed_interrogative_pronoun_with_complex_subword_control(self):
49 | self._check_equals("Die Information betrachtet man.",
50 | 'Extraktionsinformationsbetrachtung', 35, None, None)
51 |
52 | def test_governed_interrogative_pronoun_with_subword_and_coreference(self):
53 | self._check_equals("Welchen Löwen betrachten wir.", 'Es gab einen Extraktionslöwen. Leute haben ihn betrachtet', 54, 13, 29)
54 |
55 | def test_governed_interrogative_pronoun_with_subword_and_coreference_control(self):
56 | self._check_equals("Den Löwen betrachten wir.", 'Es gab einen Extraktionslöwen. Leute haben ihn betrachtet', 34, None, None)
57 |
58 | def test_governed_interrogative_pronoun_with_subword_and_embedding_matching(self):
59 | self._check_equals("Welchen Hund betrachten wir?", 'Leute betrachteten die Informationskatze', 25, 23, 40)
60 |
61 | def test_governed_interrogative_pronoun_with_subword_and_embedding_matching_control(self):
62 | self._check_equals("Den Hund betrachten wir.", 'Leute betrachteten den Informationskatze', 15, None, None)
63 |
64 | def test_check_was_predicate_positive_case(self):
65 | manager.remove_all_documents()
66 | manager.parse_and_register_document("Das ist ein Haus.", 'q')
67 | topic_matches = manager.topic_match_documents_against("Was ist das?")
68 | self.assertEqual(topic_matches, [{'document_label': 'q', 'text': 'Das ist ein Haus.', 'text_to_match': 'Was ist das?', 'rank': '1', 'index_within_document': 1, 'subword_index': None, 'start_index': 1, 'end_index': 3, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 17, 'score': 620.0, 'word_infos': [[4, 7, 'relation', True, 'Matches SEIN directly.'], [12, 16, 'relation', False, 'Matches the question word WAS.']], 'answers': [[8, 16]]}])
69 |
70 | def test_check_wer_positive_case(self):
71 | self._check_equals('Wer schaute in die Sonne?', 'Die Person schaute in die Sonne', 127, 0, 10)
72 |
73 | def test_check_wer_wrong_syntax(self):
74 | self._check_equals('Wer schaute in die Sonne?', 'Die Sonne schaute in den Mann', 19, None, None)
75 |
76 | def test_check_wer_wrong_noun(self):
77 | self._check_equals('Wer schaute in die Sonne?', 'Das Gebäude schaute in die Sonne', 70, None, None)
78 |
79 | @unittest.skipIf(manager.nlp.meta['version'] == '3.2.0', 'Version fluke')
80 | def test_check_wen_positive_case(self):
81 | self._check_equals('Wen sah das Gebäude?', 'Das Gebäude sah die Person', 54, 16, 26, alternative_highest_score=104)
82 |
83 | def test_check_wen_wrong_syntax(self):
84 | self._check_equals('Wen sah das Gebäude?', 'Das Gebäude sah das Gebäude', 34, None, None)
85 |
86 | def test_check_was_acc(self):
87 | self._check_equals('Was sah das Gebäude?', 'Das Gebäude sah das Gebäude', 104, 16, 27)
88 |
89 | def test_check_wem_positive_case(self):
90 | self._check_equals('wem gibst du es?', 'Ich gebe es der Person', 45, 12, 22)
91 |
92 | def test_check_wo_positive_case(self):
93 | self._check_equals('Wo wohnst du?', 'Ich wohne in einem Haus', 45, 10, 23)
94 |
95 | def test_check_wo_positive_case_definite_preposition(self):
96 | self._check_equals('Wo wohnst du?', 'Ich wohne im Haus', 45, 10, 17)
97 |
98 | def test_check_wo_wrong_case_definite_preposition(self):
99 | self._check_equals('Wo wohnst du?', 'Ich wohne ins Haus', 5, None, None)
100 |
101 | def test_check_wo_wrong_case(self):
102 | self._check_equals('Wo wohnst du?', 'Ich wohne in ein Haus', 5, None, None)
103 |
104 | def test_check_wohin_positive_case(self):
105 | self._check_equals('Wohin fährst du?', 'Ich fahre in ein Haus', 45, 10, 21)
106 |
107 | def test_check_wohin_positive_case_definite_preposition(self):
108 | self._check_equals('Wohin fährst du?', 'Ich fahre ins Haus', 45, 10, 18)
109 |
110 | def test_check_wohin_wrong_case_definite_preposition(self):
111 | self._check_equals('Wohin fährst du?', 'Ich fahre im Haus', 5, None, None)
112 |
113 | def test_check_womit_positive_case(self):
114 | self._check_equals('Womit fährst du?', 'Ich fahre mit meinem Auto', 45, 10, 25)
115 |
116 | def test_check_womit_other_preposition(self):
117 | self._check_equals('Womit fährst du?', 'Ich fahre ohne mein Auto', 5, None, None)
118 |
119 | @unittest.skipIf(manager.nlp.meta['version'] == '3.2.0', 'Version fluke')
120 | def test_check_wann_noun(self):
121 | self._check_equals('Wann fährst du?', 'Ich fahre nächste Woche', 45, 10, 23)
122 |
123 | def test_check_wann_preposition(self):
124 | self._check_equals('Wann fährst du?', 'Ich fahre in zwei Wochen', 45, 10, 24)
125 |
126 | def test_check_wann_wrong_preposition(self):
127 | self._check_equals('Wann fährst du?', 'Ich fahre wegen des Problems', 5, None, None)
128 |
129 | def test_check_wann_adverb(self):
130 | self._check_equals('Wann fährst du?', 'Ich fahre morgen', 45, 10, 16)
131 |
132 | def test_check_wann_verb_phrase(self):
133 | self._check_equals('Wann fährst du?', 'Ich fahre, wenn du mitkommst.', 45, 11, 28)
134 |
135 | def test_check_wie_preposition(self):
136 | self._check_equals('Wie fährst du?', 'Ich fahre mit dem Auto', 45, 10, 22)
137 |
138 | def test_check_wie_wrong_preposition(self):
139 | self._check_equals('Wie fährst du?', 'Ich fahre wegen des Problems', 5, None, None)
140 |
141 | def test_check_wie_adverb(self):
142 | self._check_equals('Wie fährst du?', 'Ich fahre langsam', 45, 10, 17)
143 |
144 | def test_check_wie_indem_phrase(self):
145 | self._check_equals('Wie fährst du?', 'Ich fahre, indem ich per Anhalter fahre', 45, 11, 39)
146 |
147 | def test_check_wie_other_phrase(self):
148 | self._check_equals('Wie fährst du?', 'Ich fahre, weil ich per Anhalter fahre', 5, None, None)
149 |
150 | def test_check_woher_preposition(self):
151 | self._check_equals('Woher denkst Du es?', 'Ich denke es wegen der Evidenz', 45, 13, 30)
152 |
153 | def test_check_woher_wrong_preposition(self):
154 | self._check_equals('Woher denkst Du es?', 'Ich denke es trotz der Evidenz', 5, None, None)
155 |
156 | def test_check_woher_weil(self):
157 | self._check_equals('Woher denkst Du es?', 'Ich denke es, weil es stimmt', 45, 14, 28)
158 |
159 | def test_check_woher_wrong_conjunction(self):
160 | self._check_equals('Woher denkst Du es?', 'Ich denke es, obwohl es nicht stimmt', 5, None, None)
161 |
162 | def test_check_warum_preposition(self):
163 | self._check_equals('Warum machst Du es?', 'Ich mache es wegen der Evidenz', 45, 13, 30)
164 |
165 | def test_check_warum_wrong_preposition(self):
166 | self._check_equals('Warum machst Du es?', 'Ich mache es trotz der Evidenz', 5, None, None)
167 |
168 | def test_check_warum_weil(self):
169 | self._check_equals('Warum machst Du es?', 'Ich mache es, weil es stimmt', 45, 14, 28)
170 |
171 | def test_check_warum_weil_sein(self):
172 | self._check_equals('Warum machst Du es?', 'Ich mache es, weil es gut ist', 45, 14, 29)
173 |
174 | def test_check_warum_damit(self):
175 | self._check_equals('Wieso machst Du es?', 'Ich mache es, damit Du kommst', 45, 14, 29)
176 |
177 | def test_check_warum_wrong_conjunction(self):
178 | self._check_equals('Woher machst Du es?', 'Ich mache es, obwohl es nicht stimmt', 5, None, None)
179 |
180 | def test_question_word_is_not_treated_as_answer(self):
181 | self._check_equals('Wohin geht der Mann?', 'Wohin geht der Mann', 73, None, None)
182 |
183 | def test_non_initial_question_word_is_not_treated_as_answer(self):
184 | self._check_equals('Wohin geht der Mann?', 'Und wohin geht der Mann', 73, None, None)
185 |
--------------------------------------------------------------------------------
/examples/example_chatbot_EN_insurance_ontology.owl:
--------------------------------------------------------------------------------
1 |
2 |
9 |
10 |
11 |
12 |
13 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
--------------------------------------------------------------------------------
/tests/en/test_ontology.owl:
--------------------------------------------------------------------------------
1 |
2 |
9 |
10 |
11 |
12 |
13 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
--------------------------------------------------------------------------------
/examples/example_chatbot_DE_insurance_ontology.owl:
--------------------------------------------------------------------------------
1 |
2 |
9 |
10 |
11 |
12 |
13 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
--------------------------------------------------------------------------------
/tests/de/test_doc_examples_DE.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import holmes_extractor as holmes
3 |
4 | holmes_manager = holmes.Manager(model="de_core_news_lg", number_of_workers=1)
5 | holmes_manager.register_search_phrase("Ein großer Hund jagt eine Katze")
6 | holmes_manager.register_search_phrase("Ein ENTITYPER geht in die Stadt")
7 |
8 |
9 | class EnglishDocumentationExamplesTest(unittest.TestCase):
10 |
11 | positive_examples = (
12 | "Der große Hund hat die Katze ständig gejagt",
13 | "Der große Hund, der müde war, jagte die Katze",
14 | "Die Katze wurde vom großen Hund gejagt",
15 | "Die Katze wurde immer wieder durch den großen Hund gejagt",
16 | "Der große Hund wollte die Katze jagen",
17 | "Der große Hund entschied sich, die Katze zu jagen",
18 | "Die Katze, die der große Hund gejagt hatte, hatte Angst",
19 | "Dass der große Hund die Katze jagte, war ein Problem",
20 | "Es gab einen großen Hund, der eine Katze jagte",
21 | "Die Katzenjagd durch den großen Hund",
22 | "Es gab einmal einen großen Hund, und er jagte eine Katze",
23 | "Es gab einen großen Hund. Er hieß Fido. Er jagte meine Katze",
24 | "Es erschien ein Hund. Er jagte eine Katze. Er war sehr groß.",
25 | "Die Katze schlich sich in unser Wohnzimmer zurück, weil ein großer Hund sie draußen gejagt hatte",
26 | "Unser großer Hund war aufgeregt, weil er eine Katze gejagt hatte",
27 | )
28 |
29 | def test_positive_examples(self):
30 | for positive_example in self.positive_examples:
31 | with self.subTest():
32 | assert len(holmes_manager.match(document_text=positive_example)) == 1
33 |
34 | negative_examples = (
35 | "Der Hund jagte eine große Katze",
36 | "Die Katze jagte den großen Hund",
37 | "Der große Hund und die Katze jagten",
38 | "Der große Hund jagte eine Maus aber die Katze war müde",
39 | "Der große Hund wurde ständig von der Katze gejagt",
40 | "Der große Hund entschloss sich, von der Katze gejagt zu werden",
41 | "Die Hundejagd durch den große Katze",
42 | )
43 |
44 | def test_negative_examples(self):
45 | for negative_example in self.negative_examples:
46 | with self.subTest():
47 | assert len(holmes_manager.match(document_text=negative_example)) == 0
48 |
49 | def test_complex_example(self):
50 | matches = holmes_manager.match(
51 | document_text="Letzte Woche sah ich Richard Hudson und Max Mustermann. Sie wollten nicht mehr in die Stadt gehen."
52 | )
53 | self.assertEqual(
54 | matches,
55 | [
56 | {
57 | "search_phrase_label": "Ein ENTITYPER geht in die Stadt",
58 | "search_phrase_text": "Ein ENTITYPER geht in die Stadt",
59 | "document": "",
60 | "index_within_document": 17,
61 | "sentences_within_document": "Letzte Woche sah ich Richard Hudson und Max Mustermann. Sie wollten nicht mehr in die Stadt gehen.",
62 | "negated": True,
63 | "uncertain": True,
64 | "involves_coreference": True,
65 | "overall_similarity_measure": 1.0,
66 | "word_matches": [
67 | {
68 | "search_phrase_token_index": 1,
69 | "search_phrase_word": "ENTITYPER",
70 | "document_token_index": 5,
71 | "first_document_token_index": 4,
72 | "last_document_token_index": 5,
73 | "structurally_matched_document_token_index": 10,
74 | "document_subword_index": None,
75 | "document_subword_containing_token_index": None,
76 | "document_word": "richard hudson",
77 | "document_phrase": "Richard Hudson",
78 | "match_type": "entity",
79 | "negated": False,
80 | "uncertain": True,
81 | "similarity_measure": 1.0,
82 | "involves_coreference": True,
83 | "extracted_word": "richard hudson",
84 | "depth": 0,
85 | "explanation": "Has an entity label matching ENTITYPER.",
86 | },
87 | {
88 | "search_phrase_token_index": 2,
89 | "search_phrase_word": "gehen",
90 | "document_token_index": 17,
91 | "first_document_token_index": 17,
92 | "last_document_token_index": 17,
93 | "structurally_matched_document_token_index": 17,
94 | "document_subword_index": None,
95 | "document_subword_containing_token_index": None,
96 | "document_word": "gehen",
97 | "document_phrase": "gehen",
98 | "match_type": "direct",
99 | "negated": True,
100 | "uncertain": False,
101 | "similarity_measure": 1.0,
102 | "involves_coreference": False,
103 | "extracted_word": "gehen",
104 | "depth": 0,
105 | "explanation": "Matches GEHEN directly.",
106 | },
107 | {
108 | "search_phrase_token_index": 3,
109 | "search_phrase_word": "in",
110 | "document_token_index": 14,
111 | "first_document_token_index": 14,
112 | "last_document_token_index": 14,
113 | "structurally_matched_document_token_index": 14,
114 | "document_subword_index": None,
115 | "document_subword_containing_token_index": None,
116 | "document_word": "in",
117 | "document_phrase": "in",
118 | "match_type": "direct",
119 | "negated": True,
120 | "uncertain": True,
121 | "similarity_measure": 1.0,
122 | "involves_coreference": False,
123 | "extracted_word": "in",
124 | "depth": 0,
125 | "explanation": "Matches IN directly.",
126 | },
127 | {
128 | "search_phrase_token_index": 5,
129 | "search_phrase_word": "stadt",
130 | "document_token_index": 16,
131 | "first_document_token_index": 16,
132 | "last_document_token_index": 16,
133 | "structurally_matched_document_token_index": 16,
134 | "document_subword_index": None,
135 | "document_subword_containing_token_index": None,
136 | "document_word": "stadt",
137 | "document_phrase": "die Stadt",
138 | "match_type": "direct",
139 | "negated": True,
140 | "uncertain": False,
141 | "similarity_measure": 1.0,
142 | "involves_coreference": False,
143 | "extracted_word": "stadt",
144 | "depth": 0,
145 | "explanation": "Matches STADT directly.",
146 | },
147 | ],
148 | },
149 | {
150 | "search_phrase_label": "Ein ENTITYPER geht in die Stadt",
151 | "search_phrase_text": "Ein ENTITYPER geht in die Stadt",
152 | "document": "",
153 | "index_within_document": 17,
154 | "sentences_within_document": "Letzte Woche sah ich Richard Hudson und Max Mustermann. Sie wollten nicht mehr in die Stadt gehen.",
155 | "negated": True,
156 | "uncertain": True,
157 | "involves_coreference": True,
158 | "overall_similarity_measure": 1.0,
159 | "word_matches": [
160 | {
161 | "search_phrase_token_index": 1,
162 | "search_phrase_word": "ENTITYPER",
163 | "document_token_index": 8,
164 | "first_document_token_index": 7,
165 | "last_document_token_index": 8,
166 | "structurally_matched_document_token_index": 10,
167 | "document_subword_index": None,
168 | "document_subword_containing_token_index": None,
169 | "document_word": "max mustermann",
170 | "document_phrase": "Max Mustermann",
171 | "match_type": "entity",
172 | "negated": False,
173 | "uncertain": True,
174 | "similarity_measure": 1.0,
175 | "involves_coreference": True,
176 | "extracted_word": "max mustermann",
177 | "depth": 0,
178 | "explanation": "Has an entity label matching ENTITYPER.",
179 | },
180 | {
181 | "search_phrase_token_index": 2,
182 | "search_phrase_word": "gehen",
183 | "document_token_index": 17,
184 | "first_document_token_index": 17,
185 | "last_document_token_index": 17,
186 | "structurally_matched_document_token_index": 17,
187 | "document_subword_index": None,
188 | "document_subword_containing_token_index": None,
189 | "document_word": "gehen",
190 | "document_phrase": "gehen",
191 | "match_type": "direct",
192 | "negated": True,
193 | "uncertain": False,
194 | "similarity_measure": 1.0,
195 | "involves_coreference": False,
196 | "extracted_word": "gehen",
197 | "depth": 0,
198 | "explanation": "Matches GEHEN directly.",
199 | },
200 | {
201 | "search_phrase_token_index": 3,
202 | "search_phrase_word": "in",
203 | "document_token_index": 14,
204 | "first_document_token_index": 14,
205 | "last_document_token_index": 14,
206 | "structurally_matched_document_token_index": 14,
207 | "document_subword_index": None,
208 | "document_subword_containing_token_index": None,
209 | "document_word": "in",
210 | "document_phrase": "in",
211 | "match_type": "direct",
212 | "negated": True,
213 | "uncertain": True,
214 | "similarity_measure": 1.0,
215 | "involves_coreference": False,
216 | "extracted_word": "in",
217 | "depth": 0,
218 | "explanation": "Matches IN directly.",
219 | },
220 | {
221 | "search_phrase_token_index": 5,
222 | "search_phrase_word": "stadt",
223 | "document_token_index": 16,
224 | "first_document_token_index": 16,
225 | "last_document_token_index": 16,
226 | "structurally_matched_document_token_index": 16,
227 | "document_subword_index": None,
228 | "document_subword_containing_token_index": None,
229 | "document_word": "stadt",
230 | "document_phrase": "die Stadt",
231 | "match_type": "direct",
232 | "negated": True,
233 | "uncertain": False,
234 | "similarity_measure": 1.0,
235 | "involves_coreference": False,
236 | "extracted_word": "stadt",
237 | "depth": 0,
238 | "explanation": "Matches STADT directly.",
239 | },
240 | ],
241 | },
242 | ],
243 | )
244 |
--------------------------------------------------------------------------------
/tests/en/test_doc_examples_EN.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import holmes_extractor as holmes
3 |
4 | holmes_manager = holmes.Manager(model="en_core_web_lg", number_of_workers=1)
5 | holmes_manager.register_search_phrase("A big dog chases a cat")
6 | holmes_manager.register_search_phrase("An ENTITYPERSON goes into town")
7 | holmes_manager.register_search_phrase("A company gives permission to publish something")
8 |
9 |
10 | class EnglishDocumentationExamplesTest(unittest.TestCase):
11 |
12 | positive_examples = (
13 | "A big dog chased a cat",
14 | "The big dog would not stop chasing the cat",
15 | "The big dog who was tired chased the cat",
16 | "The cat was chased by the big dog",
17 | "The cat always used to be chased by the big dog",
18 | "The big dog was going to chase the cat",
19 | "The big dog decided to chase the cat",
20 | "The cat was afraid of being chased by the big dog",
21 | "I saw a cat-chasing big dog",
22 | "The cat the big dog chased was scared",
23 | "The big dog chasing the cat was a problem",
24 | "There was a big dog that was chasing a cat",
25 | "The cat chase by the big dog",
26 | "There was a big dog and it was chasing a cat.",
27 | "I saw a big dog. My cat was afraid of being chased by the dog.",
28 | "There was a big dog. His name was Fido. He was chasing my cat.",
29 | "A dog appeared. It was chasing a cat. It was very big.",
30 | "The cat sneaked back into our lounge because a big dog had been chasing her.",
31 | "Our big dog was excited because he had been chasing a cat.",
32 | )
33 |
34 | def test_positive_examples(self):
35 | for positive_example in self.positive_examples:
36 | with self.subTest():
37 | assert len(holmes_manager.match(document_text=positive_example)) == 1
38 |
39 | negative_examples = (
40 | "The dog chased a big cat",
41 | "The big dog and the cat chased about",
42 | "The big dog chased a mouse but the cat was tired",
43 | "The big dog always used to be chased by the cat",
44 | "The big dog the cat chased was scared",
45 | "Our big dog was upset because he had been chased by a cat.",
46 | "The dog chase of the big cat",
47 | )
48 |
49 | def test_negative_examples(self):
50 | for negative_example in self.negative_examples:
51 | with self.subTest():
52 | assert len(holmes_manager.match(document_text=negative_example)) == 0
53 |
54 | def test_complex_example(self):
55 | matches = holmes_manager.match(
56 | document_text="I met Richard Hudson and John Doe last week. They didn't want to go into town."
57 | )
58 | self.assertEqual(
59 | matches,
60 | [
61 | {
62 | "search_phrase_label": "An ENTITYPERSON goes into town",
63 | "search_phrase_text": "An ENTITYPERSON goes into town",
64 | "document": "",
65 | "index_within_document": 15,
66 | "sentences_within_document": "I met Richard Hudson and John Doe last week. They didn't want to go into town.",
67 | "negated": True,
68 | "uncertain": True,
69 | "involves_coreference": True,
70 | "overall_similarity_measure": 1.0,
71 | "word_matches": [
72 | {
73 | "search_phrase_token_index": 1,
74 | "search_phrase_word": "ENTITYPERSON",
75 | "document_token_index": 3,
76 | "first_document_token_index": 2,
77 | "last_document_token_index": 3,
78 | "structurally_matched_document_token_index": 10,
79 | "document_subword_index": None,
80 | "document_subword_containing_token_index": None,
81 | "document_word": "richard hudson",
82 | "document_phrase": "Richard Hudson",
83 | "match_type": "entity",
84 | "negated": False,
85 | "uncertain": True,
86 | "similarity_measure": 1.0,
87 | "involves_coreference": True,
88 | "extracted_word": "richard hudson",
89 | "depth": 0,
90 | "explanation": "Has an entity label matching ENTITYPERSON.",
91 | },
92 | {
93 | "search_phrase_token_index": 2,
94 | "search_phrase_word": "go",
95 | "document_token_index": 15,
96 | "first_document_token_index": 15,
97 | "last_document_token_index": 15,
98 | "structurally_matched_document_token_index": 15,
99 | "document_subword_index": None,
100 | "document_subword_containing_token_index": None,
101 | "document_word": "go",
102 | "document_phrase": "go",
103 | "match_type": "direct",
104 | "negated": True,
105 | "uncertain": False,
106 | "similarity_measure": 1.0,
107 | "involves_coreference": False,
108 | "extracted_word": "go",
109 | "depth": 0,
110 | "explanation": "Matches GO directly.",
111 | },
112 | {
113 | "search_phrase_token_index": 3,
114 | "search_phrase_word": "into",
115 | "document_token_index": 16,
116 | "first_document_token_index": 16,
117 | "last_document_token_index": 16,
118 | "structurally_matched_document_token_index": 16,
119 | "document_subword_index": None,
120 | "document_subword_containing_token_index": None,
121 | "document_word": "into",
122 | "document_phrase": "into",
123 | "match_type": "direct",
124 | "negated": True,
125 | "uncertain": False,
126 | "similarity_measure": 1.0,
127 | "involves_coreference": False,
128 | "extracted_word": "into",
129 | "depth": 0,
130 | "explanation": "Matches INTO directly.",
131 | },
132 | {
133 | "search_phrase_token_index": 4,
134 | "search_phrase_word": "town",
135 | "document_token_index": 17,
136 | "first_document_token_index": 17,
137 | "last_document_token_index": 17,
138 | "structurally_matched_document_token_index": 17,
139 | "document_subword_index": None,
140 | "document_subword_containing_token_index": None,
141 | "document_word": "town",
142 | "document_phrase": "town",
143 | "match_type": "direct",
144 | "negated": True,
145 | "uncertain": False,
146 | "similarity_measure": 1.0,
147 | "involves_coreference": False,
148 | "extracted_word": "town",
149 | "depth": 0,
150 | "explanation": "Matches TOWN directly.",
151 | },
152 | ],
153 | },
154 | {
155 | "search_phrase_label": "An ENTITYPERSON goes into town",
156 | "search_phrase_text": "An ENTITYPERSON goes into town",
157 | "document": "",
158 | "index_within_document": 15,
159 | "sentences_within_document": "I met Richard Hudson and John Doe last week. They didn't want to go into town.",
160 | "negated": True,
161 | "uncertain": True,
162 | "involves_coreference": True,
163 | "overall_similarity_measure": 1.0,
164 | "word_matches": [
165 | {
166 | "search_phrase_token_index": 1,
167 | "search_phrase_word": "ENTITYPERSON",
168 | "document_token_index": 6,
169 | "first_document_token_index": 5,
170 | "last_document_token_index": 6,
171 | "structurally_matched_document_token_index": 10,
172 | "document_subword_index": None,
173 | "document_subword_containing_token_index": None,
174 | "document_word": "john doe",
175 | "document_phrase": "John Doe",
176 | "match_type": "entity",
177 | "negated": False,
178 | "uncertain": True,
179 | "similarity_measure": 1.0,
180 | "involves_coreference": True,
181 | "extracted_word": "john doe",
182 | "depth": 0,
183 | "explanation": "Has an entity label matching ENTITYPERSON.",
184 | },
185 | {
186 | "search_phrase_token_index": 2,
187 | "search_phrase_word": "go",
188 | "document_token_index": 15,
189 | "first_document_token_index": 15,
190 | "last_document_token_index": 15,
191 | "structurally_matched_document_token_index": 15,
192 | "document_subword_index": None,
193 | "document_subword_containing_token_index": None,
194 | "document_word": "go",
195 | "document_phrase": "go",
196 | "match_type": "direct",
197 | "negated": True,
198 | "uncertain": False,
199 | "similarity_measure": 1.0,
200 | "involves_coreference": False,
201 | "extracted_word": "go",
202 | "depth": 0,
203 | "explanation": "Matches GO directly.",
204 | },
205 | {
206 | "search_phrase_token_index": 3,
207 | "search_phrase_word": "into",
208 | "document_token_index": 16,
209 | "first_document_token_index": 16,
210 | "last_document_token_index": 16,
211 | "structurally_matched_document_token_index": 16,
212 | "document_subword_index": None,
213 | "document_subword_containing_token_index": None,
214 | "document_word": "into",
215 | "document_phrase": "into",
216 | "match_type": "direct",
217 | "negated": True,
218 | "uncertain": False,
219 | "similarity_measure": 1.0,
220 | "involves_coreference": False,
221 | "extracted_word": "into",
222 | "depth": 0,
223 | "explanation": "Matches INTO directly.",
224 | },
225 | {
226 | "search_phrase_token_index": 4,
227 | "search_phrase_word": "town",
228 | "document_token_index": 17,
229 | "first_document_token_index": 17,
230 | "last_document_token_index": 17,
231 | "structurally_matched_document_token_index": 17,
232 | "document_subword_index": None,
233 | "document_subword_containing_token_index": None,
234 | "document_word": "town",
235 | "document_phrase": "town",
236 | "match_type": "direct",
237 | "negated": True,
238 | "uncertain": False,
239 | "similarity_measure": 1.0,
240 | "involves_coreference": False,
241 | "extracted_word": "town",
242 | "depth": 0,
243 | "explanation": "Matches TOWN directly.",
244 | },
245 | ],
246 | },
247 | ],
248 | )
249 |
250 | def test_extracted_word_example(self):
251 | matches = holmes_manager.match(
252 | document_text="We discussed AstraZeneca. The company had given us permission to publish this library under the MIT license."
253 | )
254 | self.assertEqual(len(matches), 1)
255 | self.assertEqual(matches[0]["word_matches"][0]["extracted_word"], "astrazeneca")
256 |
--------------------------------------------------------------------------------