├── holmes_extractor ├── lang │ ├── __init__.py │ ├── de │ │ ├── __init__.py │ │ └── data │ │ │ ├── __init__.py │ │ │ └── derivation.csv │ └── en │ │ ├── __init__.py │ │ └── data │ │ ├── __init__.py │ │ └── derivation.csv ├── word_matching │ ├── __init__,py │ ├── entity.py │ ├── embedding.py │ ├── direct.py │ ├── question.py │ ├── entity_embedding.py │ ├── general.py │ ├── derivation.py │ └── ontology.py ├── about.py ├── config.cfg ├── __init__.py └── errors.py ├── MANIFEST.in ├── docs ├── holmes_thumbnail.png └── ontology_example.png ├── pyproject.toml ├── LICENSE ├── examples ├── example_chatbot_DE_insurance.py ├── example_chatbot_EN_insurance.py ├── example_search_DE_law.py ├── example_search_DE_literature.py ├── example_supervised_topic_model_EN.py ├── example_search_EN_literature.py ├── example_chatbot_EN_insurance_ontology.owl └── example_chatbot_DE_insurance_ontology.owl ├── tests ├── common │ ├── test_ontology2.owl │ ├── test_cpu_gpu.py │ ├── test_serialization.py │ └── test_manager.py ├── de │ ├── test_ontology.owl │ ├── test_questions_DE.py │ └── test_doc_examples_DE.py └── en │ ├── test_ontology.owl │ └── test_doc_examples_EN.py ├── setup.cfg ├── .gitignore ├── .github └── workflows │ └── test-holmes.yml └── SHORTREADME.md /holmes_extractor/lang/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /holmes_extractor/lang/de/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /holmes_extractor/lang/en/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /holmes_extractor/lang/de/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /holmes_extractor/lang/en/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /holmes_extractor/word_matching/__init__,py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /holmes_extractor/about.py: -------------------------------------------------------------------------------- 1 | __version__ = "4.0.0" -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include SHORTREADME.md 2 | global-include *.cfg 3 | global-include *.csv 4 | global-include LICENSE 5 | -------------------------------------------------------------------------------- /docs/holmes_thumbnail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msg-systems/holmes-extractor/HEAD/docs/holmes_thumbnail.png -------------------------------------------------------------------------------- /docs/ontology_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msg-systems/holmes-extractor/HEAD/docs/ontology_example.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | -------------------------------------------------------------------------------- /holmes_extractor/config.cfg: -------------------------------------------------------------------------------- 1 | [vector_nlps] 2 | # Names of models for which a second model is used as a source of vocabularies and vectors. 3 | en_core_web_trf = en_core_web_lg 4 | -------------------------------------------------------------------------------- /holmes_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | from .about import __version__ 2 | from .manager import Manager 3 | from .ontology import Ontology 4 | import os 5 | os.environ["TOKENIZERS_PARALLELISM"] = "True" 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2019-2021 msg systems ag, 2022 ExplosionAI GmbH, AstraZeneca 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /examples/example_chatbot_DE_insurance.py: -------------------------------------------------------------------------------- 1 | import os 2 | import holmes_extractor as holmes 3 | 4 | if __name__ in ('__main__', 'example_chatbot_DE_insurance'): 5 | script_directory = os.path.dirname(os.path.realpath(__file__)) 6 | ontology = holmes.Ontology(os.sep.join(( 7 | script_directory, 'example_chatbot_DE_insurance_ontology.owl'))) 8 | holmes_manager = holmes.Manager(model='de_core_news_lg', ontology=ontology, number_of_workers=2) 9 | holmes_manager.register_search_phrase('Jemand benötigt eine Versicherung') 10 | holmes_manager.register_search_phrase('Ein ENTITYPER schließt eine Versicherung ab') 11 | holmes_manager.register_search_phrase('ENTITYPER benötigt eine Versicherung') 12 | holmes_manager.register_search_phrase('Eine Versicherung für einen Zeitraum') 13 | holmes_manager.register_search_phrase('Eine Versicherung fängt an') 14 | holmes_manager.register_search_phrase('Jemand zahlt voraus') 15 | 16 | holmes_manager.start_chatbot_mode_console() 17 | # e.g. 'Richard Hudson und Max Mustermann brauchen eine Krankenversicherung für die nächsten fünf Jahre' 18 | -------------------------------------------------------------------------------- /examples/example_chatbot_EN_insurance.py: -------------------------------------------------------------------------------- 1 | import os 2 | import holmes_extractor as holmes 3 | 4 | if __name__ in ('__main__', 'example_chatbot_EN_insurance'): 5 | script_directory = os.path.dirname(os.path.realpath(__file__)) 6 | ontology = holmes.Ontology(os.sep.join(( 7 | script_directory, 'example_chatbot_EN_insurance_ontology.owl'))) 8 | holmes_manager = holmes.Manager( 9 | model='en_core_web_lg', ontology=ontology, number_of_workers=2) 10 | holmes_manager.register_search_phrase('Somebody requires insurance') 11 | holmes_manager.register_search_phrase('An ENTITYPERSON takes out insurance') 12 | holmes_manager.register_search_phrase('A company buys payment insurance') 13 | holmes_manager.register_search_phrase('An ENTITYPERSON needs insurance') 14 | holmes_manager.register_search_phrase('Insurance for a period') 15 | holmes_manager.register_search_phrase('An insurance begins') 16 | holmes_manager.register_search_phrase('Somebody prepays') 17 | holmes_manager.register_search_phrase('Somebody makes an insurance payment') 18 | 19 | holmes_manager.start_chatbot_mode_console() 20 | # e.g. 'Richard Hudson and John Doe require health insurance for the next five years' 21 | -------------------------------------------------------------------------------- /tests/common/test_ontology2.owl: -------------------------------------------------------------------------------- 1 | 2 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /examples/example_search_DE_law.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | 3 | # You will need to install bs4 (python -m pip install bs4) 4 | from bs4 import BeautifulSoup 5 | import holmes_extractor as holmes 6 | 7 | def download_and_register(url, label): 8 | print('Downloading', label) 9 | # Download the content 10 | page = urllib.request.urlopen(url) 11 | # Extract the raw text from the HTML document 12 | soup = BeautifulSoup(page, 'html.parser') 13 | # Register the document with Holmes 14 | print('Parsing and registering', label) 15 | holmes_manager.parse_and_register_document(soup.get_text(), label) 16 | 17 | if __name__ in ('__main__', 'example_search_DE_law'): 18 | # Start the Holmes Manager with the German model 19 | holmes_manager = holmes.Manager(model='de_core_news_lg', number_of_workers=2) 20 | download_and_register('https://www.gesetze-im-internet.de/vvg_2008/BJNR263110007.html', 'VVG_2008') 21 | # This may take several minutes 22 | download_and_register('https://www.gesetze-im-internet.de/vag_2016/BJNR043410015.html', 'VAG') 23 | holmes_manager.start_topic_matching_search_mode_console(initial_question_word_embedding_match_threshold=0.7) 24 | 25 | # Example queries: 26 | # 27 | # Der Versicherer darf den Vertrag fristlos kündigen, wenn der Versicherungsnehmer beim Abschluss des Vertrags die vorvertragliche Anzeigepflicht verletzt hat. 28 | # Der Versicherer darf Leistungen verweigern. 29 | # Der Versicherer darf die Prämie anpassen. 30 | # Eine Richtlinie einer ENTITYORG 31 | -------------------------------------------------------------------------------- /holmes_extractor/errors.py: -------------------------------------------------------------------------------- 1 | class HolmesError(Exception): 2 | def __init__(self, text): 3 | self.text = text 4 | 5 | def __str__(self): 6 | return self.text 7 | 8 | 9 | class SearchPhraseContainsNegationError(HolmesError): 10 | pass 11 | 12 | 13 | class SearchPhraseContainsConjunctionError(HolmesError): 14 | pass 15 | 16 | 17 | class SearchPhraseContainsCoreferringPronounError(HolmesError): 18 | pass 19 | 20 | 21 | class SearchPhraseWithoutMatchableWordsError(HolmesError): 22 | pass 23 | 24 | 25 | class SearchPhraseContainsMultipleClausesError(HolmesError): 26 | pass 27 | 28 | 29 | class DuplicateDocumentError(HolmesError): 30 | pass 31 | 32 | 33 | class NoSearchPhraseError(HolmesError): 34 | pass 35 | 36 | 37 | class NoDocumentError(HolmesError): 38 | pass 39 | 40 | 41 | class WrongModelDeserializationError(HolmesError): 42 | pass 43 | 44 | 45 | class WrongVersionDeserializationError(HolmesError): 46 | pass 47 | 48 | 49 | class DocumentTooBigError(HolmesError): 50 | pass 51 | 52 | 53 | class FewerThanTwoClassificationsError(HolmesError): 54 | pass 55 | 56 | 57 | class NoPhraseletsAfterFilteringError(HolmesError): 58 | pass 59 | 60 | 61 | class EmbeddingThresholdLessThanRelationThresholdError(HolmesError): 62 | pass 63 | 64 | 65 | class IncompatibleAnalyzeDerivationalMorphologyDeserializationError(HolmesError): 66 | pass 67 | 68 | 69 | class MultiprocessingParsingNotSupportedError(HolmesError): 70 | pass 71 | 72 | 73 | class OntologyObjectSharedBetweenManagersError(HolmesError): 74 | pass 75 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = holmes-extractor 3 | version = 4.0.0 4 | description = Information extraction from English and German texts based on predicate logic 5 | long_description = file: SHORTREADME.md 6 | long_description_content_type = text/markdown 7 | url = https://github.com/explosion/holmes-extractor 8 | author = Richard Paul Hudson, Explosion AI 9 | author_email = richard@explosion.ai 10 | license = MIT 11 | keywords= nlp, information-extraction, spacy, spacy-extension, python, machine-learning, ontology, semantics 12 | classifiers = 13 | Development Status :: 5 - Production/Stable 14 | Intended Audience :: Developers 15 | Intended Audience :: Financial and Insurance Industry 16 | Intended Audience :: Healthcare Industry 17 | Intended Audience :: Information Technology 18 | Intended Audience :: Legal Industry 19 | Intended Audience :: Other Audience 20 | Intended Audience :: Education 21 | Intended Audience :: Science/Research 22 | License :: OSI Approved :: MIT License 23 | Natural Language :: English 24 | Natural Language :: German 25 | Programming Language :: Python :: 3 26 | Programming Language :: Python :: 3.6 27 | Programming Language :: Python :: 3.7 28 | Programming Language :: Python :: 3.8 29 | Programming Language :: Python :: 3.9 30 | Programming Language :: Python :: 3.10 31 | Topic :: Scientific/Engineering :: Artificial Intelligence 32 | Topic :: Scientific/Engineering :: Information Analysis 33 | Topic :: Text Processing :: Linguistic 34 | 35 | [options] 36 | include_package_data = True 37 | python_requires = >=3.6,<3.11 38 | install_requires = 39 | spacy>=3.1.0,<3.4.0 40 | coreferee>=1.2.0 41 | rdflib 42 | [options.package_data] 43 | * = *.cfg, *.csv 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Visual Studio Code 132 | .vscode 133 | -------------------------------------------------------------------------------- /.github/workflows/test-holmes.yml: -------------------------------------------------------------------------------- 1 | name: Holmes Matrix Test 2 | on: 3 | workflow_dispatch: 4 | push: 5 | 6 | jobs: 7 | test-holmes: 8 | strategy: 9 | matrix: 10 | os: [macos-latest, windows-latest, ubuntu-latest] 11 | python_version: ['3.6', '3.7', '3.8', '3.9', '3.10'] 12 | spacy_version: ['3.3.0'] 13 | click_version: ['8.0.1'] 14 | include: 15 | - os: 'ubuntu-latest' 16 | python_version: '3.9' 17 | spacy_version: '3.2.4' 18 | click_version: '8.0.1' 19 | - os: 'ubuntu-latest' 20 | python_version: '3.9' 21 | spacy_version: '3.1.6' 22 | click_version: '7.1.2' 23 | runs-on: ${{ matrix.os }} 24 | steps: 25 | 26 | - name: Increase swap file size on Windows 27 | if: ${{ matrix.os == 'windows-latest' }} 28 | uses: al-cheb/configure-pagefile-action@v1.2 29 | with: 30 | minimum-size: 64GB 31 | maximum-size: 64GB 32 | 33 | - name: Checkout repository code 34 | uses: actions/checkout@v3 35 | with: 36 | ref: master 37 | 38 | - name: Initialize Python 39 | uses: actions/setup-python@v3 40 | with: 41 | python-version: ${{ matrix.python_version }} 42 | 43 | - name: Install dependencies 44 | run: | 45 | python -m pip install --upgrade pip setuptools wheel 46 | pip install spacy==${{ matrix.spacy_version }} pytest spacy-lookups-data 47 | 48 | # see https://github.com/explosion/spaCy/issues/10564 49 | pip uninstall click -y 50 | pip install "click==${{ matrix.click_version }}" 51 | 52 | - name: Install spaCy models 53 | run: | 54 | python -m spacy download en_core_web_sm 55 | python -m spacy download en_core_web_lg 56 | python -m spacy download en_core_web_trf 57 | python -m spacy download de_core_news_lg 58 | python -m spacy download pl_core_news_md 59 | 60 | - name: Install Coreferee 61 | run: | 62 | pip install coreferee 63 | python -m coreferee install en 64 | python -m coreferee install de 65 | python -m coreferee install pl 66 | 67 | - name: Install Holmes 68 | run: | 69 | cd ${{ github.workspace }} 70 | pip install . 71 | 72 | - name: Test Holmes 73 | run: | 74 | python -m pytest tests/de/test_doc_examples_DE.py 75 | python -m pytest tests/de/test_phraselet_production_DE.py 76 | python -m pytest tests/de/test_questions_DE.py 77 | python -m pytest tests/de/test_semantics_DE.py 78 | python -m pytest tests/de/test_structural_matching_DE.py 79 | python -m pytest tests/de/test_structural_matching_with_coreference_DE.py 80 | python -m pytest tests/de/test_supervised_topic_classification_DE.py 81 | python -m pytest tests/de/test_topic_matching_DE.py 82 | python -m pytest tests/en/test_doc_examples_EN.py 83 | python -m pytest tests/en/test_phraselet_production_EN.py 84 | python -m pytest tests/en/test_questions_EN.py 85 | python -m pytest tests/en/test_semantics_EN.py 86 | python -m pytest tests/en/test_structural_matching_EN.py 87 | python -m pytest tests/en/test_structural_matching_with_coreference_EN.py 88 | python -m pytest tests/en/test_supervised_topic_classification_EN.py 89 | python -m pytest tests/en/test_topic_matching_EN.py 90 | python -m pytest tests/common/test_manager.py 91 | python -m pytest tests/common/test_cpu_gpu.py 92 | python -m pytest tests/common/test_errors.py 93 | python -m pytest tests/common/test_ontology.py 94 | python -m pytest tests/common/test_serialization.py 95 | python -m pytest tests/common/test_word_level_matching.py 96 | python -m pytest tests/common/test_multithreading.py -------------------------------------------------------------------------------- /tests/de/test_ontology.owl: -------------------------------------------------------------------------------- 1 | 2 | 9 | 10 | 11 | 12 | 13 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /holmes_extractor/word_matching/entity.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, List 2 | from spacy.tokens import Token, Doc 3 | from .general import WordMatch, WordMatchingStrategy 4 | from ..parsing import MultiwordSpan, CorpusWordPosition, SearchPhrase 5 | 6 | 7 | class EntityWordMatchingStrategy(WordMatchingStrategy): 8 | 9 | WORD_MATCH_TYPE_LABEL = "entity" 10 | 11 | @staticmethod 12 | def _get_explanation(search_phrase_display_word: str) -> str: 13 | return "".join( 14 | ("Has an entity label matching ", search_phrase_display_word.upper(), ".") 15 | ) 16 | 17 | def match_multiwords( 18 | self, 19 | search_phrase: SearchPhrase, 20 | search_phrase_token: Token, 21 | document_token: Token, 22 | document_multiwords: List[MultiwordSpan], 23 | ) -> Optional[WordMatch]: 24 | 25 | entity_placeholder = self.semantic_matching_helper.get_entity_placeholder( 26 | search_phrase_token 27 | ) 28 | if entity_placeholder is None: 29 | return None 30 | 31 | for multiword in document_multiwords: 32 | if any( 33 | 1 34 | for i in multiword.token_indexes 35 | if not self._entity_placeholder_matches( 36 | entity_placeholder, document_token.doc[i] 37 | ) 38 | ): 39 | continue 40 | return WordMatch( 41 | search_phrase_token=search_phrase_token, 42 | search_phrase_word=entity_placeholder, 43 | document_token=document_token, 44 | first_document_token=document_token.doc[multiword.token_indexes[0]], 45 | last_document_token=document_token.doc[multiword.token_indexes[-1]], 46 | document_subword=None, 47 | document_word=multiword.text, 48 | word_match_type=self.WORD_MATCH_TYPE_LABEL, 49 | explanation=self._get_explanation(entity_placeholder), 50 | ) 51 | return None 52 | 53 | def match_token( 54 | self, 55 | search_phrase: SearchPhrase, 56 | search_phrase_token: Token, 57 | document_token: Token, 58 | ) -> Optional[WordMatch]: 59 | 60 | entity_placeholder = self.semantic_matching_helper.get_entity_placeholder( 61 | search_phrase_token 62 | ) 63 | if entity_placeholder is None: 64 | return None 65 | 66 | if self._entity_placeholder_matches(entity_placeholder, document_token): 67 | return WordMatch( 68 | search_phrase_token=search_phrase_token, 69 | search_phrase_word=entity_placeholder, 70 | document_token=document_token, 71 | first_document_token=document_token, 72 | last_document_token=document_token, 73 | document_subword=None, 74 | document_word=document_token.text.lower(), 75 | word_match_type=self.WORD_MATCH_TYPE_LABEL, 76 | explanation=self._get_explanation(entity_placeholder), 77 | ) 78 | return None 79 | 80 | def add_reverse_dict_entries( 81 | self, 82 | reverse_dict: Dict[str, List[CorpusWordPosition]], 83 | doc: Doc, 84 | document_label: str, 85 | ) -> None: 86 | for token in doc: 87 | # parent check is necessary so we only find multiword entities once per 88 | # search phrase. sibling_marker_deps applies to siblings which would 89 | # otherwise be excluded because the main sibling would normally also match the 90 | # entity root word. 91 | if len(token.ent_type_) > 0 and ( 92 | token.dep_ == "ROOT" 93 | or token.dep_ in self.semantic_matching_helper.sibling_marker_deps 94 | or token.ent_type_ != token.head.ent_type_ 95 | ): 96 | entity_label = "".join(("ENTITY", token.ent_type_)) 97 | self.add_reverse_dict_entry( 98 | reverse_dict, 99 | entity_label, 100 | document_label, 101 | token.i, 102 | None, 103 | ) 104 | entity_defined_multiword = ( 105 | self.semantic_matching_helper.get_entity_defined_multiword(token) 106 | ) 107 | if entity_defined_multiword is not None: 108 | self.add_reverse_dict_entry( 109 | reverse_dict, 110 | entity_defined_multiword.text.lower(), 111 | document_label, 112 | token.i, 113 | None, 114 | ) 115 | 116 | def _entity_placeholder_matches( 117 | self, entity_placeholder: str, document_token: Token 118 | ) -> bool: 119 | return ( 120 | document_token.ent_type_ == entity_placeholder[6:] 121 | and len(document_token._.holmes.lemma.strip()) > 0 122 | ) or ( 123 | entity_placeholder == "ENTITYNOUN" 124 | and document_token.pos_ in self.semantic_matching_helper.noun_pos 125 | ) 126 | # len(document_token._.holmes.lemma.strip()) > 0: some German spaCy models sometimes 127 | # classifies whitespace as entities. 128 | -------------------------------------------------------------------------------- /tests/common/test_cpu_gpu.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from thinc.api import prefer_gpu, require_cpu 3 | import holmes_extractor as holmes 4 | 5 | class CpuGpuTest(unittest.TestCase): 6 | 7 | def test_document_based_structural_matching_cpu_gpu(self): 8 | require_cpu() 9 | holmes_manager = holmes.Manager('en_core_web_sm', number_of_workers=2) 10 | holmes_manager.parse_and_register_document( 11 | document_text="The dog chased the cat.", label='pets') 12 | prefer_gpu() 13 | holmes_manager.register_search_phrase("A dog chases a cat") 14 | self.assertEqual(len(holmes_manager.match()), 1) 15 | 16 | def test_document_based_structural_matching_gpu_cpu(self): 17 | prefer_gpu() 18 | holmes_manager = holmes.Manager('en_core_web_sm', number_of_workers=2) 19 | holmes_manager.parse_and_register_document( 20 | document_text="The dog chased the cat.", label='pets') 21 | require_cpu() 22 | holmes_manager.register_search_phrase("A dog chases a cat") 23 | self.assertEqual(len(holmes_manager.match()), 1) 24 | 25 | def test_search_phrase_based_structural_matching_cpu_gpu(self): 26 | require_cpu() 27 | holmes_manager = holmes.Manager('en_core_web_sm', number_of_workers=2) 28 | holmes_manager.register_search_phrase("A dog chases a cat") 29 | prefer_gpu() 30 | holmes_manager.parse_and_register_document( 31 | document_text="The dog chased the cat.", label='pets') 32 | self.assertEqual(len(holmes_manager.match()), 1) 33 | 34 | def test_search_phrase_based_structural_matching_gpu_cpu(self): 35 | prefer_gpu() 36 | holmes_manager = holmes.Manager('en_core_web_sm', number_of_workers=2) 37 | holmes_manager.register_search_phrase("A dog chases a cat") 38 | require_cpu() 39 | holmes_manager.parse_and_register_document( 40 | document_text="The dog chased the cat.", label='pets') 41 | self.assertEqual(len(holmes_manager.match()), 1) 42 | 43 | def test_topic_matching_cpu_gpu(self): 44 | require_cpu() 45 | holmes_manager = holmes.Manager('en_core_web_sm', number_of_workers=2) 46 | holmes_manager.parse_and_register_document( 47 | document_text="The dog chased the cat.", label='pets') 48 | prefer_gpu() 49 | topic_matches = holmes_manager.topic_match_documents_against("A dog chases a cat") 50 | self.assertEqual(len(topic_matches), 1) 51 | 52 | def test_topic_matching_gpu_cpu(self): 53 | prefer_gpu() 54 | holmes_manager = holmes.Manager('en_core_web_sm', number_of_workers=2) 55 | holmes_manager.parse_and_register_document( 56 | document_text="The dog chased the cat.", label='pets') 57 | require_cpu() 58 | topic_matches = holmes_manager.topic_match_documents_against("A dog chases a cat") 59 | self.assertEqual(len(topic_matches), 1) 60 | 61 | def test_supervised_document_classification_cpu_gpu(self): 62 | require_cpu() 63 | holmes_manager = holmes.Manager('en_core_web_sm', number_of_workers=2) 64 | sttb = holmes_manager.get_supervised_topic_training_basis( 65 | one_hot=False 66 | ) 67 | sttb.parse_and_register_training_document("An animal", "animal", "d4") 68 | sttb.parse_and_register_training_document("A computer", "computers", "d5") 69 | sttb.prepare() 70 | # With so little training data, the NN does not consistently learn correctly 71 | for i in range(20): 72 | trainer = sttb.train( 73 | minimum_occurrences=0, 74 | cv_threshold=0, 75 | max_epochs=1000, 76 | learning_rate=0.0001, 77 | convergence_threshold=0, 78 | ) 79 | stc = trainer.classifier() 80 | if ( 81 | list(stc.parse_and_classify("You are an animal.").keys())[0] == "animal" 82 | ): 83 | break 84 | if i == 20: 85 | self.assertTrue( 86 | list(stc.parse_and_classify("You are an animal.").keys())[0] == "animal" 87 | ) 88 | 89 | prefer_gpu() 90 | self.assertTrue( 91 | list(stc.parse_and_classify("You are an animal.").keys())[0] == "animal") 92 | self.assertIsNone( 93 | stc.parse_and_classify("My name is Charles and I like sewing.") 94 | ) 95 | 96 | def test_supervised_document_classification_gpu_cpu(self): 97 | prefer_gpu() 98 | holmes_manager = holmes.Manager('en_core_web_sm', number_of_workers=2) 99 | sttb = holmes_manager.get_supervised_topic_training_basis( 100 | one_hot=False 101 | ) 102 | sttb.parse_and_register_training_document("An animal", "animal", "d4") 103 | sttb.parse_and_register_training_document("A computer", "computers", "d5") 104 | sttb.prepare() 105 | # With so little training data, the NN does not consistently learn correctly 106 | for i in range(20): 107 | trainer = sttb.train( 108 | minimum_occurrences=0, 109 | cv_threshold=0, 110 | max_epochs=1000, 111 | learning_rate=0.0001, 112 | convergence_threshold=0, 113 | ) 114 | stc = trainer.classifier() 115 | if ( 116 | list(stc.parse_and_classify("You are an animal.").keys())[0] == "animal" 117 | ): 118 | break 119 | if i == 20: 120 | self.assertTrue( 121 | list(stc.parse_and_classify("You are an animal.").keys())[0] == "animal" 122 | ) 123 | 124 | require_cpu() 125 | self.assertTrue( 126 | list(stc.parse_and_classify("You are an animal.").keys())[0] == "animal") 127 | self.assertIsNone( 128 | stc.parse_and_classify("My name is Charles and I like sewing.") 129 | ) 130 | -------------------------------------------------------------------------------- /holmes_extractor/word_matching/embedding.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from spacy.tokens import Token 3 | from .general import WordMatch, WordMatchingStrategy 4 | from ..parsing import SemanticMatchingHelper, Subword, SearchPhrase 5 | 6 | 7 | class EmbeddingWordMatchingStrategy(WordMatchingStrategy): 8 | 9 | WORD_MATCH_TYPE_LABEL = "embedding" 10 | 11 | @staticmethod 12 | def _get_explanation(similarity: float, search_phrase_display_word: str) -> str: 13 | printable_similarity = str(int(similarity * 100)) 14 | return "".join( 15 | ( 16 | "Has a word embedding that is ", 17 | printable_similarity, 18 | "% similar to ", 19 | search_phrase_display_word.upper(), 20 | ".", 21 | ) 22 | ) 23 | 24 | def __init__( 25 | self, 26 | semantic_matching_helper: SemanticMatchingHelper, 27 | perform_coreference_resolution: bool, 28 | overall_similarity_threshold: float, 29 | initial_question_word_overall_similarity_threshold: float, 30 | ): 31 | self.overall_similarity_threshold = overall_similarity_threshold 32 | self.initial_question_word_overall_similarity_threshold = ( 33 | initial_question_word_overall_similarity_threshold 34 | ) 35 | super().__init__(semantic_matching_helper, perform_coreference_resolution) 36 | 37 | def match_token( 38 | self, 39 | search_phrase: SearchPhrase, 40 | search_phrase_token: Token, 41 | document_token: Token, 42 | ) -> Optional[WordMatch]: 43 | 44 | return self._check_for_word_match( 45 | search_phrase, search_phrase_token, document_token, None 46 | ) 47 | 48 | def match_subword( 49 | self, 50 | search_phrase: SearchPhrase, 51 | search_phrase_token: Token, 52 | document_token: Token, 53 | document_subword: Subword, 54 | ) -> Optional[WordMatch]: 55 | 56 | return self._check_for_word_match( 57 | search_phrase, search_phrase_token, document_token, document_subword 58 | ) 59 | 60 | def _check_for_word_match( 61 | self, 62 | search_phrase: SearchPhrase, 63 | search_phrase_token: Token, 64 | document_token: Token, 65 | document_subword: Optional[Subword], 66 | ) -> Optional[WordMatch]: 67 | if ( 68 | search_phrase_token.i 69 | in search_phrase.matchable_non_entity_tokens_to_vectors.keys() 70 | and self.semantic_matching_helper.embedding_matching_permitted( 71 | search_phrase_token 72 | ) 73 | ): 74 | search_phrase_vector = search_phrase.matchable_non_entity_tokens_to_vectors[ 75 | search_phrase_token.i 76 | ] 77 | if search_phrase_vector is None: 78 | return None 79 | if document_subword is not None: 80 | if not self.semantic_matching_helper.embedding_matching_permitted( 81 | document_subword 82 | ): 83 | return None 84 | document_vector = document_subword.vector 85 | document_word = document_subword.lemma 86 | else: 87 | if not self.semantic_matching_helper.embedding_matching_permitted( 88 | document_token 89 | ): 90 | return None 91 | document_vector = document_token._.holmes.vector 92 | document_word = document_token.lemma_ 93 | if ( 94 | ( 95 | search_phrase_token._.holmes.is_initial_question_word 96 | or search_phrase_token._.holmes.has_initial_question_word_in_phrase 97 | ) 98 | and self.initial_question_word_overall_similarity_threshold is not None 99 | ): 100 | working_overall_similarity_threshold = ( 101 | self.initial_question_word_overall_similarity_threshold 102 | ) 103 | else: 104 | working_overall_similarity_threshold = self.overall_similarity_threshold 105 | single_token_similarity_threshold = ( 106 | working_overall_similarity_threshold 107 | ** len(search_phrase.matchable_non_entity_tokens_to_vectors) 108 | ) 109 | if document_vector is not None: 110 | similarity_measure = self.semantic_matching_helper.cosine_similarity( 111 | search_phrase_vector, document_vector 112 | ) 113 | if similarity_measure > single_token_similarity_threshold: 114 | if ( 115 | not search_phrase.topic_match_phraselet 116 | and len(search_phrase_token._.holmes.lemma.split()) > 1 117 | ): 118 | search_phrase_display_word = search_phrase_token.lemma_ 119 | else: 120 | search_phrase_display_word = search_phrase_token._.holmes.lemma 121 | word_match = WordMatch( 122 | search_phrase_token=search_phrase_token, 123 | search_phrase_word=search_phrase_display_word, 124 | document_token=document_token, 125 | first_document_token=document_token, 126 | last_document_token=document_token, 127 | document_subword=document_subword, 128 | document_word=document_word, 129 | word_match_type=self.WORD_MATCH_TYPE_LABEL, 130 | explanation=self._get_explanation( 131 | similarity_measure, search_phrase_display_word 132 | ), 133 | ) 134 | word_match.similarity_measure = similarity_measure 135 | return word_match 136 | return None 137 | -------------------------------------------------------------------------------- /examples/example_search_DE_literature.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import urllib.request 4 | from multiprocessing import cpu_count 5 | # You will need to install bs4 (python -m pip install bs4) 6 | from bs4 import BeautifulSoup 7 | import holmes_extractor as holmes 8 | # You will need to install falcon (python -m pip install falcon) 9 | import falcon 10 | 11 | if __name__ in ('__main__', 'example_search_DE_literature'): 12 | 13 | working_directory = # REPLACE WITH PATH TO WORKING DIRECTORY IN SINGLE OR DOUBLE QUOTES 14 | HOLMES_EXTENSION = 'hdc' 15 | flag_filename = os.sep.join((working_directory, 'STORY_PARSING_COMPLETE')) 16 | 17 | print('Initializing Holmes (this may take some time) ...') 18 | # Start the Holmes manager with the German model 19 | holmes_manager = holmes.Manager( 20 | model='de_core_news_lg') 21 | 22 | def process_documents_from_front_page(front_page_uri, front_page_label): 23 | """ Download and save all the stories from a front page.""" 24 | 25 | front_page = urllib.request.urlopen(front_page_uri) 26 | front_page_soup = BeautifulSoup(front_page, 'html.parser') 27 | document_texts = [] 28 | labels = [] 29 | # For each story ... 30 | for anchor in front_page_soup.find_all('a'): 31 | if not anchor['href'].startswith('/') and not anchor['href'].startswith('https'): 32 | this_document_url = '/'.join((front_page_uri, anchor['href'])) 33 | print('Downloading story', anchor.contents[0], 'from front page', front_page_label) 34 | # Get the HTML document for the story 35 | this_document = urllib.request.urlopen(this_document_url) 36 | # Extract the raw text from the HTML document 37 | this_document_soup = BeautifulSoup(this_document, 'html.parser') 38 | this_document_text = this_document_soup.prettify() 39 | this_document_text = this_document_text.split('', 1)[1] 40 | this_document_text = this_document_text.split('', ' ') 42 | # Remove any carriage returns and line feeds from the raw text 43 | this_document_text = this_document_text.replace( 44 | '\n', ' ').replace('\r', ' ').replace(' ', ' ') 45 | # Replace multiple spaces with single spaces 46 | this_document_text = ' '.join(this_document_text.split()) 47 | # Create a document label from the front page label and the story name 48 | this_document_label = ' - '.join((front_page_label, anchor.contents[0])) 49 | document_texts.append(this_document_text) 50 | labels.append(this_document_label) 51 | parsed_documents = holmes_manager.nlp.pipe(document_texts, n_process=cpu_count()) 52 | for index, parsed_document in enumerate(parsed_documents): 53 | label = labels[index] 54 | print('Saving', label) 55 | output_filename = os.sep.join((working_directory, label)) 56 | output_filename = '.'.join((output_filename, HOLMES_EXTENSION)) 57 | with open(output_filename, "wb") as file: 58 | file.write(parsed_document.to_bytes()) 59 | 60 | def load_documents_from_working_directory(): 61 | serialized_documents = {} 62 | for file in os.listdir(working_directory): 63 | if file.endswith(HOLMES_EXTENSION): 64 | print('Loading', file) 65 | label = file[:-4] 66 | long_filename = os.sep.join((working_directory, file)) 67 | with open(long_filename, "rb") as file: 68 | contents = file.read() 69 | serialized_documents[label] = contents 70 | print('Indexing documents (this may take some time) ...') 71 | holmes_manager.register_serialized_documents(serialized_documents) 72 | 73 | if os.path.exists(working_directory): 74 | if not os.path.isdir(working_directory): 75 | raise RuntimeError(' '.join((working_directory, 'must be a directory'))) 76 | else: 77 | os.mkdir(working_directory) 78 | 79 | if os.path.isfile(flag_filename): 80 | load_documents_from_working_directory() 81 | else: 82 | process_documents_from_front_page( 83 | "https://maerchen.com/grimm/", 'Gebrüder Grimm') 84 | process_documents_from_front_page( 85 | "https://maerchen.com/grimm2/", 'Gebrüder Grimm') 86 | process_documents_from_front_page( 87 | "https://maerchen.com/andersen/", 'Hans Christian Andersen') 88 | process_documents_from_front_page( 89 | "https://maerchen.com/bechstein/", 'Ludwig Bechstein') 90 | process_documents_from_front_page( 91 | "https://maerchen.com/wolf/", 'Johann Wilhelm Wolf') 92 | # Generate flag file to indicate files can be reloaded on next run 93 | open(flag_filename, 'a').close() 94 | load_documents_from_working_directory() 95 | 96 | #Comment following line in to activate interactive console 97 | #holmes_manager.start_topic_matching_search_mode_console(only_one_result_per_document=True) 98 | 99 | # The following code starts a RESTful Http service to perform topic searches. It is deployed as 100 | # as WSGI application. An example of how to start it - issued from the directory that 101 | # contains the script - is 102 | 103 | # python -m waitress example_search_DE_literature:application 104 | 105 | # You will need to install waitress (python -m pip install waitress) 106 | 107 | class RestHandler(): 108 | def on_get(self, req, resp): 109 | resp.text = \ 110 | json.dumps(holmes_manager.topic_match_documents_against( 111 | req.params['entry'][0:200], only_one_result_per_document=True)) 112 | resp.cache_control = ["s-maxage=31536000"] 113 | 114 | application = falcon.App() 115 | application.add_route('/german', RestHandler()) 116 | -------------------------------------------------------------------------------- /SHORTREADME.md: -------------------------------------------------------------------------------- 1 | **Holmes** is a Python 3 library (v3.6—v3.10) running on top of 2 | [spaCy](https://spacy.io/) (v3.1—v3.3) that supports a number of use cases 3 | involving information extraction from English and German texts. In all use cases, the information 4 | extraction is based on analysing the semantic relationships expressed by the component parts of 5 | each sentence: 6 | 7 | - In the [chatbot](#getting-started) use case, the system is configured using one or more **search phrases**. 8 | Holmes then looks for structures whose meanings correspond to those of these search phrases within 9 | a searched **document**, which in this case corresponds to an individual snippet of text or speech 10 | entered by the end user. Within a match, each word with its own meaning (i.e. that does not merely fulfil a grammatical function) in the search phrase 11 | corresponds to one or more such words in the document. Both the fact that a search phrase was matched and any structured information the search phrase extracts can be used to drive the chatbot. 12 | 13 | - The [structural extraction](#structural-extraction) use case uses exactly the same 14 | [structural matching](#how-it-works-structural-matching) technology as the chatbot use 15 | case, but searching takes place with respect to a pre-existing document or documents that are typically much 16 | longer than the snippets analysed in the chatbot use case, and the aim is to extract and store structured information. For example, a set of business articles could be searched to find all the places where one company is said to be planning to 17 | take over a second company. The identities of the companies concerned could then be stored in a database. 18 | 19 | - The [topic matching](#topic-matching) use case aims to find passages in a document or documents whose meaning 20 | is close to that of another document, which takes on the role of the **query document**, or to that of a **query phrase** entered ad-hoc by the user. Holmes extracts a number of small **phraselets** from the query phrase or 21 | query document, matches the documents being searched against each phraselet, and conflates the results to find 22 | the most relevant passages within the documents. Because there is no strict requirement that every 23 | word with its own meaning in the query document match a specific word or words in the searched documents, more matches are found 24 | than in the structural extraction use case, but the matches do not contain structured information that can be 25 | used in subsequent processing. The topic matching use case is demonstrated by [a website allowing searches within 26 | six Charles Dickens novels (for English) and around 350 traditional stories (for German)](https://holmes-demo.explosion.services/). 27 | 28 | - The [supervised document classification](#supervised-document-classification) use case uses training data to 29 | learn a classifier that assigns one or more **classification labels** to new documents based on what they are about. 30 | It classifies a new document by matching it against phraselets that were extracted from the training documents in the 31 | same way that phraselets are extracted from the query document in the topic matching use case. The technique is 32 | inspired by bag-of-words-based classification algorithms that use n-grams, but aims to derive n-grams whose component 33 | words are related semantically rather than that just happen to be neighbours in the surface representation of a language. 34 | 35 | In all four use cases, the **individual words** are matched using a [number of strategies](#word-level-matching-strategies). 36 | To work out whether two grammatical structures that contain individually matching words correspond logically and 37 | constitute a match, Holmes transforms the syntactic parse information provided by the [spaCy](https://spacy.io/) library 38 | into semantic structures that allow texts to be compared using predicate logic. As a user of Holmes, you do not need to 39 | understand the intricacies of how this works, although there are some 40 | [important tips](#writing-effective-search-phrases) around writing effective search phrases for the chatbot and 41 | structural extraction use cases that you should try and take on board. 42 | 43 | Holmes aims to offer generalist solutions that can be used more or less out of the box with 44 | relatively little tuning, tweaking or training and that are rapidly applicable to a wide range of use cases. 45 | At its core lies a logical, programmed, rule-based system that describes how syntactic representations in each 46 | language express semantic relationships. Although the supervised document classification use case does incorporate a 47 | neural network and although the spaCy library upon which Holmes builds has itself been pre-trained using machine 48 | learning, the essentially rule-based nature of Holmes means that the chatbot, structural extraction and topic matching use 49 | cases can be put to use out of the box without any training and that the supervised document classification use case 50 | typically requires relatively little training data, which is a great advantage because pre-labelled training data is 51 | not available for many real-world problems. 52 | 53 | Holmes has a long and complex history and we are now able to publish it under the MIT license thanks to the goodwill and openness of several companies. I, Richard Hudson, wrote the versions up to 3.0.0 while working at [msg systems](https://www.msg.group/en), a large international software consultancy based near Munich. In late 2021, I changed employers and now work for [Explosion](https://explosion.ai/), the creators of [spaCy](https://spacy.io/) and [Prodigy](https://prodi.gy/). Elements of the Holmes library are covered by a [US patent](https://patents.google.com/patent/US8155946B2/en) that I myself wrote in the early 2000s while working at a startup called Definiens that has since been acquired by [AstraZeneca](https://www.astrazeneca.com/). With the kind permission of both AstraZeneca and msg systems, I am now maintaining Holmes at Explosion and can offer it for the first time under a permissive license: anyone can now use Holmes under the terms of the MIT 54 | license without having to worry about the patent. 55 | 56 | For more information, please see the [main documentation on Github](https://github.com/explosion/holmes-extractor). 57 | -------------------------------------------------------------------------------- /tests/common/test_serialization.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import holmes_extractor as holmes 4 | 5 | script_directory = os.path.dirname(os.path.realpath(__file__)) 6 | holmes_manager = holmes.Manager('en_core_web_trf', number_of_workers=2) 7 | holmes_manager.register_search_phrase("A dog chases a cat") 8 | german_holmes_manager = holmes.Manager('de_core_news_lg', number_of_workers=2) 9 | 10 | 11 | class SerializationTest(unittest.TestCase): 12 | 13 | def test_matching_with_holmes_manager_document_after_serialization(self): 14 | holmes_manager.remove_all_documents() 15 | holmes_manager.parse_and_register_document( 16 | "The cat was chased by the dog", 'pets') 17 | serialized_doc = holmes_manager.serialize_document('pets') 18 | self.assertEqual(len(holmes_manager.match()), 1) 19 | 20 | def test_matching_with_reserialized_holmes_manager_document(self): 21 | holmes_manager.remove_all_documents() 22 | holmes_manager.parse_and_register_document( 23 | "The cat was chased by the dog", 'pets') 24 | serialized_doc = holmes_manager.serialize_document('pets') 25 | holmes_manager.remove_all_documents() 26 | holmes_manager.register_serialized_document( 27 | serialized_doc, 'pets') 28 | self.assertEqual(len(holmes_manager.match()), 1) 29 | 30 | def test_matching_with_multiple_reserialized_holmes_manager_document(self): 31 | holmes_manager.remove_all_documents() 32 | holmes_manager.parse_and_register_document( 33 | "The cat was chased by the dog", 'pets') 34 | serialized_doc = holmes_manager.serialize_document('pets') 35 | working_dict = {'pets': serialized_doc, 'pets2': serialized_doc} 36 | holmes_manager.remove_all_documents() 37 | holmes_manager.register_serialized_documents(working_dict) 38 | self.assertEqual(len(holmes_manager.match()), 2) 39 | 40 | def test_serialization_with_coreference(self): 41 | holmes_manager.remove_all_documents() 42 | holmes_manager.parse_and_register_document( 43 | "I saw a cat. It was chased by the dog", 'pets') 44 | serialized_doc = holmes_manager.serialize_document('pets') 45 | holmes_manager.remove_all_documents() 46 | holmes_manager.register_serialized_document( 47 | serialized_doc, 'pets') 48 | self.assertEqual(len(holmes_manager.match()), 1) 49 | 50 | def test_matching_with_both_documents(self): 51 | holmes_manager.remove_all_documents() 52 | holmes_manager.parse_and_register_document( 53 | "The cat was chased by the dog", 'pets') 54 | serialized_doc = holmes_manager.serialize_document('pets') 55 | holmes_manager.register_serialized_document( 56 | serialized_doc, 'pets2') 57 | self.assertEqual(len(holmes_manager.match()), 2) 58 | 59 | def test_document_to_serialize_does_not_exist(self): 60 | holmes_manager.remove_all_documents() 61 | serialized_doc = holmes_manager.serialize_document('pets') 62 | self.assertEqual(serialized_doc, None) 63 | 64 | def test_parent_token_indexes(self): 65 | holmes_manager.remove_all_documents() 66 | holmes_manager.parse_and_register_document( 67 | "Houses in the village.", 'village') 68 | serialized_doc = holmes_manager.serialize_document('village') 69 | holmes_manager.register_serialized_document( 70 | serialized_doc, 'village2') 71 | old_doc = holmes_manager.get_document( 72 | 'village') 73 | new_doc = holmes_manager.get_document( 74 | 'village2') 75 | self.assertEqual(old_doc[0]._.holmes.string_representation_of_children(), 76 | '1:prep; 3:pobjp') 77 | self.assertEqual(old_doc[3]._.holmes.string_representation_of_parents(), 78 | '0:pobjp; 1:pobj') 79 | self.assertEqual(old_doc[3]._.holmes.coreference_linked_parent_dependencies, [ 80 | [0, 'pobjp'], [1, 'pobj']]) 81 | self.assertEqual(new_doc[0]._.holmes.string_representation_of_children(), 82 | '1:prep; 3:pobjp') 83 | self.assertEqual(new_doc[3]._.holmes.coreference_linked_parent_dependencies, [ 84 | [0, 'pobjp'], [1, 'pobj']]) 85 | self.assertEqual(new_doc[3]._.holmes.string_representation_of_parents(), 86 | '0:pobjp; 1:pobj') 87 | 88 | def test_subwords(self): 89 | german_holmes_manager.remove_all_documents() 90 | german_holmes_manager.parse_and_register_document( 91 | "Bundesoberbehörde.", 'bo') 92 | serialized_doc = german_holmes_manager.serialize_document('bo') 93 | german_holmes_manager.register_serialized_document( 94 | serialized_doc, 'bo2') 95 | old_doc = german_holmes_manager.get_document('bo') 96 | new_doc = german_holmes_manager.get_document( 97 | 'bo2') 98 | self.assertEqual(old_doc[0]._.holmes.subwords[0].text, 'bundes') 99 | self.assertEqual(old_doc[0]._.holmes.subwords[0].lemma, 'bund') 100 | self.assertEqual(old_doc[0]._.holmes.subwords[1].text, 'oberbehörde') 101 | self.assertEqual(old_doc[0]._.holmes.subwords[1].lemma, 'oberbehörde') 102 | self.assertEqual(new_doc[0]._.holmes.subwords[0].text, 'bundes') 103 | self.assertEqual(new_doc[0]._.holmes.subwords[0].lemma, 'bund') 104 | self.assertEqual(new_doc[0]._.holmes.subwords[1].text, 'oberbehörde') 105 | self.assertEqual(new_doc[0]._.holmes.subwords[1].lemma, 'oberbehörde') 106 | 107 | def test_derived_lemma(self): 108 | holmes_manager.remove_all_documents() 109 | holmes_manager.parse_and_register_document( 110 | "A lot of information.", 'information') 111 | serialized_doc = holmes_manager.serialize_document( 112 | 'information') 113 | holmes_manager.register_serialized_document( 114 | serialized_doc, 'information2') 115 | old_doc = holmes_manager.get_document( 116 | 'information') 117 | new_doc = holmes_manager.get_document( 118 | 'information2') 119 | self.assertEqual(old_doc[3]._.holmes.derived_lemma, 'inform') 120 | self.assertEqual(new_doc[3]._.holmes.derived_lemma, 'inform') 121 | -------------------------------------------------------------------------------- /holmes_extractor/word_matching/direct.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Dict 2 | from spacy.tokens import Token, Doc 3 | from .general import WordMatch, WordMatchingStrategy 4 | from ..parsing import ( 5 | MultiwordSpan, 6 | CorpusWordPosition, 7 | Subword, 8 | SearchPhrase, 9 | ) 10 | 11 | 12 | class DirectWordMatchingStrategy(WordMatchingStrategy): 13 | 14 | WORD_MATCH_TYPE_LABEL = "direct" 15 | 16 | @staticmethod 17 | def _get_explanation(search_phrase_display_word: str) -> str: 18 | return "".join(("Matches ", search_phrase_display_word.upper(), " directly.")) 19 | 20 | def match_multiwords( 21 | self, 22 | search_phrase: SearchPhrase, 23 | search_phrase_token: Token, 24 | document_token: Token, 25 | document_multiwords: List[MultiwordSpan], 26 | ) -> Optional[WordMatch]: 27 | 28 | if len(search_phrase_token._.holmes.lemma.split()) == 1: 29 | return None 30 | for ( 31 | search_phrase_representation 32 | ) in search_phrase_token._.holmes.direct_matching_reprs: 33 | for multiword in document_multiwords: 34 | for document_representation in multiword.direct_matching_reprs: 35 | if search_phrase_representation == document_representation: 36 | search_phrase_display_word = search_phrase_token._.holmes.lemma 37 | return WordMatch( 38 | search_phrase_token=search_phrase_token, 39 | search_phrase_word=search_phrase_representation, 40 | document_token=document_token, 41 | first_document_token=document_token.doc[ 42 | multiword.token_indexes[0] 43 | ], 44 | last_document_token=document_token.doc[ 45 | multiword.token_indexes[-1] 46 | ], 47 | document_subword=None, 48 | document_word=document_representation, 49 | word_match_type=self.WORD_MATCH_TYPE_LABEL, 50 | explanation=self._get_explanation( 51 | search_phrase_display_word 52 | ), 53 | ) 54 | return None 55 | 56 | def match_token( 57 | self, 58 | search_phrase: SearchPhrase, 59 | search_phrase_token: Token, 60 | document_token: Token, 61 | ) -> Optional[WordMatch]: 62 | 63 | for ( 64 | search_phrase_representation 65 | ) in search_phrase_token._.holmes.direct_matching_reprs: 66 | for ( 67 | document_representation 68 | ) in document_token._.holmes.direct_matching_reprs: 69 | if search_phrase_representation == document_representation: 70 | search_phrase_display_word = search_phrase_token._.holmes.lemma 71 | return WordMatch( 72 | search_phrase_token=search_phrase_token, 73 | search_phrase_word=search_phrase_representation, 74 | document_token=document_token, 75 | first_document_token=document_token, 76 | last_document_token=document_token, 77 | document_subword=None, 78 | document_word=document_representation, 79 | word_match_type=self.WORD_MATCH_TYPE_LABEL, 80 | extracted_word=self.get_extracted_word_for_token( 81 | document_token, document_representation 82 | ), 83 | explanation=self._get_explanation(search_phrase_display_word), 84 | ) 85 | return None 86 | 87 | def match_subword( 88 | self, 89 | search_phrase: SearchPhrase, 90 | search_phrase_token: Token, 91 | document_token: Token, 92 | document_subword: Subword, 93 | ) -> Optional[WordMatch]: 94 | 95 | for ( 96 | search_phrase_representation 97 | ) in search_phrase_token._.holmes.direct_matching_reprs: 98 | for document_representation in document_subword.direct_matching_reprs: 99 | if search_phrase_representation == document_representation: 100 | search_phrase_display_word = search_phrase_token._.holmes.lemma 101 | return WordMatch( 102 | search_phrase_token=search_phrase_token, 103 | search_phrase_word=search_phrase_representation, 104 | document_token=document_token, 105 | first_document_token=document_token, 106 | last_document_token=document_token, 107 | document_subword=document_subword, 108 | document_word=document_representation, 109 | word_match_type=self.WORD_MATCH_TYPE_LABEL, 110 | explanation=self._get_explanation(search_phrase_display_word), 111 | ) 112 | return None 113 | 114 | def add_words_matching_search_phrase_root_token( 115 | self, search_phrase: SearchPhrase 116 | ) -> None: 117 | for word in search_phrase.root_token._.holmes.direct_matching_reprs: 118 | search_phrase.add_word_information(word) 119 | 120 | def add_reverse_dict_entries( 121 | self, 122 | reverse_dict: Dict[str, List[CorpusWordPosition]], 123 | doc: Doc, 124 | document_label: str, 125 | ) -> None: 126 | for token in doc: 127 | for representation in token._.holmes.direct_matching_reprs: 128 | self.add_reverse_dict_entry( 129 | reverse_dict, 130 | representation.lower(), 131 | document_label, 132 | token.i, 133 | None, 134 | ) 135 | for subword in token._.holmes.subwords: 136 | for representation in subword.direct_matching_reprs: 137 | self.add_reverse_dict_entry( 138 | reverse_dict, 139 | representation.lower(), 140 | document_label, 141 | token.i, 142 | subword.index, 143 | ) 144 | -------------------------------------------------------------------------------- /examples/example_supervised_topic_model_EN.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import urllib.request 4 | import zipfile 5 | from thinc.api import prefer_gpu 6 | import holmes_extractor as holmes 7 | 8 | working_directory = # REPLACE WITH PATH TO WORKING DIRECTORY IN SINGLE OR DOUBLE QUOTES 9 | 10 | if __name__ in ("__main__", "example_supervised_topic_model_EN"): 11 | 12 | def is_training_data(document_number): 13 | # We use any documents with numbers ending in 8,9,0 for test and all other documents for 14 | # training. 15 | return document_number[-1:] not in ("8", "9", "0") 16 | 17 | def get_document_filename_info(filename): 18 | # e.g. 'bbc/business/001.txt' 19 | category = filename.split("/")[1] 20 | document_number = filename.split("/")[2].split(".")[0] 21 | return category, document_number 22 | 23 | def evaluate_classifier(zip_filename, classifier): 24 | correct_classification_counter = ( 25 | wrong_classification_counter 26 | ) = no_classification_counter = correct_as_additional_classification_counter = 0 27 | with zipfile.ZipFile(zip_filename) as bbc_zipfile: 28 | for filename in ( 29 | filename 30 | for filename in sorted(bbc_zipfile.namelist()) 31 | if filename.lower().endswith(".txt") 32 | and not filename.endswith("README.TXT") 33 | ): 34 | category, document_number = get_document_filename_info(filename) 35 | if not is_training_data(document_number): 36 | with bbc_zipfile.open(filename, "r") as test_doc: 37 | test_contents = str(test_doc.read()) 38 | test_contents = test_contents.replace("\n", " ").replace( 39 | "\r", " " 40 | ) 41 | classification_dict = classifier.parse_and_classify(test_contents) 42 | if classification_dict is None: 43 | suggested_categories = [] 44 | else: 45 | suggested_categories = [ 46 | c 47 | for c in classification_dict 48 | if classification_dict[c] > 0.2 49 | ] 50 | if len(suggested_categories) == 0: 51 | no_classification_counter += 1 52 | elif suggested_categories[0] == category: 53 | correct_classification_counter += 1 54 | elif category in suggested_categories: 55 | correct_as_additional_classification_counter += 1 56 | else: 57 | wrong_classification_counter += 1 58 | print( 59 | "".join( 60 | ( 61 | filename, 62 | ": actual category ", 63 | category, 64 | "; suggested categories ", 65 | str(suggested_categories), 66 | ) 67 | ) 68 | ) 69 | print() 70 | print("Totals:") 71 | print(correct_classification_counter, "correct classifications;") 72 | print(no_classification_counter, "unclassified documents;") 73 | print(wrong_classification_counter, "incorrect classifications;") 74 | print( 75 | correct_as_additional_classification_counter, 76 | "incorrect classifications where the " 77 | "correct classification was returned as an additional classification.", 78 | ) 79 | 80 | def train_model(working_directory, zip_filename): 81 | training_basis = holmes_manager.get_supervised_topic_training_basis() 82 | with zipfile.ZipFile(zip_filename) as bbc_zipfile: 83 | for filename in ( 84 | filename 85 | for filename in sorted(bbc_zipfile.namelist()) 86 | if filename.lower().endswith(".txt") 87 | and not filename.endswith("README.TXT") 88 | ): 89 | category, document_number = get_document_filename_info(filename) 90 | if is_training_data(document_number): 91 | with bbc_zipfile.open(filename, "r") as training_doc: 92 | training_contents = str(training_doc.read()) 93 | training_contents = training_contents.replace( 94 | "\n", " " 95 | ).replace("\r", " ") 96 | training_basis.parse_and_register_training_document( 97 | training_contents, category, filename 98 | ) 99 | training_basis.prepare() 100 | prefer_gpu() 101 | classifier = training_basis.train().classifier() 102 | output_filename = os.sep.join((working_directory, "sdc-model")) 103 | with open(output_filename, "wb") as file: 104 | file.write(classifier.serialize_model()) 105 | evaluate_classifier(zip_filename, classifier) 106 | 107 | holmes_manager = holmes.Manager("en_core_web_lg", number_of_workers=1) 108 | 109 | if os.path.exists(working_directory): 110 | if not os.path.isdir(working_directory): 111 | raise RuntimeError(" ".join((working_directory, "must be a directory"))) 112 | else: 113 | os.mkdir(working_directory) 114 | zip_filename = os.sep.join((working_directory, "bbc-fulltext.zip")) 115 | if not os.path.exists(zip_filename): 116 | url = "http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip" 117 | with urllib.request.urlopen(url) as response, open( 118 | zip_filename, "wb" 119 | ) as out_file: 120 | shutil.copyfileobj(response, out_file) 121 | model_filename = os.sep.join((working_directory, "sdc-model")) 122 | if not os.path.exists(model_filename): 123 | train_model(working_directory, zip_filename) 124 | else: 125 | print( 126 | "Reloading existing trained model. " 127 | "Delete model from working directory to repeat training." 128 | ) 129 | with open(model_filename, "rb") as model_file: 130 | classifier = holmes_manager.deserialize_supervised_topic_classifier( 131 | model_file.read() 132 | ) 133 | evaluate_classifier(zip_filename, classifier) 134 | -------------------------------------------------------------------------------- /holmes_extractor/word_matching/question.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Dict 2 | from spacy.tokens import Token 3 | from thinc.types import Floats1d 4 | from .general import WordMatch, WordMatchingStrategy 5 | from ..parsing import SearchPhrase, SemanticMatchingHelper, Subword 6 | 7 | 8 | class QuestionWordMatchingStrategy(WordMatchingStrategy): 9 | 10 | WORD_MATCH_TYPE_LABEL = "question" 11 | 12 | @staticmethod 13 | def _get_explanation(search_phrase_display_word: str) -> str: 14 | return "".join( 15 | ("Matches the question word ", search_phrase_display_word.upper(), ".") 16 | ) 17 | 18 | def __init__( 19 | self, 20 | semantic_matching_helper: SemanticMatchingHelper, 21 | perform_coreference_resolution: bool, 22 | initial_question_word_overall_similarity_threshold: float, 23 | entity_label_to_vector_dict: Dict[str, Floats1d], 24 | ): 25 | self.initial_question_word_overall_similarity_threshold = ( 26 | initial_question_word_overall_similarity_threshold 27 | ) 28 | self.entity_label_to_vector_dict = entity_label_to_vector_dict 29 | super().__init__(semantic_matching_helper, perform_coreference_resolution) 30 | 31 | def match_token( 32 | self, 33 | search_phrase: SearchPhrase, 34 | search_phrase_token: Token, 35 | document_token: Token, 36 | ) -> Optional[WordMatch]: 37 | 38 | if search_phrase_token._.holmes.is_initial_question_word: 39 | document_vector = document_token._.holmes.vector 40 | if document_vector is not None: 41 | question_word_matches = ( 42 | self.semantic_matching_helper.question_word_matches( 43 | search_phrase_token, 44 | document_token, 45 | None, 46 | document_vector, 47 | self.entity_label_to_vector_dict, 48 | self.initial_question_word_overall_similarity_threshold 49 | ** len(search_phrase.matchable_non_entity_tokens_to_vectors), 50 | ) 51 | ) 52 | else: 53 | question_word_matches = ( 54 | self.semantic_matching_helper.question_word_matches( 55 | search_phrase_token, document_token, None, None, None, None 56 | ) 57 | ) 58 | if question_word_matches: 59 | first_document_token_index = ( 60 | last_document_token_index 61 | ) = document_token.i 62 | if ( 63 | document_token.pos_ in self.semantic_matching_helper.noun_pos 64 | and len(document_token.ent_type_) > 0 65 | ): 66 | while first_document_token_index >= 1: 67 | if ( 68 | document_token.doc[first_document_token_index - 1].pos_ 69 | in self.semantic_matching_helper.noun_pos 70 | ): 71 | first_document_token_index = first_document_token_index - 1 72 | else: 73 | break 74 | while last_document_token_index + 1 < len(document_token.doc): 75 | if ( 76 | document_token.doc[last_document_token_index + 1].pos_ 77 | in self.semantic_matching_helper.noun_pos 78 | ): 79 | last_document_token_index = last_document_token_index + 1 80 | else: 81 | break 82 | return WordMatch( 83 | search_phrase_token=search_phrase_token, 84 | search_phrase_word=search_phrase_token._.holmes.lemma, 85 | document_token=document_token, 86 | first_document_token=document_token.doc[first_document_token_index], 87 | last_document_token=document_token.doc[last_document_token_index], 88 | document_subword=None, 89 | document_word=document_token._.holmes.lemma, 90 | word_match_type=self.WORD_MATCH_TYPE_LABEL, 91 | explanation=self._get_explanation( 92 | search_phrase_token._.holmes.lemma 93 | ), 94 | ) 95 | return None 96 | 97 | def match_subword( 98 | self, 99 | search_phrase: SearchPhrase, 100 | search_phrase_token: Token, 101 | document_token: Token, 102 | document_subword: Subword, 103 | ) -> Optional[WordMatch]: 104 | 105 | if ( 106 | document_subword.is_head 107 | ): # question words should not match a head subword but the whole word or multiword: 108 | return None 109 | if search_phrase_token._.holmes.is_initial_question_word: 110 | document_vector = document_subword.vector 111 | if document_vector is not None: 112 | question_word_matches = ( 113 | self.semantic_matching_helper.question_word_matches( 114 | search_phrase_token, 115 | document_token, 116 | document_subword.index, 117 | document_vector, 118 | self.entity_label_to_vector_dict, 119 | self.initial_question_word_overall_similarity_threshold 120 | ** len(search_phrase.matchable_non_entity_tokens_to_vectors), 121 | ) 122 | ) 123 | else: 124 | question_word_matches = ( 125 | self.semantic_matching_helper.question_word_matches( 126 | search_phrase_token, 127 | document_token, 128 | document_subword.index, 129 | None, 130 | None, 131 | None, 132 | ) 133 | ) 134 | if question_word_matches: 135 | return WordMatch( 136 | search_phrase_token=search_phrase_token, 137 | search_phrase_word=search_phrase_token._.holmes.lemma, 138 | document_token=document_token, 139 | first_document_token=document_token, 140 | last_document_token=document_token, 141 | document_subword=document_subword, 142 | document_word=document_subword.lemma, 143 | word_match_type=self.WORD_MATCH_TYPE_LABEL, 144 | explanation=self._get_explanation(document_subword.lemma), 145 | ) 146 | return None 147 | -------------------------------------------------------------------------------- /tests/common/test_manager.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import holmes_extractor as holmes 3 | from holmes_extractor.errors import NoDocumentError 4 | 5 | holmes_manager = holmes.Manager( 6 | 'en_core_web_trf', perform_coreference_resolution=False, number_of_workers=2) 7 | 8 | lg_holmes_manager = holmes.Manager( 9 | 'en_core_web_lg', perform_coreference_resolution=False, number_of_workers=2) 10 | 11 | class ManagerTest(unittest.TestCase): 12 | 13 | def _register_multiple_documents_and_search_phrases(self): 14 | holmes_manager.remove_all_search_phrases() 15 | holmes_manager.remove_all_documents() 16 | holmes_manager.parse_and_register_document( 17 | document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets') 18 | holmes_manager.parse_and_register_document( 19 | document_text="Everything I know suggests that lions enjoy eating gnu", label='safari') 20 | holmes_manager.register_search_phrase( 21 | "A dog chases a cat", label="test") 22 | holmes_manager.register_search_phrase( 23 | "A lion eats a gnu", label="test") 24 | holmes_manager.register_search_phrase( 25 | "irrelevancy", label="alpha") 26 | return 27 | 28 | def test_multiple(self): 29 | self._register_multiple_documents_and_search_phrases() 30 | self.assertEqual(len(holmes_manager.match()), 2) 31 | 32 | def test_remove_all_search_phrases(self): 33 | self._register_multiple_documents_and_search_phrases() 34 | holmes_manager.remove_all_search_phrases() 35 | holmes_manager.register_search_phrase("A dog chases a cat") 36 | self.assertEqual(len(holmes_manager.match()), 1) 37 | 38 | def test_remove_all_documents(self): 39 | self._register_multiple_documents_and_search_phrases() 40 | holmes_manager.remove_all_documents() 41 | holmes_manager.parse_and_register_document( 42 | document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets') 43 | self.assertEqual(len(holmes_manager.match()), 1) 44 | 45 | def test_remove_all_documents_with_label(self): 46 | self._register_multiple_documents_and_search_phrases() 47 | holmes_manager.remove_all_documents() 48 | holmes_manager.parse_and_register_document( 49 | document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets11') 50 | holmes_manager.parse_and_register_document( 51 | document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets12') 52 | holmes_manager.parse_and_register_document( 53 | document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets21') 54 | holmes_manager.parse_and_register_document( 55 | document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets22') 56 | self.assertEqual(len(holmes_manager.match()), 4) 57 | holmes_manager.remove_all_documents('pets22') 58 | self.assertEqual(len(holmes_manager.match()), 3) 59 | holmes_manager.remove_all_documents('pets1') 60 | self.assertEqual(len(holmes_manager.match()), 1) 61 | holmes_manager.remove_all_documents('pets') 62 | with self.assertRaises(NoDocumentError) as context: 63 | holmes_manager.match() 64 | 65 | def test_remove_document(self): 66 | self._register_multiple_documents_and_search_phrases() 67 | holmes_manager.parse_and_register_document( 68 | document_text="All the time I am testing here, dogs keep on chasing cats.", label='pets2') 69 | self.assertEqual(len(holmes_manager.match()), 3) 70 | holmes_manager.remove_document(label='pets') 71 | holmes_manager.remove_document(label='safari') 72 | matches = holmes_manager.match() 73 | self.assertEqual(len(matches), 1) 74 | self.assertEqual(matches[0]['document'], 'pets2') 75 | 76 | def test_match_search_phrases_against(self): 77 | self._register_multiple_documents_and_search_phrases() 78 | self.assertEqual(len(holmes_manager.match(document_text= 79 | "All the time I am testing here, dogs keep on chasing cats.")), 1) 80 | 81 | def test_match_documents_against(self): 82 | self._register_multiple_documents_and_search_phrases() 83 | self.assertEqual(len(holmes_manager.match(search_phrase_text= 84 | "A lion eats a gnu.")), 1) 85 | 86 | def test_match_documents_and_search_phrases_against(self): 87 | self._register_multiple_documents_and_search_phrases() 88 | self.assertEqual(len(holmes_manager.match(search_phrase_text= "burn", 89 | document_text="Burn. Everything I know suggests that lions enjoy eating gnu")), 1) 90 | holmes_manager.remove_all_documents() 91 | holmes_manager.remove_all_search_phrases() 92 | self.assertEqual(len(holmes_manager.match(search_phrase_text= "burn", 93 | document_text="Burn. Everything I know suggests that lions enjoy eating gnu")), 1) 94 | 95 | def test_get_labels(self): 96 | self._register_multiple_documents_and_search_phrases() 97 | self.assertEqual(holmes_manager.list_search_phrase_labels(), 98 | ['alpha', 'test']) 99 | 100 | def test_get_document(self): 101 | self._register_multiple_documents_and_search_phrases() 102 | self.assertEqual(holmes_manager.get_document('safari')[5]._.holmes.lemma, 103 | 'lion') 104 | 105 | def test_remove_all_search_phrases_with_label(self): 106 | holmes_manager.remove_all_search_phrases() 107 | holmes_manager.register_search_phrase("testa", label="test1") 108 | holmes_manager.register_search_phrase("testb", label="test1") 109 | holmes_manager.register_search_phrase("testc", label="test2") 110 | holmes_manager.register_search_phrase("testd", label="test2") 111 | holmes_manager.remove_all_search_phrases_with_label("test2") 112 | holmes_manager.remove_all_search_phrases_with_label("testb") 113 | self.assertEqual(holmes_manager.list_search_phrase_labels(), 114 | ['test1']) 115 | self.assertEqual(len(holmes_manager.match(document_text= 116 | "testa")), 1) 117 | self.assertEqual(len(holmes_manager.match(document_text= 118 | "testb")), 1) 119 | self.assertEqual(len(holmes_manager.match(document_text= 120 | "testc")), 0) 121 | self.assertEqual(len(holmes_manager.match(document_text= 122 | "testd")), 0) 123 | 124 | def test_pipe_with_single_process(self): 125 | docs = lg_holmes_manager.nlp.pipe(['document1', 'document2']) 126 | self.assertEqual(str(next(docs)), 'document1') 127 | 128 | 129 | def test_pipe_with_multiple_processes(self): 130 | docs = lg_holmes_manager.nlp.pipe(['document1', 'document2'], n_process=2) 131 | self.assertEqual(str(next(docs)), 'document1') 132 | self.assertEqual(str(next(docs)), 'document2') 133 | -------------------------------------------------------------------------------- /examples/example_search_EN_literature.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import json 4 | import urllib.request 5 | # You will need to install bs4 (python -m pip install bs4) 6 | from bs4 import BeautifulSoup 7 | import holmes_extractor as holmes 8 | # You will need to install falcon (python -m pip install falcon) 9 | import falcon 10 | 11 | if __name__ in ('__main__', 'example_search_EN_literature'): 12 | 13 | working_directory = # REPLACE WITH PATH TO WORKING DIRECTORY IN SINGLE OR DOUBLE QUOTES 14 | HOLMES_EXTENSION = 'hdc' 15 | flag_filename = os.sep.join((working_directory, 'STORY_PARSING_COMPLETE')) 16 | print('Initializing Holmes (this may take some time) ...') 17 | 18 | script_directory = os.path.dirname(os.path.realpath(__file__)) 19 | ontology = holmes.Ontology(os.sep.join(( 20 | script_directory, 'example_search_EN_literature_ontology.owl'))) 21 | 22 | # Start the Holmes manager with the English model 23 | holmes_manager = holmes.Manager( 24 | model='en_core_web_trf', ontology=ontology) 25 | 26 | def extract_chapters_from_book(book_uri, title): 27 | """ Download and save the chapters from a book.""" 28 | 29 | print() 30 | print(title) 31 | print() 32 | book = urllib.request.urlopen(book_uri).read().decode() 33 | book = re.sub("\\nPage \|.+?Rowling \\n", "", book) 34 | book = re.sub("\\nP a g e \|.+?Rowling \\n", "", book) 35 | book = re.sub("\\nPage \|.+?\\n", "", book) 36 | book = book.replace("Harry Potter and the Half Blood Prince - J.K. Rowling", "") 37 | book = book.replace("Harry Potter and the Goblet of Fire - J.K. Rowling", "") 38 | book = book.replace("Harry Potter and the Deathly Hallows - J.K. Rowling", "") 39 | book = book[1:] 40 | chapter_headings = [heading for heading in re.finditer("(?<=((\\n\\n\\n\\n)|(\* \\n\\n)))((?!.*(WEASLEY WILL MAKE SURE)|(DO NOT OPEN THE PARCEL)|(HEADMISTRESS OF HOGWARTS))[A-Z][A-Z\-’., ]+)(\\n{1,2}((?!.*(WHO\-MUST))[A-Z\-’., ]+))?(?=(\\n\\n([^\\n]|(\\n\\n((“Harry!”)|(Harry’s)|(Ron’s)|(“Hagrid)|(Three o’clock))))))", book)] 41 | chapter_counter = 1 42 | labels = [] 43 | chapter_texts = [] 44 | chapter_dict = {} 45 | for chapter_heading in chapter_headings: 46 | label = ''.join(( 47 | 'Book ', title, ' Ch ', str(chapter_counter), " ‘", 48 | chapter_heading.group().replace('\n', '').strip(), "’")) 49 | labels.append(label) 50 | if chapter_counter == len(chapter_headings): # last chapter 51 | content = book[chapter_heading.end():] 52 | else: 53 | content = book[chapter_heading.end():chapter_headings[chapter_counter].start()] 54 | content = content.replace('\n', '') 55 | if content.endswith('& '): 56 | content = content[:-2] 57 | chapter_texts.append(content) 58 | print('Extracted', label) 59 | chapter_counter += 1 60 | parsed_chapters = holmes_manager.nlp.pipe(chapter_texts) 61 | for index, parsed_chapter in enumerate(parsed_chapters): 62 | label = labels[index] 63 | print('Saving', label) 64 | output_filename = os.sep.join((working_directory, label)) 65 | output_filename = '.'.join((output_filename, HOLMES_EXTENSION)) 66 | with open(output_filename, "wb") as file: 67 | file.write(parsed_chapter.to_bytes()) 68 | 69 | def load_documents_from_working_directory(): 70 | serialized_documents = {} 71 | for file in os.listdir(working_directory): 72 | if file.endswith(HOLMES_EXTENSION): 73 | print('Loading', file) 74 | label = file[:-4] 75 | long_filename = os.sep.join((working_directory, file)) 76 | with open(long_filename, "rb") as file: 77 | contents = file.read() 78 | serialized_documents[label] = contents 79 | print('Indexing documents (this may take some time) ...') 80 | holmes_manager.register_serialized_documents(serialized_documents) 81 | 82 | if os.path.exists(working_directory): 83 | if not os.path.isdir(working_directory): 84 | raise RuntimeError(' '.join((working_directory, 'must be a directory'))) 85 | else: 86 | os.mkdir(working_directory) 87 | 88 | if os.path.isfile(flag_filename): 89 | load_documents_from_working_directory() 90 | else: 91 | extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%201%20-%20The%20Philosopher's%20Stone.txt", "1 ‘The Philosopher\'s Stone’") 92 | extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%202%20-%20The%20Chamber%20of%20Secrets.txt", "2 ‘The Chamber of Secrets’") 93 | extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%203%20-%20The%20Prisoner%20of%20Azkaban.txt", "3 ‘The Prisoner of Azkaban’") 94 | extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%204%20-%20The%20Goblet%20of%20Fire.txt", "4 ‘The Goblet of Fire’") 95 | extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%205%20-%20The%20Order%20of%20the%20Phoenix.txt", "5 ‘The Order of the Phoenix’") 96 | extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%206%20-%20The%20Half%20Blood%20Prince.txt", "6 ‘The Half Blood Prince’") 97 | extract_chapters_from_book("https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%207%20-%20The%20Deathly%20Hallows.txt", "7 ‘The Deathly Hallows’") 98 | # Generate flag file to indicate files can be reloaded on next run 99 | open(flag_filename, 'a').close() 100 | load_documents_from_working_directory() 101 | 102 | #Comment following line in to activate interactive console 103 | #holmes_manager.start_topic_matching_search_mode_console() 104 | 105 | # The following code starts a RESTful Http service to perform topic searches. It is deployed as 106 | # as WSGI application. An example of how to start it - issued from the directory that 107 | # contains the script - is 108 | 109 | # python -m waitress example_search_EN_literature:application 110 | 111 | # You will need to install waitress (python -m pip install waitress) 112 | 113 | class RestHandler(): 114 | def on_get(self, req, resp): 115 | resp.text = \ 116 | json.dumps(holmes_manager.topic_match_documents_against( 117 | req.params['entry'][0:200])) 118 | resp.cache_control = ["s-maxage=31536000"] 119 | 120 | application = falcon.App() 121 | application.add_route('/english', RestHandler()) 122 | -------------------------------------------------------------------------------- /holmes_extractor/word_matching/entity_embedding.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Dict 2 | from spacy.tokens import Token 3 | from thinc.types import Floats1d 4 | from .general import WordMatch, WordMatchingStrategy 5 | from ..parsing import SearchPhrase, MultiwordSpan, SemanticMatchingHelper 6 | 7 | 8 | class EntityEmbeddingWordMatchingStrategy(WordMatchingStrategy): 9 | 10 | WORD_MATCH_TYPE_LABEL = "entity_embedding" 11 | 12 | @staticmethod 13 | def _get_explanation(similarity: float, search_phrase_display_word: str) -> str: 14 | printable_similarity = str(int(similarity * 100)) 15 | return "".join( 16 | ( 17 | "Has an entity label that is ", 18 | printable_similarity, 19 | "% similar to the word embedding corresponding to ", 20 | search_phrase_display_word.upper(), 21 | ".", 22 | ) 23 | ) 24 | 25 | def __init__( 26 | self, 27 | semantic_matching_helper: SemanticMatchingHelper, 28 | perform_coreference_resolution: bool, 29 | overall_similarity_threshold: float, 30 | initial_question_word_overall_similarity_threshold: float, 31 | entity_label_to_vector_dict: Dict[str, Floats1d], 32 | ): 33 | self.overall_similarity_threshold = overall_similarity_threshold 34 | self.initial_question_word_overall_similarity_threshold = ( 35 | initial_question_word_overall_similarity_threshold 36 | ) 37 | self.entity_label_to_vector_dict = entity_label_to_vector_dict 38 | super().__init__(semantic_matching_helper, perform_coreference_resolution) 39 | 40 | def match_multiwords( 41 | self, 42 | search_phrase: SearchPhrase, 43 | search_phrase_token: Token, 44 | document_token: Token, 45 | document_multiwords: List[MultiwordSpan], 46 | ) -> Optional[WordMatch]: 47 | 48 | if ( 49 | search_phrase_token.i 50 | in search_phrase.matchable_non_entity_tokens_to_vectors.keys() 51 | and self.semantic_matching_helper.embedding_matching_permitted( 52 | search_phrase_token 53 | ) 54 | ): 55 | search_phrase_vector = search_phrase.matchable_non_entity_tokens_to_vectors[ 56 | search_phrase_token.i 57 | ] 58 | if ( 59 | search_phrase_vector is None 60 | or not self.semantic_matching_helper.embedding_matching_permitted( 61 | document_token 62 | ) 63 | ): 64 | return None 65 | for document_multiword in document_multiwords: 66 | if document_token.ent_type_ != "" and all( 67 | document_token.doc[i].ent_type_ == document_token.ent_type_ 68 | for i in document_multiword.token_indexes 69 | ): 70 | potential_word_match = self._check_for_word_match( 71 | search_phrase=search_phrase, 72 | search_phrase_token=search_phrase_token, 73 | search_phrase_vector=search_phrase_vector, 74 | document_token=document_token, 75 | first_document_token=document_token.doc[ 76 | document_multiword.token_indexes[0] 77 | ], 78 | last_document_token=document_token.doc[ 79 | document_multiword.token_indexes[-1] 80 | ], 81 | ) 82 | if potential_word_match is not None: 83 | return potential_word_match 84 | 85 | return None 86 | 87 | def match_token( 88 | self, 89 | search_phrase: SearchPhrase, 90 | search_phrase_token: Token, 91 | document_token: Token, 92 | ) -> Optional[WordMatch]: 93 | 94 | if ( 95 | search_phrase_token.i 96 | in search_phrase.matchable_non_entity_tokens_to_vectors.keys() 97 | and self.semantic_matching_helper.embedding_matching_permitted( 98 | search_phrase_token 99 | ) 100 | ): 101 | search_phrase_vector = search_phrase.matchable_non_entity_tokens_to_vectors[ 102 | search_phrase_token.i 103 | ] 104 | if ( 105 | search_phrase_vector is None 106 | or not self.semantic_matching_helper.embedding_matching_permitted( 107 | document_token 108 | ) 109 | ): 110 | return None 111 | if document_token.ent_type_ != "": 112 | return self._check_for_word_match( 113 | search_phrase=search_phrase, 114 | search_phrase_token=search_phrase_token, 115 | search_phrase_vector=search_phrase_vector, 116 | document_token=document_token, 117 | first_document_token=document_token, 118 | last_document_token=document_token, 119 | ) 120 | return None 121 | 122 | def _check_for_word_match( 123 | self, 124 | *, 125 | search_phrase: SearchPhrase, 126 | search_phrase_token: Token, 127 | search_phrase_vector: Floats1d, 128 | document_token: Token, 129 | first_document_token: Token, 130 | last_document_token: Token, 131 | ) -> Optional[WordMatch]: 132 | if ( 133 | search_phrase_token._.holmes.is_initial_question_word 134 | or search_phrase_token._.holmes.has_initial_question_word_in_phrase 135 | ) and self.initial_question_word_overall_similarity_threshold is not None: 136 | working_overall_similarity_threshold = ( 137 | self.initial_question_word_overall_similarity_threshold 138 | ) 139 | else: 140 | working_overall_similarity_threshold = self.overall_similarity_threshold 141 | single_token_similarity_threshold = working_overall_similarity_threshold ** len( 142 | search_phrase.matchable_non_entity_tokens_to_vectors 143 | ) 144 | 145 | similarity_measure = self.semantic_matching_helper.token_matches_ent_type( 146 | search_phrase_vector, 147 | self.entity_label_to_vector_dict, 148 | (document_token.ent_type_,), 149 | single_token_similarity_threshold, 150 | ) 151 | if similarity_measure > 0: 152 | if ( 153 | not search_phrase.topic_match_phraselet 154 | and len(search_phrase_token._.holmes.lemma.split()) > 1 155 | ): 156 | search_phrase_display_word = search_phrase_token.lemma_ 157 | else: 158 | search_phrase_display_word = search_phrase_token._.holmes.lemma 159 | word_match = WordMatch( 160 | search_phrase_token=search_phrase_token, 161 | search_phrase_word=search_phrase_display_word, 162 | document_token=document_token, 163 | first_document_token=first_document_token, 164 | last_document_token=last_document_token, 165 | document_subword=None, 166 | document_word=document_token.lemma_, 167 | word_match_type=self.WORD_MATCH_TYPE_LABEL, 168 | explanation=self._get_explanation( 169 | similarity_measure, search_phrase_display_word 170 | ), 171 | ) 172 | word_match.similarity_measure = similarity_measure 173 | return word_match 174 | return None 175 | -------------------------------------------------------------------------------- /holmes_extractor/word_matching/general.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Dict 2 | from spacy.tokens import Token, Doc 3 | from ..parsing import ( 4 | CorpusWordPosition, 5 | MultiwordSpan, 6 | SemanticMatchingHelper, 7 | Subword, 8 | Index, 9 | SearchPhrase, 10 | ) 11 | 12 | 13 | class WordMatchingStrategy: 14 | """Parent class for all word matching strategies. Each strategy only implements those methods that are relevant to it.""" 15 | 16 | def __init__( 17 | self, 18 | semantic_matching_helper: SemanticMatchingHelper, 19 | perform_coreference_resolution: bool, 20 | ): 21 | self.semantic_matching_helper = semantic_matching_helper 22 | self.perform_coreference_resolution = perform_coreference_resolution 23 | 24 | def match_multiwords( 25 | self, 26 | search_phrase: SearchPhrase, 27 | search_phrase_token: Token, 28 | document_token: Token, 29 | document_multiwords: List[MultiwordSpan], 30 | ) -> Optional["WordMatch"]: 31 | """Attempts to match a search phrase token to a list of multiwords headed by a document token and ordered by decreasing size.""" 32 | pass 33 | 34 | def match_token( 35 | self, 36 | search_phrase: SearchPhrase, 37 | search_phrase_token: Token, 38 | document_token: Token, 39 | ) -> Optional["WordMatch"]: 40 | """Attempts to match a search phrase token to a document token.""" 41 | pass 42 | 43 | def match_subword( 44 | self, 45 | search_phrase: SearchPhrase, 46 | search_phrase_token: Token, 47 | document_token: Token, 48 | document_subword: Subword, 49 | ) -> Optional["WordMatch"]: 50 | """Attempts to match a search phrase token to a document subword (currently only relevant for German).""" 51 | pass 52 | 53 | def add_words_matching_search_phrase_root_token( 54 | self, search_phrase: SearchPhrase 55 | ) -> None: 56 | """Determines words that match a search phrase root token and notifies the *SearchPhrase* object of them.""" 57 | pass 58 | 59 | def add_reverse_dict_entries( 60 | self, doc: Doc, document_label: str, reverse_dict: Dict[str, List[CorpusWordPosition]] 61 | ) -> None: 62 | """Determines words that match each token within a document and adds corresponding entries to the reverse dictionary.""" 63 | pass 64 | 65 | @staticmethod 66 | def add_reverse_dict_entry( 67 | reverse_dict: Dict[str, List[CorpusWordPosition]], 68 | key_word: str, 69 | document_label: str, 70 | token_index: int, 71 | subword_index: int, 72 | ) -> None: 73 | """Adds a single entry to the reverse dictionary. Called by implementing classes.""" 74 | index = Index(token_index, subword_index) 75 | corpus_word_position = CorpusWordPosition(document_label, index) 76 | if key_word in reverse_dict.keys(): 77 | if corpus_word_position not in reverse_dict[key_word]: 78 | reverse_dict[key_word].append(corpus_word_position) 79 | else: 80 | reverse_dict[key_word] = [corpus_word_position] 81 | 82 | def get_extracted_word_for_token(self, token: Token, document_word: str) -> str: 83 | """Gets the extracted word for a token. If the token is part of a coreference chain, the extracted word is the most specific 84 | term within that chain; otherwise it is the same as the document word. 85 | """ 86 | extracted_word = document_word 87 | if ( 88 | self.perform_coreference_resolution 89 | and token._.holmes.most_specific_coreferring_term_index is not None 90 | ): 91 | most_specific_token = token.doc[ 92 | token._.holmes.most_specific_coreferring_term_index 93 | ] 94 | if token._.holmes.lemma != most_specific_token._.holmes.lemma: 95 | if most_specific_token._.holmes.multiword_spans is not None: 96 | for multiword_span in most_specific_token._.holmes.multiword_spans: 97 | extracted_word = multiword_span.text 98 | return extracted_word 99 | extracted_word = most_specific_token.text.lower() 100 | return extracted_word 101 | 102 | 103 | class WordMatch: 104 | """A match between a searched phrase word and a document word. 105 | 106 | Properties: 107 | 108 | search_phrase_token -- the spaCy token from the search phrase. 109 | search_phrase_word -- the word that matched from the search phrase. 110 | document_token -- the spaCy token from the document. 111 | first_document_token -- the first token that matched from the document, which will equal 112 | *document_token* except with multiword matches. 113 | last_document_token -- the last token that matched from the document, which will equal 114 | *document_token* except with multiword matches. 115 | document_subword -- the subword from the token that matched, or *None* if the match was 116 | with the whole token. 117 | document_word -- the word or subword that matched structurally from the document. 118 | word_match_type -- *direct*, *entity*, *embedding*, or *derivation*. 119 | depth -- the vertical difference in the ontology from *search_phrase_word* to *document_word* 120 | (can be negative). 121 | extracted_word -- the most specific term that corresponded to *document_word* within the 122 | coreference chain. 123 | explanation -- a human-readable explanation of how the word match was determined designed 124 | e.g. for use as a tooltip. 125 | similarity_measure -- for type *embedding*, the similarity between the two tokens, 126 | otherwise 1.0. 127 | involves_coreference -- *True* if *document_token* and *structurally_matched_document_token* 128 | are different. 129 | """ 130 | 131 | def __init__( 132 | self, 133 | *, 134 | search_phrase_token: Token, 135 | search_phrase_word: str, 136 | document_token: Token, 137 | first_document_token: Token, 138 | last_document_token: Token, 139 | document_subword: Subword, 140 | document_word: str, 141 | word_match_type: str, 142 | depth: int = 0, 143 | extracted_word: str = None, 144 | explanation: str 145 | ): 146 | 147 | self.search_phrase_token = search_phrase_token 148 | self.search_phrase_word = search_phrase_word 149 | self.document_token = document_token 150 | self.first_document_token = first_document_token 151 | self.last_document_token = last_document_token 152 | self.document_subword = document_subword 153 | self.document_word = document_word 154 | self.word_match_type = word_match_type 155 | self.is_negated = False # will be set by StructuralMatcher 156 | self.is_uncertain = False # will be set by StructuralMatcher 157 | self.structurally_matched_document_token = ( 158 | None # will be set by StructuralMatcher 159 | ) 160 | self.extracted_word = ( 161 | extracted_word if extracted_word is not None else document_word 162 | ) 163 | self.depth = depth 164 | self.similarity_measure = 1.0 165 | self.explanation = explanation 166 | 167 | @property 168 | def involves_coreference(self) -> bool: 169 | return self.document_token != self.structurally_matched_document_token 170 | 171 | def get_document_index(self) -> Index: 172 | if self.document_subword is not None: 173 | subword_index = self.document_subword.index 174 | else: 175 | subword_index = None 176 | return Index(self.document_token.i, subword_index) 177 | -------------------------------------------------------------------------------- /holmes_extractor/word_matching/derivation.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, List 2 | from spacy.tokens import Token, Doc 3 | from .general import WordMatch, WordMatchingStrategy 4 | from ..parsing import CorpusWordPosition, MultiwordSpan, Subword, SearchPhrase 5 | 6 | 7 | class DerivationWordMatchingStrategy(WordMatchingStrategy): 8 | 9 | WORD_MATCH_TYPE_LABEL = "derivation" 10 | 11 | @staticmethod 12 | def _get_explanation(search_phrase_display_word: str) -> str: 13 | return "".join( 14 | ("Has a common stem with ", search_phrase_display_word.upper(), ".") 15 | ) 16 | 17 | def match_multiwords( 18 | self, 19 | search_phrase: SearchPhrase, 20 | search_phrase_token: Token, 21 | document_token: Token, 22 | document_multiwords: List[MultiwordSpan], 23 | ) -> Optional[WordMatch]: 24 | 25 | if len(search_phrase_token._.holmes.lemma.split()) == 1: 26 | return None 27 | if search_phrase_token._.holmes.derivation_matching_reprs is None and not any( 28 | m for m in document_multiwords if m.derivation_matching_reprs is not None 29 | ): 30 | return None 31 | 32 | for multiword in document_multiwords: 33 | 34 | search_phrase_reprs = [] 35 | document_reprs = [] 36 | 37 | if search_phrase_token._.holmes.derivation_matching_reprs is not None: 38 | search_phrase_reprs.extend( 39 | search_phrase_token._.holmes.derivation_matching_reprs 40 | ) 41 | document_reprs.extend(multiword.direct_matching_reprs) 42 | if multiword.derivation_matching_reprs is not None: 43 | document_reprs.extend(multiword.derivation_matching_reprs) 44 | search_phrase_reprs.extend( 45 | search_phrase_token._.holmes.direct_matching_reprs 46 | ) 47 | 48 | for search_phrase_representation in search_phrase_reprs: 49 | for document_representation in document_reprs: 50 | if search_phrase_representation == document_representation: 51 | search_phrase_display_word = search_phrase_token._.holmes.lemma 52 | return WordMatch( 53 | search_phrase_token=search_phrase_token, 54 | search_phrase_word=search_phrase_representation, 55 | document_token=document_token, 56 | first_document_token=document_token.doc[ 57 | multiword.token_indexes[0] 58 | ], 59 | last_document_token=document_token.doc[ 60 | multiword.token_indexes[-1] 61 | ], 62 | document_subword=None, 63 | document_word=document_representation, 64 | word_match_type=self.WORD_MATCH_TYPE_LABEL, 65 | explanation=self._get_explanation( 66 | search_phrase_display_word 67 | ), 68 | ) 69 | return None 70 | 71 | def match_token( 72 | self, 73 | search_phrase: SearchPhrase, 74 | search_phrase_token: Token, 75 | document_token: Token, 76 | ) -> Optional[WordMatch]: 77 | 78 | search_phrase_reprs = [] 79 | document_reprs = [] 80 | 81 | if search_phrase_token._.holmes.derivation_matching_reprs is not None: 82 | search_phrase_reprs.extend( 83 | search_phrase_token._.holmes.derivation_matching_reprs 84 | ) 85 | document_reprs.extend(document_token._.holmes.direct_matching_reprs) 86 | if document_token._.holmes.derivation_matching_reprs is not None: 87 | document_reprs.extend(document_token._.holmes.derivation_matching_reprs) 88 | search_phrase_reprs.extend( 89 | search_phrase_token._.holmes.direct_matching_reprs 90 | ) 91 | 92 | for search_phrase_representation in search_phrase_reprs: 93 | for document_representation in document_reprs: 94 | if search_phrase_representation == document_representation: 95 | search_phrase_display_word = search_phrase_token._.holmes.lemma 96 | return WordMatch( 97 | search_phrase_token=search_phrase_token, 98 | search_phrase_word=search_phrase_representation, 99 | document_token=document_token, 100 | first_document_token=document_token, 101 | last_document_token=document_token, 102 | document_subword=None, 103 | document_word=document_representation, 104 | word_match_type=self.WORD_MATCH_TYPE_LABEL, 105 | extracted_word=self.get_extracted_word_for_token( 106 | document_token, document_representation 107 | ), 108 | explanation=self._get_explanation(search_phrase_display_word), 109 | ) 110 | return None 111 | 112 | def match_subword( 113 | self, 114 | search_phrase: SearchPhrase, 115 | search_phrase_token: Token, 116 | document_token: Token, 117 | document_subword: Subword, 118 | ) -> Optional[WordMatch]: 119 | 120 | search_phrase_reprs = [] 121 | document_reprs = [] 122 | 123 | if search_phrase_token._.holmes.derivation_matching_reprs is not None: 124 | search_phrase_reprs.extend( 125 | search_phrase_token._.holmes.derivation_matching_reprs 126 | ) 127 | document_reprs.extend(document_subword.direct_matching_reprs) 128 | if document_subword.derivation_matching_reprs is not None: 129 | document_reprs.extend(document_subword.derivation_matching_reprs) 130 | search_phrase_reprs.extend( 131 | search_phrase_token._.holmes.direct_matching_reprs 132 | ) 133 | 134 | for search_phrase_representation in search_phrase_reprs: 135 | for document_representation in document_reprs: 136 | if search_phrase_representation == document_representation: 137 | search_phrase_display_word = search_phrase_token._.holmes.lemma 138 | return WordMatch( 139 | search_phrase_token=search_phrase_token, 140 | search_phrase_word=search_phrase_representation, 141 | document_token=document_token, 142 | first_document_token=document_token, 143 | last_document_token=document_token, 144 | document_subword=document_subword, 145 | document_word=document_representation, 146 | word_match_type=self.WORD_MATCH_TYPE_LABEL, 147 | explanation=self._get_explanation(search_phrase_display_word), 148 | ) 149 | return None 150 | 151 | def add_words_matching_search_phrase_root_token( 152 | self, search_phrase: SearchPhrase 153 | ) -> None: 154 | if ( 155 | search_phrase.root_token._.holmes.derived_lemma 156 | != search_phrase.root_token._.holmes.lemma 157 | ): 158 | search_phrase.add_word_information( 159 | search_phrase.root_token._.holmes.derived_lemma, 160 | ) 161 | 162 | def add_reverse_dict_entries( 163 | self, 164 | reverse_dict: Dict[str, List[CorpusWordPosition]], 165 | doc: Doc, 166 | document_label: str, 167 | ) -> None: 168 | for token in doc: 169 | if token._.holmes.derived_lemma != token._.holmes.lemma: 170 | self.add_reverse_dict_entry( 171 | reverse_dict, 172 | token._.holmes.derived_lemma.lower(), 173 | document_label, 174 | token.i, 175 | None, 176 | ) 177 | for subword in token._.holmes.subwords: 178 | if subword.derived_lemma != subword.lemma: 179 | self.add_reverse_dict_entry( 180 | reverse_dict, 181 | subword.derived_lemma.lower(), 182 | document_label, 183 | token.i, 184 | subword.index, 185 | ) 186 | -------------------------------------------------------------------------------- /holmes_extractor/word_matching/ontology.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Dict, Union 2 | from holmes_extractor.ontology import Ontology 3 | from spacy.tokens import Token, Doc 4 | from .general import WordMatch, WordMatchingStrategy 5 | from ..parsing import ( 6 | HolmesDictionary, 7 | CorpusWordPosition, 8 | MultiwordSpan, 9 | SemanticMatchingHelper, 10 | Subword, 11 | SearchPhrase, 12 | ) 13 | 14 | 15 | class OntologyWordMatchingStrategy(WordMatchingStrategy): 16 | """ 17 | The patent US8155946 associated with this code has been made available under the MIT licence, 18 | with kind permission from AstraZeneca. 19 | """ 20 | 21 | WORD_MATCH_TYPE_LABEL = "ontology" 22 | 23 | ONTOLOGY_DEPTHS_TO_NAMES = { 24 | -4: "an ancestor", 25 | -3: "a great-grandparent", 26 | -2: "a grandparent", 27 | -1: "a parent", 28 | 0: "a synonym", 29 | 1: "a child", 30 | 2: "a grandchild", 31 | 3: "a great-grandchild", 32 | 4: "a descendant", 33 | } 34 | 35 | def _get_explanation(self, search_phrase_display_word: str, depth: int) -> str: 36 | depth = min(depth, 4) 37 | depth = max(depth, -4) 38 | return "".join( 39 | ( 40 | "Is ", 41 | self.ONTOLOGY_DEPTHS_TO_NAMES[depth], 42 | " of ", 43 | search_phrase_display_word.upper(), 44 | " in the ontology.", 45 | ) 46 | ) 47 | 48 | def __init__( 49 | self, 50 | semantic_matching_helper: SemanticMatchingHelper, 51 | perform_coreference_resolution: bool, 52 | ontology: Ontology, 53 | analyze_derivational_morphology: bool, 54 | ontology_reverse_derivational_dict: Optional[Dict[str, str]], 55 | ): 56 | self.ontology = ontology 57 | self.analyze_derivational_morphology = analyze_derivational_morphology 58 | self.ontology_reverse_derivational_dict = ontology_reverse_derivational_dict 59 | super().__init__(semantic_matching_helper, perform_coreference_resolution) 60 | 61 | def match_multiwords( 62 | self, 63 | search_phrase: SearchPhrase, 64 | search_phrase_token: Token, 65 | document_token: Token, 66 | document_multiwords: List[MultiwordSpan], 67 | ) -> Optional[WordMatch]: 68 | 69 | for search_phrase_representation in self._get_reprs( 70 | search_phrase_token._.holmes 71 | ): 72 | for multiword in document_multiwords: 73 | entry = self.ontology.matches( 74 | search_phrase_representation, self._get_reprs(multiword) 75 | ) 76 | if entry is not None: 77 | search_phrase_display_word = search_phrase_token._.holmes.lemma 78 | return WordMatch( 79 | search_phrase_token=search_phrase_token, 80 | search_phrase_word=search_phrase_representation, 81 | document_token=document_token, 82 | first_document_token=document_token.doc[ 83 | multiword.token_indexes[0] 84 | ], 85 | last_document_token=document_token.doc[ 86 | multiword.token_indexes[-1] 87 | ], 88 | document_subword=None, 89 | document_word=entry.word, 90 | word_match_type=self.WORD_MATCH_TYPE_LABEL, 91 | depth=entry.depth, 92 | explanation=self._get_explanation( 93 | search_phrase_display_word, entry.depth 94 | ), 95 | ) 96 | return None 97 | 98 | def match_token( 99 | self, 100 | search_phrase: SearchPhrase, 101 | search_phrase_token: Token, 102 | document_token: Token, 103 | ) -> Optional[WordMatch]: 104 | 105 | for search_phrase_representation in self._get_reprs( 106 | search_phrase_token._.holmes 107 | ): 108 | entry = self.ontology.matches( 109 | search_phrase_representation, self._get_reprs(document_token._.holmes) 110 | ) 111 | if entry is not None: 112 | search_phrase_display_word = search_phrase_token._.holmes.lemma 113 | return WordMatch( 114 | search_phrase_token=search_phrase_token, 115 | search_phrase_word=search_phrase_representation, 116 | document_token=document_token, 117 | first_document_token=document_token, 118 | last_document_token=document_token, 119 | document_subword=None, 120 | document_word=entry.word, 121 | word_match_type=self.WORD_MATCH_TYPE_LABEL, 122 | extracted_word=self.get_extracted_word_for_token( 123 | document_token, entry.word 124 | ), 125 | depth=entry.depth, 126 | explanation=self._get_explanation( 127 | search_phrase_display_word, entry.depth 128 | ), 129 | ) 130 | return None 131 | 132 | def match_subword( 133 | self, 134 | search_phrase: SearchPhrase, 135 | search_phrase_token: Token, 136 | document_token: Token, 137 | document_subword: Subword, 138 | ) -> Optional[WordMatch]: 139 | 140 | for search_phrase_representation in self._get_reprs( 141 | search_phrase_token._.holmes 142 | ): 143 | entry = self.ontology.matches( 144 | search_phrase_representation, self._get_reprs(document_subword) 145 | ) 146 | if entry is not None: 147 | search_phrase_display_word = search_phrase_token._.holmes.lemma 148 | return WordMatch( 149 | search_phrase_token=search_phrase_token, 150 | search_phrase_word=search_phrase_representation, 151 | document_token=document_token, 152 | first_document_token=document_token, 153 | last_document_token=document_token, 154 | document_subword=document_subword, 155 | document_word=entry.word, 156 | word_match_type=self.WORD_MATCH_TYPE_LABEL, 157 | depth=entry.depth, 158 | explanation=self._get_explanation( 159 | search_phrase_display_word, entry.depth 160 | ), 161 | ) 162 | return None 163 | 164 | def add_words_matching_search_phrase_root_token( 165 | self, search_phrase: SearchPhrase 166 | ) -> None: 167 | search_phrase_reprs = search_phrase.root_token._.holmes.direct_matching_reprs[:] 168 | if ( 169 | self.analyze_derivational_morphology 170 | and search_phrase.root_token._.holmes.derivation_matching_reprs is not None 171 | ): 172 | search_phrase_reprs.extend( 173 | search_phrase.root_token._.holmes.derivation_matching_reprs 174 | ) 175 | for word in search_phrase_reprs: 176 | for entry in self.ontology.get_matching_entries(word): 177 | for repr in entry.reprs: 178 | search_phrase.add_word_information(repr) 179 | 180 | def add_reverse_dict_entries( 181 | self, 182 | reverse_dict: Dict[str, List[CorpusWordPosition]], 183 | doc: Doc, 184 | document_label: str, 185 | ) -> None: 186 | for token in doc: 187 | odw = self.semantic_matching_helper.get_ontology_defined_multiword( 188 | token, self.ontology 189 | ) 190 | if odw is not None: 191 | for representation in odw.direct_matching_reprs: 192 | self.add_reverse_dict_entry( 193 | reverse_dict, 194 | representation.lower(), 195 | document_label, 196 | token.i, 197 | None, 198 | ) 199 | if ( 200 | self.analyze_derivational_morphology 201 | and odw.derivation_matching_reprs is not None 202 | ): 203 | for representation in odw.derivation_matching_reprs: 204 | self.add_reverse_dict_entry( 205 | reverse_dict, 206 | representation.lower(), 207 | document_label, 208 | token.i, 209 | None, 210 | ) 211 | 212 | def _get_reprs( 213 | self, repr_bearer: Union[HolmesDictionary, Subword, MultiwordSpan] 214 | ) -> List[str]: 215 | reprs = repr_bearer.direct_matching_reprs 216 | if ( 217 | self.analyze_derivational_morphology 218 | and repr_bearer.derivation_matching_reprs is not None 219 | ): 220 | reprs.extend(repr_bearer.derivation_matching_reprs) 221 | return reprs 222 | -------------------------------------------------------------------------------- /holmes_extractor/lang/de/data/derivation.csv: -------------------------------------------------------------------------------- 1 | abbau,abbauen 2 | abbonement,abbonnieren 3 | abbruch,abbrechen 4 | abfahrt,abfahren 5 | abflug,abfliegen 6 | abgabe,abgeben 7 | ablauf,ablaufen 8 | abnahme,abnehmen 9 | abreise,abreisen 10 | absage,absagen 11 | abschluss,abschließen 12 | abschrift,abschreiben 13 | absicht,beabsichtigen 14 | abstieg,absteigen 15 | abwehr,abwehren 16 | adoption,adoptieren,adoptiert 17 | akzeptanz,akzeptieren 18 | amputation,amputieren 19 | analyse,analysieren 20 | anbau,anbauen 21 | anfang,anfangen 22 | angabe,angeben 23 | angebot,anbieten 24 | angriff,angreifen 25 | ankunft,ankommen 26 | anlage,anlegen 27 | annahme,annehmen 28 | anprobe,anprobieren 29 | anreise,anreisen 30 | anruf,anrufen 31 | anschluss,anschließen 32 | ansporn,anspornen 33 | anstieg,ansteigen 34 | anstoß,anstoßen 35 | anstrich,anstreichen 36 | antrieb,antreiben 37 | antwort,antworten 38 | anzeige,anzeigen 39 | arbeit,arbeiten 40 | arrangement,arrangieren 41 | assimilation,assimilieren 42 | attacke,attackieren 43 | ärger,ärgern 44 | audit,auditieren,auditierung 45 | aufbau,aufbauen 46 | aufbruch,aufbrechen 47 | aufgabe,aufgeben 48 | aufnahme,aufnehmen 49 | aufsicht,beaufsichtigen 50 | aufstieg,aufsteigen 51 | auftrag,beauftragen 52 | aufwand,aufwenden 53 | ausbau,ausbauen 54 | ausdruck,ausdrücken 55 | ausfall,ausfallen 56 | ausgabe,ausgeben 57 | ausgang,ausgehen 58 | ausgleich,ausgleichen 59 | ausleihe,ausleihen 60 | ausschluss,ausschließen 61 | aussprache,aussprechen 62 | ausstieg,aussteigen 63 | austausch,austauschen 64 | auswahl,auswählen 65 | bau,bauen 66 | bedrängen,bedrängnis 67 | befehl,befehlen 68 | beginn,beginnen 69 | beichte,beichten 70 | beistand,beistehen 71 | beitrag,beitragen 72 | beitritt,beitreten 73 | bekennen,bekenntnis 74 | beleg,belegen 75 | bericht,berichten 76 | beschluss,beschließen 77 | beschwerde,beschweren 78 | besitz,besitzen 79 | besuch,besuchen 80 | beten,gebet 81 | betrieb,betreiben 82 | betrug,betrügen 83 | beweis,beweisen 84 | biss,beißen 85 | bitte,bitten 86 | blamage,blamieren 87 | blick,blicken 88 | blitz,blitzen 89 | blockade,blockieren 90 | blüte,blühen 91 | boykott,boykottieren 92 | brand,brennen 93 | brüllen,gebrüll 94 | bummel,bummeln 95 | dank,bedanken 96 | dank,danken 97 | dauer,dauern 98 | debatte,debattieren 99 | deklaration,deklarieren 100 | dekoration,dekorieren 101 | dementi,dementieren 102 | demonstration,demonstrieren 103 | demontage,demontieren 104 | denken,gedanke 105 | deportation,deportieren 106 | desertation,desertieren 107 | desinfektion,desinfizieren 108 | destillation,destillieren 109 | diagnose,diagnostizieren 110 | dienen,dienst 111 | diskussion,diskutieren 112 | dokumentation,dokumentieren 113 | donner,donnern 114 | dopen,doping 115 | druck,drucken 116 | duft,duften 117 | dusche,duschen 118 | ehre,ehren 119 | eile,eilen 120 | einfall,einfallen 121 | eingabe,eingeben 122 | eingriff,eingreifen 123 | einkauf,einkaufen 124 | einnahme,einnehmen 125 | einsatz,einsetzen 126 | einsehen,einsicht 127 | einstieg,einsteigen 128 | einsturz,einstürzen 129 | einwurf,einwerfen 130 | einzug,einziehen 131 | emigration,emigrieren 132 | empfang,empfangen 133 | ende,enden 134 | engagement,engagieren 135 | entnahme,entnehmen 136 | entschluss,entschließen 137 | entwurf,entwerfen 138 | ereignen,ereignis 139 | erhalt,erhalten 140 | erkennen,erkenntnis 141 | erlass,erlassen 142 | erlauben,erlaubnis 143 | erleben,erlebnis 144 | ernte,ernten 145 | erschweren,erschwernis 146 | erwerb,erwerben 147 | existenz,existieren 148 | experiment,experimentieren 149 | explosion,explodieren 150 | export,exportieren 151 | extraktion,extrahieren 152 | fahrt,fahren 153 | fall,fallen 154 | fang,fangen 155 | faszination,faszinieren 156 | feier,feiern 157 | festnahme,festnehmen 158 | flirt,flirten 159 | flucht,fliehen 160 | flucht,flüchten 161 | flug,fliegen 162 | folge,folgen 163 | fortschritt,fortschreiten 164 | frage,fragen 165 | freigabe,freigeben 166 | freude,freuen 167 | frost,frieren 168 | frustration,frustrieren 169 | frühstück,frühstücken 170 | fund,finden 171 | furcht,fürchten 172 | fühlen,gefühl 173 | gabe,geben 174 | garantie,garantieren 175 | geruch,riechen 176 | gesang,singen 177 | geschmack,schmecken 178 | glanz,glänzen 179 | glaube,glauben 180 | glückwunsch,beglückwünschen 181 | gratulation,gratulieren 182 | griff,greifen 183 | gruß,grüßen 184 | guss,gießen 185 | hagel,hageln 186 | halt,halten 187 | harmonie,harmonieren 188 | hass,hassen 189 | hauch,hauchen 190 | heirat,heiraten 191 | herrschen,herrschaft 192 | hetze,hetzen 193 | hilfe,helfen 194 | hinweis,hinweisen 195 | identifikation,identifizieren 196 | ignoranz,ignorieren 197 | illustration,illustrieren 198 | immigration,immigrieren 199 | import,importieren 200 | infektion,infizieren 201 | information,informieren 202 | inhalt,beinhalten 203 | inspiration,inspirieren 204 | installation,installieren 205 | integration,integrieren 206 | interesse,interessieren 207 | interpretation,interpretieren 208 | interview,interviewen 209 | investieren,investition 210 | irritation,irritieren 211 | jagd,jagen 212 | joggen,jogging 213 | jubel,jubeln 214 | kampf,kämpfen 215 | kauf,kaufen 216 | kennen,kenntnis 217 | klage,klagen,beklagen 218 | klang,klingen 219 | kollision,kollidieren 220 | kombination,kombinieren 221 | kommunikation,kommunizieren 222 | komponieren,komposition 223 | konfrontation,konfrontieren 224 | konstruieren,konstruktion 225 | kontraktion,kontrahieren 226 | kontrolle,kontrollieren 227 | konzentration,konzentrieren 228 | kopie,kopieren 229 | korrektur,korrigieren 230 | korrespondenz,korrespondieren 231 | kritik,kritisieren 232 | kummer,bekümmern 233 | kuss,küssen 234 | langeweile,langweilen 235 | lauf,laufen 236 | lehre,lehren 237 | leihen,verleih,ausleihe 238 | liebe,lieben 239 | lob,loben 240 | lüge,lügen 241 | managen,management 242 | mangel,mangeln 243 | marsch,marschieren 244 | massage,massieren 245 | miete,mieten 246 | mitarbeit,mitarbeiten 247 | mitfühlen,mitgefühl 248 | mitschrift,mitschreiben 249 | montage,montieren 250 | müde,müdigkeit 251 | nachfolge,nachfolgen 252 | nachfrage,nachfragen 253 | nachlass,nachlassen 254 | nachweis,nachweisen 255 | neid,beneiden 256 | notiz,notieren 257 | operation,operieren 258 | opfer,opfern 259 | patrouille,patrouillieren 260 | pflege,pflegen 261 | picknick,picknicken 262 | plädoyer,plädieren 263 | politur,polieren 264 | pose,posieren 265 | predigt,predigen 266 | privileg,privilegieren 267 | probe,proben,probieren 268 | produktion,produzieren 269 | protest,protestieren 270 | protokoll,protokollieren 271 | provokation,provozieren 272 | qual,quälen 273 | quatschen,gequatsche 274 | rache,rächen 275 | rat,raten 276 | raub,rauben 277 | reaktion,reagieren 278 | rebellion,rebellieren 279 | rede,reden 280 | reduktion,reduzieren 281 | reform,reformieren 282 | regen,regnen 283 | regeneration,regenerieren 284 | reise,reisen 285 | reiz,reizen 286 | reklamation,reklamieren 287 | reparatur,reparieren 288 | respekt,respektieren 289 | restauration,restaurieren 290 | reue,bereuen 291 | revision,revidieren 292 | risiko,riskieren 293 | riss,reißen 294 | ritt,reiten 295 | rotation,rotieren 296 | ruf,rufen 297 | ruhe,ruhen 298 | ruin,ruinieren 299 | rückgabe,zurückgeben 300 | rückgriff,zurückgreifen 301 | rückkehr,zurückkehren 302 | rücktritt,zurücktreten 303 | rückzug,zurückziehen 304 | sabotage,sabotieren 305 | schau,schauen 306 | schauder,schaudern 307 | schein,scheinen 308 | scherz,scherzen 309 | schikane,schikanieren 310 | schimmer,schimmern 311 | schimpfen,geschimpfe 312 | schlaf,schlafen 313 | schlag,schlagen 314 | schmerz,schmerzen 315 | schmuggel,schmuggeln 316 | schnee,schneien 317 | schrei,schreien 318 | schrift,schreiben 319 | schritt,schreiten 320 | schuss,schießen 321 | schutz,schützen,beschützen 322 | schwatz,schwatzen 323 | schweiß,schwitzen 324 | schwindel,schwindeln 325 | schwur,schwüren 326 | schwung,schwingen 327 | sehen,sicht 328 | seufzen,seufzer 329 | sieg,siegen,besiegen 330 | sorge,sorgen 331 | spazieren,spaziergang 332 | spekulation,spekulieren 333 | spende,spenden 334 | spiel,spielen 335 | spionage,spionieren 336 | spott,spotten 337 | sprung,springen 338 | stagnation,stagnieren 339 | start,starten 340 | stau,stauen 341 | stimulation,stimulieren 342 | stopp,stoppen 343 | stoß,stoßen 344 | streik,streiken 345 | streit,streiten 346 | studium,studieren 347 | sturm,stürmen 348 | sturz,stürzen 349 | suche,suchen 350 | sünde,sündigen 351 | süß,sußigkeit 352 | tanz,tanzen 353 | tat,tun 354 | taufe,taufen 355 | tausch,tauschen 356 | teilnahme,teilnehmen 357 | telefonat,telefonieren 358 | test,testen 359 | training,trainieren 360 | transport,transportieren 361 | trauer,trauern 362 | traum,träumen 363 | tritt,treten 364 | triumph,triumphieren 365 | trost,trösten 366 | überfall,überfallen 367 | übergabe,übergeben 368 | umbau,umbauen 369 | umgang,umgehen 370 | umkehr,umkehren 371 | umstieg,umsteigen 372 | umtausch,umtauschen 373 | umzug,umziehen 374 | unterricht,unterrichten 375 | unterschrift,unterschreiben 376 | urteil,urteilen 377 | variation,variieren 378 | verbot,verbieten 379 | verbrauch,verbrauchen 380 | verbund,verbinden 381 | verdienen,verdienst 382 | vergabe,vergeben 383 | vergleich,vergleichen 384 | verhör,verhören 385 | verkauf,verkaufen 386 | verlauf,verlaufen 387 | verlust,verlieren 388 | verrat,verraten 389 | versand,versenden 390 | verschleiß,verschleißen 391 | verschluss,verschließen 392 | versteck,verstecken 393 | verstehen,verständnis 394 | versuch,versuchen 395 | versäumen,versäumnis 396 | verzehr,verzehren 397 | verzicht,verzichten 398 | voraussage,voraussagen 399 | vorgabe,vorgeben 400 | vorhersage,vorhersagen 401 | vorkommen,vorkommnis 402 | vorschlag,vorschlagen 403 | vorschrift,vorschreiben 404 | vortrag,vortragen 405 | wachsen,wachstum 406 | wagen,wagnis 407 | wahl,wählen 408 | wandel,wandeln 409 | wechsel,wechseln 410 | weggang,weggehen 411 | wegnahme,wegnehmen 412 | weiterfahrt,weiterfahren 413 | weitergabe,weitergeben 414 | wende,wenden 415 | wette,wetten 416 | widerruf,widerrufen 417 | widerspruch,widersprechen 418 | widerstand,widerstehen 419 | wiegen,gewicht 420 | wille,wollen 421 | wunsch,wünschen 422 | wurf,werfen 423 | wäsche,waschen 424 | zensur,zensieren 425 | zitation,zitieren 426 | zug,ziehen 427 | zunahme,zunehmen 428 | zusammenarbeit,zusammenarbeiten 429 | zusammenbau,zusammenbauen 430 | zusammenstoß,zusammenstoßen 431 | zwang,zwingen 432 | zweifel,bezweifeln 433 | zweifel,zweifeln 434 | -------------------------------------------------------------------------------- /holmes_extractor/lang/en/data/derivation.csv: -------------------------------------------------------------------------------- 1 | abandon,abandonment 2 | able,ability 3 | abort,abortion 4 | abstract,abstraction 5 | abuse,abusive 6 | accept,acceptance 7 | accident,accidental 8 | accompany,accompaniment 9 | accomplish,accomplishment 10 | accountable,accountability 11 | accuracy,accurate 12 | accuse,accusation 13 | achieve,achievement 14 | acknowledge,acknowledgement 15 | act,action,activity 16 | adapt,adaptation 17 | add,addition,additional 18 | adjust,adjustment 19 | admire,admiration 20 | adopt,adoption 21 | advertise,advertize,advertisement 22 | advice,advise 23 | affect,effect 24 | agree,agreement 25 | alcohol,alcoholic 26 | allow,allowance 27 | alter,alteration 28 | amaze,amazing,amazement 29 | ambiguity,ambiguous,ambiguousness 30 | amuse,amusing,amusement 31 | analyse,analyze,analysis 32 | anger,angry 33 | announce,announcement 34 | anonymity,anonymous 35 | apology,apologize,apologetic 36 | appear,appearance 37 | applaud,applause 38 | appoint,appointment 39 | approve,approval 40 | argue,argument 41 | arrange,arrangement 42 | assert,assertion 43 | assess,assessment 44 | assure,assurance 45 | astonish,astonishing,astonishment 46 | attach,attachment 47 | attain,attainment 48 | attract,attraction 49 | attribute,attribution 50 | avoid,avoidance 51 | base,basic,basis 52 | beast,beastly 53 | behave,behavior,behaviour,behavioral 54 | belief,believe 55 | breath,breathe 56 | bury,burial 57 | capable,capability 58 | cease,cessation 59 | ceremony,ceremonial 60 | certain,certainty 61 | charm,charming 62 | cite,citation 63 | clean,cleanliness 64 | clear,clarity 65 | clinic,clinical 66 | collaboration,collaborative 67 | collect,collection 68 | combine,combination 69 | commerce,commercial 70 | commit,commitment 71 | compare,comparison 72 | compete,competition 73 | compile,compilation 74 | complete,completion 75 | compliant,compliance 76 | compose,composition 77 | comprehend,comprehension 78 | conclude,conclusion 79 | confirm,confirmation 80 | conform,conformity 81 | confront,confrontation 82 | confuse,confusion 83 | connect,connection 84 | consequent,consequence 85 | conservatism,conservative 86 | consider,consideration 87 | consistent,consistency 88 | constrain,constraint 89 | construct,construction 90 | consult,consultation 91 | continue,continuation 92 | contradict,contradiction 93 | contribute,contribution 94 | controversy,controversial 95 | convene,convention 96 | convenient,convenience 97 | cooperate,cooperative 98 | correct,correction 99 | correlate,correlative 100 | correspond,correspondence 101 | cover,coverage 102 | critical,criticise,criticism,criticize 103 | cruel,cruelty 104 | day,daily 105 | deceit,deceive,deception,deceptive 106 | decide,decision 107 | declare,declaration,declarative 108 | deep,depth 109 | defend,defence,defense,defensive 110 | define,definition 111 | deny,denial 112 | depend,dependence 113 | depress,depression,depressive 114 | describe,description 115 | despair,desperation 116 | destroy,destruction,destructive 117 | detach,detachment 118 | detect,detection 119 | deter,deterrent,deterrence 120 | determine,determination 121 | develop,development,developmental 122 | devote,devotion 123 | diagnose,diagnosis 124 | dictator,dictatorial 125 | die,dead,death 126 | differ,different,difference 127 | digest,digestion,digestive 128 | dimension,dimensional 129 | disagree,disagreement 130 | disappoint,disappointing,disappointment 131 | disaster,disastrous 132 | discourage,discouragement 133 | discuss,discussion 134 | dishonest,dishonesty 135 | dismiss,dismissal 136 | disobey,disobedient,disobedience 137 | dispose,disposal 138 | disrespect,disrespectful 139 | dissatisfy,dissatisfaction 140 | distant,distance 141 | distinct,distinction,distinctive 142 | distort,distortion 143 | distract,distracting,distraction 144 | disturb,disturbing,disturbance 145 | diverse,diversity 146 | divide,division 147 | domestic,domesticate 148 | dominant,dominate,dominance 149 | doubt,doubtful 150 | ease,easy 151 | edit,edition 152 | efficient,efficiency 153 | elect,election 154 | embarrass,embarrassment 155 | emerge,emergence 156 | emit,emission 157 | emphasis,emphatic,emphasise,emphasize 158 | employ,employment 159 | enclose,enclosure 160 | encourage,encouragement 161 | endure,endurance 162 | energy,energize,energetic 163 | enforce,enforcement 164 | engage,engagement 165 | enhance,enhancement 166 | enjoy,enjoyment 167 | enlarge,enlargement 168 | enormity,enormous 169 | enter,entrance 170 | entertain,entertainment 171 | entitle,entitlement 172 | envy,envious 173 | equal,equality 174 | equip,equipment 175 | evolve,evolution 176 | examine,examination 177 | excel,excellent,excellence 178 | excess,excessive 179 | excite,excitement 180 | execute,execution 181 | exhibit,exhibition 182 | exist,existence 183 | expand,expansion 184 | expanse,expansive 185 | expect,expectation 186 | expend,expenditure 187 | expense,expensive 188 | expire,expiry,expiration 189 | explain,explanation 190 | explode,explosion,explosive 191 | exploit,exploitation 192 | explore,exploration 193 | express,expression 194 | expel,expulsion 195 | extract,extraction 196 | fail,failure 197 | familiar,familiarise,familiarity,familiarize 198 | fear,fearful 199 | feasible,feasibility 200 | fiction,fictional 201 | finance,financial 202 | fly,flight 203 | forgive,forgiveness 204 | frequent,frequency 205 | fur,furry 206 | generous,generosity 207 | gift,give 208 | glass,glassy 209 | govern,government 210 | grand,grandeur 211 | grateful,gratitude 212 | guilt,guilty 213 | hard,hardship 214 | haste,hasty 215 | hierarchy,hierarchical 216 | high,height 217 | hinder,hindrance 218 | history,historical 219 | honest,honesty 220 | hope,hopeful 221 | hostile,hostility 222 | humid,humidity 223 | hunger,hungry 224 | hypothesis,hypothetical 225 | ice,icy 226 | identify,identity,identification 227 | ideology,ideological 228 | imagine,imagination 229 | impatient,impatience 230 | important,importance 231 | impress,impression 232 | imprison,imprisonment 233 | improbable,improbability 234 | improve,improvement 235 | impure,impurity 236 | incapable,incapability 237 | incident,incidence,incidental 238 | include,inclusion 239 | inconsistent,inconsistency 240 | independent,independence 241 | indifferent,indifference 242 | infeasible,infeasibility 243 | infect,infection 244 | infinite,infinity 245 | inform,information 246 | inhibit,inhibition 247 | injure,injury 248 | innocent,innocence 249 | insist,insistent,insistence 250 | inspect,inspection 251 | instant,instance 252 | institution,institutional 253 | instruct,instruction 254 | integral,integrate 255 | intelligent,intelligence 256 | intend,intention 257 | intense,intensity 258 | interrupt,interruption 259 | intervene,intervention 260 | introduce,introduction 261 | invade,invasion 262 | invent,invention 263 | invite,invitation 264 | involve,involvement 265 | liable,liability 266 | logic,logical 267 | loose,loosen 268 | lose,loss 269 | loyal,loyalty 270 | magic,magical 271 | maintain,maintenance 272 | manage,management 273 | manipulate,manipulative 274 | marry,marriage 275 | mass,massive 276 | maximal,maximum 277 | measure,measurement 278 | minimal,minimum 279 | mix,mixture 280 | modern,modernity 281 | modest,modesty 282 | month,monthly 283 | music,musical 284 | necessary,necessity,necessitate 285 | neglect,negligent,negligence 286 | nerve,nervous 287 | noble,nobility 288 | norm,normal,normality 289 | obey,obedient,obedience 290 | oblige,obligation,obligatory 291 | offend,offence,offense 292 | omit,omission 293 | option,optional 294 | package,packaging 295 | patient,patience 296 | patriot,patriotic,patriotism 297 | peace,peaceful 298 | peculiar,peculiarity 299 | perfect,perfection 300 | perform,performance 301 | permit,permission 302 | persist,persistent,persistence 303 | persuade,persuasion 304 | poem,poetic 305 | poor,poverty 306 | possess,possession,possessive 307 | possible,possibility 308 | post,postal 309 | practical,practicality 310 | practice,practise 311 | precise,precision 312 | prefer,preference 313 | prejudice,prejudicial 314 | prepare,preparation 315 | present,presence 316 | preserve,preservation,preservative 317 | presume,presumption 318 | presuppose,presupposition 319 | pretend,pretence 320 | prevalent,prevalence 321 | prevent,prevention 322 | probable,probability 323 | produce,production 324 | progress,progression 325 | prohibit,prohibition,prohibitory 326 | project,projection 327 | promote,promotion 328 | proof,prove 329 | propose,proposal 330 | protect,protection,protective 331 | publicise,publicize,publication 332 | punish,punishment 333 | pure,purity 334 | rare,rarity 335 | react,reaction 336 | reappear,reappearance 337 | reassure,reassurance 338 | rebel,rebellious 339 | receipt,receive 340 | recognise,recognize,recognition 341 | reconcile,reconciliation 342 | reconsider,reconsideration 343 | recruit,recruitment 344 | refer,referral 345 | refresh,refreshment 346 | refuse,refusal 347 | reinforce,reinforcement 348 | relax,relaxation 349 | relief,relieve 350 | reluctant,reluctance 351 | rely,reliance 352 | represent,representation 353 | reproduce,reproduction 354 | require,requirement 355 | reside,residence,residential 356 | resign,resignation 357 | resist,resistance 358 | resolve,resolution 359 | respect,respectful 360 | responsible,responsibility 361 | restrain,restraint 362 | restrict,restriction,restrictive 363 | reverse,reversal 364 | rigor,rigour,rigorous 365 | rival,rivalry 366 | rose,rosy 367 | satisfy,satisfaction 368 | secret,secrecy 369 | sector,sectoral 370 | sequence,sequential 371 | serve,service 372 | settle,settlement 373 | sex,sexual 374 | sign,signature 375 | sincere,sincerity 376 | solve,solution 377 | speak,speech 378 | sphere,spherical 379 | spite,spiteful 380 | spontaneity,spontaneous 381 | strong,strength 382 | stupid,stupidity 383 | substance,substantial 384 | succeed,success 385 | suggest,suggestion 386 | summer,summery 387 | superior,superiority 388 | suppose,supposition 389 | survive,survival 390 | suspend,suspension 391 | talent,talented 392 | tempt,temptation 393 | tense,tension 394 | thirst,thirsty 395 | threat,threaten 396 | transmit,transmission 397 | treat,treatment 398 | true,truth 399 | trivia,trivial 400 | unable,inability 401 | uncertain,uncertainty 402 | unimportant,unimportance 403 | unite,unity 404 | use,usage 405 | vary,variation 406 | virtue,virtuous 407 | warm,warmth 408 | waste,wastage 409 | week,weekly 410 | weigh,weight 411 | wide,width 412 | winter,wintery 413 | wood,wooden 414 | wool,wooly,woolen,woolly,woollen 415 | year,yearly 416 | young,youth 417 | -------------------------------------------------------------------------------- /tests/de/test_questions_DE.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import holmes_extractor as holmes 3 | from holmes_extractor.topic_matching import TopicMatcher 4 | 5 | manager = holmes.Manager(model='de_core_news_lg', number_of_workers=1) 6 | 7 | class GermanInitialQuestionsTest(unittest.TestCase): 8 | 9 | def _check_equals(self, text_to_match, document_text, highest_score, answer_start, answer_end, 10 | word_embedding_match_threshold=0.42, initial_question_word_embedding_match_threshold=0.42, 11 | use_frequency_factor=True, *, alternative_highest_score=None): 12 | manager.remove_all_documents() 13 | manager.parse_and_register_document(document_text) 14 | topic_matches = manager.topic_match_documents_against(text_to_match, 15 | word_embedding_match_threshold= 16 | word_embedding_match_threshold, 17 | initial_question_word_embedding_match_threshold=initial_question_word_embedding_match_threshold, 18 | initial_question_word_answer_score=40, 19 | relation_score=20, 20 | reverse_only_relation_score=15, single_word_score=10, single_word_any_tag_score=5, 21 | different_match_cutoff_score=10, 22 | relation_matching_frequency_threshold=0.0, 23 | embedding_matching_frequency_threshold=0.0, 24 | use_frequency_factor=use_frequency_factor) 25 | if alternative_highest_score is None: 26 | self.assertEqual(int(topic_matches[0]['score']), highest_score) 27 | else: 28 | self.assertIn(int(topic_matches[0]['score']), (highest_score, alternative_highest_score)) 29 | if answer_start is not None: 30 | self.assertEqual(topic_matches[0]['answers'][0][0], answer_start) 31 | self.assertEqual(topic_matches[0]['answers'][0][1], answer_end) 32 | else: 33 | self.assertEqual(len(topic_matches[0]['answers']), 0) 34 | 35 | def test_basic_matching_with_subword(self): 36 | self._check_equals("Was betrachtet man?", 'Informationsbetrachtung', 45, 0, 11) 37 | 38 | def test_governed_interrogative_pronoun_with_subword(self): 39 | self._check_equals("Welche Information betrachtet man?", 'Informationsbetrachtung', 55, 0, 11) 40 | 41 | def test_governed_interrogative_pronoun_with_subword_control(self): 42 | self._check_equals("Die Information betrachtet man.", 'Informationsbetrachtung', 35, None, None) 43 | 44 | def test_governed_interrogative_pronoun_with_complex_subword(self): 45 | self._check_equals("Welche Information betrachtet man?", 46 | 'Extraktionsinformationsbetrachtung', 55, 0, 22) 47 | 48 | def test_governed_interrogative_pronoun_with_complex_subword_control(self): 49 | self._check_equals("Die Information betrachtet man.", 50 | 'Extraktionsinformationsbetrachtung', 35, None, None) 51 | 52 | def test_governed_interrogative_pronoun_with_subword_and_coreference(self): 53 | self._check_equals("Welchen Löwen betrachten wir.", 'Es gab einen Extraktionslöwen. Leute haben ihn betrachtet', 54, 13, 29) 54 | 55 | def test_governed_interrogative_pronoun_with_subword_and_coreference_control(self): 56 | self._check_equals("Den Löwen betrachten wir.", 'Es gab einen Extraktionslöwen. Leute haben ihn betrachtet', 34, None, None) 57 | 58 | def test_governed_interrogative_pronoun_with_subword_and_embedding_matching(self): 59 | self._check_equals("Welchen Hund betrachten wir?", 'Leute betrachteten die Informationskatze', 25, 23, 40) 60 | 61 | def test_governed_interrogative_pronoun_with_subword_and_embedding_matching_control(self): 62 | self._check_equals("Den Hund betrachten wir.", 'Leute betrachteten den Informationskatze', 15, None, None) 63 | 64 | def test_check_was_predicate_positive_case(self): 65 | manager.remove_all_documents() 66 | manager.parse_and_register_document("Das ist ein Haus.", 'q') 67 | topic_matches = manager.topic_match_documents_against("Was ist das?") 68 | self.assertEqual(topic_matches, [{'document_label': 'q', 'text': 'Das ist ein Haus.', 'text_to_match': 'Was ist das?', 'rank': '1', 'index_within_document': 1, 'subword_index': None, 'start_index': 1, 'end_index': 3, 'sentences_start_index': 0, 'sentences_end_index': 4, 'sentences_character_start_index': 0, 'sentences_character_end_index': 17, 'score': 620.0, 'word_infos': [[4, 7, 'relation', True, 'Matches SEIN directly.'], [12, 16, 'relation', False, 'Matches the question word WAS.']], 'answers': [[8, 16]]}]) 69 | 70 | def test_check_wer_positive_case(self): 71 | self._check_equals('Wer schaute in die Sonne?', 'Die Person schaute in die Sonne', 127, 0, 10) 72 | 73 | def test_check_wer_wrong_syntax(self): 74 | self._check_equals('Wer schaute in die Sonne?', 'Die Sonne schaute in den Mann', 19, None, None) 75 | 76 | def test_check_wer_wrong_noun(self): 77 | self._check_equals('Wer schaute in die Sonne?', 'Das Gebäude schaute in die Sonne', 70, None, None) 78 | 79 | @unittest.skipIf(manager.nlp.meta['version'] == '3.2.0', 'Version fluke') 80 | def test_check_wen_positive_case(self): 81 | self._check_equals('Wen sah das Gebäude?', 'Das Gebäude sah die Person', 54, 16, 26, alternative_highest_score=104) 82 | 83 | def test_check_wen_wrong_syntax(self): 84 | self._check_equals('Wen sah das Gebäude?', 'Das Gebäude sah das Gebäude', 34, None, None) 85 | 86 | def test_check_was_acc(self): 87 | self._check_equals('Was sah das Gebäude?', 'Das Gebäude sah das Gebäude', 104, 16, 27) 88 | 89 | def test_check_wem_positive_case(self): 90 | self._check_equals('wem gibst du es?', 'Ich gebe es der Person', 45, 12, 22) 91 | 92 | def test_check_wo_positive_case(self): 93 | self._check_equals('Wo wohnst du?', 'Ich wohne in einem Haus', 45, 10, 23) 94 | 95 | def test_check_wo_positive_case_definite_preposition(self): 96 | self._check_equals('Wo wohnst du?', 'Ich wohne im Haus', 45, 10, 17) 97 | 98 | def test_check_wo_wrong_case_definite_preposition(self): 99 | self._check_equals('Wo wohnst du?', 'Ich wohne ins Haus', 5, None, None) 100 | 101 | def test_check_wo_wrong_case(self): 102 | self._check_equals('Wo wohnst du?', 'Ich wohne in ein Haus', 5, None, None) 103 | 104 | def test_check_wohin_positive_case(self): 105 | self._check_equals('Wohin fährst du?', 'Ich fahre in ein Haus', 45, 10, 21) 106 | 107 | def test_check_wohin_positive_case_definite_preposition(self): 108 | self._check_equals('Wohin fährst du?', 'Ich fahre ins Haus', 45, 10, 18) 109 | 110 | def test_check_wohin_wrong_case_definite_preposition(self): 111 | self._check_equals('Wohin fährst du?', 'Ich fahre im Haus', 5, None, None) 112 | 113 | def test_check_womit_positive_case(self): 114 | self._check_equals('Womit fährst du?', 'Ich fahre mit meinem Auto', 45, 10, 25) 115 | 116 | def test_check_womit_other_preposition(self): 117 | self._check_equals('Womit fährst du?', 'Ich fahre ohne mein Auto', 5, None, None) 118 | 119 | @unittest.skipIf(manager.nlp.meta['version'] == '3.2.0', 'Version fluke') 120 | def test_check_wann_noun(self): 121 | self._check_equals('Wann fährst du?', 'Ich fahre nächste Woche', 45, 10, 23) 122 | 123 | def test_check_wann_preposition(self): 124 | self._check_equals('Wann fährst du?', 'Ich fahre in zwei Wochen', 45, 10, 24) 125 | 126 | def test_check_wann_wrong_preposition(self): 127 | self._check_equals('Wann fährst du?', 'Ich fahre wegen des Problems', 5, None, None) 128 | 129 | def test_check_wann_adverb(self): 130 | self._check_equals('Wann fährst du?', 'Ich fahre morgen', 45, 10, 16) 131 | 132 | def test_check_wann_verb_phrase(self): 133 | self._check_equals('Wann fährst du?', 'Ich fahre, wenn du mitkommst.', 45, 11, 28) 134 | 135 | def test_check_wie_preposition(self): 136 | self._check_equals('Wie fährst du?', 'Ich fahre mit dem Auto', 45, 10, 22) 137 | 138 | def test_check_wie_wrong_preposition(self): 139 | self._check_equals('Wie fährst du?', 'Ich fahre wegen des Problems', 5, None, None) 140 | 141 | def test_check_wie_adverb(self): 142 | self._check_equals('Wie fährst du?', 'Ich fahre langsam', 45, 10, 17) 143 | 144 | def test_check_wie_indem_phrase(self): 145 | self._check_equals('Wie fährst du?', 'Ich fahre, indem ich per Anhalter fahre', 45, 11, 39) 146 | 147 | def test_check_wie_other_phrase(self): 148 | self._check_equals('Wie fährst du?', 'Ich fahre, weil ich per Anhalter fahre', 5, None, None) 149 | 150 | def test_check_woher_preposition(self): 151 | self._check_equals('Woher denkst Du es?', 'Ich denke es wegen der Evidenz', 45, 13, 30) 152 | 153 | def test_check_woher_wrong_preposition(self): 154 | self._check_equals('Woher denkst Du es?', 'Ich denke es trotz der Evidenz', 5, None, None) 155 | 156 | def test_check_woher_weil(self): 157 | self._check_equals('Woher denkst Du es?', 'Ich denke es, weil es stimmt', 45, 14, 28) 158 | 159 | def test_check_woher_wrong_conjunction(self): 160 | self._check_equals('Woher denkst Du es?', 'Ich denke es, obwohl es nicht stimmt', 5, None, None) 161 | 162 | def test_check_warum_preposition(self): 163 | self._check_equals('Warum machst Du es?', 'Ich mache es wegen der Evidenz', 45, 13, 30) 164 | 165 | def test_check_warum_wrong_preposition(self): 166 | self._check_equals('Warum machst Du es?', 'Ich mache es trotz der Evidenz', 5, None, None) 167 | 168 | def test_check_warum_weil(self): 169 | self._check_equals('Warum machst Du es?', 'Ich mache es, weil es stimmt', 45, 14, 28) 170 | 171 | def test_check_warum_weil_sein(self): 172 | self._check_equals('Warum machst Du es?', 'Ich mache es, weil es gut ist', 45, 14, 29) 173 | 174 | def test_check_warum_damit(self): 175 | self._check_equals('Wieso machst Du es?', 'Ich mache es, damit Du kommst', 45, 14, 29) 176 | 177 | def test_check_warum_wrong_conjunction(self): 178 | self._check_equals('Woher machst Du es?', 'Ich mache es, obwohl es nicht stimmt', 5, None, None) 179 | 180 | def test_question_word_is_not_treated_as_answer(self): 181 | self._check_equals('Wohin geht der Mann?', 'Wohin geht der Mann', 73, None, None) 182 | 183 | def test_non_initial_question_word_is_not_treated_as_answer(self): 184 | self._check_equals('Wohin geht der Mann?', 'Und wohin geht der Mann', 73, None, None) 185 | -------------------------------------------------------------------------------- /examples/example_chatbot_EN_insurance_ontology.owl: -------------------------------------------------------------------------------- 1 | 2 | 9 | 10 | 11 | 12 | 13 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | -------------------------------------------------------------------------------- /tests/en/test_ontology.owl: -------------------------------------------------------------------------------- 1 | 2 | 9 | 10 | 11 | 12 | 13 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | -------------------------------------------------------------------------------- /examples/example_chatbot_DE_insurance_ontology.owl: -------------------------------------------------------------------------------- 1 | 2 | 9 | 10 | 11 | 12 | 13 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | -------------------------------------------------------------------------------- /tests/de/test_doc_examples_DE.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import holmes_extractor as holmes 3 | 4 | holmes_manager = holmes.Manager(model="de_core_news_lg", number_of_workers=1) 5 | holmes_manager.register_search_phrase("Ein großer Hund jagt eine Katze") 6 | holmes_manager.register_search_phrase("Ein ENTITYPER geht in die Stadt") 7 | 8 | 9 | class EnglishDocumentationExamplesTest(unittest.TestCase): 10 | 11 | positive_examples = ( 12 | "Der große Hund hat die Katze ständig gejagt", 13 | "Der große Hund, der müde war, jagte die Katze", 14 | "Die Katze wurde vom großen Hund gejagt", 15 | "Die Katze wurde immer wieder durch den großen Hund gejagt", 16 | "Der große Hund wollte die Katze jagen", 17 | "Der große Hund entschied sich, die Katze zu jagen", 18 | "Die Katze, die der große Hund gejagt hatte, hatte Angst", 19 | "Dass der große Hund die Katze jagte, war ein Problem", 20 | "Es gab einen großen Hund, der eine Katze jagte", 21 | "Die Katzenjagd durch den großen Hund", 22 | "Es gab einmal einen großen Hund, und er jagte eine Katze", 23 | "Es gab einen großen Hund. Er hieß Fido. Er jagte meine Katze", 24 | "Es erschien ein Hund. Er jagte eine Katze. Er war sehr groß.", 25 | "Die Katze schlich sich in unser Wohnzimmer zurück, weil ein großer Hund sie draußen gejagt hatte", 26 | "Unser großer Hund war aufgeregt, weil er eine Katze gejagt hatte", 27 | ) 28 | 29 | def test_positive_examples(self): 30 | for positive_example in self.positive_examples: 31 | with self.subTest(): 32 | assert len(holmes_manager.match(document_text=positive_example)) == 1 33 | 34 | negative_examples = ( 35 | "Der Hund jagte eine große Katze", 36 | "Die Katze jagte den großen Hund", 37 | "Der große Hund und die Katze jagten", 38 | "Der große Hund jagte eine Maus aber die Katze war müde", 39 | "Der große Hund wurde ständig von der Katze gejagt", 40 | "Der große Hund entschloss sich, von der Katze gejagt zu werden", 41 | "Die Hundejagd durch den große Katze", 42 | ) 43 | 44 | def test_negative_examples(self): 45 | for negative_example in self.negative_examples: 46 | with self.subTest(): 47 | assert len(holmes_manager.match(document_text=negative_example)) == 0 48 | 49 | def test_complex_example(self): 50 | matches = holmes_manager.match( 51 | document_text="Letzte Woche sah ich Richard Hudson und Max Mustermann. Sie wollten nicht mehr in die Stadt gehen." 52 | ) 53 | self.assertEqual( 54 | matches, 55 | [ 56 | { 57 | "search_phrase_label": "Ein ENTITYPER geht in die Stadt", 58 | "search_phrase_text": "Ein ENTITYPER geht in die Stadt", 59 | "document": "", 60 | "index_within_document": 17, 61 | "sentences_within_document": "Letzte Woche sah ich Richard Hudson und Max Mustermann. Sie wollten nicht mehr in die Stadt gehen.", 62 | "negated": True, 63 | "uncertain": True, 64 | "involves_coreference": True, 65 | "overall_similarity_measure": 1.0, 66 | "word_matches": [ 67 | { 68 | "search_phrase_token_index": 1, 69 | "search_phrase_word": "ENTITYPER", 70 | "document_token_index": 5, 71 | "first_document_token_index": 4, 72 | "last_document_token_index": 5, 73 | "structurally_matched_document_token_index": 10, 74 | "document_subword_index": None, 75 | "document_subword_containing_token_index": None, 76 | "document_word": "richard hudson", 77 | "document_phrase": "Richard Hudson", 78 | "match_type": "entity", 79 | "negated": False, 80 | "uncertain": True, 81 | "similarity_measure": 1.0, 82 | "involves_coreference": True, 83 | "extracted_word": "richard hudson", 84 | "depth": 0, 85 | "explanation": "Has an entity label matching ENTITYPER.", 86 | }, 87 | { 88 | "search_phrase_token_index": 2, 89 | "search_phrase_word": "gehen", 90 | "document_token_index": 17, 91 | "first_document_token_index": 17, 92 | "last_document_token_index": 17, 93 | "structurally_matched_document_token_index": 17, 94 | "document_subword_index": None, 95 | "document_subword_containing_token_index": None, 96 | "document_word": "gehen", 97 | "document_phrase": "gehen", 98 | "match_type": "direct", 99 | "negated": True, 100 | "uncertain": False, 101 | "similarity_measure": 1.0, 102 | "involves_coreference": False, 103 | "extracted_word": "gehen", 104 | "depth": 0, 105 | "explanation": "Matches GEHEN directly.", 106 | }, 107 | { 108 | "search_phrase_token_index": 3, 109 | "search_phrase_word": "in", 110 | "document_token_index": 14, 111 | "first_document_token_index": 14, 112 | "last_document_token_index": 14, 113 | "structurally_matched_document_token_index": 14, 114 | "document_subword_index": None, 115 | "document_subword_containing_token_index": None, 116 | "document_word": "in", 117 | "document_phrase": "in", 118 | "match_type": "direct", 119 | "negated": True, 120 | "uncertain": True, 121 | "similarity_measure": 1.0, 122 | "involves_coreference": False, 123 | "extracted_word": "in", 124 | "depth": 0, 125 | "explanation": "Matches IN directly.", 126 | }, 127 | { 128 | "search_phrase_token_index": 5, 129 | "search_phrase_word": "stadt", 130 | "document_token_index": 16, 131 | "first_document_token_index": 16, 132 | "last_document_token_index": 16, 133 | "structurally_matched_document_token_index": 16, 134 | "document_subword_index": None, 135 | "document_subword_containing_token_index": None, 136 | "document_word": "stadt", 137 | "document_phrase": "die Stadt", 138 | "match_type": "direct", 139 | "negated": True, 140 | "uncertain": False, 141 | "similarity_measure": 1.0, 142 | "involves_coreference": False, 143 | "extracted_word": "stadt", 144 | "depth": 0, 145 | "explanation": "Matches STADT directly.", 146 | }, 147 | ], 148 | }, 149 | { 150 | "search_phrase_label": "Ein ENTITYPER geht in die Stadt", 151 | "search_phrase_text": "Ein ENTITYPER geht in die Stadt", 152 | "document": "", 153 | "index_within_document": 17, 154 | "sentences_within_document": "Letzte Woche sah ich Richard Hudson und Max Mustermann. Sie wollten nicht mehr in die Stadt gehen.", 155 | "negated": True, 156 | "uncertain": True, 157 | "involves_coreference": True, 158 | "overall_similarity_measure": 1.0, 159 | "word_matches": [ 160 | { 161 | "search_phrase_token_index": 1, 162 | "search_phrase_word": "ENTITYPER", 163 | "document_token_index": 8, 164 | "first_document_token_index": 7, 165 | "last_document_token_index": 8, 166 | "structurally_matched_document_token_index": 10, 167 | "document_subword_index": None, 168 | "document_subword_containing_token_index": None, 169 | "document_word": "max mustermann", 170 | "document_phrase": "Max Mustermann", 171 | "match_type": "entity", 172 | "negated": False, 173 | "uncertain": True, 174 | "similarity_measure": 1.0, 175 | "involves_coreference": True, 176 | "extracted_word": "max mustermann", 177 | "depth": 0, 178 | "explanation": "Has an entity label matching ENTITYPER.", 179 | }, 180 | { 181 | "search_phrase_token_index": 2, 182 | "search_phrase_word": "gehen", 183 | "document_token_index": 17, 184 | "first_document_token_index": 17, 185 | "last_document_token_index": 17, 186 | "structurally_matched_document_token_index": 17, 187 | "document_subword_index": None, 188 | "document_subword_containing_token_index": None, 189 | "document_word": "gehen", 190 | "document_phrase": "gehen", 191 | "match_type": "direct", 192 | "negated": True, 193 | "uncertain": False, 194 | "similarity_measure": 1.0, 195 | "involves_coreference": False, 196 | "extracted_word": "gehen", 197 | "depth": 0, 198 | "explanation": "Matches GEHEN directly.", 199 | }, 200 | { 201 | "search_phrase_token_index": 3, 202 | "search_phrase_word": "in", 203 | "document_token_index": 14, 204 | "first_document_token_index": 14, 205 | "last_document_token_index": 14, 206 | "structurally_matched_document_token_index": 14, 207 | "document_subword_index": None, 208 | "document_subword_containing_token_index": None, 209 | "document_word": "in", 210 | "document_phrase": "in", 211 | "match_type": "direct", 212 | "negated": True, 213 | "uncertain": True, 214 | "similarity_measure": 1.0, 215 | "involves_coreference": False, 216 | "extracted_word": "in", 217 | "depth": 0, 218 | "explanation": "Matches IN directly.", 219 | }, 220 | { 221 | "search_phrase_token_index": 5, 222 | "search_phrase_word": "stadt", 223 | "document_token_index": 16, 224 | "first_document_token_index": 16, 225 | "last_document_token_index": 16, 226 | "structurally_matched_document_token_index": 16, 227 | "document_subword_index": None, 228 | "document_subword_containing_token_index": None, 229 | "document_word": "stadt", 230 | "document_phrase": "die Stadt", 231 | "match_type": "direct", 232 | "negated": True, 233 | "uncertain": False, 234 | "similarity_measure": 1.0, 235 | "involves_coreference": False, 236 | "extracted_word": "stadt", 237 | "depth": 0, 238 | "explanation": "Matches STADT directly.", 239 | }, 240 | ], 241 | }, 242 | ], 243 | ) 244 | -------------------------------------------------------------------------------- /tests/en/test_doc_examples_EN.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import holmes_extractor as holmes 3 | 4 | holmes_manager = holmes.Manager(model="en_core_web_lg", number_of_workers=1) 5 | holmes_manager.register_search_phrase("A big dog chases a cat") 6 | holmes_manager.register_search_phrase("An ENTITYPERSON goes into town") 7 | holmes_manager.register_search_phrase("A company gives permission to publish something") 8 | 9 | 10 | class EnglishDocumentationExamplesTest(unittest.TestCase): 11 | 12 | positive_examples = ( 13 | "A big dog chased a cat", 14 | "The big dog would not stop chasing the cat", 15 | "The big dog who was tired chased the cat", 16 | "The cat was chased by the big dog", 17 | "The cat always used to be chased by the big dog", 18 | "The big dog was going to chase the cat", 19 | "The big dog decided to chase the cat", 20 | "The cat was afraid of being chased by the big dog", 21 | "I saw a cat-chasing big dog", 22 | "The cat the big dog chased was scared", 23 | "The big dog chasing the cat was a problem", 24 | "There was a big dog that was chasing a cat", 25 | "The cat chase by the big dog", 26 | "There was a big dog and it was chasing a cat.", 27 | "I saw a big dog. My cat was afraid of being chased by the dog.", 28 | "There was a big dog. His name was Fido. He was chasing my cat.", 29 | "A dog appeared. It was chasing a cat. It was very big.", 30 | "The cat sneaked back into our lounge because a big dog had been chasing her.", 31 | "Our big dog was excited because he had been chasing a cat.", 32 | ) 33 | 34 | def test_positive_examples(self): 35 | for positive_example in self.positive_examples: 36 | with self.subTest(): 37 | assert len(holmes_manager.match(document_text=positive_example)) == 1 38 | 39 | negative_examples = ( 40 | "The dog chased a big cat", 41 | "The big dog and the cat chased about", 42 | "The big dog chased a mouse but the cat was tired", 43 | "The big dog always used to be chased by the cat", 44 | "The big dog the cat chased was scared", 45 | "Our big dog was upset because he had been chased by a cat.", 46 | "The dog chase of the big cat", 47 | ) 48 | 49 | def test_negative_examples(self): 50 | for negative_example in self.negative_examples: 51 | with self.subTest(): 52 | assert len(holmes_manager.match(document_text=negative_example)) == 0 53 | 54 | def test_complex_example(self): 55 | matches = holmes_manager.match( 56 | document_text="I met Richard Hudson and John Doe last week. They didn't want to go into town." 57 | ) 58 | self.assertEqual( 59 | matches, 60 | [ 61 | { 62 | "search_phrase_label": "An ENTITYPERSON goes into town", 63 | "search_phrase_text": "An ENTITYPERSON goes into town", 64 | "document": "", 65 | "index_within_document": 15, 66 | "sentences_within_document": "I met Richard Hudson and John Doe last week. They didn't want to go into town.", 67 | "negated": True, 68 | "uncertain": True, 69 | "involves_coreference": True, 70 | "overall_similarity_measure": 1.0, 71 | "word_matches": [ 72 | { 73 | "search_phrase_token_index": 1, 74 | "search_phrase_word": "ENTITYPERSON", 75 | "document_token_index": 3, 76 | "first_document_token_index": 2, 77 | "last_document_token_index": 3, 78 | "structurally_matched_document_token_index": 10, 79 | "document_subword_index": None, 80 | "document_subword_containing_token_index": None, 81 | "document_word": "richard hudson", 82 | "document_phrase": "Richard Hudson", 83 | "match_type": "entity", 84 | "negated": False, 85 | "uncertain": True, 86 | "similarity_measure": 1.0, 87 | "involves_coreference": True, 88 | "extracted_word": "richard hudson", 89 | "depth": 0, 90 | "explanation": "Has an entity label matching ENTITYPERSON.", 91 | }, 92 | { 93 | "search_phrase_token_index": 2, 94 | "search_phrase_word": "go", 95 | "document_token_index": 15, 96 | "first_document_token_index": 15, 97 | "last_document_token_index": 15, 98 | "structurally_matched_document_token_index": 15, 99 | "document_subword_index": None, 100 | "document_subword_containing_token_index": None, 101 | "document_word": "go", 102 | "document_phrase": "go", 103 | "match_type": "direct", 104 | "negated": True, 105 | "uncertain": False, 106 | "similarity_measure": 1.0, 107 | "involves_coreference": False, 108 | "extracted_word": "go", 109 | "depth": 0, 110 | "explanation": "Matches GO directly.", 111 | }, 112 | { 113 | "search_phrase_token_index": 3, 114 | "search_phrase_word": "into", 115 | "document_token_index": 16, 116 | "first_document_token_index": 16, 117 | "last_document_token_index": 16, 118 | "structurally_matched_document_token_index": 16, 119 | "document_subword_index": None, 120 | "document_subword_containing_token_index": None, 121 | "document_word": "into", 122 | "document_phrase": "into", 123 | "match_type": "direct", 124 | "negated": True, 125 | "uncertain": False, 126 | "similarity_measure": 1.0, 127 | "involves_coreference": False, 128 | "extracted_word": "into", 129 | "depth": 0, 130 | "explanation": "Matches INTO directly.", 131 | }, 132 | { 133 | "search_phrase_token_index": 4, 134 | "search_phrase_word": "town", 135 | "document_token_index": 17, 136 | "first_document_token_index": 17, 137 | "last_document_token_index": 17, 138 | "structurally_matched_document_token_index": 17, 139 | "document_subword_index": None, 140 | "document_subword_containing_token_index": None, 141 | "document_word": "town", 142 | "document_phrase": "town", 143 | "match_type": "direct", 144 | "negated": True, 145 | "uncertain": False, 146 | "similarity_measure": 1.0, 147 | "involves_coreference": False, 148 | "extracted_word": "town", 149 | "depth": 0, 150 | "explanation": "Matches TOWN directly.", 151 | }, 152 | ], 153 | }, 154 | { 155 | "search_phrase_label": "An ENTITYPERSON goes into town", 156 | "search_phrase_text": "An ENTITYPERSON goes into town", 157 | "document": "", 158 | "index_within_document": 15, 159 | "sentences_within_document": "I met Richard Hudson and John Doe last week. They didn't want to go into town.", 160 | "negated": True, 161 | "uncertain": True, 162 | "involves_coreference": True, 163 | "overall_similarity_measure": 1.0, 164 | "word_matches": [ 165 | { 166 | "search_phrase_token_index": 1, 167 | "search_phrase_word": "ENTITYPERSON", 168 | "document_token_index": 6, 169 | "first_document_token_index": 5, 170 | "last_document_token_index": 6, 171 | "structurally_matched_document_token_index": 10, 172 | "document_subword_index": None, 173 | "document_subword_containing_token_index": None, 174 | "document_word": "john doe", 175 | "document_phrase": "John Doe", 176 | "match_type": "entity", 177 | "negated": False, 178 | "uncertain": True, 179 | "similarity_measure": 1.0, 180 | "involves_coreference": True, 181 | "extracted_word": "john doe", 182 | "depth": 0, 183 | "explanation": "Has an entity label matching ENTITYPERSON.", 184 | }, 185 | { 186 | "search_phrase_token_index": 2, 187 | "search_phrase_word": "go", 188 | "document_token_index": 15, 189 | "first_document_token_index": 15, 190 | "last_document_token_index": 15, 191 | "structurally_matched_document_token_index": 15, 192 | "document_subword_index": None, 193 | "document_subword_containing_token_index": None, 194 | "document_word": "go", 195 | "document_phrase": "go", 196 | "match_type": "direct", 197 | "negated": True, 198 | "uncertain": False, 199 | "similarity_measure": 1.0, 200 | "involves_coreference": False, 201 | "extracted_word": "go", 202 | "depth": 0, 203 | "explanation": "Matches GO directly.", 204 | }, 205 | { 206 | "search_phrase_token_index": 3, 207 | "search_phrase_word": "into", 208 | "document_token_index": 16, 209 | "first_document_token_index": 16, 210 | "last_document_token_index": 16, 211 | "structurally_matched_document_token_index": 16, 212 | "document_subword_index": None, 213 | "document_subword_containing_token_index": None, 214 | "document_word": "into", 215 | "document_phrase": "into", 216 | "match_type": "direct", 217 | "negated": True, 218 | "uncertain": False, 219 | "similarity_measure": 1.0, 220 | "involves_coreference": False, 221 | "extracted_word": "into", 222 | "depth": 0, 223 | "explanation": "Matches INTO directly.", 224 | }, 225 | { 226 | "search_phrase_token_index": 4, 227 | "search_phrase_word": "town", 228 | "document_token_index": 17, 229 | "first_document_token_index": 17, 230 | "last_document_token_index": 17, 231 | "structurally_matched_document_token_index": 17, 232 | "document_subword_index": None, 233 | "document_subword_containing_token_index": None, 234 | "document_word": "town", 235 | "document_phrase": "town", 236 | "match_type": "direct", 237 | "negated": True, 238 | "uncertain": False, 239 | "similarity_measure": 1.0, 240 | "involves_coreference": False, 241 | "extracted_word": "town", 242 | "depth": 0, 243 | "explanation": "Matches TOWN directly.", 244 | }, 245 | ], 246 | }, 247 | ], 248 | ) 249 | 250 | def test_extracted_word_example(self): 251 | matches = holmes_manager.match( 252 | document_text="We discussed AstraZeneca. The company had given us permission to publish this library under the MIT license." 253 | ) 254 | self.assertEqual(len(matches), 1) 255 | self.assertEqual(matches[0]["word_matches"][0]["extracted_word"], "astrazeneca") 256 | --------------------------------------------------------------------------------