├── .gitignore ├── LICENSE ├── README.md ├── deploy.sh ├── examples ├── example_1.py ├── pattern_0.json ├── pattern_1.json ├── pattern_2.json └── pattern_3.json ├── requirements.txt ├── requirements_dev.txt ├── setup.py ├── spacy_pattern_builder ├── __init__.py ├── build.py ├── exceptions.py ├── match.py ├── mutate.py └── util.py └── test ├── __init__.py └── test_spacy_pattern_builder.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | experiments 106 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019 Nick Morley 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SpaCy Pattern Builder 2 | 3 | Use training examples to build and refine patterns for use with SpaCy's DependencyMatcher. 4 | 5 | ## Motivation 6 | 7 | Generating patterns programmatically from training data is more efficient than creating them manually. 8 | 9 | ## Installation 10 | 11 | With pip: 12 | 13 | ```bash 14 | pip install spacy-pattern-builder 15 | ``` 16 | 17 | ## Usage 18 | 19 | ```python 20 | # Import a SpaCy model, parse a string to create a Doc object 21 | import en_core_web_sm 22 | 23 | text = 'We introduce efficient methods for fitting Boolean models to molecular data.' 24 | nlp = en_core_web_sm.load() 25 | doc = nlp(text) 26 | 27 | from spacy_pattern_builder import build_dependency_pattern 28 | 29 | # Provide a list of tokens we want to match. 30 | match_tokens = [doc[i] for i in [0, 1, 3]] # [We, introduce, methods] 31 | 32 | ''' Note that these tokens must be fully connected. That is, 33 | all tokens must have a path to all other tokens in the list, 34 | without needing to traverse tokens outside of the list. 35 | Otherwise, spacy-pattern-builder will raise a TokensNotFullyConnectedError. 36 | You can get a connected set that includes your tokens with the following: ''' 37 | from spacy_pattern_builder import util 38 | connected_tokens = util.smallest_connected_subgraph(match_tokens, doc) 39 | assert match_tokens == connected_tokens # In this case, the tokens we provided are already fully connected 40 | 41 | # Specify the token attributes / features to use 42 | feature_dict = { # This is equal to the default feature_dict 43 | 'DEP': 'dep_', 44 | 'TAG': 'tag_' 45 | } 46 | 47 | # Build the pattern 48 | pattern = build_dependency_pattern(doc, match_tokens, feature_dict=feature_dict) 49 | 50 | from pprint import pprint 51 | pprint(pattern) # In the format consumed by SpaCy's DependencyMatcher: 52 | ''' 53 | [{'PATTERN': {'DEP': 'ROOT', 'TAG': 'VBP'}, 'SPEC': {'NODE_NAME': 'node1'}}, 54 | {'PATTERN': {'DEP': 'nsubj', 'TAG': 'PRP'}, 55 | 'SPEC': {'NBOR_NAME': 'node1', 'NBOR_RELOP': '>', 'NODE_NAME': 'node0'}}, 56 | {'PATTERN': {'DEP': 'dobj', 'TAG': 'NNS'}, 57 | 'SPEC': {'NBOR_NAME': 'node1', 'NBOR_RELOP': '>', 'NODE_NAME': 'node3'}}] 58 | ''' 59 | 60 | # Create a matcher and add the newly generated pattern 61 | from spacy.matcher import DependencyMatcher 62 | 63 | matcher = DependencyTreeMatcher(doc.vocab) 64 | matcher.add('pattern', None, pattern) 65 | 66 | # And get matches 67 | matches = matcher(doc) 68 | for match_id, token_idxs in matches: 69 | tokens = [doc[i] for i in token_idxs] 70 | tokens = sorted(tokens, key=lambda w: w.i) # Make sure tokens are in their original order 71 | print(tokens) # [We, introduce, methods] 72 | 73 | ``` 74 | 75 | ## Acknowledgements 76 | 77 | Uses: 78 | 79 | - [SpaCy](https://spacy.io) 80 | - [networkx](https://github.com/networkx/networkx) -------------------------------------------------------------------------------- /deploy.sh: -------------------------------------------------------------------------------- 1 | deactivate 2 | rm -rf dist 3 | rm -rf build 4 | python3 setup.py sdist bdist_wheel 5 | python3 -m twine upload dist/* -------------------------------------------------------------------------------- /examples/example_1.py: -------------------------------------------------------------------------------- 1 | # Import a SpaCy model, parse a string to create a Doc object 2 | import en_core_web_sm 3 | 4 | text = 'We introduce efficient methods for fitting Boolean models to molecular data.' 5 | nlp = en_core_web_sm.load() 6 | doc = nlp(text) 7 | 8 | from spacy_pattern_builder import build_dependency_pattern 9 | 10 | # Provide a list of tokens we want to match. 11 | match_tokens = [doc[i] for i in [0, 1, 3]] # [We, introduce, methods] 12 | 13 | ''' Note that these tokens must be fully connected. That is, 14 | all tokens must have a path to all other tokens in the list, 15 | without needing to traverse tokens outside of the list. 16 | Otherwise, spacy-pattern-builder will raise a TokensNotFullyConnectedError. 17 | You can get a connected set that includes your tokens with the following: ''' 18 | from spacy_pattern_builder import util 19 | connected_tokens = util.smallest_connected_subgraph(match_tokens, doc) 20 | assert match_tokens == connected_tokens 21 | 22 | # Specify the token attributes / features to use 23 | feature_dict = { # This here is equal to the default feature_dict 24 | 'DEP': 'dep_', 25 | 'TAG': 'tag_' 26 | } 27 | 28 | # Build the pattern 29 | pattern = build_dependency_pattern(doc, match_tokens, feature_dict=feature_dict) 30 | 31 | from pprint import pprint 32 | pprint(pattern) # In the format consumed by SpaCy's DependencyTreeMatcher: 33 | ''' 34 | [{'PATTERN': {'DEP': 'ROOT', 'TAG': 'VBP'}, 'SPEC': {'NODE_NAME': 'node1'}}, 35 | {'PATTERN': {'DEP': 'nsubj', 'TAG': 'PRP'}, 36 | 'SPEC': {'NBOR_NAME': 'node1', 'NBOR_RELOP': '>', 'NODE_NAME': 'node0'}}, 37 | {'PATTERN': {'DEP': 'dobj', 'TAG': 'NNS'}, 38 | 'SPEC': {'NBOR_NAME': 'node1', 'NBOR_RELOP': '>', 'NODE_NAME': 'node3'}}] 39 | ''' 40 | 41 | # Create a matcher and add the newly generated pattern 42 | from spacy.matcher import DependencyTreeMatcher 43 | 44 | matcher = DependencyTreeMatcher(doc.vocab) 45 | matcher.add('pattern', None, pattern) 46 | 47 | # And match away 48 | matches = matcher(doc) 49 | for match_id, token_idxs in matches: 50 | tokens = [doc[i] for i in token_idxs] 51 | tokens = sorted(tokens, key=lambda w: w.i) 52 | print(tokens) # [We, introduce, methods] 53 | -------------------------------------------------------------------------------- /examples/pattern_0.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "SPEC": { 4 | "NODE_NAME": "node1" 5 | }, 6 | "PATTERN": { 7 | "DEP": "ROOT", 8 | "_": { 9 | "custom_attr": "my_attr" 10 | } 11 | } 12 | }, 13 | { 14 | "SPEC": { 15 | "NODE_NAME": "node0", 16 | "NBOR_NAME": "node1", 17 | "NBOR_RELOP": ">" 18 | }, 19 | "PATTERN": { 20 | "DEP": "nsubj", 21 | "_": { 22 | "custom_attr": "my_attr" 23 | } 24 | } 25 | }, 26 | { 27 | "SPEC": { 28 | "NODE_NAME": "node3", 29 | "NBOR_NAME": "node0", 30 | "NBOR_RELOP": "$--" 31 | }, 32 | "PATTERN": { 33 | "DEP": "dobj", 34 | "_": { 35 | "custom_attr": "my_attr" 36 | } 37 | } 38 | } 39 | ] -------------------------------------------------------------------------------- /examples/pattern_1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "SPEC": { 4 | "NODE_NAME": "node13" 5 | }, 6 | "PATTERN": { 7 | "DEP": "advcl", 8 | "_": { 9 | "custom_attr": "my_attr" 10 | } 11 | } 12 | }, 13 | { 14 | "SPEC": { 15 | "NODE_NAME": "node15", 16 | "NBOR_NAME": "node13", 17 | "NBOR_RELOP": ">" 18 | }, 19 | "PATTERN": { 20 | "DEP": "dobj", 21 | "_": { 22 | "custom_attr": "my_attr" 23 | } 24 | } 25 | }, 26 | { 27 | "SPEC": { 28 | "NODE_NAME": "node16", 29 | "NBOR_NAME": "node15", 30 | "NBOR_RELOP": "$--" 31 | }, 32 | "PATTERN": { 33 | "DEP": "prep", 34 | "_": { 35 | "custom_attr": "my_attr" 36 | } 37 | } 38 | }, 39 | { 40 | "SPEC": { 41 | "NODE_NAME": "node19", 42 | "NBOR_NAME": "node16", 43 | "NBOR_RELOP": ">" 44 | }, 45 | "PATTERN": { 46 | "DEP": "pobj", 47 | "_": { 48 | "custom_attr": "my_attr" 49 | } 50 | } 51 | } 52 | ] -------------------------------------------------------------------------------- /examples/pattern_2.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "SPEC": { 4 | "NODE_NAME": "node1" 5 | }, 6 | "PATTERN": { 7 | "DEP": "ROOT", 8 | "_": { 9 | "custom_attr": "my_attr" 10 | } 11 | } 12 | }, 13 | { 14 | "SPEC": { 15 | "NODE_NAME": "node0", 16 | "NBOR_NAME": "node1", 17 | "NBOR_RELOP": ">" 18 | }, 19 | "PATTERN": { 20 | "DEP": "nsubj", 21 | "_": { 22 | "custom_attr": "my_attr" 23 | } 24 | } 25 | }, 26 | { 27 | "SPEC": { 28 | "NODE_NAME": "node2", 29 | "NBOR_NAME": "node0", 30 | "NBOR_RELOP": "$--" 31 | }, 32 | "PATTERN": { 33 | "DEP": "prep", 34 | "_": { 35 | "custom_attr": "my_attr" 36 | } 37 | } 38 | }, 39 | { 40 | "SPEC": { 41 | "NODE_NAME": "node4", 42 | "NBOR_NAME": "node2", 43 | "NBOR_RELOP": ">" 44 | }, 45 | "PATTERN": { 46 | "DEP": "pobj", 47 | "_": { 48 | "custom_attr": "my_attr" 49 | } 50 | } 51 | } 52 | ] -------------------------------------------------------------------------------- /examples/pattern_3.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "SPEC": { 4 | "NODE_NAME": "node4" 5 | }, 6 | "PATTERN": { 7 | "DEP": "ROOT", 8 | "_": { 9 | "custom_attr": "my_attr" 10 | } 11 | } 12 | }, 13 | { 14 | "SPEC": { 15 | "NODE_NAME": "node2", 16 | "NBOR_NAME": "node4", 17 | "NBOR_RELOP": ">" 18 | }, 19 | "PATTERN": { 20 | "DEP": "nsubj", 21 | "_": { 22 | "custom_attr": "my_attr" 23 | } 24 | } 25 | }, 26 | { 27 | "SPEC": { 28 | "NODE_NAME": "node8", 29 | "NBOR_NAME": "node2", 30 | "NBOR_RELOP": "$--" 31 | }, 32 | "PATTERN": { 33 | "DEP": "dobj", 34 | "_": { 35 | "custom_attr": "my_attr" 36 | } 37 | } 38 | } 39 | ] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | spacy 2 | networkx 3 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz 2 | visualise-spacy-tree -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | readme = open('README.md').read() 4 | requirements = open('requirements.txt').read().splitlines() 5 | 6 | setup( 7 | name='spacy-pattern-builder', 8 | version='0.0.7', 9 | description='Reverse engineer patterns for use with the SpaCy DependencyTreeMatcher', 10 | long_description=readme, 11 | long_description_content_type='text/markdown', 12 | author='Nick Morley', 13 | author_email='nick.morley111@gmail.com', 14 | url='https://github.com/cyclecycle/spacy-pattern-builder', 15 | packages=find_packages(), 16 | package_dir={'spacy-pattern-builder': 'spacy-pattern-builder'}, 17 | include_package_data=True, 18 | install_requires=requirements, 19 | license='MIT', 20 | zip_safe=False, 21 | keywords='spacy-pattern-builder', 22 | classifiers=[ 23 | 'Development Status :: 2 - Pre-Alpha', 24 | 'Intended Audience :: Developers', 25 | 'License :: OSI Approved :: MIT License', 26 | 'Natural Language :: English', 27 | 'Programming Language :: Python :: 3', 28 | 'Programming Language :: Python :: 3.6', 29 | 'Programming Language :: Python :: Implementation :: PyPy', 30 | ], 31 | ) 32 | -------------------------------------------------------------------------------- /spacy_pattern_builder/__init__.py: -------------------------------------------------------------------------------- 1 | from spacy_pattern_builder.build import build_dependency_pattern 2 | import spacy_pattern_builder.util 3 | import spacy_pattern_builder.exceptions 4 | import spacy_pattern_builder.mutate 5 | import spacy_pattern_builder.match 6 | from spacy_pattern_builder.mutate import yield_pattern_permutations, yield_node_level_pattern_variants, yield_extended_trees 7 | -------------------------------------------------------------------------------- /spacy_pattern_builder/build.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from pprint import pprint 3 | import spacy_pattern_builder.util as util 4 | from spacy_pattern_builder.exceptions import TokensNotFullyConnectedError, DuplicateTokensError, TokenNotInMatchTokensError 5 | 6 | 7 | DEFAULT_BUILD_PATTERN_FEATURE_DICT = { 8 | 'DEP': 'dep_', 9 | 'TAG': 'tag_' 10 | } 11 | 12 | 13 | def node_name(token): 14 | return 'node{0}'.format(token.i) 15 | 16 | 17 | def node_features(token, feature_dict): 18 | native_feature_dict = {name: feature for name, feature in feature_dict.items() if name != '_'} 19 | extension_feature_dict = feature_dict.get('_', None) 20 | node_features = { 21 | name: getattr(token, feature) for name, feature in native_feature_dict.items() 22 | } 23 | if extension_feature_dict: 24 | extension_node_features = { 25 | name: getattr(token._, feature) for name, feature in extension_feature_dict.items() 26 | } 27 | node_features['_'] = extension_node_features 28 | return node_features 29 | 30 | 31 | def build_pattern_element(token, feature_dict, nbor=None, operator='>'): 32 | features = node_features(token, feature_dict) 33 | if not nbor: 34 | pattern_element = { 35 | 'SPEC': {'NODE_NAME': node_name(token)}, 36 | 'PATTERN': features 37 | } 38 | else: 39 | pattern_element = { 40 | 'SPEC': { 41 | 'NODE_NAME': node_name(token), 42 | 'NBOR_NAME': node_name(nbor), 43 | 'NBOR_RELOP': operator 44 | }, 45 | 'PATTERN': features 46 | } 47 | return pattern_element 48 | 49 | 50 | def build_dependency_pattern(doc, match_tokens, feature_dict=DEFAULT_BUILD_PATTERN_FEATURE_DICT, nx_graph=None): 51 | '''Build a depedency pattern for use with DependencyTreeMatcher that will match the set of tokens provided in "match_tokens". This set of tokens must form a fully connected graph. 52 | 53 | Arguments: 54 | doc {SpaCy Doc object} 55 | match_tokens {list} -- Set of tokens to match with the resulting dependency pattern 56 | token_features {list} -- Attributes of spaCy tokens to match in the pattern 57 | nx_graph {NetworkX object} -- graph representing the doc dependency tree 58 | 59 | Returns: 60 | [list] -- Dependency pattern in the format consumed by SpaCy's DependencyTreeMatcher 61 | ''' 62 | # Checks 63 | if not nx_graph: 64 | nx_graph = util.doc_to_nx_graph(doc) 65 | util.annotate_token_depth(doc) 66 | connected_tokens = util.smallest_connected_subgraph( 67 | match_tokens, doc, nx_graph=nx_graph) 68 | match_token_ids = util.token_idxs(match_tokens) 69 | connected_token_ids = util.token_idxs(connected_tokens) 70 | tokens_not_fully_connected = set(match_token_ids) != set(connected_token_ids) 71 | if tokens_not_fully_connected: 72 | raise TokensNotFullyConnectedError('Try expanding the training example to include all tokens in between those you are trying to match. Or, try the "role-pattern-nlp" module which handles this for you.') 73 | tokens_contain_duplicates = util.list_contains_duplicates(match_tokens) 74 | if tokens_contain_duplicates: 75 | raise DuplicateTokensError('Ensure the match_tokens is a unique list of tokens.') 76 | match_tokens = util.sort_by_depth(match_tokens) # Iterate through tokens in descending depth order 77 | dependency_pattern = [] 78 | root_token = match_tokens[0] 79 | pattern_element = build_pattern_element(root_token, feature_dict, operator='>') 80 | dependency_pattern.append(pattern_element) 81 | tokens_in_pattern = [root_token] 82 | non_root_tokens = match_tokens[1:] 83 | for i, token in enumerate(non_root_tokens): 84 | # If the token is a right sibling of a token already in the pattern, also add a sibling relationship. 85 | left_siblings = util.siblings(token, side='left') 86 | left_siblings_in_pattern = [t for t in left_siblings if t in tokens_in_pattern] 87 | if left_siblings_in_pattern: 88 | last_left_sibling_in_pattern = left_siblings_in_pattern[-1] 89 | pattern_element = build_pattern_element( 90 | token, feature_dict, nbor=last_left_sibling_in_pattern, operator='$--') 91 | dependency_pattern.append(pattern_element) 92 | else: # Parent-child relation 93 | head = token.head 94 | if head not in match_tokens: 95 | raise TokenNotInMatchTokensError('Head token not in match_tokens. Is match_tokens fully connected?') 96 | pattern_element = build_pattern_element(token, feature_dict, nbor=head, operator='>') 97 | dependency_pattern.append(pattern_element) 98 | tokens_in_pattern.append(token) 99 | return dependency_pattern 100 | -------------------------------------------------------------------------------- /spacy_pattern_builder/exceptions.py: -------------------------------------------------------------------------------- 1 | class TokensNotFullyConnectedError(Exception): 2 | pass 3 | 4 | 5 | class DuplicateTokensError(Exception): 6 | pass 7 | 8 | 9 | class TokenNotInMatchTokensError(Exception): 10 | pass 11 | 12 | 13 | class FeaturesMissingFromPatternError(Exception): 14 | pass 15 | -------------------------------------------------------------------------------- /spacy_pattern_builder/match.py: -------------------------------------------------------------------------------- 1 | from spacy.matcher import DependencyMatcher 2 | 3 | 4 | def build_matcher(vocab, pattern): 5 | matcher = DependencyMatcher(vocab) 6 | matcher.add('pattern', None, pattern) 7 | return matcher 8 | 9 | 10 | def find_matches(doc, pattern): 11 | matcher = build_matcher(doc.vocab, pattern) 12 | # print(doc, pattern) 13 | matches = matcher(doc) 14 | match_list = [] 15 | for match_id, match_trees in matches: 16 | for token_idxs in match_trees: 17 | tokens = [doc[idx] for idx in token_idxs] 18 | tokens = sorted(tokens, key=lambda t: t.i) 19 | match_list.append(tokens) 20 | return match_list 21 | -------------------------------------------------------------------------------- /spacy_pattern_builder/mutate.py: -------------------------------------------------------------------------------- 1 | '''Generate pattern variants 2 | ''' 3 | from pprint import pprint 4 | import itertools 5 | from spacy_pattern_builder import build 6 | from spacy_pattern_builder import util 7 | from spacy_pattern_builder.exceptions import FeaturesMissingFromPatternError 8 | 9 | 10 | def yield_pattern_permutations(pattern, feature_sets): 11 | # First check all features in feature_sets are present in all pattern_elements 12 | all_features = set(util.flatten_list(feature_sets)) 13 | all_features_are_in_pattern = util.features_are_in_pattern(all_features, pattern) 14 | if not all_features_are_in_pattern: 15 | raise FeaturesMissingFromPatternError( 16 | 'Tried to create pattern permutations using features that are not present in the pattern. Ensure the pattern has all the features specified in feature_sets.' 17 | ) 18 | pattern_element_combinations = [] 19 | for pattern_element in pattern: 20 | token_features = pattern_element['PATTERN'] 21 | new_pattern_elements = [] 22 | for feature_set in feature_sets: 23 | new_token_features = { 24 | k: v for k, v in token_features.items() if k in feature_set 25 | } 26 | new_pattern_element = { 27 | 'SPEC': pattern_element['SPEC'], 28 | 'PATTERN': new_token_features, 29 | } 30 | new_pattern_elements.append(new_pattern_element) 31 | pattern_element_combinations.append(new_pattern_elements) 32 | return itertools.product(*pattern_element_combinations) 33 | 34 | 35 | def yield_node_level_pattern_variants(pattern, match_tokens, feature_dicts, mutate_tokens=[]): 36 | # Sort tokens by depth and assume to match one-to-one with pattern 37 | if not mutate_tokens: 38 | mutate_tokens = match_tokens 39 | match_tokens = util.sort_by_depth(match_tokens) 40 | pattern_element_combinations = [] 41 | for pattern_element, token in zip(pattern, match_tokens): 42 | if token not in mutate_tokens: 43 | new_pattern_elements = [pattern_element for _ in feature_dicts] 44 | else: 45 | new_pattern_elements = [] 46 | for feature_dict in feature_dicts: 47 | new_token_features = build.node_features(token, feature_dict) 48 | new_pattern_element = { 49 | 'SPEC': pattern_element['SPEC'], 50 | 'PATTERN': new_token_features, 51 | } 52 | new_pattern_elements.append(new_pattern_element) 53 | pattern_element_combinations.append(new_pattern_elements) 54 | pattern_variants = itertools.product(*pattern_element_combinations) 55 | for variant in pattern_variants: 56 | assert len(variant) == len(pattern) 57 | yield variant 58 | 59 | 60 | def yield_extended_trees(match_tokens): 61 | min_depth = min([t._.depth for t in match_tokens]) 62 | extend_by = [] 63 | for token in match_tokens: 64 | is_root = token._.depth == min_depth 65 | if is_root: 66 | extend_by.append(token.head) 67 | extend_by += token.children 68 | if not is_root: 69 | # Only extend by siblings if the token is not root, as this would also require adding the common root that connects these siblings or else the tokens would not be fully connected. 70 | extend_by += util.siblings(token) 71 | extend_by = [t for t in extend_by if t] 72 | extend_by = [t for t in extend_by if t not in match_tokens] 73 | extend_by = util.de_duplicate_list(extend_by) 74 | for node in extend_by: 75 | match_token_variant = match_tokens + [node] 76 | yield match_token_variant 77 | -------------------------------------------------------------------------------- /spacy_pattern_builder/util.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import networkx as nx 3 | from spacy.tokens import Token 4 | 5 | 6 | def annotate_token_depth(doc): 7 | '''Annotate token depth in the syntactic tree''' 8 | Token.set_extension('depth', default=None, force=True) 9 | for word in doc: 10 | depth = 0 11 | current_word = word 12 | while not current_word == current_word.head: 13 | depth += 1 14 | current_word = current_word.head 15 | word._.depth = depth 16 | return doc 17 | 18 | 19 | def filter_by_depth(depths, tokens): 20 | if isinstance(depths, int): 21 | depths = set([depths]) 22 | return [t for t in tokens if t._.depth in depths] 23 | 24 | 25 | def shallowest_token(tokens): 26 | tokens = sort_by_depth(tokens) 27 | return tokens[0] 28 | 29 | 30 | def sort_by_depth(tokens): 31 | return sorted(tokens, key=lambda w: (w._.depth, w.i)) 32 | 33 | 34 | def sort_by_idx(tokens): 35 | return sorted(tokens, key=lambda w: w.i) 36 | 37 | 38 | def siblings(token, side=None): 39 | try: 40 | siblings = token.head.children 41 | except: 42 | return [] 43 | if side == 'left': 44 | siblings = [s for s in siblings if s.i < token.i] 45 | elif side == 'left': 46 | siblings = [s for s in siblings if s.i > token.i] 47 | return siblings 48 | 49 | 50 | def doc_to_nx_graph(doc): 51 | edges = [] 52 | for token in doc: 53 | for child in token.children: 54 | edges.append(('{0}-{1}'.format(token.text, token.i), 55 | '{0}-{1}'.format(child.text, child.i))) 56 | graph = nx.Graph(edges) 57 | return graph 58 | 59 | 60 | def shortest_dependency_path(nx_graph, doc, source, target): 61 | source = '{0}-{1}'.format(source.text, source.i) 62 | target = '{0}-{1}'.format(target.text, target.i) 63 | try: 64 | path = nx.shortest_path(nx_graph, source=source, target=target) 65 | except nx.exception.NetworkXNoPath: 66 | path = [] 67 | dep_path = [] 68 | for node in path: 69 | idx = int(node.split('-')[-1]) 70 | token = doc[idx] 71 | dep_path.append(token) 72 | dep_path = sorted(dep_path, key=lambda t: t._.depth) 73 | return dep_path 74 | 75 | 76 | def smallest_connected_subgraph(with_tokens, doc, nx_graph=None): 77 | # Find root nodes 78 | if not nx_graph: 79 | nx_graph = doc_to_nx_graph(doc) 80 | try: 81 | doc[0]._.depth 82 | except AttributeError: 83 | annotate_token_depth(doc) 84 | min_depth = min([t._.depth for t in with_tokens]) 85 | roots = [t for t in with_tokens if t._.depth == min_depth] 86 | non_roots = [t for t in with_tokens if t not in roots] 87 | tokens_touched = roots + non_roots 88 | # For each non-root token, trace paths to each root. This will touch every non-root token we're looking for 89 | for token in non_roots: 90 | for root in roots: 91 | path = shortest_dependency_path(nx_graph, doc, token, root) 92 | for t in path: 93 | if t not in tokens_touched: 94 | tokens_touched.append(t) 95 | tokens_touched = sorted(tokens_touched, key=lambda t: t.i) 96 | # Trace paths between roots 97 | for root_x, root_y in itertools.combinations(roots, 2): 98 | path = shortest_dependency_path(nx_graph, doc, root_x, root_y) 99 | for t in path: 100 | if t not in tokens_touched: 101 | tokens_touched.append(t) 102 | return tokens_touched 103 | 104 | 105 | def idxs_to_tokens(doc, idxs): 106 | return [doc[idx] for idx in idxs] 107 | 108 | 109 | def token_idxs(tokens): 110 | return [t.i for t in tokens] 111 | 112 | 113 | def de_duplicate_list(list_): 114 | unique_list = [] 115 | for item in list_: 116 | if item not in unique_list: 117 | unique_list.append(item) 118 | return unique_list 119 | 120 | 121 | def list_contains_duplicates(list_): 122 | unique_list = de_duplicate_list(list_) 123 | if len(list_) > len(unique_list): 124 | return True 125 | return False 126 | 127 | 128 | def features_are_in_pattern(features, pattern): 129 | for pattern_element in pattern: 130 | for feature in features: 131 | if feature not in pattern_element['PATTERN']: 132 | return False 133 | return True 134 | 135 | 136 | def flatten_list(list_): 137 | return list(itertools.chain(*list_)) 138 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyclecycle/spacy-pattern-builder/51a1eb9a2cbd56163103e0e903af585442f8f912/test/__init__.py -------------------------------------------------------------------------------- /test/test_spacy_pattern_builder.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Tests for `spacy-pattern-builder` module. 3 | ''' 4 | import pytest 5 | from pprint import pprint 6 | import json 7 | import en_core_web_sm 8 | from spacy.tokens import Token 9 | from spacy_pattern_builder import ( 10 | build_dependency_pattern, 11 | yield_pattern_permutations, 12 | yield_node_level_pattern_variants, 13 | yield_extended_trees, 14 | ) 15 | from spacy_pattern_builder.exceptions import ( 16 | TokensNotFullyConnectedError, 17 | DuplicateTokensError, 18 | ) 19 | import spacy_pattern_builder.util as util 20 | import spacy_pattern_builder.match as match 21 | 22 | 23 | nlp = en_core_web_sm.load() 24 | 25 | text1 = 'We introduce efficient methods for fitting Boolean models to molecular data, successfully demonstrating their application to synthetic time courses generated by a number of established clock models, as well as experimental expression levels measured using luciferase imaging.' 26 | 27 | text2 = 'Moreover, again only in sCON individuals, we observed a significant positive correlation between ASL and wine in overlapping left parietal WM indicating better baseline brain perfusion.' 28 | 29 | text3 = 'We focused on green tea and performed a systematic review of observational studies that examined the association between green tea intake and dementia, Alzheimer\'s disease, mild cognitive impairment, or cognitive impairment.' 30 | 31 | text4 = 'L-theanine alone improved self-reported relaxation, tension, and calmness starting at 200 mg.' 32 | 33 | 34 | doc1 = nlp(text1) 35 | doc2 = nlp(text2) 36 | doc3 = nlp(text3) 37 | doc4 = nlp(text4) 38 | 39 | cases = [ 40 | { 41 | 'example': { 42 | 'doc': doc1, 43 | 'match': util.idxs_to_tokens(doc1, [0, 1, 3]), # [We, introduce, methods] 44 | } 45 | }, 46 | { 47 | 'example': { 48 | 'doc': doc1, 49 | 'match': util.idxs_to_tokens( 50 | doc1, [13, 15, 16, 19] 51 | ), # [demonstrating, application, to, courses] 52 | } 53 | }, 54 | { 55 | 'example': { 56 | 'doc': doc3, 57 | 'match': util.idxs_to_tokens(doc3, [0, 1, 2, 4]), # [We, focused, on, tea] 58 | }, 59 | 'should_miss': [ 60 | { 61 | 'doc': doc2, 62 | 'match': util.idxs_to_tokens( 63 | doc2, [4, 8, 9, 18] 64 | ), # [in, we, observed, in] 65 | } 66 | ], 67 | }, 68 | { 69 | 'example': { 70 | 'doc': doc4, 71 | 'match': util.idxs_to_tokens( 72 | doc4, [2, 4, 8] 73 | ), # [theanine, relaxation, improved] 74 | } 75 | }, 76 | ] 77 | 78 | 79 | class TestSpacyPatternBuilder(object): 80 | def test_build_pattern(self): 81 | feature_dict = {'DEP': 'dep_', 'TAG': 'tag_'} 82 | for i, case in enumerate(cases): 83 | doc = case['example']['doc'] 84 | match_example = case['example']['match'] 85 | pattern = build_dependency_pattern(doc, match_example, feature_dict) 86 | matches = match.find_matches(doc, pattern) 87 | assert match_example in matches, 'does not match example' 88 | pattern_file_name = 'examples/pattern_{}.json'.format(i) 89 | with open(pattern_file_name, 'w') as f: 90 | json.dump(pattern, f, indent=2) 91 | if 'should_hit' in case: 92 | for item in case['should_hit']: 93 | doc = item['doc'] 94 | hit_match = item['match'] 95 | matches = match.find_matches(doc, pattern) 96 | assert hit_match in matches, 'false negative' 97 | if 'should_miss' in case: 98 | for item in case['should_miss']: 99 | doc = item['doc'] 100 | miss_match = item['match'] 101 | matches = match.find_matches(doc, pattern) 102 | assert miss_match not in matches, 'false positive' 103 | 104 | def test_custom_extension(self): 105 | Token.set_extension('custom_attr', default=False) 106 | feature_dict = {'DEP': 'dep_', '_': {'custom_attr': 'custom_attr'}} 107 | for i, case in enumerate(cases): 108 | doc = case['example']['doc'] 109 | for token in doc: 110 | token._.custom_attr = 'my_attr' 111 | match_example = case['example']['match'] 112 | pattern = build_dependency_pattern(doc, match_example, feature_dict) 113 | matches = match.find_matches(doc, pattern) 114 | assert match_example in matches, 'does not match example' 115 | pattern_file_name = 'examples/pattern_{}.json'.format(i) 116 | with open(pattern_file_name, 'w') as f: 117 | json.dump(pattern, f, indent=2) 118 | if 'should_hit' in case: 119 | for item in case['should_hit']: 120 | doc = item['doc'] 121 | hit_match = item['match'] 122 | matches = match.find_matches(doc, pattern) 123 | assert hit_match in matches, 'false negative' 124 | if 'should_miss' in case: 125 | for item in case['should_miss']: 126 | doc = item['doc'] 127 | miss_match = item['match'] 128 | matches = match.find_matches(doc, pattern) 129 | assert miss_match not in matches, 'false positive' 130 | 131 | def test_tokens_not_connected_error(self): 132 | doc = doc1 133 | match_examples = [ 134 | util.idxs_to_tokens( 135 | doc, [19, 20, 21, 27] 136 | ) # [courses, generated, by, models] 137 | ] 138 | feature_dict = {'DEP': 'dep_', 'TAG': 'tag_'} 139 | for match_example in match_examples: 140 | with pytest.raises(TokensNotFullyConnectedError): 141 | build_dependency_pattern(doc, match_example, feature_dict) 142 | 143 | def test_duplicate_tokens_error(self): 144 | doc = doc1 145 | match_examples = [ 146 | util.idxs_to_tokens( 147 | doc, [0, 1, 1, 3] 148 | ) # [We, introduce, introduce, methods] 149 | ] 150 | for match_example in match_examples: 151 | with pytest.raises(DuplicateTokensError): 152 | build_dependency_pattern(doc, match_example) 153 | 154 | def test_yield_node_level_pattern_variants(self): 155 | # Build initial pattern 156 | doc = doc1 157 | match_tokens = util.idxs_to_tokens(doc, [0, 1, 3]) # [We, introduce, methods] 158 | feature_dict = {'DEP': 'dep_', 'TAG': 'tag_'} 159 | pattern = build_dependency_pattern(doc, match_tokens, feature_dict) 160 | 161 | feature_dicts = ( 162 | {'DEP': 'dep_', 'TAG': 'tag_'}, 163 | {'DEP': 'dep_', 'TAG': 'tag_', 'LOWER': 'lower_'}, 164 | ) 165 | pattern_variants = list( 166 | yield_node_level_pattern_variants(pattern, match_tokens, feature_dicts) 167 | ) 168 | assert not util.list_contains_duplicates(pattern_variants) 169 | n_variants = len(pattern_variants) 170 | assert n_variants == len(feature_dicts) ** len(pattern) 171 | for pattern_variant in pattern_variants: 172 | matches = match.find_matches(doc, pattern_variant) 173 | assert match_tokens in matches 174 | 175 | # Test mutate_tokens parameter 176 | pattern_variants = list( 177 | yield_node_level_pattern_variants( 178 | pattern, match_tokens, feature_dicts, mutate_tokens=[match_tokens[1]] 179 | ) 180 | ) 181 | n_variants = len(pattern_variants) 182 | assert n_variants == len(feature_dicts) ** len(pattern) 183 | for pattern_variant in pattern_variants: 184 | matches = match.find_matches(doc, pattern_variant) 185 | assert match_tokens in matches 186 | 187 | def test_yield_extended_trees(self): 188 | # Build initial pattern 189 | doc = doc1 190 | match_tokens = util.idxs_to_tokens(doc, [0, 1, 3]) # [We, introduce, methods] 191 | feature_dict = {'DEP': 'dep_', 'TAG': 'tag_', 'LOWER': 'lower_'} 192 | pattern = build_dependency_pattern(doc, match_tokens, feature_dict) 193 | 194 | match_tokens_variants = list(yield_extended_trees(match_tokens)) 195 | 196 | pattern_variants = [ 197 | build_dependency_pattern(doc, match_token_variant, feature_dict) 198 | for match_token_variant in match_tokens_variants 199 | ] 200 | 201 | assert not util.list_contains_duplicates(pattern_variants) 202 | n_variants = len(pattern_variants) 203 | for pattern_variant, match_tokens_variant in zip( 204 | pattern_variants, match_tokens_variants 205 | ): 206 | matches = match.find_matches(doc, pattern_variant) 207 | match_tokens_variant = sorted(match_tokens_variant, key=lambda t: t.i) 208 | assert match_tokens_variant in matches 209 | 210 | # def test_yield_pattern_permutations(self): 211 | # doc = doc1 212 | # match_example = util.idxs_to_tokens(doc, [0, 1, 3]) # [We, introduce, methods] 213 | # feature_dict = {'DEP': 'dep_', 'TAG': 'tag_', 'LOWER': 'lower_'} 214 | # pattern = build_dependency_pattern(doc, match_example, feature_dict) 215 | 216 | # feature_sets = (('DEP', 'TAG'), ('DEP', 'TAG', 'LOWER')) 217 | # pattern_variants = list(yield_pattern_permutations(pattern, feature_sets)) 218 | # assert not util.list_contains_duplicates(pattern_variants) 219 | # n_variants = len(pattern_variants) 220 | # assert n_variants == len(feature_sets) ** len(pattern) 221 | # for pattern_variant in pattern_variants: 222 | # matches = match.find_matches(doc, pattern_variant) 223 | # assert match_example in matches 224 | 225 | # feature_sets = (('DEP',), ('DEP', 'TAG'), ('DEP', 'TAG', 'LOWER')) 226 | # pattern_variants = list(yield_pattern_permutations(pattern, feature_sets)) 227 | # assert not util.list_contains_duplicates(pattern_variants) 228 | # n_variants = len(pattern_variants) 229 | # assert n_variants == len(feature_sets) ** len(pattern) 230 | # for pattern_variant in pattern_variants: 231 | # matches = match.find_matches(doc, pattern_variant) 232 | # assert match_example in matches 233 | --------------------------------------------------------------------------------