├── .gitignore
├── LICENSE
├── README.md
├── deploy.sh
├── examples
    ├── example_1.py
    ├── pattern_0.json
    ├── pattern_1.json
    ├── pattern_2.json
    └── pattern_3.json
├── requirements.txt
├── requirements_dev.txt
├── setup.py
├── spacy_pattern_builder
    ├── __init__.py
    ├── build.py
    ├── exceptions.py
    ├── match.py
    ├── mutate.py
    └── util.py
└── test
    ├── __init__.py
    └── test_spacy_pattern_builder.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | experiments
106 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2019 Nick Morley
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SpaCy Pattern Builder
 2 | 
 3 | Use training examples to build and refine patterns for use with SpaCy's DependencyMatcher.
 4 | 
 5 | ## Motivation
 6 | 
 7 | Generating patterns programmatically from training data is more efficient than creating them manually.
 8 | 
 9 | ## Installation
10 | 
11 | With pip:
12 | 
13 | ```bash
14 | pip install spacy-pattern-builder
15 | ```
16 | 
17 | ## Usage
18 | 
19 | ```python
20 | # Import a SpaCy model, parse a string to create a Doc object
21 | import en_core_web_sm
22 | 
23 | text = 'We introduce efficient methods for fitting Boolean models to molecular data.'
24 | nlp = en_core_web_sm.load()
25 | doc = nlp(text)
26 | 
27 | from spacy_pattern_builder import build_dependency_pattern
28 | 
29 | # Provide a list of tokens we want to match.
30 | match_tokens = [doc[i] for i in [0, 1, 3]]  # [We, introduce, methods]
31 | 
32 | ''' Note that these tokens must be fully connected. That is,
33 | all tokens must have a path to all other tokens in the list,
34 | without needing to traverse tokens outside of the list.
35 | Otherwise, spacy-pattern-builder will raise a TokensNotFullyConnectedError.
36 | You can get a connected set that includes your tokens with the following: '''
37 | from spacy_pattern_builder import util
38 | connected_tokens = util.smallest_connected_subgraph(match_tokens, doc)
39 | assert match_tokens == connected_tokens  # In this case, the tokens we provided are already fully connected
40 | 
41 | # Specify the token attributes / features to use
42 | feature_dict = {  # This is equal to the default feature_dict
43 |     'DEP': 'dep_',
44 |     'TAG': 'tag_'
45 | }
46 | 
47 | # Build the pattern
48 | pattern = build_dependency_pattern(doc, match_tokens, feature_dict=feature_dict)
49 | 
50 | from pprint import pprint
51 | pprint(pattern)  # In the format consumed by SpaCy's DependencyMatcher:
52 | '''
53 | [{'PATTERN': {'DEP': 'ROOT', 'TAG': 'VBP'}, 'SPEC': {'NODE_NAME': 'node1'}},
54 |  {'PATTERN': {'DEP': 'nsubj', 'TAG': 'PRP'},
55 |   'SPEC': {'NBOR_NAME': 'node1', 'NBOR_RELOP': '>', 'NODE_NAME': 'node0'}},
56 |  {'PATTERN': {'DEP': 'dobj', 'TAG': 'NNS'},
57 |   'SPEC': {'NBOR_NAME': 'node1', 'NBOR_RELOP': '>', 'NODE_NAME': 'node3'}}]
58 | '''
59 | 
60 | # Create a matcher and add the newly generated pattern
61 | from spacy.matcher import DependencyMatcher
62 | 
63 | matcher = DependencyTreeMatcher(doc.vocab)
64 | matcher.add('pattern', None, pattern)
65 | 
66 | # And get matches
67 | matches = matcher(doc)
68 | for match_id, token_idxs in matches:
69 |     tokens = [doc[i] for i in token_idxs]
70 |     tokens = sorted(tokens, key=lambda w: w.i)  # Make sure tokens are in their original order
71 |     print(tokens)  # [We, introduce, methods]
72 | 
73 | ```
74 | 
75 | ## Acknowledgements
76 | 
77 | Uses:
78 | 
79 | - [SpaCy](https://spacy.io)
80 | - [networkx](https://github.com/networkx/networkx)


--------------------------------------------------------------------------------
/deploy.sh:
--------------------------------------------------------------------------------
1 | deactivate
2 | rm -rf dist
3 | rm -rf build
4 | python3 setup.py sdist bdist_wheel
5 | python3 -m twine upload dist/*


--------------------------------------------------------------------------------
/examples/example_1.py:
--------------------------------------------------------------------------------
 1 | # Import a SpaCy model, parse a string to create a Doc object
 2 | import en_core_web_sm
 3 | 
 4 | text = 'We introduce efficient methods for fitting Boolean models to molecular data.'
 5 | nlp = en_core_web_sm.load()
 6 | doc = nlp(text)
 7 | 
 8 | from spacy_pattern_builder import build_dependency_pattern
 9 | 
10 | # Provide a list of tokens we want to match.
11 | match_tokens = [doc[i] for i in [0, 1, 3]]  # [We, introduce, methods]
12 | 
13 | ''' Note that these tokens must be fully connected. That is,
14 | all tokens must have a path to all other tokens in the list,
15 | without needing to traverse tokens outside of the list.
16 | Otherwise, spacy-pattern-builder will raise a TokensNotFullyConnectedError.
17 | You can get a connected set that includes your tokens with the following: '''
18 | from spacy_pattern_builder import util
19 | connected_tokens = util.smallest_connected_subgraph(match_tokens, doc)
20 | assert match_tokens == connected_tokens
21 | 
22 | # Specify the token attributes / features to use
23 | feature_dict = {  # This here is equal to the default feature_dict
24 |     'DEP': 'dep_',
25 |     'TAG': 'tag_'
26 | }
27 | 
28 | # Build the pattern
29 | pattern = build_dependency_pattern(doc, match_tokens, feature_dict=feature_dict)
30 | 
31 | from pprint import pprint
32 | pprint(pattern)  # In the format consumed by SpaCy's DependencyTreeMatcher:
33 | '''
34 | [{'PATTERN': {'DEP': 'ROOT', 'TAG': 'VBP'}, 'SPEC': {'NODE_NAME': 'node1'}},
35 |  {'PATTERN': {'DEP': 'nsubj', 'TAG': 'PRP'},
36 |   'SPEC': {'NBOR_NAME': 'node1', 'NBOR_RELOP': '>', 'NODE_NAME': 'node0'}},
37 |  {'PATTERN': {'DEP': 'dobj', 'TAG': 'NNS'},
38 |   'SPEC': {'NBOR_NAME': 'node1', 'NBOR_RELOP': '>', 'NODE_NAME': 'node3'}}]
39 | '''
40 | 
41 | # Create a matcher and add the newly generated pattern
42 | from spacy.matcher import DependencyTreeMatcher
43 | 
44 | matcher = DependencyTreeMatcher(doc.vocab)
45 | matcher.add('pattern', None, pattern)
46 | 
47 | # And match away
48 | matches = matcher(doc)
49 | for match_id, token_idxs in matches:
50 |     tokens = [doc[i] for i in token_idxs]
51 |     tokens = sorted(tokens, key=lambda w: w.i)
52 |     print(tokens)  # [We, introduce, methods]
53 | 


--------------------------------------------------------------------------------
/examples/pattern_0.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "SPEC": {
 4 |       "NODE_NAME": "node1"
 5 |     },
 6 |     "PATTERN": {
 7 |       "DEP": "ROOT",
 8 |       "_": {
 9 |         "custom_attr": "my_attr"
10 |       }
11 |     }
12 |   },
13 |   {
14 |     "SPEC": {
15 |       "NODE_NAME": "node0",
16 |       "NBOR_NAME": "node1",
17 |       "NBOR_RELOP": ">"
18 |     },
19 |     "PATTERN": {
20 |       "DEP": "nsubj",
21 |       "_": {
22 |         "custom_attr": "my_attr"
23 |       }
24 |     }
25 |   },
26 |   {
27 |     "SPEC": {
28 |       "NODE_NAME": "node3",
29 |       "NBOR_NAME": "node0",
30 |       "NBOR_RELOP": "$--"
31 |     },
32 |     "PATTERN": {
33 |       "DEP": "dobj",
34 |       "_": {
35 |         "custom_attr": "my_attr"
36 |       }
37 |     }
38 |   }
39 | ]


--------------------------------------------------------------------------------
/examples/pattern_1.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "SPEC": {
 4 |       "NODE_NAME": "node13"
 5 |     },
 6 |     "PATTERN": {
 7 |       "DEP": "advcl",
 8 |       "_": {
 9 |         "custom_attr": "my_attr"
10 |       }
11 |     }
12 |   },
13 |   {
14 |     "SPEC": {
15 |       "NODE_NAME": "node15",
16 |       "NBOR_NAME": "node13",
17 |       "NBOR_RELOP": ">"
18 |     },
19 |     "PATTERN": {
20 |       "DEP": "dobj",
21 |       "_": {
22 |         "custom_attr": "my_attr"
23 |       }
24 |     }
25 |   },
26 |   {
27 |     "SPEC": {
28 |       "NODE_NAME": "node16",
29 |       "NBOR_NAME": "node15",
30 |       "NBOR_RELOP": "$--"
31 |     },
32 |     "PATTERN": {
33 |       "DEP": "prep",
34 |       "_": {
35 |         "custom_attr": "my_attr"
36 |       }
37 |     }
38 |   },
39 |   {
40 |     "SPEC": {
41 |       "NODE_NAME": "node19",
42 |       "NBOR_NAME": "node16",
43 |       "NBOR_RELOP": ">"
44 |     },
45 |     "PATTERN": {
46 |       "DEP": "pobj",
47 |       "_": {
48 |         "custom_attr": "my_attr"
49 |       }
50 |     }
51 |   }
52 | ]


--------------------------------------------------------------------------------
/examples/pattern_2.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "SPEC": {
 4 |       "NODE_NAME": "node1"
 5 |     },
 6 |     "PATTERN": {
 7 |       "DEP": "ROOT",
 8 |       "_": {
 9 |         "custom_attr": "my_attr"
10 |       }
11 |     }
12 |   },
13 |   {
14 |     "SPEC": {
15 |       "NODE_NAME": "node0",
16 |       "NBOR_NAME": "node1",
17 |       "NBOR_RELOP": ">"
18 |     },
19 |     "PATTERN": {
20 |       "DEP": "nsubj",
21 |       "_": {
22 |         "custom_attr": "my_attr"
23 |       }
24 |     }
25 |   },
26 |   {
27 |     "SPEC": {
28 |       "NODE_NAME": "node2",
29 |       "NBOR_NAME": "node0",
30 |       "NBOR_RELOP": "$--"
31 |     },
32 |     "PATTERN": {
33 |       "DEP": "prep",
34 |       "_": {
35 |         "custom_attr": "my_attr"
36 |       }
37 |     }
38 |   },
39 |   {
40 |     "SPEC": {
41 |       "NODE_NAME": "node4",
42 |       "NBOR_NAME": "node2",
43 |       "NBOR_RELOP": ">"
44 |     },
45 |     "PATTERN": {
46 |       "DEP": "pobj",
47 |       "_": {
48 |         "custom_attr": "my_attr"
49 |       }
50 |     }
51 |   }
52 | ]


--------------------------------------------------------------------------------
/examples/pattern_3.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "SPEC": {
 4 |       "NODE_NAME": "node4"
 5 |     },
 6 |     "PATTERN": {
 7 |       "DEP": "ROOT",
 8 |       "_": {
 9 |         "custom_attr": "my_attr"
10 |       }
11 |     }
12 |   },
13 |   {
14 |     "SPEC": {
15 |       "NODE_NAME": "node2",
16 |       "NBOR_NAME": "node4",
17 |       "NBOR_RELOP": ">"
18 |     },
19 |     "PATTERN": {
20 |       "DEP": "nsubj",
21 |       "_": {
22 |         "custom_attr": "my_attr"
23 |       }
24 |     }
25 |   },
26 |   {
27 |     "SPEC": {
28 |       "NODE_NAME": "node8",
29 |       "NBOR_NAME": "node2",
30 |       "NBOR_RELOP": "$--"
31 |     },
32 |     "PATTERN": {
33 |       "DEP": "dobj",
34 |       "_": {
35 |         "custom_attr": "my_attr"
36 |       }
37 |     }
38 |   }
39 | ]


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | spacy
2 | networkx
3 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz
2 | visualise-spacy-tree


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | readme = open('README.md').read()
 4 | requirements = open('requirements.txt').read().splitlines()
 5 | 
 6 | setup(
 7 |     name='spacy-pattern-builder',
 8 |     version='0.0.7',
 9 |     description='Reverse engineer patterns for use with the SpaCy DependencyTreeMatcher',
10 |     long_description=readme,
11 |     long_description_content_type='text/markdown',
12 |     author='Nick Morley',
13 |     author_email='nick.morley111@gmail.com',
14 |     url='https://github.com/cyclecycle/spacy-pattern-builder',
15 |     packages=find_packages(),
16 |     package_dir={'spacy-pattern-builder': 'spacy-pattern-builder'},
17 |     include_package_data=True,
18 |     install_requires=requirements,
19 |     license='MIT',
20 |     zip_safe=False,
21 |     keywords='spacy-pattern-builder',
22 |     classifiers=[
23 |         'Development Status :: 2 - Pre-Alpha',
24 |         'Intended Audience :: Developers',
25 |         'License :: OSI Approved :: MIT License',
26 |         'Natural Language :: English',
27 |         'Programming Language :: Python :: 3',
28 |         'Programming Language :: Python :: 3.6',
29 |         'Programming Language :: Python :: Implementation :: PyPy',
30 |     ],
31 | )
32 | 


--------------------------------------------------------------------------------
/spacy_pattern_builder/__init__.py:
--------------------------------------------------------------------------------
1 | from spacy_pattern_builder.build import build_dependency_pattern
2 | import spacy_pattern_builder.util
3 | import spacy_pattern_builder.exceptions
4 | import spacy_pattern_builder.mutate
5 | import spacy_pattern_builder.match
6 | from spacy_pattern_builder.mutate import yield_pattern_permutations, yield_node_level_pattern_variants, yield_extended_trees
7 | 


--------------------------------------------------------------------------------
/spacy_pattern_builder/build.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from pprint import pprint
  3 | import spacy_pattern_builder.util as util
  4 | from spacy_pattern_builder.exceptions import TokensNotFullyConnectedError, DuplicateTokensError, TokenNotInMatchTokensError
  5 | 
  6 | 
  7 | DEFAULT_BUILD_PATTERN_FEATURE_DICT = {
  8 |     'DEP': 'dep_',
  9 |     'TAG': 'tag_'
 10 | }
 11 | 
 12 | 
 13 | def node_name(token):
 14 |     return 'node{0}'.format(token.i)
 15 | 
 16 | 
 17 | def node_features(token, feature_dict):
 18 |     native_feature_dict = {name: feature for name, feature in feature_dict.items() if name != '_'}
 19 |     extension_feature_dict = feature_dict.get('_', None)
 20 |     node_features = {
 21 |         name: getattr(token, feature) for name, feature in native_feature_dict.items()
 22 |     }
 23 |     if extension_feature_dict:
 24 |         extension_node_features = {
 25 |             name: getattr(token._, feature) for name, feature in extension_feature_dict.items()
 26 |         }
 27 |         node_features['_'] = extension_node_features
 28 |     return node_features
 29 | 
 30 | 
 31 | def build_pattern_element(token, feature_dict, nbor=None, operator='>'):
 32 |     features = node_features(token, feature_dict)
 33 |     if not nbor:
 34 |         pattern_element = {
 35 |             'SPEC': {'NODE_NAME': node_name(token)},
 36 |             'PATTERN': features
 37 |         }
 38 |     else:
 39 |         pattern_element = {
 40 |             'SPEC': {
 41 |                 'NODE_NAME': node_name(token),
 42 |                 'NBOR_NAME': node_name(nbor),
 43 |                 'NBOR_RELOP': operator
 44 |             },
 45 |             'PATTERN': features
 46 |         }
 47 |     return pattern_element
 48 | 
 49 | 
 50 | def build_dependency_pattern(doc, match_tokens, feature_dict=DEFAULT_BUILD_PATTERN_FEATURE_DICT, nx_graph=None):
 51 |     '''Build a depedency pattern for use with DependencyTreeMatcher that will match the set of tokens provided in "match_tokens". This set of tokens must form a fully connected graph.
 52 | 
 53 |     Arguments:
 54 |         doc {SpaCy Doc object}
 55 |         match_tokens {list} -- Set of tokens to match with the resulting dependency pattern
 56 |         token_features {list} -- Attributes of spaCy tokens to match in the pattern
 57 |         nx_graph {NetworkX object} -- graph representing the doc dependency tree
 58 | 
 59 |     Returns:
 60 |         [list] -- Dependency pattern in the format consumed by SpaCy's DependencyTreeMatcher
 61 |     '''
 62 |     # Checks
 63 |     if not nx_graph:
 64 |         nx_graph = util.doc_to_nx_graph(doc)
 65 |     util.annotate_token_depth(doc)
 66 |     connected_tokens = util.smallest_connected_subgraph(
 67 |         match_tokens, doc, nx_graph=nx_graph)
 68 |     match_token_ids = util.token_idxs(match_tokens)
 69 |     connected_token_ids = util.token_idxs(connected_tokens)
 70 |     tokens_not_fully_connected = set(match_token_ids) != set(connected_token_ids)
 71 |     if tokens_not_fully_connected:
 72 |         raise TokensNotFullyConnectedError('Try expanding the training example to include all tokens in between those you are trying to match. Or, try the "role-pattern-nlp" module which handles this for you.')
 73 |     tokens_contain_duplicates = util.list_contains_duplicates(match_tokens)
 74 |     if tokens_contain_duplicates:
 75 |         raise DuplicateTokensError('Ensure the match_tokens is a unique list of tokens.')
 76 |     match_tokens = util.sort_by_depth(match_tokens)  # Iterate through tokens in descending depth order
 77 |     dependency_pattern = []
 78 |     root_token = match_tokens[0]
 79 |     pattern_element = build_pattern_element(root_token, feature_dict, operator='>')
 80 |     dependency_pattern.append(pattern_element)
 81 |     tokens_in_pattern = [root_token]
 82 |     non_root_tokens = match_tokens[1:]
 83 |     for i, token in enumerate(non_root_tokens):
 84 |         # If the token is a right sibling of a token already in the pattern, also add a sibling relationship.
 85 |         left_siblings = util.siblings(token, side='left')
 86 |         left_siblings_in_pattern = [t for t in left_siblings if t in tokens_in_pattern]
 87 |         if left_siblings_in_pattern:
 88 |             last_left_sibling_in_pattern = left_siblings_in_pattern[-1]
 89 |             pattern_element = build_pattern_element(
 90 |                 token, feature_dict, nbor=last_left_sibling_in_pattern, operator='$--')
 91 |             dependency_pattern.append(pattern_element)
 92 |         else:  # Parent-child relation
 93 |             head = token.head
 94 |             if head not in match_tokens:
 95 |                 raise TokenNotInMatchTokensError('Head token not in match_tokens. Is match_tokens fully connected?')
 96 |             pattern_element = build_pattern_element(token, feature_dict, nbor=head, operator='>')
 97 |             dependency_pattern.append(pattern_element)
 98 |         tokens_in_pattern.append(token)
 99 |     return dependency_pattern
100 | 


--------------------------------------------------------------------------------
/spacy_pattern_builder/exceptions.py:
--------------------------------------------------------------------------------
 1 | class TokensNotFullyConnectedError(Exception):
 2 |     pass
 3 | 
 4 | 
 5 | class DuplicateTokensError(Exception):
 6 |     pass
 7 | 
 8 | 
 9 | class TokenNotInMatchTokensError(Exception):
10 |     pass
11 | 
12 | 
13 | class FeaturesMissingFromPatternError(Exception):
14 |     pass
15 | 


--------------------------------------------------------------------------------
/spacy_pattern_builder/match.py:
--------------------------------------------------------------------------------
 1 | from spacy.matcher import DependencyMatcher
 2 | 
 3 | 
 4 | def build_matcher(vocab, pattern):
 5 |     matcher = DependencyMatcher(vocab)
 6 |     matcher.add('pattern', None, pattern)
 7 |     return matcher
 8 | 
 9 | 
10 | def find_matches(doc, pattern):
11 |     matcher = build_matcher(doc.vocab, pattern)
12 |     # print(doc, pattern)
13 |     matches = matcher(doc)
14 |     match_list = []
15 |     for match_id, match_trees in matches:
16 |         for token_idxs in match_trees:
17 |             tokens = [doc[idx] for idx in token_idxs]
18 |             tokens = sorted(tokens, key=lambda t: t.i)
19 |             match_list.append(tokens)
20 |     return match_list
21 | 


--------------------------------------------------------------------------------
/spacy_pattern_builder/mutate.py:
--------------------------------------------------------------------------------
 1 | '''Generate pattern variants
 2 | '''
 3 | from pprint import pprint
 4 | import itertools
 5 | from spacy_pattern_builder import build
 6 | from spacy_pattern_builder import util
 7 | from spacy_pattern_builder.exceptions import FeaturesMissingFromPatternError
 8 | 
 9 | 
10 | def yield_pattern_permutations(pattern, feature_sets):
11 |     # First check all features in feature_sets are present in all pattern_elements
12 |     all_features = set(util.flatten_list(feature_sets))
13 |     all_features_are_in_pattern = util.features_are_in_pattern(all_features, pattern)
14 |     if not all_features_are_in_pattern:
15 |         raise FeaturesMissingFromPatternError(
16 |             'Tried to create pattern permutations using features that are not present in the pattern. Ensure the pattern has all the features specified in feature_sets.'
17 |         )
18 |     pattern_element_combinations = []
19 |     for pattern_element in pattern:
20 |         token_features = pattern_element['PATTERN']
21 |         new_pattern_elements = []
22 |         for feature_set in feature_sets:
23 |             new_token_features = {
24 |                 k: v for k, v in token_features.items() if k in feature_set
25 |             }
26 |             new_pattern_element = {
27 |                 'SPEC': pattern_element['SPEC'],
28 |                 'PATTERN': new_token_features,
29 |             }
30 |             new_pattern_elements.append(new_pattern_element)
31 |         pattern_element_combinations.append(new_pattern_elements)
32 |     return itertools.product(*pattern_element_combinations)
33 | 
34 | 
35 | def yield_node_level_pattern_variants(pattern, match_tokens, feature_dicts, mutate_tokens=[]):
36 |     # Sort tokens by depth and assume to match one-to-one with pattern
37 |     if not mutate_tokens:
38 |         mutate_tokens = match_tokens
39 |     match_tokens = util.sort_by_depth(match_tokens)
40 |     pattern_element_combinations = []
41 |     for pattern_element, token in zip(pattern, match_tokens):
42 |         if token not in mutate_tokens:
43 |             new_pattern_elements = [pattern_element for _ in feature_dicts]
44 |         else:
45 |             new_pattern_elements = []
46 |             for feature_dict in feature_dicts:
47 |                 new_token_features = build.node_features(token, feature_dict)
48 |                 new_pattern_element = {
49 |                     'SPEC': pattern_element['SPEC'],
50 |                     'PATTERN': new_token_features,
51 |                 }
52 |                 new_pattern_elements.append(new_pattern_element)
53 |         pattern_element_combinations.append(new_pattern_elements)
54 |     pattern_variants = itertools.product(*pattern_element_combinations)
55 |     for variant in pattern_variants:
56 |         assert len(variant) == len(pattern)
57 |         yield variant
58 | 
59 | 
60 | def yield_extended_trees(match_tokens):
61 |     min_depth = min([t._.depth for t in match_tokens])
62 |     extend_by = []
63 |     for token in match_tokens:
64 |         is_root = token._.depth == min_depth
65 |         if is_root:
66 |             extend_by.append(token.head)
67 |         extend_by += token.children
68 |         if not is_root:
69 |             # Only extend by siblings if the token is not root, as this would also require adding the common root that connects these siblings or else the tokens would not be fully connected.
70 |             extend_by += util.siblings(token)
71 |     extend_by = [t for t in extend_by if t]
72 |     extend_by = [t for t in extend_by if t not in match_tokens]
73 |     extend_by = util.de_duplicate_list(extend_by)
74 |     for node in extend_by:
75 |         match_token_variant = match_tokens + [node]
76 |         yield match_token_variant
77 | 


--------------------------------------------------------------------------------
/spacy_pattern_builder/util.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import networkx as nx
  3 | from spacy.tokens import Token
  4 | 
  5 | 
  6 | def annotate_token_depth(doc):
  7 |     '''Annotate token depth in the syntactic tree'''
  8 |     Token.set_extension('depth', default=None, force=True)
  9 |     for word in doc:
 10 |         depth = 0
 11 |         current_word = word
 12 |         while not current_word == current_word.head:
 13 |             depth += 1
 14 |             current_word = current_word.head
 15 |         word._.depth = depth
 16 |     return doc
 17 | 
 18 | 
 19 | def filter_by_depth(depths, tokens):
 20 |     if isinstance(depths, int):
 21 |         depths = set([depths])
 22 |     return [t for t in tokens if t._.depth in depths]
 23 | 
 24 | 
 25 | def shallowest_token(tokens):
 26 |     tokens = sort_by_depth(tokens)
 27 |     return tokens[0]
 28 | 
 29 | 
 30 | def sort_by_depth(tokens):
 31 |     return sorted(tokens, key=lambda w: (w._.depth, w.i))
 32 | 
 33 | 
 34 | def sort_by_idx(tokens):
 35 |     return sorted(tokens, key=lambda w: w.i)
 36 | 
 37 | 
 38 | def siblings(token, side=None):
 39 |     try:
 40 |         siblings = token.head.children
 41 |     except:
 42 |         return []
 43 |     if side == 'left':
 44 |         siblings = [s for s in siblings if s.i < token.i]
 45 |     elif side == 'left':
 46 |         siblings = [s for s in siblings if s.i > token.i]
 47 |     return siblings
 48 | 
 49 | 
 50 | def doc_to_nx_graph(doc):
 51 |     edges = []
 52 |     for token in doc:
 53 |         for child in token.children:
 54 |             edges.append(('{0}-{1}'.format(token.text, token.i),
 55 |                           '{0}-{1}'.format(child.text, child.i)))
 56 |     graph = nx.Graph(edges)
 57 |     return graph
 58 | 
 59 | 
 60 | def shortest_dependency_path(nx_graph, doc, source, target):
 61 |     source = '{0}-{1}'.format(source.text, source.i)
 62 |     target = '{0}-{1}'.format(target.text, target.i)
 63 |     try:
 64 |         path = nx.shortest_path(nx_graph, source=source, target=target)
 65 |     except nx.exception.NetworkXNoPath:
 66 |         path = []
 67 |     dep_path = []
 68 |     for node in path:
 69 |         idx = int(node.split('-')[-1])
 70 |         token = doc[idx]
 71 |         dep_path.append(token)
 72 |     dep_path = sorted(dep_path, key=lambda t: t._.depth)
 73 |     return dep_path
 74 | 
 75 | 
 76 | def smallest_connected_subgraph(with_tokens, doc, nx_graph=None):
 77 |     # Find root nodes
 78 |     if not nx_graph:
 79 |         nx_graph = doc_to_nx_graph(doc)
 80 |     try:
 81 |         doc[0]._.depth
 82 |     except AttributeError:
 83 |         annotate_token_depth(doc)
 84 |     min_depth = min([t._.depth for t in with_tokens])
 85 |     roots = [t for t in with_tokens if t._.depth == min_depth]
 86 |     non_roots = [t for t in with_tokens if t not in roots]
 87 |     tokens_touched = roots + non_roots
 88 |     # For each non-root token, trace paths to each root. This will touch every non-root token we're looking for
 89 |     for token in non_roots:
 90 |         for root in roots:
 91 |             path = shortest_dependency_path(nx_graph, doc, token, root)
 92 |             for t in path:
 93 |                 if t not in tokens_touched:
 94 |                     tokens_touched.append(t)
 95 |     tokens_touched = sorted(tokens_touched, key=lambda t: t.i)
 96 |     # Trace paths between roots
 97 |     for root_x, root_y in itertools.combinations(roots, 2):
 98 |         path = shortest_dependency_path(nx_graph, doc, root_x, root_y)
 99 |         for t in path:
100 |             if t not in tokens_touched:
101 |                 tokens_touched.append(t)
102 |     return tokens_touched
103 | 
104 | 
105 | def idxs_to_tokens(doc, idxs):
106 |     return [doc[idx] for idx in idxs]
107 | 
108 | 
109 | def token_idxs(tokens):
110 |     return [t.i for t in tokens]
111 | 
112 | 
113 | def de_duplicate_list(list_):
114 |     unique_list = []
115 |     for item in list_:
116 |         if item not in unique_list:
117 |             unique_list.append(item)
118 |     return unique_list
119 | 
120 | 
121 | def list_contains_duplicates(list_):
122 |     unique_list = de_duplicate_list(list_)
123 |     if len(list_) > len(unique_list):
124 |         return True
125 |     return False
126 | 
127 | 
128 | def features_are_in_pattern(features, pattern):
129 |     for pattern_element in pattern:
130 |         for feature in features:
131 |             if feature not in pattern_element['PATTERN']:
132 |                 return False
133 |     return True
134 | 
135 | 
136 | def flatten_list(list_):
137 |     return list(itertools.chain(*list_))
138 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cyclecycle/spacy-pattern-builder/51a1eb9a2cbd56163103e0e903af585442f8f912/test/__init__.py


--------------------------------------------------------------------------------
/test/test_spacy_pattern_builder.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Tests for `spacy-pattern-builder` module.
  3 | '''
  4 | import pytest
  5 | from pprint import pprint
  6 | import json
  7 | import en_core_web_sm
  8 | from spacy.tokens import Token
  9 | from spacy_pattern_builder import (
 10 |     build_dependency_pattern,
 11 |     yield_pattern_permutations,
 12 |     yield_node_level_pattern_variants,
 13 |     yield_extended_trees,
 14 | )
 15 | from spacy_pattern_builder.exceptions import (
 16 |     TokensNotFullyConnectedError,
 17 |     DuplicateTokensError,
 18 | )
 19 | import spacy_pattern_builder.util as util
 20 | import spacy_pattern_builder.match as match
 21 | 
 22 | 
 23 | nlp = en_core_web_sm.load()
 24 | 
 25 | text1 = 'We introduce efficient methods for fitting Boolean models to molecular data, successfully demonstrating their application to synthetic time courses generated by a number of established clock models, as well as experimental expression levels measured using luciferase imaging.'
 26 | 
 27 | text2 = 'Moreover, again only in sCON individuals, we observed a significant positive correlation between ASL and wine in overlapping left parietal WM indicating better baseline brain perfusion.'
 28 | 
 29 | text3 = 'We focused on green tea and performed a systematic review of observational studies that examined the association between green tea intake and dementia, Alzheimer\'s disease, mild cognitive impairment, or cognitive impairment.'
 30 | 
 31 | text4 = 'L-theanine alone improved self-reported relaxation, tension, and calmness starting at 200 mg.'
 32 | 
 33 | 
 34 | doc1 = nlp(text1)
 35 | doc2 = nlp(text2)
 36 | doc3 = nlp(text3)
 37 | doc4 = nlp(text4)
 38 | 
 39 | cases = [
 40 |     {
 41 |         'example': {
 42 |             'doc': doc1,
 43 |             'match': util.idxs_to_tokens(doc1, [0, 1, 3]),  # [We, introduce, methods]
 44 |         }
 45 |     },
 46 |     {
 47 |         'example': {
 48 |             'doc': doc1,
 49 |             'match': util.idxs_to_tokens(
 50 |                 doc1, [13, 15, 16, 19]
 51 |             ),  # [demonstrating, application, to, courses]
 52 |         }
 53 |     },
 54 |     {
 55 |         'example': {
 56 |             'doc': doc3,
 57 |             'match': util.idxs_to_tokens(doc3, [0, 1, 2, 4]),  # [We, focused, on, tea]
 58 |         },
 59 |         'should_miss': [
 60 |             {
 61 |                 'doc': doc2,
 62 |                 'match': util.idxs_to_tokens(
 63 |                     doc2, [4, 8, 9, 18]
 64 |                 ),  # [in, we, observed, in]
 65 |             }
 66 |         ],
 67 |     },
 68 |     {
 69 |         'example': {
 70 |             'doc': doc4,
 71 |             'match': util.idxs_to_tokens(
 72 |                 doc4, [2, 4, 8]
 73 |             ),  # [theanine, relaxation, improved]
 74 |         }
 75 |     },
 76 | ]
 77 | 
 78 | 
 79 | class TestSpacyPatternBuilder(object):
 80 |     def test_build_pattern(self):
 81 |         feature_dict = {'DEP': 'dep_', 'TAG': 'tag_'}
 82 |         for i, case in enumerate(cases):
 83 |             doc = case['example']['doc']
 84 |             match_example = case['example']['match']
 85 |             pattern = build_dependency_pattern(doc, match_example, feature_dict)
 86 |             matches = match.find_matches(doc, pattern)
 87 |             assert match_example in matches, 'does not match example'
 88 |             pattern_file_name = 'examples/pattern_{}.json'.format(i)
 89 |             with open(pattern_file_name, 'w') as f:
 90 |                 json.dump(pattern, f, indent=2)
 91 |             if 'should_hit' in case:
 92 |                 for item in case['should_hit']:
 93 |                     doc = item['doc']
 94 |                     hit_match = item['match']
 95 |                     matches = match.find_matches(doc, pattern)
 96 |                     assert hit_match in matches, 'false negative'
 97 |             if 'should_miss' in case:
 98 |                 for item in case['should_miss']:
 99 |                     doc = item['doc']
100 |                     miss_match = item['match']
101 |                     matches = match.find_matches(doc, pattern)
102 |                     assert miss_match not in matches, 'false positive'
103 | 
104 |     def test_custom_extension(self):
105 |         Token.set_extension('custom_attr', default=False)
106 |         feature_dict = {'DEP': 'dep_', '_': {'custom_attr': 'custom_attr'}}
107 |         for i, case in enumerate(cases):
108 |             doc = case['example']['doc']
109 |             for token in doc:
110 |                 token._.custom_attr = 'my_attr'
111 |             match_example = case['example']['match']
112 |             pattern = build_dependency_pattern(doc, match_example, feature_dict)
113 |             matches = match.find_matches(doc, pattern)
114 |             assert match_example in matches, 'does not match example'
115 |             pattern_file_name = 'examples/pattern_{}.json'.format(i)
116 |             with open(pattern_file_name, 'w') as f:
117 |                 json.dump(pattern, f, indent=2)
118 |             if 'should_hit' in case:
119 |                 for item in case['should_hit']:
120 |                     doc = item['doc']
121 |                     hit_match = item['match']
122 |                     matches = match.find_matches(doc, pattern)
123 |                     assert hit_match in matches, 'false negative'
124 |             if 'should_miss' in case:
125 |                 for item in case['should_miss']:
126 |                     doc = item['doc']
127 |                     miss_match = item['match']
128 |                     matches = match.find_matches(doc, pattern)
129 |                     assert miss_match not in matches, 'false positive'
130 | 
131 |     def test_tokens_not_connected_error(self):
132 |         doc = doc1
133 |         match_examples = [
134 |             util.idxs_to_tokens(
135 |                 doc, [19, 20, 21, 27]
136 |             )  # [courses, generated, by, models]
137 |         ]
138 |         feature_dict = {'DEP': 'dep_', 'TAG': 'tag_'}
139 |         for match_example in match_examples:
140 |             with pytest.raises(TokensNotFullyConnectedError):
141 |                 build_dependency_pattern(doc, match_example, feature_dict)
142 | 
143 |     def test_duplicate_tokens_error(self):
144 |         doc = doc1
145 |         match_examples = [
146 |             util.idxs_to_tokens(
147 |                 doc, [0, 1, 1, 3]
148 |             )  # [We, introduce, introduce, methods]
149 |         ]
150 |         for match_example in match_examples:
151 |             with pytest.raises(DuplicateTokensError):
152 |                 build_dependency_pattern(doc, match_example)
153 | 
154 |     def test_yield_node_level_pattern_variants(self):
155 |         # Build initial pattern
156 |         doc = doc1
157 |         match_tokens = util.idxs_to_tokens(doc, [0, 1, 3])  # [We, introduce, methods]
158 |         feature_dict = {'DEP': 'dep_', 'TAG': 'tag_'}
159 |         pattern = build_dependency_pattern(doc, match_tokens, feature_dict)
160 | 
161 |         feature_dicts = (
162 |             {'DEP': 'dep_', 'TAG': 'tag_'},
163 |             {'DEP': 'dep_', 'TAG': 'tag_', 'LOWER': 'lower_'},
164 |         )
165 |         pattern_variants = list(
166 |             yield_node_level_pattern_variants(pattern, match_tokens, feature_dicts)
167 |         )
168 |         assert not util.list_contains_duplicates(pattern_variants)
169 |         n_variants = len(pattern_variants)
170 |         assert n_variants == len(feature_dicts) ** len(pattern)
171 |         for pattern_variant in pattern_variants:
172 |             matches = match.find_matches(doc, pattern_variant)
173 |             assert match_tokens in matches
174 | 
175 |         # Test mutate_tokens parameter
176 |         pattern_variants = list(
177 |             yield_node_level_pattern_variants(
178 |                 pattern, match_tokens, feature_dicts, mutate_tokens=[match_tokens[1]]
179 |             )
180 |         )
181 |         n_variants = len(pattern_variants)
182 |         assert n_variants == len(feature_dicts) ** len(pattern)
183 |         for pattern_variant in pattern_variants:
184 |             matches = match.find_matches(doc, pattern_variant)
185 |             assert match_tokens in matches
186 | 
187 |     def test_yield_extended_trees(self):
188 |         # Build initial pattern
189 |         doc = doc1
190 |         match_tokens = util.idxs_to_tokens(doc, [0, 1, 3])  # [We, introduce, methods]
191 |         feature_dict = {'DEP': 'dep_', 'TAG': 'tag_', 'LOWER': 'lower_'}
192 |         pattern = build_dependency_pattern(doc, match_tokens, feature_dict)
193 | 
194 |         match_tokens_variants = list(yield_extended_trees(match_tokens))
195 | 
196 |         pattern_variants = [
197 |             build_dependency_pattern(doc, match_token_variant, feature_dict)
198 |             for match_token_variant in match_tokens_variants
199 |         ]
200 | 
201 |         assert not util.list_contains_duplicates(pattern_variants)
202 |         n_variants = len(pattern_variants)
203 |         for pattern_variant, match_tokens_variant in zip(
204 |             pattern_variants, match_tokens_variants
205 |         ):
206 |             matches = match.find_matches(doc, pattern_variant)
207 |             match_tokens_variant = sorted(match_tokens_variant, key=lambda t: t.i)
208 |             assert match_tokens_variant in matches
209 | 
210 |     # def test_yield_pattern_permutations(self):
211 |     #     doc = doc1
212 |     #     match_example = util.idxs_to_tokens(doc, [0, 1, 3])  # [We, introduce, methods]
213 |     #     feature_dict = {'DEP': 'dep_', 'TAG': 'tag_', 'LOWER': 'lower_'}
214 |     #     pattern = build_dependency_pattern(doc, match_example, feature_dict)
215 | 
216 |     #     feature_sets = (('DEP', 'TAG'), ('DEP', 'TAG', 'LOWER'))
217 |     #     pattern_variants = list(yield_pattern_permutations(pattern, feature_sets))
218 |     #     assert not util.list_contains_duplicates(pattern_variants)
219 |     #     n_variants = len(pattern_variants)
220 |     #     assert n_variants == len(feature_sets) ** len(pattern)
221 |     #     for pattern_variant in pattern_variants:
222 |     #         matches = match.find_matches(doc, pattern_variant)
223 |     #         assert match_example in matches
224 | 
225 |     #     feature_sets = (('DEP',), ('DEP', 'TAG'), ('DEP', 'TAG', 'LOWER'))
226 |     #     pattern_variants = list(yield_pattern_permutations(pattern, feature_sets))
227 |     #     assert not util.list_contains_duplicates(pattern_variants)
228 |     #     n_variants = len(pattern_variants)
229 |     #     assert n_variants == len(feature_sets) ** len(pattern)
230 |     #     for pattern_variant in pattern_variants:
231 |     #         matches = match.find_matches(doc, pattern_variant)
232 |     #         assert match_example in matches
233 | 


--------------------------------------------------------------------------------